Commit ff69d574 authored by Marek Ostaszewski's avatar Marek Ostaszewski
Browse files

updated code for text mining/omnipath network

parent 3ee7e4c1
......@@ -2,4 +2,4 @@ param value
output_file output.txt
n 50
source stringdb
score 0
score 0.3
......@@ -7,7 +7,7 @@
# n: the number of new proteins in the extended network. By default 50
# score: is the string score (between 0,1). By default 0. it filters a posteriori because
# I was not able to find documentation on how to add the score in the API url request
# dbsource: the source of the protein-protein interaction data. Currently, stringdb or omnipathdb.
# dbsource: the source of the protein-protein interaction data. Currently, text_mining, stringdb or omnipathdb.
# By default, stringdb, and then overlaps omnipathdb data to provide information about directionality.
# When source = "omnipathdb" it will retrieve all information for the genes from omnipath
# the output file name. It will contain the pairs gene A, gene B with their corresponding HGNC identifiers,
......@@ -22,7 +22,6 @@ if(length(args) > 0) { infile <- args[1] }
if(!file.exists(infile)) { message("No correct input given!")}
input <- readLines(infile)
config <- "config.txt"
if(length(args) > 1) { config <- args[2] }
if(!file.exists(config)) { message("No correct config given!")}
......@@ -33,7 +32,7 @@ n <-config[config$param == "n", "value"]
score <-config[config$param == "score", "value"]
dbsource <-config[config$param == "source", "value"]
if( !dbsource %in% c("stringdb", "omnipathdb")) {
if( !dbsource %in% c("stringdb", "omnipathdb", "text_mining")) {
stop("Invalid argument 'dbsource'. It should be omnipathdb or stringdb ")
}
if( ! ( score >= 0 & score <= 1 )) {
......@@ -50,22 +49,11 @@ if ("biomaRt" %in% installed.packages()[,1] == FALSE){
library(biomaRt)
message(paste0("there are ", length(unique(input))), " genes in the file")
if (dbsource == "stringdb") {
message( "mapping the hgnc identifiers to ensemble proteins used by stringdb with biomart" )
ensembl <- useMart("ensembl")
ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")
ensprotein_idest<- getBM(attributes=c('ensembl_peptide_id', 'hgnc_symbol'),
filters = 'hgnc_symbol',
values = input,
mart = ensembl)
ensprotein_idest <- subset(ensprotein_idest, ensembl_peptide_id != "")
message(paste0("there are ", length(unique(ensprotein_idest$hgnc_symbol))), " genes in the input file with ensembl protein ids")
message(paste0("doing the query to stringdb and retrieving n ",n, " neighbors" ))
### Function to retrieve StringDB interactors
retrieve_stringdb_net <- function(finput, fn, fscore) {
message(paste0("doing the query to stringdb and retrieving n ", fn, " neighbors"))
entities <- paste0("9606.", ensprotein_idest$ensembl_peptide_id)
entities <- paste(entities, collapse = "%0A" )
......@@ -75,124 +63,196 @@ if (dbsource == "stringdb") {
body = paste0("entities=", entities, "&additional=", n),
httr::verbose())
json <- jsonlite::fromJSON(httr::content(resp, as = "text"), flatten = FALSE)
protein_info <- json[[2]]
protein_info$protein <- gsub('stringdb:', "", protein_info$`@id`)
protein_info$protein <- gsub('stringdb:', "", protein_info$`@id`)
protein_info$protein <- gsub('9606.', "", protein_info$protein )
protein_info_2ensp <- getBM(attributes=c('ensembl_peptide_id', "hgnc_symbol", 'entrezgene_id'),
filters = 'ensembl_peptide_id',
values = protein_info$protein,
mart = ensembl)
biomartCacheClear()
protein_info_2ensp <- subset(protein_info_2ensp, ensembl_peptide_id != "")
protein_info_2ensp$ensembl_peptide_id <- paste0("9606.", protein_info_2ensp$ensembl_peptide_id)
json <- jsonlite::fromJSON(httr::content(resp, as = "text", encoding = "UTF-8"), flatten = FALSE)
ppi_network <- json[[1]]
ppi_network$string_score <- ppi_network$scores$`stringdb::score`
message("the network contains ", dim(ppi_network)[1], " interactions ")
ppi_network <- subset(ppi_network, string_score > score)
### May be important for later, when node-level info can be used for filtering
# protein_info <- json[[2]]
# protein_info$protein <- gsub('stringdb:', "", protein_info$`@id`)
# protein_info$protein <- gsub('9606.', "", protein_info$protein )
# protein_info <- protein_info$protein
ppi_network <- data.frame(source = json[[1]]$source,
target = json[[1]]$target,
score = json[[1]]$scores$`stringdb::score`,
stringsAsFactors = F)
ppi_network <- ppi_network[ppi_network$score >= fscore,]
ppi_network$source <- gsub('9606.', "", ppi_network$source)
ppi_network$target <- gsub('9606.', "", ppi_network$target)
ppi_network
ppi_network <- unique(ppi_network)
message("after pruning by score, the network contains ", dim(ppi_network)[1], " interactions ")
ppi_network <- merge(ppi_network, protein_info_2ensp, by.x = "source", by.y = "ensembl_peptide_id" )
colnames(ppi_network)[which(names(ppi_network) == "hgnc_symbol")] <- "source_hgnc_symbol"
colnames(ppi_network)[which(names(ppi_network) == "entrezgene_id")] <- "source_entrezgene_id"
ppi_network <- merge(ppi_network, protein_info_2ensp, by.x = "target", by.y = "ensembl_peptide_id" )
colnames(ppi_network)[which(names(ppi_network) == "hgnc_symbol")] <- "target_hgnc_symbol"
colnames(ppi_network)[which(names(ppi_network) == "entrezgene_id")] <- "target_entrezgene_id"
ppi_network <- unique(ppi_network[c("source_hgnc_symbol","source_entrezgene_id",
"target_hgnc_symbol","target_entrezgene_id", "string_score")])
message("after converting to gene symbols, the network contains ", dim(ppi_network)[1], " interactions ")
return(ppi_network)
}
### Function to retrieve OmniPathDB interactors, by default only between input genes
retrieve_omnipath <- function(finput, only_between_inputs = T) {
message("Retrieving data from Omnipath")
url <- paste0("http://omnipathdb.org/interactions/?genesymbols=1&partners=",
paste(finput, collapse = ","),"&fields=sources,references")
url <- paste0("http://omnipathdb.org/interactions/?genesymbols=1&partners=", paste(protein_info_2ensp$hgnc_symbol, collapse = ","), "&fields=sources,references")
dataTsv <- RCurl::getURLContent( url )
myTextConnection <- textConnection( dataTsv )
result <- read.csv( myTextConnection, header = TRUE, sep = "\t", colClasses=c("character"))
resp <- httr::GET(url = url,
httr::add_headers('Content-Type' = "application/x-www-form-urlencoded"),
httr::verbose())
result <- read.table(text = httr::content(resp, as = "text", encoding = "UTF-8"),
sep = "\t", header = T)
result <- result[result$is_directed == 1,]
colnames(result)[which(names(result) == "source_genesymbol")] <- "source_hgnc_symbol"
colnames(result)[which(names(result) == "target_genesymbol")] <- "target_hgnc_symbol"
gene_info_2entrez <- getBM(attributes=c('hgnc_symbol', 'entrezgene_id'),
filters = 'hgnc_symbol',
values = c(result$source_hgnc_symbol, result$target_hgnc_symbol),
mart = ensembl)
biomartCacheClear()
gene_info_2entrez <- subset(gene_info_2entrez, entrezgene_id != "")
result <- merge(result, gene_info_2entrez, by.x = "source_hgnc_symbol", by.y = "hgnc_symbol" )
colnames(result)[which(names(result) == "entrezgene_id")] <- "source_entrezgene_id"
result <- merge(result, gene_info_2entrez, by.x = "target_hgnc_symbol", by.y = "hgnc_symbol" )
colnames(result)[which(names(result) == "entrezgene_id")] <- "target_entrezgene_id"
result$pair <-apply(result,1,function(x){paste(sort(x[c("source_hgnc_symbol","target_hgnc_symbol")]),collapse=":")})
ppi_network$pair <-apply(ppi_network,1,function(x){paste(sort(x[c("source_hgnc_symbol","target_hgnc_symbol")]),collapse=":")})
message(paste0("from ",length(unique(ppi_network$pair))," pairs of interacting proteins from stringdb"))
message(paste0("we found ", length(intersect(ppi_network$pair, result$pair) ), " interacting pairs in Omnipath"))
final_results <- unique(subset(result, pair %in% ppi_network$pair)[c( "source_hgnc_symbol","source_entrezgene_id",
"target_hgnc_symbol", "target_entrezgene_id",
"is_directed", "consensus_direction", "references", "pair" )])
ppi_network <- subset(ppi_network, !pair %in% final_results$pair)
ppi_network <- within(ppi_network , rm(string_score, pair))
final_results <- merge( final_results[, -grep("pair", colnames(final_results))],
ppi_network, all = T )
final_results$is_directed <- ifelse(is.na(final_results$is_directed), 0, as.character(final_results$is_directed))
message(paste0(" And from those ",dim(subset(final_results, is_directed == 1) )[1], " interacting pairs have direction in Omnipath"))
if(only_between_inputs) {
result <- result[(result$source_hgnc_symbol %in% finput) & (result$target_hgnc_symbol %in% finput),]
}
### Restrict the number of columns, sign still missing
result <- result[,c("source_hgnc_symbol","target_hgnc_symbol",
"is_directed", "consensus_direction", "references")]
return(result)
}
### Load ensembl mart, will be used in id translation
ensembl <- useMart("ensembl", dataset="hsapiens_gene_ensembl")
ppi_network <- NULL
if (dbsource == "stringdb" | dbsource == "text_mining") {
message( "mapping the hgnc identifiers to ensemble proteins")
ensprotein_idest <- getBM(attributes=c('ensembl_peptide_id', 'hgnc_symbol'),
filters = 'hgnc_symbol',
values = input,
mart = ensembl)
ensprotein_idest <- subset(ensprotein_idest, ensembl_peptide_id != "")
message(paste0("there are ", length(unique(ensprotein_idest$hgnc_symbol))), " genes in the input file with ensembl protein ids")
if( nrow( final_results ) == 0 ){
warning("None of the pairs of interacting proteins havee pased the filters")
if (dbsource == "stringdb"){
ppi_network <- retrieve_stringdb_net(ensprotein_idest$ensembl_peptide_id, n, score)
} else {
write.table(final_results, outputFile, sep = "\t", quote = F, row.names = F)
### This will remain a stub while no bulk request to text mining is possible
}
message("the network contains ", dim(ppi_network)[1], " interactions ")
if (length(ppi_network) > 0) {
all_proteins <- grep("^ENSP", unique(c(ppi_network$source, ppi_network$target)), value = T)
all_proteins_ids <- getBM(attributes=c('ensembl_peptide_id', "hgnc_symbol"),
filters = 'ensembl_peptide_id',
values = all_proteins,
mart = ensembl)
all_proteins_ids <- subset(all_proteins_ids, hgnc_symbol != "")
rownames(all_proteins_ids) <- all_proteins_ids[,1]
ppi_network <- data.frame(all_proteins_ids[ppi_network$source, c("hgnc_symbol")],
all_proteins_ids[ppi_network$target, c("hgnc_symbol")],
ppi_network$score,
stringsAsFactors = F)
colnames(ppi_network) <- c("source_hgnc_symbol", "target_hgnc_symbol", "score")
ppi_network <- unique(ppi_network)
### Remove HGNC NAs
ppi_network <- ppi_network[!(is.na(ppi_network$source_hgnc_symbol) | is.na(ppi_network$target_hgnc_symbol)),]
message("after converting to gene symbols, the network contains ", dim(ppi_network)[1], " interactions ")
}
}
if(length(ppi_network) == 0) {
message("No text mining network, relying only on OmniPathDB")
ppi_network <- retrieve_omnipath(input)
} else {
message("Retrieving data from Omnipath")
url <- paste0("http://omnipathdb.org/interactions/?genesymbols=1&partners=", paste(input, collapse = ","), "&fields=sources,references")
dataTsv <- RCurl::getURLContent( url )
myTextConnection <- textConnection( dataTsv )
result <- read.csv( myTextConnection, header = TRUE, sep = "\t", colClasses=c("character"))
colnames(result)[which(names(result) == "source_genesymbol")] <- "source_hgnc_symbol"
colnames(result)[which(names(result) == "target_genesymbol")] <- "target_hgnc_symbol"
result$pair <-apply(result,1,function(x){paste(sort(x[c("source_hgnc_symbol","target_hgnc_symbol")]),collapse=":")})
message(paste0("there are ",length(unique(result$pair))," pairs of interacting proteins from omnipath"))
ensembl <- useMart("ensembl")
ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")
omnipath_network <- retrieve_omnipath(unique(c(ppi_network$source_hgnc_symbol,ppi_network$target_hgnc_symbol)))
### Check for Omnipath interactions that are uniquely between the text mining pairs
### Combine source-target and target source identifiers (text mining interactions are reversible/non-directional)
### and check if Omnipath interactions are within this set
ppi_pairs <- c(paste(ppi_network$source_hgnc_symbol,",",ppi_network$target_hgnc_symbol),
paste(ppi_network$target_hgnc_symbol,",",ppi_network$source_hgnc_symbol))
omnipath_pairs <- paste(omnipath_network$source_hgnc_symbol,",",omnipath_network$target_hgnc_symbol)
ppi_network <- omnipath_network[omnipath_pairs %in% ppi_pairs,]
message("after OmniPathDB filtering, the network contains ", dim(ppi_network)[1], " interactions ")
}
if(nrow(result) == 0) {
warning("None of the pairs of interacting proteins have pased the filters")
} else {
### Add entrez at the end, if any results remain
gene_info_2entrez <- getBM(attributes=c('hgnc_symbol', 'entrezgene_id'),
filters = 'hgnc_symbol',
values = c(result$source_hgnc_symbol, result$target_hgnc_symbol),
values = unique(c(ppi_network$source_hgnc_symbol, ppi_network$target_hgnc_symbol)),
mart = ensembl)
gene_info_2entrez <- subset(gene_info_2entrez, entrezgene_id != "")
result <- merge(result, gene_info_2entrez, by.x = "source_hgnc_symbol", by.y = "hgnc_symbol" )
colnames(result)[which(names(result) == "entrezgene_id")] <- "source_entrezgene_id"
result <- merge(result, gene_info_2entrez, by.x = "target_hgnc_symbol", by.y = "hgnc_symbol" )
colnames(result)[which(names(result) == "entrezgene_id")] <- "target_entrezgene_id"
rownames(gene_info_2entrez) <- gene_info_2entrez$hgnc_symbol
result <- unique(result[c( "source_hgnc_symbol", "source_entrezgene_id",
"target_hgnc_symbol", "target_entrezgene_id",
"is_directed", "consensus_direction", "references" )])
ppi_network <- cbind(ppi_network,
source_entrezgene_id = gene_info_2entrez[ppi_network$source_hgnc_symbol,"entrezgene_id"],
target_entrezgene_id = gene_info_2entrez[ppi_network$source_hgnc_symbol,"entrezgene_id"])
ppi_network <- ppi_network[,c("source_hgnc_symbol","source_entrezgene_id",
"target_hgnc_symbol", "target_entrezgene_id",
"is_directed", "consensus_direction", "references")]
if( nrow( result ) == 0 ){
warning("None of the pairs of interacting proteins have pased the filters")
} else {
write.table(result, outputFile, sep = "\t", quote = F, row.names = F)
}
write.table(ppi_network, outputFile, sep = "\t", quote = F, row.names = F, col.names = T)
}
# gene_info_2entrez <- subset(gene_info_2entrez, entrezgene_id != "")
#
# result <- merge(result, gene_info_2entrez, by.x = "source_hgnc_symbol", by.y = "hgnc_symbol" )
# colnames(result)[which(names(result) == "entrezgene_id")] <- "source_entrezgene_id"
# result <- merge(result, gene_info_2entrez, by.x = "target_hgnc_symbol", by.y = "hgnc_symbol" )
# colnames(result)[which(names(result) == "entrezgene_id")] <- "target_entrezgene_id"
#
# result$pair <-apply(result,1,function(x){paste(sort(x[c("source_hgnc_symbol","target_hgnc_symbol")]),collapse=":")})
# ppi_network$pair <-apply(ppi_network,1,function(x){paste(sort(x[c("source_hgnc_symbol","target_hgnc_symbol")]),collapse=":")})
#
# message(paste0("from ",length(unique(ppi_network$pair))," pairs of interacting proteins from stringdb"))
# message(paste0("we found ", length(intersect(ppi_network$pair, result$pair) ), " interacting pairs in Omnipath"))
#
# final_results <- unique(subset(result, pair %in% ppi_network$pair)[c( "source_hgnc_symbol","source_entrezgene_id",
# "target_hgnc_symbol", "target_entrezgene_id",
# "is_directed", "consensus_direction", "references", "pair" )])
# ppi_network <- subset(ppi_network, !pair %in% final_results$pair)
# ppi_network <- within(ppi_network , rm(score, pair))
#
# final_results <- merge( final_results[, -grep("pair", colnames(final_results))],
# ppi_network, all = T )
# final_results$is_directed <- ifelse(is.na(final_results$is_directed), 0, as.character(final_results$is_directed))
#
# message(paste0(" And from those ",dim(subset(final_results, is_directed == 1) )[1], " interacting pairs have direction in Omnipath"))
#
# if( nrow( final_results ) == 0 ){
# warning("None of the pairs of interacting proteins havee pased the filters")
# } else {
# write.table(final_results, outputFile, sep = "\t", quote = F, row.names = F)
# }
# } else {
# message("Retrieving data from Omnipath")
#
# url <- paste0("http://omnipathdb.org/interactions/?genesymbols=1&partners=", paste(input, collapse = ","), "&fields=sources,references")
# dataTsv <- RCurl::getURLContent( url )
# myTextConnection <- textConnection( dataTsv )
# result <- read.csv( myTextConnection, header = TRUE, sep = "\t", colClasses=c("character"))
#
# colnames(result)[which(names(result) == "source_genesymbol")] <- "source_hgnc_symbol"
# colnames(result)[which(names(result) == "target_genesymbol")] <- "target_hgnc_symbol"
# result$pair <-apply(result,1,function(x){paste(sort(x[c("source_hgnc_symbol","target_hgnc_symbol")]),collapse=":")})
# message(paste0("there are ",length(unique(result$pair))," pairs of interacting proteins from omnipath"))
# ensembl <- useMart("ensembl")
# ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")
#
# gene_info_2entrez <- getBM(attributes=c('hgnc_symbol', 'entrezgene_id'),
# filters = 'hgnc_symbol',
# values = c(result$source_hgnc_symbol, result$target_hgnc_symbol),
# mart = ensembl)
# gene_info_2entrez <- subset(gene_info_2entrez, entrezgene_id != "")
#
# result <- merge(result, gene_info_2entrez, by.x = "source_hgnc_symbol", by.y = "hgnc_symbol" )
# colnames(result)[which(names(result) == "entrezgene_id")] <- "source_entrezgene_id"
# result <- merge(result, gene_info_2entrez, by.x = "target_hgnc_symbol", by.y = "hgnc_symbol" )
# colnames(result)[which(names(result) == "entrezgene_id")] <- "target_entrezgene_id"
#
#
# result <- unique(result[c( "source_hgnc_symbol", "source_entrezgene_id",
# "target_hgnc_symbol", "target_entrezgene_id",
# "is_directed", "consensus_direction", "references" )])
#
#
# if( nrow( result ) == 0 ){
# warning("None of the pairs of interacting proteins have pased the filters")
# } else {
# write.table(result, outputFile, sep = "\t", quote = F, row.names = F)
# }
#
# }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment