Commit 40671fc5 authored by David Hoksza's avatar David Hoksza
Browse files

extending network script added

parent a656e288
......@@ -59,10 +59,11 @@ def main():
args = parser.parse_args()
with open(args.f) as myf:
allrsId = myf.readlines()
allrsId = myf.read().split("\n")
#thislist=["rs12720452", "rs1060501130"]
maxNumbRsIdForRequest=200 # there is a limit in the VEP rest
while len(allrsId) > 0 :
print("{} remaining".format(len(allrsId)))
sublist=allrsId[0:maxNumbRsIdForRequest]
allrsId=allrsId[maxNumbRsIdForRequest:]
result = getfreqfromVEPbulck(sublist)
......
......@@ -11,3 +11,24 @@ The script uses "config.txt" file, containing the list of minerva instances and
It provides output with the list of enriched map areas and pathways.
Statistical tests used for maps and pathways are different.
# **get_extended_network.R** Script for extending the original list of disease genes using PPI data
Should be run as `Rscript --vanilla get_extended_network.R <input_filename> <config_file>`
`input_filename` should contain list of genes as HGNC symbols, one entry per line. Default is "input.txt" in the script folder.
`config_file` contains the several parameters to perform the expansion of the data. Default is "config.txt" in the script folder.
The script uses "config.txt" file, containing the following parameters
- outputFile: the output file name containing the extended network and some edge attributes (see below).
- n: the number of new genes in the extended network. By default 50.
- score: is the string score (between 0,1). By default 0. It is used to filter a posteriori because
I was not able to find documentation on how to add the score in the API url request
- dbsource: the database source of the protein-protein interaction data. Currently, stringdb or omnipathdb.
By default stringdb. When source = "stringdb"", the script will overlap omnipathdb data to provide information about directionality. When source = "omnipathdb" it will expand the network by retrieven all information for the query genes annotated in omnipath.
The script provides an output file with the pairs gene A - gene B (HGNC identifiers, and also the ncbi entrez identifiers). If available in omnipath, it will contain directionality (values = 1, there is direction. value=0 means no direction ). The column consensus_directionality reflects the fact that some evidence for a pair might be contradictory. It can be ignored by now. The column references contains the publications retrieved from omnipath.
param value
output_file output.txt
n 50
source stringdb
score 0
# Given an file (input.txt) with a set of genes (identified by their HGNC gene symbol), one per line
# returns an extended network with the n most close interacting genes, plus the original genes
# the mandatory arguments are two:
# input.txt the input file that should be in the same folder with the script
# config.txt the configuration file contains the values for other parameters, such as:
# outputFile the output file name. By default, output.txt
# n: the number of new proteins in the extended network. By default 50
# score: is the string score (between 0,1). By default 0. it filters a posteriori because
# I was not able to find documentation on how to add the score in the API url request
# dbsource: the source of the protein-protein interaction data. Currently, stringdb or omnipathdb.
# By default, stringdb, and then overlaps omnipathdb data to provide information about directionality.
# When source = "omnipathdb" it will retrieve all information for the genes from omnipath
# the output file name. It will contain the pairs gene A, gene B with their corresponding HGNC identifiers,
# and also the ncbi entrez identifiers.
# if available from omnipath it will contain directionality (values = 1, there is direction. value=0 means no direction )
# consensus directionality reflects the fact that some evidence for a pair might be contradictory. It can be ignored by now
args = commandArgs(trailingOnly=TRUE)
infile <- "input.txt"
if(length(args) > 0) { infile <- args[1] }
if(!file.exists(infile)) { message("No correct input given!")}
input <- readLines(infile)
config <- "config.txt"
if(length(args) > 1) { config <- args[2] }
if(!file.exists(config)) { message("No correct config given!")}
config <- read.table(config, sep = "\t", stringsAsFactors = F, header = T)
outputFile <-config[config$param == "output_file", "value"]
n <-config[config$param == "n", "value"]
score <-config[config$param == "score", "value"]
dbsource <-config[config$param == "source", "value"]
if( !dbsource %in% c("stringdb", "omnipathdb")) {
stop("Invalid argument 'dbsource'. It should be omnipathdb or stringdb ")
}
if( ! ( score >= 0 & score <= 1 )) {
stop("Invalid argument 'score'. It should be between 0 and 1")
}
required.packages <- c("biomaRt", "RCurl", "jsonlite", "httr")
new.packages <- required.packages[!(required.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
library(biomaRt)
message(paste0("there are ", length(unique(input))), " genes in the file")
if (dbsource == "stringdb") {
message( "mapping the hgnc identifiers to ensemble proteins used by stringdb with biomart" )
ensembl <- useMart("ensembl")
ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")
ensprotein_idest<- getBM(attributes=c('ensembl_peptide_id', 'hgnc_symbol'),
filters = 'hgnc_symbol',
values = input,
mart = ensembl)
ensprotein_idest <- subset(ensprotein_idest, ensembl_peptide_id != "")
message(paste0("there are ", length(unique(ensprotein_idest$hgnc_symbol))), " genes in the input file with ensembl protein ids")
message(paste0("doing the query to stringdb and retrieving n ",n, " neighbors" ))
entities <- paste0("9606.", ensprotein_idest$ensembl_peptide_id)
entities <- paste(entities, collapse = "%0A" )
resp <- httr::GET(url = paste0("api.jensenlab.org/network?entities=", entities, "&additional=", n),
httr::add_headers('Content-Type' = "application/x-www-form-urlencoded")
#, httr::verbose()
)
json <- jsonlite::fromJSON(httr::content(resp, as = "text"), flatten = FALSE)
protein_info <- json[[2]]
protein_info$protein <- gsub('stringdb:', "", protein_info$`@id`)
protein_info$protein <- gsub('stringdb:', "", protein_info$`@id`)
protein_info$protein <- gsub('9606.', "", protein_info$protein )
protein_info_2ensp <- getBM(attributes=c('ensembl_peptide_id', "hgnc_symbol", 'entrezgene_id'),
filters = 'ensembl_peptide_id',
values = protein_info$protein,
mart = ensembl)
protein_info_2ensp <- subset(protein_info_2ensp, ensembl_peptide_id != "")
protein_info_2ensp$ensembl_peptide_id <- paste0("9606.", protein_info_2ensp$ensembl_peptide_id)
ppi_network <- json[[1]]
ppi_network$string_score <- ppi_network$scores$`stringdb::score`
message("the network contains ", dim(ppi_network)[1], " interactions ")
ppi_network <- subset(ppi_network, string_score > score)
message("after pruning by score, the network contains ", dim(ppi_network)[1], " interactions ")
ppi_network <- merge(ppi_network, protein_info_2ensp, by.x = "source", by.y = "ensembl_peptide_id" )
colnames(ppi_network)[which(names(ppi_network) == "hgnc_symbol")] <- "source_hgnc_symbol"
colnames(ppi_network)[which(names(ppi_network) == "entrezgene_id")] <- "source_entrezgene_id"
ppi_network <- merge(ppi_network, protein_info_2ensp, by.x = "target", by.y = "ensembl_peptide_id" )
colnames(ppi_network)[which(names(ppi_network) == "hgnc_symbol")] <- "target_hgnc_symbol"
colnames(ppi_network)[which(names(ppi_network) == "entrezgene_id")] <- "target_entrezgene_id"
ppi_network <- unique(ppi_network[c("source_hgnc_symbol","source_entrezgene_id",
"target_hgnc_symbol","target_entrezgene_id", "string_score")])
message("after converting to gene symbols, the network contains ", dim(ppi_network)[1], " interactions ")
message("Retrieving data from Omnipath")
url <- paste0("http://omnipathdb.org/interactions/?genesymbols=1&partners=", paste(protein_info_2ensp$hgnc_symbol, collapse = ","), "&fields=sources,references")
dataTsv <- RCurl::getURLContent( url )
myTextConnection <- textConnection( dataTsv )
result <- read.csv( myTextConnection, header = TRUE, sep = "\t", colClasses=c("character"))
colnames(result)[which(names(result) == "source_genesymbol")] <- "source_hgnc_symbol"
colnames(result)[which(names(result) == "target_genesymbol")] <- "target_hgnc_symbol"
gene_info_2entrez <- getBM(attributes=c('hgnc_symbol', 'entrezgene_id'),
filters = 'hgnc_symbol',
values = c(result$source_hgnc_symbol, result$target_hgnc_symbol),
mart = ensembl)
gene_info_2entrez <- subset(gene_info_2entrez, entrezgene_id != "")
result <- merge(result, gene_info_2entrez, by.x = "source_hgnc_symbol", by.y = "hgnc_symbol" )
colnames(result)[which(names(result) == "entrezgene_id")] <- "source_entrezgene_id"
result <- merge(result, gene_info_2entrez, by.x = "target_hgnc_symbol", by.y = "hgnc_symbol" )
colnames(result)[which(names(result) == "entrezgene_id")] <- "target_entrezgene_id"
result$pair <-apply(result,1,function(x){paste(sort(x[c("source_hgnc_symbol","target_hgnc_symbol")]),collapse=":")})
ppi_network$pair <-apply(ppi_network,1,function(x){paste(sort(x[c("source_hgnc_symbol","target_hgnc_symbol")]),collapse=":")})
message(paste0("from ",length(unique(ppi_network$pair))," pairs of interacting proteins from stringdb"))
message(paste0("we found ", length(intersect(ppi_network$pair, result$pair) ), " interacting pairs in Omnipath"))
final_results <- unique(subset(result, pair %in% ppi_network$pair)[c( "source_hgnc_symbol","source_entrezgene_id",
"target_hgnc_symbol", "target_entrezgene_id",
"is_directed", "consensus_direction", "references", "pair" )])
ppi_network <- subset(ppi_network, !pair %in% final_results$pair)
ppi_network <- within(ppi_network , rm(string_score, pair))
final_results <- merge( final_results[, -grep("pair", colnames(final_results))],
ppi_network, all = T )
final_results$is_directed <- ifelse(is.na(final_results$is_directed), 0, as.character(final_results$is_directed))
message(paste0(" And from those ",dim(subset(final_results, is_directed == 1) )[1], " interacting pairs have direction in Omnipath"))
if( nrow( final_results ) == 0 ){
warning("None of the pairs of interacting proteins havee pased the filters")
} else {
write.table(final_results, outputFile, sep = "\t", quote = F, row.names = F)
}
} else {
message("Retrieving data from Omnipath")
url <- paste0("http://omnipathdb.org/interactions/?genesymbols=1&partners=", paste(input, collapse = ","), "&fields=sources,references")
dataTsv <- RCurl::getURLContent( url )
myTextConnection <- textConnection( dataTsv )
result <- read.csv( myTextConnection, header = TRUE, sep = "\t", colClasses=c("character"))
colnames(result)[which(names(result) == "source_genesymbol")] <- "source_hgnc_symbol"
colnames(result)[which(names(result) == "target_genesymbol")] <- "target_hgnc_symbol"
result$pair <-apply(result,1,function(x){paste(sort(x[c("source_hgnc_symbol","target_hgnc_symbol")]),collapse=":")})
message(paste0("there are ",length(unique(result$pair))," pairs of interacting proteins from omnipath"))
ensembl <- useMart("ensembl")
ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")
gene_info_2entrez <- getBM(attributes=c('hgnc_symbol', 'entrezgene_id'),
filters = 'hgnc_symbol',
values = c(result$source_hgnc_symbol, result$target_hgnc_symbol),
mart = ensembl)
gene_info_2entrez <- subset(gene_info_2entrez, entrezgene_id != "")
result <- merge(result, gene_info_2entrez, by.x = "source_hgnc_symbol", by.y = "hgnc_symbol" )
colnames(result)[which(names(result) == "entrezgene_id")] <- "source_entrezgene_id"
result <- merge(result, gene_info_2entrez, by.x = "target_hgnc_symbol", by.y = "hgnc_symbol" )
colnames(result)[which(names(result) == "entrezgene_id")] <- "target_entrezgene_id"
result <- unique(result[c( "source_hgnc_symbol", "source_entrezgene_id",
"target_hgnc_symbol", "target_entrezgene_id",
"is_directed", "consensus_direction", "references" )])
if( nrow( result ) == 0 ){
warning("None of the pairs of interacting proteins have pased the filters")
} else {
write.table(result, outputFile, sep = "\t", quote = F, row.names = F)
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment