Commit 6edb2f58 authored by David Hoksza's avatar David Hoksza
Browse files

stop pipeline after unsuccesfull command

parent ba7ed61d
......@@ -28,19 +28,30 @@ mkdir $RES_DIR
# First we copy the parameters into the result directory
cp parameters.sh ${RES_DIR}
check_exit_code(){
if [ $? -ne 0 ]; then
tput setaf 1; echo "The last command failed. Can't continue..."
exit
fi
}
# Get associations from DisGeNET
disgenet_out_path=${RES_DIR}/01-disgenet-n_${DISGENET_CNT_THRESHOLD}-s_${DISGENET_ASSOCIATION_SCORE_THRESHOLD_STR}.json
$PYTHON_BIN $ASSOCIATIONS_DIR/disgenet.py -o ${ORPHANET_IDS} -n ${DISGENET_CNT_THRESHOLD} -s ${DISGENET_ASSOCIATION_SCORE_THRESHOLD} > ${disgenet_out_path}
check_exit_code
echo "Disgenet gene and variant associations stored in ${disgenet_out_path}"
# Get associations from OpenTargets
opentargets_out_path=${RES_DIR}/01-opentargets-n_${DISGENET_CNT_THRESHOLD}-s_${OPENTARGETS_ASSOCIATION_SCORE_THRESHOLD_STR}.json
$PYTHON_BIN $ASSOCIATIONS_DIR/opentargets.py -o ${ORPHANET_IDS} -n ${OPENTARGETS_CNT_THRESHOLD} -s ${OPENTARGETS_ASSOCIATION_SCORE_THRESHOLD} > ${opentargets_out_path}
check_exit_code
echo "Opentargets gene and variant associations stored in ${opentargets_out_path}"
# Merge with ClinVar
genes_variants_out_path=${RES_DIR}/02-genes_variants.log
$PYTHON_BIN $ASSOCIATIONS_DIR/merge_with_clinvar.py -v $disgenet_out_path,$opentargets_out_path -c ${ASSOCIATIONS_DATA_DIR}/OrphaHPO_clinvar_variants_summary.tsv -oid ${ORPHANET_IDS} > ${genes_variants_out_path}
check_exit_code
echo "Integration with ClinVar stored in ${genes_variants_out_path}"
genes_line=`cat ${genes_variants_out_path} | grep "genes in total"`
......@@ -52,11 +63,13 @@ echo "Genes stored in ${genes_out_path}"
echo "Extending genes list"
text_mining_out_path=${genes_out_path/03-genes/03-text-mining}
echo Rscript --vanilla ${EXTEND_DIR}/get_extended_network.R ${genes_out_path} ${EXTEND_CONFIG}
check_exit_code
cp ${EXTEND_DIR}/output.txt ${text_mining_out_path}
echo "Genes list extended"
minerva_genes_out_path=${RES_DIR}/04-minerva-genes.txt
$PYTHON_BIN $ASSOCIATIONS_DIR/minerva_genes.py -f ${genes_out_path} > ${minerva_genes_out_path}
check_exit_code
var_line=`cat ${genes_variants_out_path} | grep "variants in total"`
variants_out_path=${genes_variants_out_path/02-genes_variants/03-variants}
......@@ -66,6 +79,7 @@ echo "Variants stored in ${variants_out_path}"
minerva_variants_out_path=${RES_DIR}/04-minerva-variants.txt
$PYTHON_BIN $ASSOCIATIONS_DIR/minerva_variants.py -f ${variants_out_path} > ${minerva_variants_out_path}
check_exit_code
if [ ${STOP_AFTER_STAGE} = 1 ]; then
echo "Exiting after stage ${STOP_AFTER_STAGE}"
......@@ -81,6 +95,8 @@ sed -E -i "s/(max_areas_per_map)(.*)/\1\t${ENRICH_MAX_AREAS_PER_MAP}/g" ${ENRICH
sed -E -i "s/(max_areas_per_pathway_db)(.*)/\1\t${MAX_AREAS_PER_PATHWAY_DB}/g" ${ENRICHMENT_CONFIG_TMP}
Rscript --vanilla ${ENRICHMENT_DIR}/enrich_maps.R ${genes_out_path} ${ENRICHMENT_CONFIG_TMP}
check_exit_code
enriched_maps_out_path=$RES_DIR/05-enriched_disease_maps.txt
enriched_paths_out_path=$RES_DIR/05-enriched_pathways.txt
mv enriched_disease_maps.txt ${enriched_maps_out_path}
......@@ -107,6 +123,8 @@ echo "Assembling the map from pathways ..."
map_out_path=${RES_DIR}/06-minerva_map.xml
java -Xmx4g -jar ${MAP_GENERATOR_DIR}/biohackathon/target/biohackathon-1.0-jar-with-dependencies.jar --enrichr-file ${enriched_paths_out_path} --minerva-file ${enriched_maps_out_path} --text-mining-file ${text_mining_out_path} --output-file ${map_out_path}
#java -Xmx4g -jar ${MAP_GENERATOR_DIR}/biohackathon/target/biohackathon-1.0-jar-with-dependencies.jar --enrichr-file ${enriched_paths_out_path} --minerva-file ${enriched_maps_out_path} --output-file ${map_out_path}
check_exit_code
echo "Pathwaygs assembled into ${map_out_path}"
echo "Combining map with overlays"
......
......@@ -7,7 +7,7 @@
# n: the number of new proteins in the extended network. By default 50
# score: is the string score (between 0,1). By default 0. it filters a posteriori because
# I was not able to find documentation on how to add the score in the API url request
# dbsource: the source of the protein-protein interaction data. Currently, stringdb or omnipathdb.
# dbsource: the source of the protein-protein interaction data. Currently, text_mining, stringdb or omnipathdb.
# By default, stringdb, and then overlaps omnipathdb data to provide information about directionality.
# When source = "omnipathdb" it will retrieve all information for the genes from omnipath
# the output file name. It will contain the pairs gene A, gene B with their corresponding HGNC identifiers,
......@@ -33,7 +33,7 @@ n <-config[config$param == "n", "value"]
score <-config[config$param == "score", "value"]
dbsource <-config[config$param == "source", "value"]
if( !dbsource %in% c("stringdb", "omnipathdb")) {
if( !dbsource %in% c("stringdb", "omnipathdb", "text_mining")) {
stop("Invalid argument 'dbsource'. It should be omnipathdb or stringdb ")
}
if( ! ( score >= 0 & score <= 1 )) {
......@@ -53,7 +53,7 @@ library(biomaRt)
message(paste0("there are ", length(unique(input))), " genes in the file")
if (dbsource == "stringdb") {
if (dbsource == "stringdb" | dbsource == "text_mining") {
message( "mapping the hgnc identifiers to ensemble proteins used by stringdb with biomart" )
ensembl <- useMart("ensembl")
ensembl <- useMart("ensembl",dataset="hsapiens_gene_ensembl")
......@@ -61,42 +61,81 @@ if (dbsource == "stringdb") {
filters = 'hgnc_symbol',
values = input,
mart = ensembl)
biomartCacheClear()
ensprotein_idest <- subset(ensprotein_idest, ensembl_peptide_id != "")
message(paste0("there are ", length(unique(ensprotein_idest$hgnc_symbol))), " genes in the input file with ensembl protein ids")
message(paste0("doing the query to stringdb and retrieving n ",n, " neighbors" ))
entities <- paste0("9606.", ensprotein_idest$ensembl_peptide_id)
entities <- paste(entities, collapse = "%0A" )
if (dbsource == "stringdb"){
message(paste0("doing the query to stringdb and retrieving n ",n, " neighbors" ))
entities <- paste0("9606.", ensprotein_idest$ensembl_peptide_id)
entities <- paste(entities, collapse = "%0A" )
resp <- httr::POST(url = "api.jensenlab.org/network",
httr::add_headers('Content-Type' = "application/x-www-form-urlencoded"),
body = paste0("entities=", entities, "&additional=", n),
httr::verbose())
json <- jsonlite::fromJSON(httr::content(resp, as = "text", encoding = "UTF-8"), flatten = FALSE)
protein_info <- json[[2]]
protein_info$protein <- gsub('stringdb:', "", protein_info$`@id`)
protein_info$protein <- gsub('stringdb:', "", protein_info$`@id`)
protein_info$protein <- gsub('9606.', "", protein_info$protein )
protein_info <- protein_info$protein
ppi_network <- json[[1]]
ppi_network$source <- gsub('9606.', "", ppi_network$source )
ppi_network$target <- gsub('9606.', "", ppi_network$target )
ppi_network$score <- ppi_network$scores$`stringdb::score`
message("the network contains ", dim(ppi_network)[1], " interactions ")
ppi_network <- ppi_network[ ppi_network$score > score, ]
ppi_network <- unique(ppi_network[c("source", "target", "score")])
message("after pruning by score, the network contains ", dim(ppi_network)[1], " interactions ")
} else {
ppi_network <- data.frame(source=character(), target=character(), score=double())
for (entity in ensprotein_idest$ensembl_peptide_id ){
# print(paste0("http://7fccb2cc.ngrok.io/getCooccurrence/", entity, "?type=9606"))
# http://172.16.4.230:5000/
# resp <- httr::GET(url = paste0("http://7fccb2cc.ngrok.io/getCooccurrence/", entity, "?type=9606"),
# httr::add_headers('Content-Type' = "application/x-www-form-urlencoded")
# #, httr::verbose()
# )
resp <- httr::GET(url = paste0("http://172.16.4.230:5000/getCooccurrence/", entity, "?type=9606&limit=", n),
httr::add_headers('Content-Type' = "application/x-www-form-urlencoded")
#, httr::verbose()
)
json <- jsonlite::fromJSON(httr::content(resp, as = "text", encoding = "UTF-8"), flatten = FALSE)
if (length(json)!= 0) {
json$target <- json$info$entity
json <- unique(subset(json, grepl("ENSP", target) )[c("target", "score")])
json$source <- entity
ppi_network <- rbind(ppi_network, json)
}
}
message("the network contains ", dim(ppi_network)[1], " interactions ")
ppi_network <- ppi_network[ppi_network$score > 0, ]
ppi_network <- unique(ppi_network[c("source", "target", "score")])
message("after pruning by score, the network contains ", dim(ppi_network)[1], " interactions ")
if (dim(ppi_network)[1] > 0) {
protein_info <- unique( c(ppi_network$source, ppi_network$target) )
}
else{
stop("there is no info in text mining for the query genes")
}
}
resp <- httr::POST(url = "api.jensenlab.org/network",
httr::add_headers('Content-Type' = "application/x-www-form-urlencoded"),
body = paste0("entities=", entities, "&additional=", n),
httr::verbose())
json <- jsonlite::fromJSON(httr::content(resp, as = "text"), flatten = FALSE)
protein_info <- json[[2]]
protein_info$protein <- gsub('stringdb:', "", protein_info$`@id`)
protein_info$protein <- gsub('stringdb:', "", protein_info$`@id`)
protein_info$protein <- gsub('9606.', "", protein_info$protein )
protein_info_2ensp <- getBM(attributes=c('ensembl_peptide_id', "hgnc_symbol", 'entrezgene_id'),
filters = 'ensembl_peptide_id',
values = protein_info$protein,
values = protein_info,
mart = ensembl)
biomartCacheClear()
protein_info_2ensp <- subset(protein_info_2ensp, ensembl_peptide_id != "")
protein_info_2ensp$ensembl_peptide_id <- paste0("9606.", protein_info_2ensp$ensembl_peptide_id)
ppi_network <- json[[1]]
ppi_network$string_score <- ppi_network$scores$`stringdb::score`
message("the network contains ", dim(ppi_network)[1], " interactions ")
ppi_network <- subset(ppi_network, string_score > score)
message("after pruning by score, the network contains ", dim(ppi_network)[1], " interactions ")
protein_info_2ensp <- subset(protein_info_2ensp, hgnc_symbol != "")
ppi_network <- merge(ppi_network, protein_info_2ensp, by.x = "source", by.y = "ensembl_peptide_id" )
colnames(ppi_network)[which(names(ppi_network) == "hgnc_symbol")] <- "source_hgnc_symbol"
colnames(ppi_network)[which(names(ppi_network) == "entrezgene_id")] <- "source_entrezgene_id"
ppi_network <- merge(ppi_network, protein_info_2ensp, by.x = "target", by.y = "ensembl_peptide_id" )
......@@ -104,7 +143,7 @@ if (dbsource == "stringdb") {
colnames(ppi_network)[which(names(ppi_network) == "entrezgene_id")] <- "target_entrezgene_id"
ppi_network <- unique(ppi_network[c("source_hgnc_symbol","source_entrezgene_id",
"target_hgnc_symbol","target_entrezgene_id", "string_score")])
"target_hgnc_symbol","target_entrezgene_id", "score")])
message("after converting to gene symbols, the network contains ", dim(ppi_network)[1], " interactions ")
......@@ -118,12 +157,11 @@ if (dbsource == "stringdb") {
colnames(result)[which(names(result) == "source_genesymbol")] <- "source_hgnc_symbol"
colnames(result)[which(names(result) == "target_genesymbol")] <- "target_hgnc_symbol"
gene_info_2entrez <- getBM(attributes=c('hgnc_symbol', 'entrezgene_id'),
filters = 'hgnc_symbol',
values = c(result$source_hgnc_symbol, result$target_hgnc_symbol),
mart = ensembl)
biomartCacheClear()
gene_info_2entrez <- subset(gene_info_2entrez, entrezgene_id != "")
result <- merge(result, gene_info_2entrez, by.x = "source_hgnc_symbol", by.y = "hgnc_symbol" )
......@@ -141,7 +179,7 @@ if (dbsource == "stringdb") {
"target_hgnc_symbol", "target_entrezgene_id",
"is_directed", "consensus_direction", "references", "pair" )])
ppi_network <- subset(ppi_network, !pair %in% final_results$pair)
ppi_network <- within(ppi_network , rm(string_score, pair))
ppi_network <- within(ppi_network , rm(score, pair))
final_results <- merge( final_results[, -grep("pair", colnames(final_results))],
ppi_network, all = T )
......
......@@ -126,7 +126,7 @@ def get_dbsnp(ids: List[str]) -> List[Dict]:
"allele_frequency": allele_frequency,
"variant_identifier": variant_identifier,
"amino_acid_change": amino_acid_change,
"uniprot_acc": uniprot_acc
"identifier_uniprot": uniprot_acc
})
logging.info("Skipped {} variants out of {}".format(cnt - len(snps), cnt))
......@@ -139,7 +139,7 @@ def remove_snps_with_multiple_uniprot_ids(db_snps: List[Dict]) -> List[Dict]:
id = snp["variant_identifier"]
if id not in snp_cnt:
snp_cnt[id] = set()
snp_cnt[id].add(snp["uniprot_acc"])
snp_cnt[id].add(snp["identifier_uniprot"])
id_to_keep = [id for id in snp_cnt if len(snp_cnt[id]) == 1]
new_db_snps: List[Dict] = []
......@@ -158,10 +158,6 @@ def get_minerva_format(db_snps: List[Dict]) -> str:
if len(db_snps) == 0:
return out
#remove uniprot accesion numbers since these don't currently go to Minerva
for snp in db_snps:
del snp['uniprot_acc']
out += "#NAME=DISEASE_ASSOCIATED_VARIANTS\n"
out += "#TYPE=GENETIC_VARIANT\n"
out += "#GENOME_TYPE=UCSC\n"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment