Commit 96764ee7 authored by David Hoksza's avatar David Hoksza
Browse files

gene extension included and prepared for pipeline

parent 580ad3e7
......@@ -15,6 +15,8 @@ STOP_AFTER_STAGE=-1
ASSOCIATIONS_DIR=associations/
ASSOCIATIONS_DATA_DIR=$ASSOCIATIONS_DIR/data/
EXTEND_DIR=${ASSOCIATIONS_DIR}/extend_network/
EXTEND_CONFIG=${EXTEND_DIR}/config.txt
RES_DIR=results
ENRICHMENT_DIR=enrichment/
ENRICHMENT_CONFIG=${ENRICHMENT_DIR}/config.txt
......@@ -50,6 +52,12 @@ genes_out_path=${genes_variants_out_path/02-genes_variants/03-genes}
echo ${genes_line#*:} | tr ',' '\n' > ${genes_out_path}
echo "Genes stored in ${genes_out_path}"
#echo "Extending genes list"
#text_mining_out_path=${genes_out_path/03-genes/03-text-mining}
#echo Rscript --vanilla ${EXTEND_DIR}/get_extended_network.R ${genes_out_path} ${EXTEND_CONFIG}
#cp ${EXTEND_DIR}/output.txt ${text_mining_out_path}
#echo "Genes list extended"
minerva_genes_out_path=${RES_DIR}/04-minerva-genes-id_${ORPHANET_IDS_UNDERSCORE}.txt
$PYTHON_BIN $ASSOCIATIONS_DIR/minerva_genes.py -f ${genes_out_path} > ${minerva_genes_out_path}
......@@ -68,7 +76,7 @@ if [ ${STOP_AFTER_STAGE} = 1 ]; then
fi
# ------------------------------ 2. Obtain pathways ------------------------------
echo "Retrieving enriched pathways"
Rscript --vanilla enrichment/enrich_maps.R ${genes_out_path} ${ENRICHMENT_CONFIG}
Rscript --vanilla ${ENRICHMENT_DIR}/enrich_maps.R ${genes_out_path} ${ENRICHMENT_CONFIG}
enriched_maps_out_path=$RES_DIR/05-enriched_disease_maps-id_${ORPHANET_IDS_UNDERSCORE}.txt
enriched_paths_out_path=$RES_DIR/05-enriched_pathways-id_${ORPHANET_IDS_UNDERSCORE}.txt
mv enriched_disease_maps.txt ${enriched_maps_out_path}
......@@ -90,6 +98,7 @@ fi
echo "Map generator built"
echo "Assembling the map from pathways ..."
map_out_path=${RES_DIR}/06-minerva_map-id_${ORPHANET_IDS_UNDERSCORE}.xml
#java -Xmx4g -jar ${MAP_GENERATOR_DIR}/biohackathon/target/biohackathon-1.0-jar-with-dependencies.jar --enrichr-file ${enriched_paths_out_path} --minerva-file ${enriched_maps_out_path} --text-mining-file ${text_mining_out_path} --output-file ${map_out_path}
java -Xmx4g -jar ${MAP_GENERATOR_DIR}/biohackathon/target/biohackathon-1.0-jar-with-dependencies.jar --enrichr-file ${enriched_paths_out_path} --minerva-file ${enriched_maps_out_path} --output-file ${map_out_path}
echo "Pathways assembled into ${map_out_path}"
......
# **get_extended_network.R** Script for extending the original list of disease genes using PPI data
Should be run as `Rscript --vanilla get_extended_network.R <input_filename> <config_file>`
`input_filename` should contain list of genes as HGNC symbols, one entry per line. Default is "input.txt" in the script folder.
`config_file` contains the several parameters to perform the expansion of the data. Default is "config.txt" in the script folder.
The script uses "config.txt" file, containing the following parameters
- outputFile: the output file name containing the extended network and some edge attributes (see below).
- n: the number of new genes in the extended network. By default 50.
- score: is the string score (between 0,1). By default 0. It is used to filter a posteriori because
I was not able to find documentation on how to add the score in the API url request
- dbsource: the database source of the protein-protein interaction data. Currently, stringdb or omnipathdb.
By default stringdb. When source = "stringdb"", the script will overlap omnipathdb data to provide information about directionality. When source = "omnipathdb" it will expand the network by retrieven all information for the query genes annotated in omnipath.
The script provides an output file with the pairs gene A - gene B (HGNC identifiers, and also the ncbi entrez identifiers). If available in omnipath, it will contain directionality (values = 1, there is direction. value=0 means no direction ). The column consensus_directionality reflects the fact that some evidence for a pair might be contradictory. It can be ignored by now. The column references contains the publications retrieved from omnipath.
......@@ -45,7 +45,7 @@ new.packages <- required.packages[!(required.packages %in% installed.packages()[
if(length(new.packages)) install.packages(new.packages, repos='http://cran.us.r-project.org')
if ("biomaRt" %in% installed.packages()[,1] == FALSE){
BiocManager::install("biomaRt", update=TRUE, ask=FALSE)
BiocManager::install("biomaRt", update=TRUE, ask=FALSE)
}
library(biomaRt)
......@@ -70,10 +70,11 @@ if (dbsource == "stringdb") {
entities <- paste0("9606.", ensprotein_idest$ensembl_peptide_id)
entities <- paste(entities, collapse = "%0A" )
resp <- httr::GET(url = paste0("api.jensenlab.org/network?entities=", entities, "&additional=", n),
httr::add_headers('Content-Type' = "application/x-www-form-urlencoded")
#, httr::verbose()
)
resp <- httr::POST(url = "api.jensenlab.org/network",
httr::add_headers('Content-Type' = "application/x-www-form-urlencoded"),
body = paste0("entities=", entities, "&additional=", n),
httr::verbose())
json <- jsonlite::fromJSON(httr::content(resp, as = "text"), flatten = FALSE)
protein_info <- json[[2]]
protein_info$protein <- gsub('stringdb:', "", protein_info$`@id`)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment