Commit 0658937b authored by David Hoksza's avatar David Hoksza
Browse files

minerva and opentargets integrated

parent 8ae48a32
......@@ -4,8 +4,10 @@ Code and comments to [https://r3.pages.uni.lu/biocuration/resources/biohackathon
#### Requirements
- Java Runtime
- Python 3.x
- R
- zip utility
If the pipeline is run at clean Linux installation you might need to install the following libraries
(`sudo apt-get install` on Ubuntu) prior to running the code:
......
......@@ -3,6 +3,11 @@
# ------------------------- PARAMETERS TO SET -------------------------
ORPHANET_IDS="130"
DISGENET_CNT_THRESHOLD=50
DISGENET_ASSOCIATION_SCORE_THRESHOLD=0
OPENTARGETS_CNT_THRESHOLD=50
OPENTARGETS_ASSOCIATION_SCORE_THRESHOLD=0
BUILD_MAP_GENERATOR=0 #1 for building it
STOP_AFTER_STAGE=1
# ------------------------- PARAMETERS TO SET -------------------------
ASSOCIATIONS_DIR=associations/
......@@ -10,6 +15,7 @@ ASSOCIATIONS_DATA_DIR=$ASSOCIATIONS_DIR/data/
RES_DIR=results
ENRICHMENT_DIR=enrichment/
ENRICHMENT_CONFIG=${ENRICHMENT_DIR}/config.txt
MAP_GENERATOR_DIR=map_generator/
PYTHON_BIN=python3
#ORPHANET_IDS="33,67046,79327,79321,86309"
......@@ -18,12 +24,16 @@ ORPHANET_IDS_UNDERSCORE=${ORPHANET_IDS//,/_}
mkdir $RES_DIR
# -------------- 1. Extract genes and variants associated with a disease --------------
disgenet_out_path=${RES_DIR}/01-disgenet-id_${ORPHANET_IDS_UNDERSCORE}-n_${DISGENET_CNT_THRESHOLD}.json
$PYTHON_BIN $ASSOCIATIONS_DIR/disgenet.py -o ${ORPHANET_IDS} -n ${DISGENET_CNT_THRESHOLD} > ${disgenet_out_path}
disgenet_out_path=${RES_DIR}/01-disgenet-id_${ORPHANET_IDS_UNDERSCORE}-n_${DISGENET_CNT_THRESHOLD}-s_${DISGENET_ASSOCIATION_SCORE_THRESHOLD}.json
$PYTHON_BIN $ASSOCIATIONS_DIR/disgenet.py -o ${ORPHANET_IDS} -n ${DISGENET_CNT_THRESHOLD} -s ${DISGENET_ASSOCIATION_SCORE_THRESHOLD} > ${disgenet_out_path}
echo "Disgenet gene and variant associations stored in ${disgenet_out_path}"
opentargets_out_path=${RES_DIR}/01-opentargets-id_${ORPHANET_IDS_UNDERSCORE}-n_${DISGENET_CNT_THRESHOLD}-s_${DISGENET_ASSOCIATION_SCORE_THRESHOLD}.json
$PYTHON_BIN $ASSOCIATIONS_DIR/opentargets.py -o ${ORPHANET_IDS} -n ${OPENTARGETS_CNT_THRESHOLD} -s ${OPENTARGETS_ASSOCIATION_SCORE_THRESHOLD} > ${opentargets_out_path}
echo "Opentargets gene and variant associations stored in ${opentargets_out_path}"
genes_variants_out_path=${RES_DIR}/02-genes_variants-id_${ORPHANET_IDS_UNDERSCORE}.log
$PYTHON_BIN $ASSOCIATIONS_DIR/merge_with_clinvar.py -v $disgenet_out_path -c ${ASSOCIATIONS_DATA_DIR}/OrphaHPO_clinvar_variants_summary.tsv -oid ${ORPHANET_IDS} > ${genes_variants_out_path}
$PYTHON_BIN $ASSOCIATIONS_DIR/merge_with_clinvar.py -v $disgenet_out_path,$opentargets_out_path -c ${ASSOCIATIONS_DATA_DIR}/OrphaHPO_clinvar_variants_summary.tsv -oid ${ORPHANET_IDS} > ${genes_variants_out_path}
echo "Integration with ClinVar stored in ${genes_variants_out_path}"
genes_line=`cat ${genes_variants_out_path} | grep "genes in total"`
......@@ -42,14 +52,54 @@ echo "Variants stored in ${variants_out_path}"
minerva_variants_out_path=${RES_DIR}/04-minerva-variants-id_${ORPHANET_IDS_UNDERSCORE}.txt
$PYTHON_BIN $ASSOCIATIONS_DIR/minerva_variants.py -f ${variants_out_path} > ${minerva_variants_out_path}
if [ ${STOP_AFTER_STAGE} = 1 ]; then
echo "Exiting after stage ${STOP_AFTER_STAGE}"
exit 0
fi
# ------------------------------ 2. Obtain pathways ------------------------------
echo "Retrieving enriched pathways"
Rscript --vanilla enrichment/enrich_maps.R ${genes_out_path} ${ENRICHMENT_CONFIG}
enriched_maps_out_path=$RES_DIR/05-enriched_disease_maps.txt
enriched_paths_out_path=$RES_DIR/05-enriched_pathways.txt
enriched_maps_out_path=$RES_DIR/05-enriched_disease_maps-id_${ORPHANET_IDS_UNDERSCORE}.txt
enriched_paths_out_path=$RES_DIR/05-enriched_pathways-id_${ORPHANET_IDS_UNDERSCORE}.txt
mv enriched_disease_maps.txt ${enriched_maps_out_path}
mv enriched_pathways.txt ${enriched_paths_out_path}
echo "Enriched pathways obtained"
if [ ${STOP_AFTER_STAGE} = 2 ]; then
echo "Exiting after stage ${STOP_AFTER_STAGE}"
exit 0
fi
# ------------------------------ 2. Assemble pathways and overlays into a map ------------------------------
# MINERVA
if [ ${BUILD_MAP_GENERATOR} = 1 ]; then
echo "Bulding the map generator ..."
cd ${MAP_GENERATOR_DIR}
mvn -DskipTests=true clean install -pl biohackathon -am
cd ..
fi
echo "Map generator built"
echo "Assembling the map from pathways ..."
map_out_path=${RES_DIR}/06-minerva_map-id_${ORPHANET_IDS_UNDERSCORE}.xml
java -jar ${MAP_GENERATOR_DIR}/biohackathon/target/biohackathon-1.0-jar-with-dependencies.jar --enrichr-file ${enriched_paths_out_path} --minerva-file ${enriched_maps_out_path} --output-file ${map_out_path}
echo "Pathways assembled into ${map_out_path}"
echo "Combining map with overlays"
tmp_dir=${RES_DIR}/tmp/
tmp_dir_layouts=${tmp_dir}/layouts/
mkdir ${tmp_dir}
cp ${map_out_path} ${tmp_dir}
mkdir ${tmp_dir_layouts}
cp ${minerva_genes_out_path} ${tmp_dir_layouts}
cp ${minerva_variants_out_path} ${tmp_dir_layouts}
map_zip_out_path=${map_out_path/.xml/.zip}
rm ${map_zip_out_path}
cd ${tmp_dir}
zip -r tmp.zip .
cd -
mv ${tmp_dir}/tmp.zip ${map_zip_out_path}
rm -rf ${tmp_dir}
echo "Final map with overlays stored in ${map_zip_out_path}"
\ No newline at end of file
......@@ -19,14 +19,14 @@ def get_umls_ids(orphanet_ids:List[str]) -> List[str]:
return umls_ids
def get_genes(umls_ids:List[str]) -> Dict[str, Dict]:
def get_genes(umls_ids:List[str], score:float) -> Dict[str, Dict]:
genes = {}
logging.info("Connecting to disgenet gda endpoint")
res = requests.get(
"http://www.disgenet.org/api/gda/disease/{}".format(",".join(umls_ids)),
params={"min_score": 0}
params={"min_score": score}
)
logging.info("Retrieved data from disgenet gda endpoint")
if res.status_code == requests.codes.ok:
......@@ -62,17 +62,22 @@ if __name__ == '__main__':
parser.add_argument("-oids", "--orphanet_ids",
required=True,
help="Orphanet number of the disease")
help="Input Orphanet numbers")
parser.add_argument("-n", "--top-n",
required=False,
default=50,
type=int,
help="Retrieve the top N genes sorted by association score (0 for all)")
parser.add_argument("-s", "--threshold-score",
required=False,
default=0,
type=float,
help="Only consider genes with association score higher than given threshold")
args = parser.parse_args()
umls_ids = get_umls_ids(args.orphanet_ids.split(","))
genes = get_genes(umls_ids)
genes = get_genes(umls_ids, args.threshold_score)
get_variants(umls_ids, genes)
genes_sorted = {}
cnt = args.top_n
......
......@@ -5,29 +5,31 @@ from typing import List, Dict
import requests
def get_genes_and_evidence(orphanet_id:str, association_score: float) -> Dict:
def get_genes_and_evidence(orphanet_ids:List[str], association_score: float) -> Dict:
genes = {}
res = requests.get(
"https://api.opentargets.io/v3/platform/public/association/filter",
params={'disease': 'Orphanet_{}'.format(orphanet_id),
'scorevalue_min': association_score,
'size': '1000'}
)
if res.status_code == requests.codes.ok:
content = json.loads(res.text)
for data in content['data']:
genes[data['target']['gene_info']['symbol']] = {
'gene_name': data['target']['gene_info']['symbol'],
'evidence_id': data['id'],
'association_score': data['association_score']['overall'],
'association_score_evidence': data['association_score']['datatypes']
}
for oid in orphanet_ids:
res = requests.get(
"https://api.opentargets.io/v3/platform/public/association/filter",
params={'disease': 'Orphanet_{}'.format(oid),
'scorevalue_min': association_score,
'size': '1000'}
)
if res.status_code == requests.codes.ok:
content = json.loads(res.text)
for data in content['data']:
genes[data['target']['gene_info']['symbol']] = {
'gene_name': data['target']['gene_info']['symbol'],
'evidence_id': data['id'],
'association_score': data['association_score']['overall'],
'association_score_evidence': data['association_score']['datatypes'],
'variants': []
}
return genes
def get_variants(genes_evidence: List[Dict]):
def get_variants(genes_evidence: Dict):
for ge in genes_evidence:
s_ge = ge['evidence_id'].split('-')
target = s_ge[0]
......@@ -52,21 +54,34 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-o", "--orphanet",
parser.add_argument("-oids", "--orphanet_ids",
required=True,
help="Orphanet number of the disease")
parser.add_argument("-s", "--score",
help="Inpu Orphanet numbers")
parser.add_argument("-n", "--top-n",
required=False,
default=50,
type=int,
help="Retrieve the top N genes sorted by association score (0 for all)")
parser.add_argument("-s", "--threshold-score",
required=False,
default=0,
help="Threshold for association score")
type=float,
help="Only consider genes with association score higher than given threshold")
args = parser.parse_args()
genes_evidence = get_genes_and_evidence(args.orphanet, args.score)
get_variants(genes_evidence)
print(json.dumps(genes_evidence, indent=2))
genes_evidence = get_genes_and_evidence(args.orphanet_ids.split(","), args.threshold_score)
# get_variants(genes_evidence)
genes_sorted = {}
cnt = args.top_n
i = 0
for k, v in sorted(genes_evidence.items(), key=lambda x: x[1]['association_score'], reverse=True):
if i >= cnt:
break
i += 1
genes_sorted[k] = v
print(json.dumps({"name": "opentargets", "genes": genes_evidence}, indent=2))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment