Commit ab1381e0 authored by Marek Ostaszewski's avatar Marek Ostaszewski
Browse files

updated code for text mining/omnipath network

parents ff69d574 27022035
#!/usr/bin/env bash
# ------------------------- PARAMETERS TO SET -------------------------
ORPHANET_IDS="791"
#ORPHANET_IDS="130"
#ORPHANET_IDS="33,67046,79327,79321,86309"
DISGENET_CNT_THRESHOLD=50
DISGENET_ASSOCIATION_SCORE_THRESHOLD="0"
OPENTARGETS_CNT_THRESHOLD=50
OPENTARGETS_ASSOCIATION_SCORE_THRESHOLD="0.5"
BUILD_MAP_GENERATOR=0 #0 for building it
STOP_AFTER_STAGE=-1
source parameters.sh
# ------------------------- PARAMETERS TO SET -------------------------
ASSOCIATIONS_DIR=associations/
......@@ -20,6 +11,7 @@ EXTEND_CONFIG=${EXTEND_DIR}/config.txt
RES_DIR=results
ENRICHMENT_DIR=enrichment/
ENRICHMENT_CONFIG=${ENRICHMENT_DIR}/config.txt
ENRICHMENT_CONFIG_TMP=${ENRICHMENT_CONFIG}.tmp
MAP_GENERATOR_DIR=map_generator/
PYTHON_BIN=python3
PIP_BIN=pip3
......@@ -33,57 +25,109 @@ mkdir $RES_DIR
RES_DIR=$RES_DIR/${ORPHANET_IDS_UNDERSCORE}
mkdir $RES_DIR
# Get associations from DisGeNET
disgenet_out_path=${RES_DIR}/01-disgenet-n_${DISGENET_CNT_THRESHOLD}-s_${DISGENET_ASSOCIATION_SCORE_THRESHOLD_STR}.json
$PYTHON_BIN $ASSOCIATIONS_DIR/disgenet.py -o ${ORPHANET_IDS} -n ${DISGENET_CNT_THRESHOLD} -s ${DISGENET_ASSOCIATION_SCORE_THRESHOLD} > ${disgenet_out_path}
echo "Disgenet gene and variant associations stored in ${disgenet_out_path}"
# First we copy the parameters into the result directory
cp parameters.sh ${RES_DIR}
check_exit_code(){
if [ $? -ne 0 ]; then
tput setaf 1; echo "The last command failed. Can't continue..."
exit
fi
}
log() {
echo `date +"%T"`: $1
}
out_paths=""
if [ ${USE_DISGENET} = 1 ]; then
log "Querying DisGeNET..."
disgenet_out_path=${RES_DIR}/01-disgenet-n_${DISGENET_CNT_THRESHOLD}-s_${DISGENET_ASSOCIATION_SCORE_THRESHOLD_STR}.json
$PYTHON_BIN $ASSOCIATIONS_DIR/disgenet.py -o ${ORPHANET_IDS} -n ${DISGENET_CNT_THRESHOLD} -s ${DISGENET_ASSOCIATION_SCORE_THRESHOLD} > ${disgenet_out_path}
check_exit_code
log "DisGeNET gene and variant associations stored in ${disgenet_out_path}"
out_paths="${out_paths},${disgenet_out_path}"
fi
if [ ${USE_OPENTARGETS} = 1 ]; then
# Get associations from OpenTargets
log "Querying OpenTargets..."
opentargets_out_path=${RES_DIR}/01-opentargets-n_${DISGENET_CNT_THRESHOLD}-s_${OPENTARGETS_ASSOCIATION_SCORE_THRESHOLD_STR}.json
$PYTHON_BIN $ASSOCIATIONS_DIR/opentargets.py -o ${ORPHANET_IDS} -n ${OPENTARGETS_CNT_THRESHOLD} -s ${OPENTARGETS_ASSOCIATION_SCORE_THRESHOLD} > ${opentargets_out_path}
check_exit_code
log "Opentargets gene and variant associations stored in ${opentargets_out_path}"
out_paths="${out_paths},${opentargets_out_path}"
fi
# Get associations from OpenTargets
opentargets_out_path=${RES_DIR}/01-opentargets-n_${DISGENET_CNT_THRESHOLD}-s_${OPENTARGETS_ASSOCIATION_SCORE_THRESHOLD_STR}.json
$PYTHON_BIN $ASSOCIATIONS_DIR/opentargets.py -o ${ORPHANET_IDS} -n ${OPENTARGETS_CNT_THRESHOLD} -s ${OPENTARGETS_ASSOCIATION_SCORE_THRESHOLD} > ${opentargets_out_path}
echo "Opentargets gene and variant associations stored in ${opentargets_out_path}"
# Merge with ClinVar
log "Merging with ClinVar..."
genes_variants_out_path=${RES_DIR}/02-genes_variants.log
$PYTHON_BIN $ASSOCIATIONS_DIR/merge_with_clinvar.py -v $disgenet_out_path,$opentargets_out_path -c ${ASSOCIATIONS_DATA_DIR}/OrphaHPO_clinvar_variants_summary.tsv -oid ${ORPHANET_IDS} > ${genes_variants_out_path}
echo "Integration with ClinVar stored in ${genes_variants_out_path}"
out_paths=${out_paths#,}
$PYTHON_BIN $ASSOCIATIONS_DIR/merge_with_clinvar.py -v $out_paths -c ${ASSOCIATIONS_DATA_DIR}/OrphaHPO_clinvar_variants_summary.tsv -oid ${ORPHANET_IDS} > ${genes_variants_out_path}
check_exit_code
log "Integration with ClinVar stored in ${genes_variants_out_path}"
genes_line=`cat ${genes_variants_out_path} | grep "genes in total"`
genes_out_path=${genes_variants_out_path/02-genes_variants/03-genes}
#echo ${genes_line#*:} | sed 's/\,/\n/g' > ${genes_out_path}
echo ${genes_line#*:} | tr ',' '\n' > ${genes_out_path}
echo "Genes stored in ${genes_out_path}"
log "Genes stored in ${genes_out_path}"
echo "Extending genes list"
log "Extending genes list..."
text_mining_out_path=${genes_out_path/03-genes/03-text-mining}
echo Rscript --vanilla ${EXTEND_DIR}/get_extended_network.R ${genes_out_path} ${EXTEND_CONFIG}
check_exit_code
cp ${EXTEND_DIR}/output.txt ${text_mining_out_path}
echo "Genes list extended"
log "Genes list extended"
minerva_genes_out_path=${RES_DIR}/04-minerva-genes.txt
$PYTHON_BIN $ASSOCIATIONS_DIR/minerva_genes.py -f ${genes_out_path} > ${minerva_genes_out_path}
check_exit_code
log "Genes stored in ${minerva_genes_out_path}"
var_line=`cat ${genes_variants_out_path} | grep "variants in total"`
variants_out_path=${genes_variants_out_path/02-genes_variants/03-variants}
#echo ${var_line#*:} | sed 's/\,/\n/g' > ${variants_out_path}
echo ${var_line#*:} | tr ',' '\n' > ${variants_out_path}
echo "Variants stored in ${variants_out_path}"
log "Variants stored in ${variants_out_path}"
log "Getting detailed variants information..."
minerva_variants_out_path=${RES_DIR}/04-minerva-variants.txt
$PYTHON_BIN $ASSOCIATIONS_DIR/minerva_variants.py -f ${variants_out_path} > ${minerva_variants_out_path}
log "Detailed variants information obtained"
check_exit_code
if [ ${STOP_AFTER_STAGE} = 1 ]; then
echo "Exiting after stage ${STOP_AFTER_STAGE}"
exit 0
fi
# ------------------------------ 2. Obtain pathways ------------------------------
echo "Retrieving enriched pathways"
Rscript --vanilla ${ENRICHMENT_DIR}/enrich_maps.R ${genes_out_path} ${ENRICHMENT_CONFIG}
log "Retrieving enriched pathways..."
tr '\r' '\n' < ${ENRICHMENT_CONFIG} > ${ENRICHMENT_CONFIG_TMP}
#update paremters file
sed -E -i "s/(max_areas_per_map)(.*)/\1\t${ENRICH_MAX_AREAS_PER_MAP}/g" ${ENRICHMENT_CONFIG_TMP}
sed -E -i "s/(max_areas_per_pathway_db)(.*)/\1\t${MAX_AREAS_PER_PATHWAY_DB}/g" ${ENRICHMENT_CONFIG_TMP}
Rscript --vanilla ${ENRICHMENT_DIR}/enrich_maps.R ${genes_out_path} ${ENRICHMENT_CONFIG_TMP}
check_exit_code
enriched_maps_out_path=$RES_DIR/05-enriched_disease_maps.txt
enriched_paths_out_path=$RES_DIR/05-enriched_pathways.txt
mv enriched_disease_maps.txt ${enriched_maps_out_path}
mv enriched_pathways.txt ${enriched_paths_out_path}
echo "Enriched pathways obtained"
rm ${ENRICHMENT_CONFIG_TMP}
log "Enriched pathways retrieved"
if [ ${STOP_AFTER_STAGE} = 2 ]; then
......@@ -97,26 +141,33 @@ if [ ${BUILD_MAP_GENERATOR} = 1 ]; then
mvn -DskipTests=true clean install -pl biohackathon -am
cd ..
fi
echo "Map generator built"
echo "Assembling the map from pathways ..."
log "Map generator built"
log "Assembling the map from pathways ..."
map_out_path=${RES_DIR}/06-minerva_map.xml
java -Xmx4g -jar ${MAP_GENERATOR_DIR}/biohackathon/target/biohackathon-1.0-jar-with-dependencies.jar --enrichr-file ${enriched_paths_out_path} --minerva-file ${enriched_maps_out_path} --text-mining-file ${text_mining_out_path} --output-file ${map_out_path}
#java -Xmx4g -jar ${MAP_GENERATOR_DIR}/biohackathon/target/biohackathon-1.0-jar-with-dependencies.jar --enrichr-file ${enriched_paths_out_path} --minerva-file ${enriched_maps_out_path} --output-file ${map_out_path}
echo "Pathwaygs assembled into ${map_out_path}"
check_exit_code
log "Pathways assembled into ${map_out_path}"
log "Trimming long strings..."
map_out_path_trimmed=${map_out_path/.xml/_trim.xml}
$PYTHON_BIN ${MAP_GENERATOR_DIR}/utils/trim_long_strings.py -i ${map_out_path} > ${map_out_path_trimmed}
log "Long string trimmed"
echo "Combining map with overlays"
log "Combining the map with overlays"
tmp_dir=${RES_DIR}/tmp/
tmp_dir_layouts=${tmp_dir}/layouts/
mkdir ${tmp_dir}
cp ${map_out_path} ${tmp_dir}
cp ${map_out_path_trimmed} ${tmp_dir}
mkdir ${tmp_dir_layouts}
cp ${minerva_genes_out_path} ${tmp_dir_layouts}
cp ${minerva_variants_out_path} ${tmp_dir_layouts}
map_zip_out_path=${map_out_path/.xml/.zip}
map_zip_out_path=${map_out_path_trimmed/.xml/.zip}
rm ${map_zip_out_path}
cd ${tmp_dir}
zip -r tmp.zip .
......@@ -124,4 +175,4 @@ cd -
mv ${tmp_dir}/tmp.zip ${map_zip_out_path}
rm -rf ${tmp_dir}
echo "Final map with overlays stored in ${map_zip_out_path}"
\ No newline at end of file
log "Final map with overlays stored in ${map_zip_out_path}"
\ No newline at end of file
......@@ -2,12 +2,16 @@
The purpose of the tools in this directory is to obtain list of genes and
variants in those genes which are associated with given diseases. Currently,
the pipeline uses ClinVar, DisGeNET, OpenTargets to obtain the gene-disease association and
variants and Ensembl which assesses allele frequencies of the identified variants to
the pipeline to retrieve the genes and variants works as follows:
1. Use ClinVar, DisGeNET, OpenTargets to obtain the gene-disease association and
variants.
2. Extend the genes list by looking into additional resources such as OmniPath.
3. Use Ensembl to assesses allele frequencies of the identified variants to
filter out possibly non-rare variants.
## Gene-disease associations and variants
## 1. Gene-disease associations and variants
The DisGeNET and OpenTargets tools connect to the resource of interest and return a JSON file
with the following format (this is the minimum specification, the output of the tools
......@@ -146,7 +150,26 @@ genese was limited by setting the association threshold to 0.3.
1246 variants in total: rs1559407312, rs774490908, rs41313667, rs199473220, rs1167279918, rs41313015, rs199473158, rs756765026, rs755579963, rs1553702937, rs397514251, rs199473248, rs1060501140, rs1553705586, rs886054341, rs886049240, rs886049207, rs886048653, rs754130748, rs1559734780, rs372907454, rs140490085, rs199473586, rs1559419984, rs199473100, rs201342036, rs786205268, rs546574502, rs886058444, rs794728865, rs172149856, rs879255356, rs886058437, rs45491996, rs745834030, rs781283574, rs112170830, rs730880049, rs199694744, rs878855295, rs949932623, rs199473565, rs755167125, rs869025517, rs777689378, rs199473070, rs886058451, rs1366120635, rs45627438, rs201706560, rs199473219, rs886049245, rs1553692699, rs75996884, rs746360906, rs757146653, rs374341474, rs142778041, rs886046891, rs3918389, rs370630496, rs1060501717, rs545511851, rs1278221673, rs199473124, rs199473628, rs886049246, rs777110449, rs368719191, rs199473079, rs886047888, rs564407056, rs137854614, rs199473558, rs28937318, rs764965507, rs768830617, rs199473086, rs886058465, rs763891399, rs1481582794, rs886047891, rs762600386, rs373932682, rs186942072, rs199473320, rs794728910, rs142276689, rs72552139, rs199473181, rs886058455, rs775538425, rs1038605800, rs199473160, rs201756421, rs137854608, rs186019566, rs863224532, rs1060501716, rs199473254, rs539215014, rs786204216, rs41315539, rs727504801, rs199473309, rs794728906, rs758082514, rs758959053, rs199473133, rs886039214, rs376222853, rs199473304, rs199473567, rs72552191, rs757966245, rs886058333, rs139934383, rs747044382, rs141810266, rs199473316, rs140825889, rs199473287, rs199473639, rs199473233, rs1553614805, rs1559725687, rs199473214, rs1060501135, rs199473143, rs137854612, rs1334461772, rs886047900, rs777258179, rs531920801, rs760016062, rs547615657, rs199473272, rs150280879, rs1060501134, rs202192818, rs201641342, rs776705132, rs752623537, rs112185839, rs1553623225, rs886046899, rs749753216, rs10428132, rs886054340, rs149127157, rs886049150, rs761362832, rs137854607, rs774870551, rs775485359, rs144520035, rs199473195, rs886049148, rs886047885, rs1559727734, rs886046908, rs886049208, rs764717543, rs199473185, rs1064795457, rs199473187, rs1163800062, rs150528041, rs1060501132, rs549819195, rs138774330, rs199473050, rs1064792926, rs1553692410, rs886058454, rs761505485, rs369058100, rs199473164, rs199473636, rs1559414230, rs367906630, rs886047899, rs60016728, rs1060501146, rs199473335, rs555303241, rs368678204, rs140288103, rs199473168, rs886058456, rs370880796, rs561922849, rs199473240, rs1060501133, rs199473211, rs397517951, rs185638763, rs45489199, rs148464224, rs886047895, rs886049224, rs199473280, rs199473156, rs368058564, rs199473178, rs199473054, rs111606207, rs143744796, rs45458203, rs199473066, rs562896922, rs878855292, rs114709124, rs886058443, rs187700411, rs199473144, rs137854615, rs117409744, rs1131691708, rs72552193, rs199473117, rs199473267, rs768691853, rs775331521, rs1473144775, rs756133876, rs72552294, rs199473310, rs569357069, rs886058466, rs199473607, rs794728916, rs748312802, rs886058457, rs113764055, rs528559406, rs199473250, rs199473288, rs12720441, rs199473208, rs759235726, rs151303346, rs1564417691, rs113583236, rs187370816, rs758157003, rs536201762, rs757119370, rs781529391, rs199473246, rs199473044, rs199473243, rs199473638, rs371308670, rs199473058, rs199473318, rs199473574, rs775234338, rs201682618, rs199473102, rs566273812, rs199473251, rs192508010, rs886047892, rs1559371879, rs773767571, rs199473598, rs1553607561, rs55980498, rs199473210, rs751005040, rs199473170, rs757313687, rs878855294, rs202152674, rs199473269, rs886049211, rs137854606, rs1060501128, rs794728931, rs1554333986, rs79299226, rs1559424975, rs543872505, rs878855288, rs558257162, rs551399685, rs886049236, rs199473641, rs1553699747, rs111341342, rs750396182, rs757638347, rs1559721808, rs199473147, rs199473252, rs1421700935, rs535500151, rs1554843746, rs1060501139, rs139638446, rs1276970820, rs121918282, rs72546642, rs199473605, rs199473571, rs886058337, rs762818132, rs150264233, rs1060503190, rs72549411, rs754387957, rs199473218, rs1553693632, rs886058334, rs199473556, rs772186966, rs886046901, rs137886839, rs199473207, rs199473547, rs199473629, rs543489443, rs77584352, rs199473063, rs886047887, rs1060501141, rs886047907, rs560476223, rs199473196, rs1060501137, rs886049222, rs886049221, rs45522138, rs199473620, rs759080305, rs1554560384, rs41315493, rs72552292, rs137854611, rs1060501126, rs199473125, rs727504495, rs199473153, rs199473330, rs61733968, rs199473067, rs139608070, rs199473230, rs886047901, rs727503411, rs41313703, rs886049225, rs199473570, rs199473110, rs72554071, rs199473194, rs184556043, rs1016820088, rs886058448, rs199473091, rs149793143, rs778522112, rs886048654, rs199473126, rs886058447, rs72558029, rs886049228, rs780734976, rs148405740, rs1559379802, rs1329499714, rs886049237, rs886049234, rs748926089, rs199473090, rs770390440, rs371313714, rs886049243, rs80017720, rs781354273, rs187815725, rs1559774986, rs774593360, rs373410109, rs747296872, rs369678002, rs199473083, rs199473584, rs755980331, rs886058439, rs199473129, rs1376727184, rs1565662161, rs575883763, rs199473392, rs369704754, rs187400761, rs1554398187, rs878855004, rs755162776, rs139861061, rs199473113, rs199503439, rs758175811, rs369673473, rs199473123, rs199473062, rs137854616, rs747192078, rs199473084, rs746690639, rs1553616744, rs1060499900, rs199473139, rs886058335, rs199473247, rs755194086, rs1060501127, rs369438564, rs730880120, rs150439110, rs199473624, rs141238313, rs568986179, rs199473311, rs886049239, rs886058458, rs555082306, rs757614405, rs145733679, rs199473245, rs886049206, rs1553704898, rs1314879329, rs148067743, rs1554345090, rs886047886, rs199473242, rs72650031, rs776980213, rs116281194, rs756547221, rs111250176, rs199473217, rs886049216, rs72552027, rs199473575, rs199473055, rs778220127, rs762283891, rs72648942, rs1060501149, rs199473234, rs886049244, rs41261344, rs886047894, rs886049226, rs1559757317, rs794728911, rs199473179, rs121912776, rs199473626, rs374314562, rs1553660781, rs199473621, rs61737825, rs1559721331, rs45475899, rs373156650, rs199473554, rs199473341, rs199473313, rs149504103, rs765907469, rs370114378, rs78378025, rs886047889, rs1372622252, rs187531872, rs770088052, rs779649600, rs199473602, rs1559757280, rs56394008, rs566251672, rs12828687, rs886058338, rs199473329, rs72553907, rs144509796, rs199473098, rs199473333, rs201258230, rs886047902, rs794728842, rs199473089, rs72552105, rs137854617, rs1559565644, rs748297358, rs149930872, rs886046886, rs1553619005, rs199473244, rs41313033, rs199473047, rs886049218, rs199473103, rs772585696, rs886046898, rs199473258, rs199473151, rs150923753, rs1060501131, rs587781159, rs747251132, rs143326262, rs775608046, rs199473097, rs76257004, rs368357262, rs1199205585, rs199473289, rs587781157, rs143486835, rs149253719, rs199473176, rs1373914068, rs886047884, rs863224816, rs117808169, rs367807565, rs41310769, rs886058460, rs886049212, rs36210420, rs748854017, rs140250795, rs869025522, rs528760350, rs201492706, rs886049217, rs886049227, rs759924541, rs199473145, rs794728879, rs199473559, rs561475141, rs1553702946, rs45546039, rs147205617, rs199473074, rs764843384, rs1559725364, rs886047904, rs201994425, rs763550164, rs142235256, rs1559409413, rs745623708, rs577244366, rs373602669, rs199473599, rs199473071, rs199473592, rs146965005, rs1060501148, rs764993726, rs794728925, rs148361833, rs199473606, rs886049241, rs371834340, rs566007803, rs886048647, rs199473293, rs1555172510, rs199473188, rs199473572, rs886047890, rs886049202, rs886058438, rs745337273, rs886049230, rs886048646, rs886046902, rs202114798, rs199473241, rs199473096, rs750893840, rs137986136, rs886049249, rs200034939, rs201276017, rs202174472, rs730880207, rs794728899, rs200645452, rs397514450, rs1060501142, rs201907325, rs538672341, rs148244106, rs794728898, rs185492581, rs886046887, rs553784643, rs1218449046, rs145694222, rs199473625, rs192493052, rs397517953, rs529326016, rs768270021, rs367895193, rs757916036, rs199473122, rs886046892, rs28937316, rs1060501138, rs199473552, rs147073518, rs199538058, rs773016454, rs397517956, rs763622598, rs375940680, rs773930851, rs753149586, rs368048551, rs756253735, rs561365937, rs546242304, rs137854604, rs370588133, rs41313687, rs373289770, rs45437099, rs758282196, rs199473582, rs12720452, rs794728924, rs199473577, rs865884072, rs199473623, rs1455337011, rs763740777, rs41313693, rs370438420, rs41312419, rs199473204, rs199473635, rs1265687114, rs752000790, rs763830252, rs766143752, rs199473315, rs886058462, rs1417036453, rs1553659648, rs199473275, rs199473045, rs141368548, rs199473270, rs1553695847, rs771243374, rs533634115, rs45553235, rs199473573, rs199473082, rs41310753, rs1559415879, rs199473146, rs779953279, rs886058446, rs199473339, rs757532106, rs199715855, rs527480102, rs886049204, rs41276525, rs762981322, rs551293049, rs182634399, rs794728934, rs202108848, rs199473157, rs149641866, rs772484960, rs763880032, rs144182966, rs776449566, rs199473590, rs540669024, rs199473562, rs199473610, rs11708996, rs886058464, rs752736296, rs199473159, rs199473088, rs886048648, rs199473578, rs1559778838, rs751751942, rs886049149, rs199473614, rs878855289, rs757302500, rs886058449, rs1373296470, rs137854601, rs199473564, rs199473174, rs886046885, rs781395450, rs139494055, rs1553695282, rs114517792, rs1554349582, rs567401056, rs1553704015, rs753892795, rs199473228, rs755100226, rs45459402, rs41311135, rs199473282, rs72546651, rs199473180, rs199473322, rs199473053, rs375306544, rs747839312, rs776925980, rs72552164, rs150406522, rs886048650, rs752637408, rs116885934, rs146882581, rs199473111, rs199473336, rs199473225, rs886058332, rs886058331, rs199473305, rs41311127, rs569249327, rs199473056, rs74315445, rs199473163, rs786204839, rs547804393, rs794728852, rs199473603, rs886054339, rs72549410, rs10494366, rs1322825102, rs771243543, rs199473048, rs730880211, rs869025369, rs1805124, rs1553626259, rs766017851, rs1553607617, rs765116077, rs199473609, rs886038918, rs570472222, rs200231105, rs199473118, rs753258598, rs199473141, rs886049203, rs199473069, rs199473550, rs199473637, rs142203439, rs148598985, rs749769938, rs1060503151, rs199473104, rs747486328, rs536419495, rs886047903, rs199473613, rs1559720961, rs199473237, rs730880208, rs1559579249, rs762012668, rs1553705529, rs794728875, rs886046907, rs794728843, rs759036311, rs41258394, rs864622270, rs114349261, rs886058440, rs199473660, rs200868190, rs760011764, rs560874115, rs776171341, rs199473236, rs138423466, rs199473073, rs774893568, rs1559720176, rs199473566, rs767190987, rs199473303, rs1559414131, rs1559720870, rs41311087, rs45609733, rs794728855, rs376965389, rs199473295, rs111442547, rs587777023, rs751166285, rs761369505, rs772258197, rs748441157, rs186741807, rs137854619, rs199473137, rs555721726, rs1060501715, rs148663098, rs142884499, rs149548827, rs368327166, rs886049205, rs199473186, rs886049229, rs199473167, rs864622440, rs45465995, rs199473205, rs200714519, rs771018427, rs1469078045, rs199473297, rs375614054, rs41315541, rs34551674, rs727504822, rs199473238, rs773095610, rs199473081, rs72552194, rs145032037, rs137854609, rs190416544, rs779999584, rs1060501114, rs1060501136, rs199473121, rs886046883, rs138832868, rs569261408, rs186708407, rs747321219, rs1471640168, rs199473292, rs762798134, rs878911306, rs371337228, rs886049232, rs72553909, rs201106879, rs772985306, rs199473092, rs752824646, rs377492327, rs199473112, rs769473892, rs201955990, rs41315489, rs869025444, rs794728846, rs199473568, rs779194497, rs199473266, rs148537653, rs794728935, rs1377226524, rs199473331, rs199473199, rs368512946, rs794728940, rs199473298, rs886049223, rs193922726, rs146906267, rs142516364, rs199473166, rs542554745, rs886047897, rs794728849, rs377165829, rs886058459, rs199473640, rs1553694258, rs199473182, rs199473072, rs200569112, rs564275776, rs1484635042, rs759136534, rs373118001, rs199473192, rs183988524, rs546849670, rs775466397, rs886046896, rs538707712, rs761117662, rs199473589, rs751083292, rs886047906, rs1060501145, rs199473052, rs886058463, rs1226072923, rs72552158, rs397515985, rs41311121, rs773422233, rs199473328, rs1559720461, rs1420621771, rs199473634, rs867302682, rs886046897, rs1554327285, rs9858585, rs768246863, rs1559738468, rs199473061, rs199473198, rs750073618, rs192379242, rs886058441, rs775896337, rs764323907, rs747174454, rs1064795784, rs370694515, rs199473263, rs886049233, rs886049248, rs1011460663, rs199473555, rs199473119, rs199473042, rs759258778, rs199473065, rs199473172, rs199473171, rs778295363, rs199473206, rs755306391, rs776522592, rs199473294, rs45502793, rs371909817, rs192741898, rs483353016, rs370346797, rs186015395, rs375323548, rs149155352, rs1060501147, rs778236407, rs886049210, rs1553612991, rs199473189, rs1553699663, rs886046893, rs794728869, rs875989812, rs569552176, rs886049247, rs1553700699, rs1490175548, rs72552185, rs901233405, rs375818733, rs886049250, rs553042856, rs869025363, rs199473265, rs35310524, rs746291609, rs749697698, rs199473175, rs199473302, rs886049231, rs199473284, rs1553696719, rs199473323, rs1216770303, rs199473561, rs199931920, rs546669133, rs751050999, rs199473601, rs199473281, rs878855029, rs748097809, rs886048645, rs199473600, rs199473224, rs371803816, rs199535863, rs199473093, rs191106518, rs1553693063, rs1553692734, rs1397301108, rs1057518456, rs137854602, rs886058442, rs918933961, rs375008004, rs756474485, rs199473608, rs727505158, rs1477267428, rs780405533, rs199473235, rs1060501129, rs760585484, rs45620037, rs199473177, rs1430691171, rs199473221, rs886058453, rs199473560, rs76956014, rs199473612, rs759584454, rs779009338, rs182050752, rs199473085, rs199473296, rs765294617, rs142476147, rs199473232, rs45503498, rs369565476, rs869025520, rs199473260, rs886047898, rs886049219, rs786205271, rs886049209, rs879035421, rs745501384, rs182826539, rs878855296, rs199473116, rs199473149, rs1318798411, rs886049238, rs72552291, rs794728914, rs147844607, rs886047905, rs1553704925, rs372368062, rs199473161, rs375926577, rs781103369, rs199473587, rs1553626242, rs199473325, rs376697724, rs199473286, rs137854600, rs758461435, rs756159737, rs1450059546, rs370145265, rs1334180362, rs200588235, rs1553625946, rs199473595, rs121912775, rs199473630, rs1559771615, rs753335762, rs886058461, rs72558074, rs199473229, rs41311117, rs779316495, rs199473128, rs199473391, rs764493888, rs544984224, rs780761880, rs767116750, rs878855287, rs72552195, rs369267978, rs561547165, rs910678839, rs199473154, rs199473597, rs760837591, rs201843136, rs121908441, rs758101066, rs774432823, rs199473593, rs1438675647, rs886048652, rs199473278, rs199473332, rs191840835, rs780901801, rs886046884, rs199473231, rs199473588, rs199473095, rs886048651, rs199473173, rs537616228, rs137854610, rs137854618, rs147795595, rs753304260, rs547303769, rs146848219, rs374447261, rs199473197, rs199473617, rs182163363, rs568890834, rs144925784, rs45514691, rs1464663200, rs199473314, rs1060501130, rs758274878, rs760888275, rs199473273, rs145712124, rs199473299, rs191009474, rs1559370502, rs137854603, rs794728912, rs1553692406, rs886049251, rs199473239, rs886046894, rs886058336, rs112122950, rs779669888, rs794728927, rs1559725648, rs886058450, rs886046906, rs41310765, rs45471994, rs138404783, rs886049213, rs199473209, rs1553699796, rs886058452, rs764488677, rs774165526, rs886037904, rs199473274, rs199473340, rs762376206, rs540984871, rs182905404, rs886049214, rs746146243, rs751252167, rs756364065, rs886058445, rs747848559, rs794728853, rs374857905, rs563090568, rs794728883, rs1173154320, rs781104838, rs1184094156, rs191005723, rs1060501143, rs1553695248, rs199473094, rs376987352, rs747643709, rs199473101, rs863224533, rs1553613078, rs753835985, rs769349991, rs115291555, rs199473261, rs72553906, rs200713724, rs199473051, rs199473076, rs199473557, rs1553692660, rs370826688, rs886049235, rs199473226, rs558085329, rs202040659, rs777061524, rs151199943, rs367928065, rs12720064, rs199473169, rs75960619, rs199473134, rs863225273, rs764252430, rs199473249, rs199473611, rs199473271, rs886047893, rs759842238, rs201799095, rs867623173, rs1060501144, rs199473213, rs45563942, rs72646873, rs200654601
```
## Allele frequency
## 2. Enrich gene list
The list of genes associated with given disease is then enriched by the script stored
stored in the `extend_network` directory.
#### **enrich_maps.R** Script for map and pathway enrichment
Should be run as `Rscript --vanilla enrich_maps.R <input_filename> <config_file>`
- `input_filename` should contain list of genes as HGNC symbols, one entry per line. Default is "input.txt" in the script folder.
- `config_file` contains the list of map instances and pathways to enrich. Default is "config.txt" in the script folder.
The script uses "config.txt" file, containing the list of minerva instances and
pathways to query for enriched areas.
It provides output with the list of enriched map areas and pathways.
Statistical tests used for maps and pathways are different.
## 3. Filter variants by allele frequency
#### Ensembl
The script [vepAllInOne.py](bh19-rare-diseases/associations/vepmining) retrives information about [allele frequencies](https://en.wikipedia.org/wiki/Allele\_frequency) in several populations available in the [Ensembl](http://www.ensembl.org/index.html) databse. This is done throug [this endpoint](https://rest.ensembl.org/documentation/info/vep\_id\_post) of the Ensembl API.
......
......@@ -58,11 +58,6 @@ retrieve_stringdb_net <- function(finput, fn, fscore) {
entities <- paste0("9606.", ensprotein_idest$ensembl_peptide_id)
entities <- paste(entities, collapse = "%0A" )
resp <- httr::POST(url = "api.jensenlab.org/network",
httr::add_headers('Content-Type' = "application/x-www-form-urlencoded"),
body = paste0("entities=", entities, "&additional=", n),
httr::verbose())
json <- jsonlite::fromJSON(httr::content(resp, as = "text", encoding = "UTF-8"), flatten = FALSE)
### May be important for later, when node-level info can be used for filtering
......@@ -122,7 +117,6 @@ if (dbsource == "stringdb" | dbsource == "text_mining") {
mart = ensembl)
ensprotein_idest <- subset(ensprotein_idest, ensembl_peptide_id != "")
message(paste0("there are ", length(unique(ensprotein_idest$hgnc_symbol))), " genes in the input file with ensembl protein ids")
if (dbsource == "stringdb"){
ppi_network <- retrieve_stringdb_net(ensprotein_idest$ensembl_peptide_id, n, score)
} else {
......@@ -165,7 +159,7 @@ if(length(ppi_network) == 0) {
message("after OmniPathDB filtering, the network contains ", dim(ppi_network)[1], " interactions ")
}
if(nrow(result) == 0) {
if(nrow(ppi_network) == 0) {
warning("None of the pairs of interacting proteins have pased the filters")
} else {
### Add entrez at the end, if any results remain
......
......@@ -126,7 +126,7 @@ def get_dbsnp(ids: List[str]) -> List[Dict]:
"allele_frequency": allele_frequency,
"variant_identifier": variant_identifier,
"amino_acid_change": amino_acid_change,
"uniprot_acc": uniprot_acc
"identifier_uniprot": uniprot_acc
})
logging.info("Skipped {} variants out of {}".format(cnt - len(snps), cnt))
......@@ -139,7 +139,7 @@ def remove_snps_with_multiple_uniprot_ids(db_snps: List[Dict]) -> List[Dict]:
id = snp["variant_identifier"]
if id not in snp_cnt:
snp_cnt[id] = set()
snp_cnt[id].add(snp["uniprot_acc"])
snp_cnt[id].add(snp["identifier_uniprot"])
id_to_keep = [id for id in snp_cnt if len(snp_cnt[id]) == 1]
new_db_snps: List[Dict] = []
......@@ -158,10 +158,6 @@ def get_minerva_format(db_snps: List[Dict]) -> str:
if len(db_snps) == 0:
return out
#remove uniprot accesion numbers since these don't currently go to Minerva
for snp in db_snps:
del snp['uniprot_acc']
out += "#NAME=DISEASE_ASSOCIATED_VARIANTS\n"
out += "#TYPE=GENETIC_VARIANT\n"
out += "#GENOME_TYPE=UCSC\n"
......
# **enrich_maps.R** Script for map and pathway enrichment
Should be run as `Rscript --vanilla enrich_maps.R <input_filename> <config_file>`
`input_filename` should contain list of genes as HGNC symbols, one entry per line. Default is "input.txt" in the script folder.
`config_file` contains the list of map instances and pathways to enrich. Default is "config.txt" in the script folder.
The script uses "config.txt" file, containing the list of minerva instances and pathways to query for enriched areas.
It provides output with the list of enriched map areas and pathways.
Statistical tests used for maps and pathways are different.
# **get_extended_network.R** Script for extending the original list of disease genes using PPI data
Should be run as `Rscript --vanilla get_extended_network.R <input_filename> <config_file>`
......
type resource value parameter max_areas_per_map 5 parameter max_areas_per_pathway_db 5 map https://pdmap.uni.lu/minerva/api/ map https://progeria.uni.lu/minerva/api/ pathway_db WikiPathways_2019_Human pathway_db Panther_2016
\ No newline at end of file
import logging
import argparse
import xml.etree.ElementTree as ET
def trim(path: str) -> str:
tree = ET.parse(path)
root = tree.getroot()
for elem in root.iter():
atts = elem.attrib
# if 'name' in atts:
# print(elem.tag, atts)
for k, v in atts.items():
if len(v) > 250:
logging.warning("Trimming {} (key {} too long)".format(elem, k))
if "resource" in k:
# We want to keep the RDF resource still a valid resource otherwise MINERVA will fail during import
atts[k] = atts[k].split(";")[0]
else:
atts[k] = ""
return ET.tostring(root, encoding='utf8').decode('utf8')
if __name__ == '__main__':
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(module)s - %(message)s',
datefmt='%H:%M:%S')
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input_xml",
required=True,
help="XML file with a map")
args = parser.parse_args()
print(trim(args.input_xml))
\ No newline at end of file
#!/usr/bin/env bash
ORPHANET_IDS="130"
#ORPHANET_IDS="851,404493,46348,85295"
#ORPHANET_IDS="33,67046,79327,79321,86309"
USE_DISGENET=1
DISGENET_CNT_THRESHOLD=50
DISGENET_ASSOCIATION_SCORE_THRESHOLD="0"
USE_OPENTARGETS=1
OPENTARGETS_CNT_THRESHOLD=50
OPENTARGETS_ASSOCIATION_SCORE_THRESHOLD="0.3"
ENRICH_MAX_AREAS_PER_MAP=1
MAX_AREAS_PER_PATHWAY_DB=1
BUILD_MAP_GENERATOR=0 #1 for building it
STOP_AFTER_STAGE=-1
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment