Commit 1150cb40 authored by David Hoksza's avatar David Hoksza
Browse files

clinvar merging accepting arbitrary number of inputs

parent 7a177575
......@@ -23,7 +23,7 @@ $PYTHON_BIN $ASSOCIATIONS_DIR/disgenet.py -o ${ORPHANET_IDS} -n ${DISGENET_CNT_T
echo "Disgenet gene and variant associations stored in ${disgenet_out_path}"
genes_variants_out_path=${RES_DIR}/02-genes_variants-id_${ORPHANET_IDS_UNDERSCORE}.log
$PYTHON_BIN $ASSOCIATIONS_DIR/compare_variants_with_clinvar.py -v $disgenet_out_path -c ${ASSOCIATIONS_DATA_DIR}/OrphaHPO_clinvar_variants_summary.tsv -oid ${ORPHANET_IDS} > ${genes_variants_out_path}
$PYTHON_BIN $ASSOCIATIONS_DIR/merge_with_clinvar.py -v $disgenet_out_path -c ${ASSOCIATIONS_DATA_DIR}/OrphaHPO_clinvar_variants_summary.tsv -oid ${ORPHANET_IDS} > ${genes_variants_out_path}
echo "Integration with ClinVar stored in ${genes_variants_out_path}"
genes_line=`cat ${genes_variants_out_path} | grep "genes in total"`
......@@ -43,10 +43,13 @@ minerva_variants_out_path=${RES_DIR}/04-minerva-variants-id_${ORPHANET_IDS_UNDER
$PYTHON_BIN $ASSOCIATIONS_DIR/minerva_variants.py -f ${variants_out_path} > ${minerva_variants_out_path}
# ------------------------------ 2. Obtain pathways ------------------------------
echo "Rscript --vanilla enrichment/enrich_maps.R enrich_maps.R ${genes_out_path} ${ENRICHMENT_CONFIG}"
echo "Retrieving enriched pathways"
Rscript --vanilla enrichment/enrich_maps.R ${genes_out_path} ${ENRICHMENT_CONFIG}
enriched_maps_out_path=$RES_DIR/05-enriched_disease_maps.txt
enriched_paths_out_path=$RES_DIR/05-enriched_pathways.txt
cp enriched_disease_maps.txt ${enriched_maps_out_path}
cp enriched_pathways.txt ${enriched_paths_out_path}
mv enriched_disease_maps.txt ${enriched_maps_out_path}
mv enriched_pathways.txt ${enriched_paths_out_path}
echo "Enriched pathways obtained"
# ------------------------------ 2. Assemble pathways and overlays into a map ------------------------------
# MINERVA
......@@ -83,7 +83,7 @@ if __name__ == '__main__':
i += 1
genes_sorted[k] = v
print(json.dumps(genes_sorted, indent=2))
print(json.dumps({"name": "disgenet", "genes": genes_sorted}, indent=2))
......
......@@ -2,14 +2,17 @@ import common
import logging
import json
import argparse
from typing import List, Dict
from typing import List, Dict, Set, Tuple
import requests
import pandas as pd
def parse_gene_variants(fname) -> str:
with open(fname) as f:
return json.loads(f.read())
def parse_gene_variants(fnames: List[str]) -> List[Dict]:
gvs:List[Dict] = []
for fname in fnames:
with open(fname) as f:
gvs.append(json.loads(f.read()))
return gvs
def df_contains_list(df, column_id, ids):
......@@ -67,43 +70,63 @@ def get_clinvar_variants(clinvar_fname:str, orpha_ids:List[str]):
return filter_out_conflicting(gene_variants)
def contrast_clinvar(gene_variants:Dict[str, Dict], gene_variants_clinvar:Dict[str, Dict]):
genes = set(gene_variants.keys())
genes_clinvar = set(gene_variants_clinvar.keys())
def contrast_clinvar(gene_variants:List[Dict], gene_variants_clinvar:Dict):
variants = set()
variants_clinvar = set()
for gene in gene_variants:
for variant in gene_variants[gene]['variants']:
variants.add((gene, variant))
for gene in gene_variants_clinvar:
for variant in gene_variants_clinvar[gene]['variants']:
variants_clinvar.add((gene, variant))
all_g_v = gene_variants + [gene_variants_clinvar]
genes: List[Set[str]] = []
print("-------------Gene-level summary-------------")
print('{} input genes: {}'.format(len(genes), sorted(genes)) )
print('{} ClinVar genes: {}'.format(len(genes_clinvar), sorted(genes_clinvar)))
for i in range(len(all_g_v)):
genes.append(set(all_g_v[i]["genes"].keys()))
print('{} {} genes: {}'.format(len(genes[i]), all_g_v[i]["name"], sorted(genes[i])))
g_c = sorted(genes.difference(genes_clinvar))
c_g = sorted(genes_clinvar.difference(genes))
print('{} genes in input and not in ClinVar: {}'.format(len(g_c), str(g_c)))
print('{} genes in ClinVar and not in input: {}'.format(len(c_g), str(c_g)))
print()
genes_all = genes.union(genes_clinvar)
print('{} genes in total: {}'.format(len(genes_all), ",".join(genes_all)))
for i in range(len(all_g_v)):
name1 = all_g_v[i]["name"]
genes1 = genes[i]
for j in range(i+1, len(all_g_v)):
name2 = all_g_v[j]["name"]
genes2 = genes[j]
print()
print("------------Variant-level summary-----------")
g1_g2 = sorted(genes1.difference(genes2))
g2_g1 = sorted(genes2.difference(genes1))
v_c = sorted(variants.difference(variants_clinvar))
c_v = sorted(variants_clinvar.difference(variants))
print('{} variants in input and not in ClinVar: {}'.format(len(v_c), str(v_c)))
print('{} variants in ClinVar and not in input: {}'.format(len(c_v), str(c_v)))
print('{} genes in {} and not in {}: {}'.format(len(g1_g2), name1, name2, str(g1_g2)))
print('{} genes in {} and not in {}: {}'.format(len(g2_g1), name2, name1, str(g2_g1)))
db_snp_ids = set([v[1] for v in variants])
db_snp_ids_clinvar = set([v[1] for v in variants_clinvar])
all_db_snp_ids = db_snp_ids.union(db_snp_ids_clinvar)
print('{} variants in total: {}'.format(len(all_db_snp_ids), ", ".join(all_db_snp_ids)))
print()
print("-------------Variant-level summary-------------")
variants:List[Set[Tuple[str]]] = [] #for each dataset a set of gene-dbsnpid pairs
for i in range(len(all_g_v)):
variants.append(set())
for gene in all_g_v[i]["genes"]:
for variant in all_g_v[i]["genes"][gene]['variants']:
variants[i].add((gene, variant))
print('{} {} variants: {}'.format(len(variants[i]), all_g_v[i]["name"], sorted(variants[i])))
for i in range(len(all_g_v)):
name1 = all_g_v[i]["name"]
vars1 = variants[i]
for j in range(i + 1, len(all_g_v)):
name2 = all_g_v[j]["name"]
vars2 = variants[j]
v1_v2 = sorted(vars1.difference(vars2))
v2_v1 = sorted(vars2.difference(vars1))
print('{} variants in {} and not in {}: {}'.format(len(v1_v2), name1, name2, str(v1_v2)))
print('{} variants in {} and not in {}: {}'.format(len(v2_v1), name2, name1, str(v2_v1)))
print()
print("-------------Merged data-------------")
genes_all = set.union(*genes)
print('{} genes in total: {}'.format(len(genes_all), ",".join(genes_all)))
dbsnp_all = set()
for dataset_variants in variants:
dbsnp_all = dbsnp_all.union([v[1] for v in dataset_variants])
print('{} variants in total: {}'.format(len(dbsnp_all), ", ".join(dbsnp_all)))
if __name__ == '__main__':
......@@ -112,9 +135,9 @@ if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-v", "--variants",
parser.add_argument("-v", "--variant-files",
required=True,
help="JSON input from association")
help="Comma-separated list of JSON input files with gene-variant mapping")
parser.add_argument("-c", "--clinvar",
required=True,
help="ClinVar input file")
......@@ -124,6 +147,6 @@ if __name__ == '__main__':
args = parser.parse_args()
gene_variants = parse_gene_variants(args.variants)
gene_variants: List[Dict] = parse_gene_variants(args.variant_files.split(","))
gene_variants_clinvar = get_clinvar_variants(args.clinvar, args.orphanet_ids.split(","))
contrast_clinvar(gene_variants, gene_variants_clinvar)
\ No newline at end of file
contrast_clinvar(gene_variants, {"name": "clinvar", "genes": gene_variants_clinvar})
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment