Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
David Hoksza
bh19-rare-diseases
Commits
6edb2f58
Commit
6edb2f58
authored
Dec 02, 2019
by
David Hoksza
Browse files
stop pipeline after unsuccesfull command
parent
ba7ed61d
Changes
3
Hide whitespace changes
Inline
Side-by-side
assemble.sh
View file @
6edb2f58
...
...
@@ -28,19 +28,30 @@ mkdir $RES_DIR
# First we copy the parameters into the result directory
cp
parameters.sh
${
RES_DIR
}
check_exit_code
(){
if
[
$?
-ne
0
]
;
then
tput setaf 1
;
echo
"The last command failed. Can't continue..."
exit
fi
}
# Get associations from DisGeNET
disgenet_out_path
=
${
RES_DIR
}
/01-disgenet-n_
${
DISGENET_CNT_THRESHOLD
}
-s_
${
DISGENET_ASSOCIATION_SCORE_THRESHOLD_STR
}
.json
$PYTHON_BIN
$ASSOCIATIONS_DIR
/disgenet.py
-o
${
ORPHANET_IDS
}
-n
${
DISGENET_CNT_THRESHOLD
}
-s
${
DISGENET_ASSOCIATION_SCORE_THRESHOLD
}
>
${
disgenet_out_path
}
check_exit_code
echo
"Disgenet gene and variant associations stored in
${
disgenet_out_path
}
"
# Get associations from OpenTargets
opentargets_out_path
=
${
RES_DIR
}
/01-opentargets-n_
${
DISGENET_CNT_THRESHOLD
}
-s_
${
OPENTARGETS_ASSOCIATION_SCORE_THRESHOLD_STR
}
.json
$PYTHON_BIN
$ASSOCIATIONS_DIR
/opentargets.py
-o
${
ORPHANET_IDS
}
-n
${
OPENTARGETS_CNT_THRESHOLD
}
-s
${
OPENTARGETS_ASSOCIATION_SCORE_THRESHOLD
}
>
${
opentargets_out_path
}
check_exit_code
echo
"Opentargets gene and variant associations stored in
${
opentargets_out_path
}
"
# Merge with ClinVar
genes_variants_out_path
=
${
RES_DIR
}
/02-genes_variants.log
$PYTHON_BIN
$ASSOCIATIONS_DIR
/merge_with_clinvar.py
-v
$disgenet_out_path
,
$opentargets_out_path
-c
${
ASSOCIATIONS_DATA_DIR
}
/OrphaHPO_clinvar_variants_summary.tsv
-oid
${
ORPHANET_IDS
}
>
${
genes_variants_out_path
}
check_exit_code
echo
"Integration with ClinVar stored in
${
genes_variants_out_path
}
"
genes_line
=
`
cat
${
genes_variants_out_path
}
|
grep
"genes in total"
`
...
...
@@ -52,11 +63,13 @@ echo "Genes stored in ${genes_out_path}"
echo
"Extending genes list"
text_mining_out_path
=
${
genes_out_path
/03-genes/03-text-mining
}
echo
Rscript
--vanilla
${
EXTEND_DIR
}
/get_extended_network.R
${
genes_out_path
}
${
EXTEND_CONFIG
}
check_exit_code
cp
${
EXTEND_DIR
}
/output.txt
${
text_mining_out_path
}
echo
"Genes list extended"
minerva_genes_out_path
=
${
RES_DIR
}
/04-minerva-genes.txt
$PYTHON_BIN
$ASSOCIATIONS_DIR
/minerva_genes.py
-f
${
genes_out_path
}
>
${
minerva_genes_out_path
}
check_exit_code
var_line
=
`
cat
${
genes_variants_out_path
}
|
grep
"variants in total"
`
variants_out_path
=
${
genes_variants_out_path
/02-genes_variants/03-variants
}
...
...
@@ -66,6 +79,7 @@ echo "Variants stored in ${variants_out_path}"
minerva_variants_out_path
=
${
RES_DIR
}
/04-minerva-variants.txt
$PYTHON_BIN
$ASSOCIATIONS_DIR
/minerva_variants.py
-f
${
variants_out_path
}
>
${
minerva_variants_out_path
}
check_exit_code
if
[
${
STOP_AFTER_STAGE
}
=
1
]
;
then
echo
"Exiting after stage
${
STOP_AFTER_STAGE
}
"
...
...
@@ -81,6 +95,8 @@ sed -E -i "s/(max_areas_per_map)(.*)/\1\t${ENRICH_MAX_AREAS_PER_MAP}/g" ${ENRICH
sed
-E
-i
"s/(max_areas_per_pathway_db)(.*)/
\1\t
${
MAX_AREAS_PER_PATHWAY_DB
}
/g"
${
ENRICHMENT_CONFIG_TMP
}
Rscript
--vanilla
${
ENRICHMENT_DIR
}
/enrich_maps.R
${
genes_out_path
}
${
ENRICHMENT_CONFIG_TMP
}
check_exit_code
enriched_maps_out_path
=
$RES_DIR
/05-enriched_disease_maps.txt
enriched_paths_out_path
=
$RES_DIR
/05-enriched_pathways.txt
mv
enriched_disease_maps.txt
${
enriched_maps_out_path
}
...
...
@@ -107,6 +123,8 @@ echo "Assembling the map from pathways ..."
map_out_path
=
${
RES_DIR
}
/06-minerva_map.xml
java
-Xmx4g
-jar
${
MAP_GENERATOR_DIR
}
/biohackathon/target/biohackathon-1.0-jar-with-dependencies.jar
--enrichr-file
${
enriched_paths_out_path
}
--minerva-file
${
enriched_maps_out_path
}
--text-mining-file
${
text_mining_out_path
}
--output-file
${
map_out_path
}
#java -Xmx4g -jar ${MAP_GENERATOR_DIR}/biohackathon/target/biohackathon-1.0-jar-with-dependencies.jar --enrichr-file ${enriched_paths_out_path} --minerva-file ${enriched_maps_out_path} --output-file ${map_out_path}
check_exit_code
echo
"Pathwaygs assembled into
${
map_out_path
}
"
echo
"Combining map with overlays"
...
...
associations/extend_network/get_extended_network.R
View file @
6edb2f58
...
...
@@ -7,7 +7,7 @@
# n: the number of new proteins in the extended network. By default 50
# score: is the string score (between 0,1). By default 0. it filters a posteriori because
# I was not able to find documentation on how to add the score in the API url request
# dbsource: the source of the protein-protein interaction data. Currently, stringdb or omnipathdb.
# dbsource: the source of the protein-protein interaction data. Currently,
text_mining,
stringdb or omnipathdb.
# By default, stringdb, and then overlaps omnipathdb data to provide information about directionality.
# When source = "omnipathdb" it will retrieve all information for the genes from omnipath
# the output file name. It will contain the pairs gene A, gene B with their corresponding HGNC identifiers,
...
...
@@ -33,7 +33,7 @@ n <-config[config$param == "n", "value"]
score
<-
config
[
config
$
param
==
"score"
,
"value"
]
dbsource
<-
config
[
config
$
param
==
"source"
,
"value"
]
if
(
!
dbsource
%in%
c
(
"stringdb"
,
"omnipathdb"
))
{
if
(
!
dbsource
%in%
c
(
"stringdb"
,
"omnipathdb"
,
"text_mining"
))
{
stop
(
"Invalid argument 'dbsource'. It should be omnipathdb or stringdb "
)
}
if
(
!
(
score
>=
0
&
score
<=
1
))
{
...
...
@@ -53,7 +53,7 @@ library(biomaRt)
message
(
paste0
(
"there are "
,
length
(
unique
(
input
))),
" genes in the file"
)
if
(
dbsource
==
"stringdb"
)
{
if
(
dbsource
==
"stringdb"
|
dbsource
==
"text_mining"
)
{
message
(
"mapping the hgnc identifiers to ensemble proteins used by stringdb with biomart"
)
ensembl
<-
useMart
(
"ensembl"
)
ensembl
<-
useMart
(
"ensembl"
,
dataset
=
"hsapiens_gene_ensembl"
)
...
...
@@ -61,42 +61,81 @@ if (dbsource == "stringdb") {
filters
=
'hgnc_symbol'
,
values
=
input
,
mart
=
ensembl
)
biomartCacheClear
()
ensprotein_idest
<-
subset
(
ensprotein_idest
,
ensembl_peptide_id
!=
""
)
message
(
paste0
(
"there are "
,
length
(
unique
(
ensprotein_idest
$
hgnc_symbol
))),
" genes in the input file with ensembl protein ids"
)
message
(
paste0
(
"doing the query to stringdb and retrieving n "
,
n
,
" neighbors"
))
entities
<-
paste0
(
"9606."
,
ensprotein_idest
$
ensembl_peptide_id
)
entities
<-
paste
(
entities
,
collapse
=
"%0A"
)
if
(
dbsource
==
"stringdb"
){
message
(
paste0
(
"doing the query to stringdb and retrieving n "
,
n
,
" neighbors"
))
entities
<-
paste0
(
"9606."
,
ensprotein_idest
$
ensembl_peptide_id
)
entities
<-
paste
(
entities
,
collapse
=
"%0A"
)
resp
<-
httr
::
POST
(
url
=
"api.jensenlab.org/network"
,
httr
::
add_headers
(
'Content-Type'
=
"application/x-www-form-urlencoded"
),
body
=
paste0
(
"entities="
,
entities
,
"&additional="
,
n
),
httr
::
verbose
())
json
<-
jsonlite
::
fromJSON
(
httr
::
content
(
resp
,
as
=
"text"
,
encoding
=
"UTF-8"
),
flatten
=
FALSE
)
protein_info
<-
json
[[
2
]]
protein_info
$
protein
<-
gsub
(
'stringdb:'
,
""
,
protein_info
$
`@id`
)
protein_info
$
protein
<-
gsub
(
'stringdb:'
,
""
,
protein_info
$
`@id`
)
protein_info
$
protein
<-
gsub
(
'9606.'
,
""
,
protein_info
$
protein
)
protein_info
<-
protein_info
$
protein
ppi_network
<-
json
[[
1
]]
ppi_network
$
source
<-
gsub
(
'9606.'
,
""
,
ppi_network
$
source
)
ppi_network
$
target
<-
gsub
(
'9606.'
,
""
,
ppi_network
$
target
)
ppi_network
$
score
<-
ppi_network
$
scores
$
`stringdb::score`
message
(
"the network contains "
,
dim
(
ppi_network
)[
1
],
" interactions "
)
ppi_network
<-
ppi_network
[
ppi_network
$
score
>
score
,
]
ppi_network
<-
unique
(
ppi_network
[
c
(
"source"
,
"target"
,
"score"
)])
message
(
"after pruning by score, the network contains "
,
dim
(
ppi_network
)[
1
],
" interactions "
)
}
else
{
ppi_network
<-
data.frame
(
source
=
character
(),
target
=
character
(),
score
=
double
())
for
(
entity
in
ensprotein_idest
$
ensembl_peptide_id
){
# print(paste0("http://7fccb2cc.ngrok.io/getCooccurrence/", entity, "?type=9606"))
# http://172.16.4.230:5000/
# resp <- httr::GET(url = paste0("http://7fccb2cc.ngrok.io/getCooccurrence/", entity, "?type=9606"),
# httr::add_headers('Content-Type' = "application/x-www-form-urlencoded")
# #, httr::verbose()
# )
resp
<-
httr
::
GET
(
url
=
paste0
(
"http://172.16.4.230:5000/getCooccurrence/"
,
entity
,
"?type=9606&limit="
,
n
),
httr
::
add_headers
(
'Content-Type'
=
"application/x-www-form-urlencoded"
)
#, httr::verbose()
)
json
<-
jsonlite
::
fromJSON
(
httr
::
content
(
resp
,
as
=
"text"
,
encoding
=
"UTF-8"
),
flatten
=
FALSE
)
if
(
length
(
json
)
!=
0
)
{
json
$
target
<-
json
$
info
$
entity
json
<-
unique
(
subset
(
json
,
grepl
(
"ENSP"
,
target
)
)[
c
(
"target"
,
"score"
)])
json
$
source
<-
entity
ppi_network
<-
rbind
(
ppi_network
,
json
)
}
}
message
(
"the network contains "
,
dim
(
ppi_network
)[
1
],
" interactions "
)
ppi_network
<-
ppi_network
[
ppi_network
$
score
>
0
,
]
ppi_network
<-
unique
(
ppi_network
[
c
(
"source"
,
"target"
,
"score"
)])
message
(
"after pruning by score, the network contains "
,
dim
(
ppi_network
)[
1
],
" interactions "
)
if
(
dim
(
ppi_network
)[
1
]
>
0
)
{
protein_info
<-
unique
(
c
(
ppi_network
$
source
,
ppi_network
$
target
)
)
}
else
{
stop
(
"there is no info in text mining for the query genes"
)
}
}
resp
<-
httr
::
POST
(
url
=
"api.jensenlab.org/network"
,
httr
::
add_headers
(
'Content-Type'
=
"application/x-www-form-urlencoded"
),
body
=
paste0
(
"entities="
,
entities
,
"&additional="
,
n
),
httr
::
verbose
())
json
<-
jsonlite
::
fromJSON
(
httr
::
content
(
resp
,
as
=
"text"
),
flatten
=
FALSE
)
protein_info
<-
json
[[
2
]]
protein_info
$
protein
<-
gsub
(
'stringdb:'
,
""
,
protein_info
$
`@id`
)
protein_info
$
protein
<-
gsub
(
'stringdb:'
,
""
,
protein_info
$
`@id`
)
protein_info
$
protein
<-
gsub
(
'9606.'
,
""
,
protein_info
$
protein
)
protein_info_2ensp
<-
getBM
(
attributes
=
c
(
'ensembl_peptide_id'
,
"hgnc_symbol"
,
'entrezgene_id'
),
filters
=
'ensembl_peptide_id'
,
values
=
protein_info
$
protein
,
values
=
protein_info
,
mart
=
ensembl
)
biomartCacheClear
()
protein_info_2ensp
<-
subset
(
protein_info_2ensp
,
ensembl_peptide_id
!=
""
)
protein_info_2ensp
$
ensembl_peptide_id
<-
paste0
(
"9606."
,
protein_info_2ensp
$
ensembl_peptide_id
)
ppi_network
<-
json
[[
1
]]
ppi_network
$
string_score
<-
ppi_network
$
scores
$
`stringdb::score`
message
(
"the network contains "
,
dim
(
ppi_network
)[
1
],
" interactions "
)
ppi_network
<-
subset
(
ppi_network
,
string_score
>
score
)
message
(
"after pruning by score, the network contains "
,
dim
(
ppi_network
)[
1
],
" interactions "
)
protein_info_2ensp
<-
subset
(
protein_info_2ensp
,
hgnc_symbol
!=
""
)
ppi_network
<-
merge
(
ppi_network
,
protein_info_2ensp
,
by.x
=
"source"
,
by.y
=
"ensembl_peptide_id"
)
colnames
(
ppi_network
)[
which
(
names
(
ppi_network
)
==
"hgnc_symbol"
)]
<-
"source_hgnc_symbol"
colnames
(
ppi_network
)[
which
(
names
(
ppi_network
)
==
"entrezgene_id"
)]
<-
"source_entrezgene_id"
ppi_network
<-
merge
(
ppi_network
,
protein_info_2ensp
,
by.x
=
"target"
,
by.y
=
"ensembl_peptide_id"
)
...
...
@@ -104,7 +143,7 @@ if (dbsource == "stringdb") {
colnames
(
ppi_network
)[
which
(
names
(
ppi_network
)
==
"entrezgene_id"
)]
<-
"target_entrezgene_id"
ppi_network
<-
unique
(
ppi_network
[
c
(
"source_hgnc_symbol"
,
"source_entrezgene_id"
,
"target_hgnc_symbol"
,
"target_entrezgene_id"
,
"
string_
score"
)])
"target_hgnc_symbol"
,
"target_entrezgene_id"
,
"score"
)])
message
(
"after converting to gene symbols, the network contains "
,
dim
(
ppi_network
)[
1
],
" interactions "
)
...
...
@@ -118,12 +157,11 @@ if (dbsource == "stringdb") {
colnames
(
result
)[
which
(
names
(
result
)
==
"source_genesymbol"
)]
<-
"source_hgnc_symbol"
colnames
(
result
)[
which
(
names
(
result
)
==
"target_genesymbol"
)]
<-
"target_hgnc_symbol"
gene_info_2entrez
<-
getBM
(
attributes
=
c
(
'hgnc_symbol'
,
'entrezgene_id'
),
filters
=
'hgnc_symbol'
,
values
=
c
(
result
$
source_hgnc_symbol
,
result
$
target_hgnc_symbol
),
mart
=
ensembl
)
biomartCacheClear
()
gene_info_2entrez
<-
subset
(
gene_info_2entrez
,
entrezgene_id
!=
""
)
result
<-
merge
(
result
,
gene_info_2entrez
,
by.x
=
"source_hgnc_symbol"
,
by.y
=
"hgnc_symbol"
)
...
...
@@ -141,7 +179,7 @@ if (dbsource == "stringdb") {
"target_hgnc_symbol"
,
"target_entrezgene_id"
,
"is_directed"
,
"consensus_direction"
,
"references"
,
"pair"
)])
ppi_network
<-
subset
(
ppi_network
,
!
pair
%in%
final_results
$
pair
)
ppi_network
<-
within
(
ppi_network
,
rm
(
string_
score
,
pair
))
ppi_network
<-
within
(
ppi_network
,
rm
(
score
,
pair
))
final_results
<-
merge
(
final_results
[,
-
grep
(
"pair"
,
colnames
(
final_results
))],
ppi_network
,
all
=
T
)
...
...
associations/minerva_variants.py
View file @
6edb2f58
...
...
@@ -126,7 +126,7 @@ def get_dbsnp(ids: List[str]) -> List[Dict]:
"allele_frequency"
:
allele_frequency
,
"variant_identifier"
:
variant_identifier
,
"amino_acid_change"
:
amino_acid_change
,
"uniprot
_acc
"
:
uniprot_acc
"
identifier_
uniprot"
:
uniprot_acc
})
logging
.
info
(
"Skipped {} variants out of {}"
.
format
(
cnt
-
len
(
snps
),
cnt
))
...
...
@@ -139,7 +139,7 @@ def remove_snps_with_multiple_uniprot_ids(db_snps: List[Dict]) -> List[Dict]:
id
=
snp
[
"variant_identifier"
]
if
id
not
in
snp_cnt
:
snp_cnt
[
id
]
=
set
()
snp_cnt
[
id
].
add
(
snp
[
"uniprot
_acc
"
])
snp_cnt
[
id
].
add
(
snp
[
"
identifier_
uniprot"
])
id_to_keep
=
[
id
for
id
in
snp_cnt
if
len
(
snp_cnt
[
id
])
==
1
]
new_db_snps
:
List
[
Dict
]
=
[]
...
...
@@ -158,10 +158,6 @@ def get_minerva_format(db_snps: List[Dict]) -> str:
if
len
(
db_snps
)
==
0
:
return
out
#remove uniprot accesion numbers since these don't currently go to Minerva
for
snp
in
db_snps
:
del
snp
[
'uniprot_acc'
]
out
+=
"#NAME=DISEASE_ASSOCIATED_VARIANTS
\n
"
out
+=
"#TYPE=GENETIC_VARIANT
\n
"
out
+=
"#GENOME_TYPE=UCSC
\n
"
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment