Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Laura Denies
PathoFact
Commits
850f0c34
Commit
850f0c34
authored
Jun 22, 2020
by
Laura Denies
Browse files
update Toxin_report with SignalP v5.0 results
parent
4e54bc02
Changes
5
Hide whitespace changes
Inline
Side-by-side
Snakefile
View file @
850f0c34
...
...
@@ -22,7 +22,7 @@ elif config["pathofact"]["workflow"] == "Tox":
input:
expand(
[
"{datadir}/{project}/Toxin_prediction_{sample}_report.
c
sv",
"{datadir}/{project}/Toxin_prediction_{sample}_report.
t
sv",
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv"
],
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
...
...
rules/Toxin/Combine_Toxin_SignalP.smk
View file @
850f0c34
...
...
@@ -15,12 +15,12 @@ rule R_script:
signalP="{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.tsv",
library=config["pathofact"]["tox_lib"]
output:
gene_library=
temp(
"{datadir}/{project}/T
OXIN/R_output/{sample}_gene_table_library.c
sv"
)
,
gene_toxic=
temp(
"{datadir}/{project}/T
OXIN/R_output/{sample}_gene_table_Toxic.c
sv"
)
gene_library="{datadir}/{project}/T
oxin_gene_library_{sample}_report.t
sv",
gene_toxic="{datadir}/{project}/T
oxin_prediction_{sample}_report.t
sv"
log:
"{datadir}/{project}/TOXIN/R_output/{sample}_gene_table_library.log"
message:
"Run external R script to join SignalP and ToxinHMM"
"Run external R script to join SignalP and ToxinHMM
and create Toxin report (incl. confidence levels)
"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
...
...
@@ -30,125 +30,3 @@ rule R_script:
script:
"../../scripts/ownHMM_library.R"
#Put the different files in the correct configuration
###Gene table library
rule config_library:
input:
"{datadir}/{project}/TOXIN/R_output/{sample}_gene_table_library.csv"
output:
temp("{datadir}/{project}/TOXIN/R_output/{sample}_gene_table_library_1.csv")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
sed 's/"//g' {input}| sed 's/,/#/g' > {output}
"""
rule config_library_2:
input:
"{datadir}/{project}/TOXIN/R_output/{sample}_gene_table_library_1.csv"
output:
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv"
message:
"Put {input} in the correct configuration: {output}"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
cut -f3,4,5,6,7,8,9 -d "#" {input} | sed 's/#/\\t/g' > {output}
"""
#Gene table Toxic
rule config_toxic:
input:
"{datadir}/{project}/TOXIN/R_output/{sample}_gene_table_Toxic.csv"
output:
"{datadir}/{project}/TOXIN/R_output/{sample}_Toxin.tsv"
message:
"Put {input} in the correct configuration: {output}"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
sed 's/"//g' {input} | sed 's/,/\\t/g' | cut -f2,3,4,5 > {output}
"""
###########################################################
### Insert a confidence level for the different predictions #
#############################################################
# Prediction: Non-Pathogenic
rule non_pathogenic:
input:
"{datadir}/{project}/TOXIN/R_output/{sample}_Toxin.tsv"
output:
"{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_Non-pathogenic.txt"
message:
"The confidence level of Non-Pathogenic is given to query sequences which are negative for both SignalP as HMM toxin"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$3 =="non-pathogenic"' {input} | awk '$5 ="-"' | sed 's/ /\\t/g' > {output}
"""
# Prediction: confidence 1
rule confidence_1:
input:
"{datadir}/{project}/TOXIN/R_output/{sample}_Toxin.tsv"
output:
"{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_confidence_1.txt"
message:
"The confidence level of 1 is given to query sequences which are positives for both SignalP as HMM toxin"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$4 =="Y" && $3 =="pathogenic"' {input}| awk '$5 ="1"' | sed 's/ /\\t/g' > {output}
"""
# Prediction: confidence 2
rule confidence_2:
input:
"{datadir}/{project}/TOXIN/R_output/{sample}_Toxin.tsv"
output:
"{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_confidence_2.txt"
message:
"The confidence level of 2 is given to query sequences which are determined pathogenic with HMM toxin yet negative for SignalP"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$4 =="N" && $3 =="pathogenic"' {input} | awk '$5 ="2"' | sed 's/ /\\t/g' > {output}
"""
# Combine different prediction files
rule combine_confidence:
input:
Non_pathogenic="{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_Non-pathogenic.txt",
Confidence_1="{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_confidence_1.txt",
Confidence_2="{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_confidence_2.txt"
output:
"{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_Confidence_HMM_SignalP_ensembled.csv"
message:
"The different confidence files ({input}) are combined in a single output file: {output}"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
cat {input} | sort > {output}
"""
rules/Toxin/Toxin_report.smk
deleted
100755 → 0
View file @
4e54bc02
#Toxin
import glob
import os
####################################
# Final Report #
####################################
#create final report by combining all files
rule merge_final:
input:
translation="{datadir}/{project}/renamed/{sample}_translation.tsv",
prediction="{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_Confidence_HMM_SignalP_ensembled.csv",
output:
"{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_ToxinHMM_SignalP_translation_ensembled.csv"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
join -t $'\\t' <(sort {input.translation}) <(sort {input.prediction}) > {output}
"""
rule toxin_report:
input:
"{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_ToxinHMM_SignalP_translation_ensembled.csv"
output:
"{datadir}/{project}/Toxin_prediction_{sample}_report.csv"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"sed -i $'1 i\\\ Sequence no.\\tSequence Query\\tNumber of Hits\\tHMM prediction\\tSignalP\\tConfidence level' {input};"
"cp {input} {output}"
scripts/ownHMM_library.R
View file @
850f0c34
...
...
@@ -3,53 +3,48 @@
# logging
sink
(
file
=
file
(
snakemake
@
log
[[
1
]],
open
=
"wt"
),
type
=
"message"
)
#setwd("~/Desktop/own_HMM_library")
####################################
### Input of the different files ###
####################################
library
(
tidyverse
)
# Input
## Toxin prediction
gene_table_HMM
<-
read.delim
(
file
=
snakemake
@
input
[[
"input_HMM"
]],
comment.char
=
"#"
)
translation
<-
read.delim
(
file
=
snakemake
@
input
[[
"translation"
]],
header
=
FALSE
)
gene_table_HMM
<-
gene_table_HMM
[,
c
(
1
,
2
,
4
,
3
)]
colnames
(
gene_table_HMM
)
<-
c
(
"ORF_ID"
,
"HMM_Name"
,
"Score"
,
"Significance_evalue"
)
gene_table_HMM
$
ORF_ID
<-
sprintf
(
"%010d"
,
as.numeric
(
gene_table_HMM
$
ORF_ID
))
## ORF translation
ORF_translation
<-
read.delim
(
file
=
snakemake
@
input
[[
"translation"
]],
header
=
FALSE
)
colnames
(
ORF_translation
)
<-
c
(
"ORF_ID"
,
"ORF"
)
ORF_translation
$
ORF
<-
sub
(
'>'
,
''
,
ORF_translation
$
ORF
)
ORF_translation
<-
ORF_translation
[,
c
(
2
,
1
)]
ORF_translation
$
ORF_ID
<-
sprintf
(
"%010d"
,
as.numeric
(
ORF_translation
$
ORF_ID
))
## SignalP prediction
signalP
<-
read.table
(
file
=
snakemake
@
input
[[
"signalP"
]],
sep
=
"\t"
,
header
=
TRUE
)
colnames
(
signalP
)
<-
c
(
"ORF_ID"
,
"Signal_peptide"
,
"Signal_prediction"
,
"Organism"
)
signalP
$
ORF_ID
<-
sprintf
(
"%010d"
,
as.numeric
(
signalP
$
ORF_ID
))
gene_table_HMM
<-
gene_table_HMM
[,
c
(
1
,
2
,
4
,
3
)]
colnames
(
translation
)
<-
c
(
"ID"
,
"Query_sequence"
)
translation
<-
translation
[,
c
(
2
,
1
)]
#Description file of the HMM
## Gene library
library_HMM
<-
read.table
(
file
=
snakemake
@
input
[[
"library"
]],
sep
=
";"
,
header
=
T
,
quote
=
'#'
)
colnames
(
library_HMM
)
<-
c
(
"HMM_ID"
,
"NAME"
,
"Alternative_name"
,
"Database"
,
"Description"
)
gene_table_library
<-
merge.data.frame
(
gene_table_HMM
,
library_HMM
,
by.x
=
"HMM_Name"
,
by.y
=
"HMM_ID"
)
gene_table_library
<-
gene_table_library
[,
c
(
2
,
1
,
3
,
4
,
5
,
6
,
7
,
8
)]
gene_table_library
<-
merge.data.frame
(
translation
,
gene_table_library
,
by.x
=
"ID"
,
by.y
=
"Query_sequence"
)
gene_table_library
$
ID
<-
sprintf
(
"%010d"
,
as.numeric
(
gene_table_library
$
ID
))
write.csv
(
gene_table_library
,
file
=
snakemake
@
output
[[
"gene_library"
]])
#Toxin/Non-Toxin file
frequency
<-
data.frame
(
table
(
gene_table_HMM
[
1
]))
prediction
<-
data.frame
(
rep
(
1
,
length
(
table
(
gene_table_HMM
[
1
]))))
table_pathos_freq
<-
cbind.data.frame
(
frequency
,
prediction
)
colnames
(
table_pathos_freq
)
<-
c
(
"Query_sequence"
,
"Number_of_Hits"
,
"Prediction"
)
protein_number
<-
nrow
(
translation
)
number
<-
cbind
(
c
(
seq
(
1
,
protein_number
,
1
)),
c
(
seq
(
1
,
protein_number
,
1
)))
gene_table_Toxic
<-
merge.data.frame
(
table_pathos_freq
,
number
,
by.x
=
"Query_sequence"
,
by.y
=
"V1"
,
all.y
=
T
)
gene_table_Toxic
$
Query_sequence
=
NULL
gene_table_Toxic
<-
gene_table_Toxic
[,
c
(
3
,
1
,
2
)]
colnames
(
gene_table_Toxic
)
<-
c
(
"Query_sequence"
,
"Number_of_Hits"
,
"Prediction"
)
gene_table_Toxic
[
c
(
"Number_of_Hits"
,
"Prediction"
)][
is.na
(
gene_table_Toxic
[
c
(
"Number_of_Hits"
,
"Prediction"
)])]
<
-0
gene_table_Toxic
$
Prediction
<-
gsub
(
0
,
"non-pathogenic"
,
gene_table_Toxic
$
Prediction
)
gene_table_Toxic
$
Prediction
<-
gsub
(
1
,
"pathogenic"
,
gene_table_Toxic
$
Prediction
)
gene_table_Toxic
<-
as.data.frame
(
gene_table_Toxic
)
#Merge SignalP and Toxic HMM
gene_table_Toxic
<-
merge.data.frame
(
gene_table_Toxic
,
signalP
,
by.x
=
"Query_sequence"
,
by.y
=
"ID"
)
gene_table_Toxic
$
Query_sequence
<-
sprintf
(
"%010d"
,
as.numeric
(
gene_table_Toxic
$
Query_sequence
))
write.csv
(
gene_table_Toxic
,
file
=
snakemake
@
output
[[
"gene_toxic"
]])
#Description file of the HMM
gene_table_library
<-
merge.data.frame
(
gene_table_HMM
,
library_HMM
,
by.x
=
"HMM_Name"
,
by.y
=
"HMM_ID"
,
all.x
=
TRUE
)
gene_table_library
<-
gene_table_library
[,
c
(
2
,
1
,
3
,
4
,
5
,
6
,
7
,
8
)]
gene_table_library
<-
merge.data.frame
(
ORF_translation
,
gene_table_library
,
by
=
"ORF_ID"
)
write.table
(
gene_table_library
,
file
=
snakemake
@
output
[[
"gene_library"
]],
sep
=
"\t"
,
row.names
=
FALSE
,
quote
=
FALSE
)
# Toxin prediction
gene_table_Toxic
<-
data.frame
(
table
(
gene_table_HMM
[
1
]))
colnames
(
gene_table_Toxic
)
<-
c
(
"ORF_ID"
,
"Number_of_hits"
)
gene_table_Toxic
$
Toxin_prediction
<-
"pathogenic"
gene_table_Toxic
<-
left_join
(
ORF_translation
,
gene_table_Toxic
,
by
=
"ORF_ID"
)
gene_table_Toxic
$
Toxin_prediction
<-
fct_explicit_na
(
gene_table_Toxic
$
Toxin_prediction
,
na_level
=
"non_pathogenic"
)
gene_table_Toxic
[
is.na
(
gene_table_Toxic
)]
<
-0
gene_table_Toxic
<-
full_join
(
gene_table_Toxic
,
signalP
,
by
=
"ORF_ID"
)
gene_table_Toxic
$
Toxin_confidence_level
<-
ifelse
(
gene_table_Toxic
$
Toxin_prediction
==
"pathogenic"
&
gene_table_Toxic
$
Signal_peptide
==
"Y"
,
"1: Secreted Toxin"
,
ifelse
(
gene_table_Toxic
$
Toxin_prediction
==
"pathogenic"
&
gene_table_Toxic
$
Signal_peptide
==
"N"
,
"2: Non-secreted Toxin"
,
"-"
))
gene_table_Toxic
<-
gene_table_Toxic
%>%
select
(
1
,
2
,
3
,
4
,
5
,
8
)
write.table
(
gene_table_Toxic
,
file
=
snakemake
@
output
[[
"gene_toxic"
]],
sep
=
"\t"
,
row.names
=
FALSE
,
quote
=
FALSE
)
workflows/Toxin_workflow.smk
View file @
850f0c34
...
...
@@ -7,8 +7,6 @@ include:
'../rules/Toxin/Toxin.smk'
include:
'../rules/Toxin/Combine_Toxin_SignalP.smk'
include:
'../rules/Toxin/Toxin_report.smk'
# master command
rule Analysis:
...
...
@@ -17,7 +15,7 @@ rule Analysis:
[
"{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.tsv",
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv",
"{datadir}/{project}/Toxin_prediction_{sample}_report.
c
sv"
"{datadir}/{project}/Toxin_prediction_{sample}_report.
t
sv"
],
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment