Gitlab is now using https://gitlab.lcsb.uni.lu as it's primary address. Please update your bookmarks. FAQ.

Commit a09f2eaf authored by Laura Denies's avatar Laura Denies
Browse files

adjusted virulence report to include SignalP v5.0

parent 03b526bc
......@@ -33,7 +33,7 @@ elif config["pathofact"]["workflow"] == "Vir":
rule all:
input:
expand(
"{datadir}/{project}/Virulence_prediction_{sample}_report.csv",
"{datadir}/{project}/Virulence_prediction_{sample}_report.tsv",
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
)
elif config["pathofact"]["workflow"] == "AMR":
......
......@@ -9,104 +9,17 @@ import os
rule merge_SignalPVir:
input:
Vir="{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_virulence_final_prediction.tsv",
SignalP="{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.tsv"
hmm="{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}_hmm_prediction.tsv",
classifier="{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}_classifier_results_formatted.tsv",
SignalP="{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.tsv",
translation="{datadir}/{project}/renamed/{sample}_translation.tsv"
output:
"{datadir}/{project}/VIRULENCE/virulence_merged/{sample}_virulence_SignalP_prediction.tsv"
Virulence_report="{datadir}/{project}/Virulence_prediction_{sample}_report.tsv"
log:
"{datadir}/{project}/VIRULENCE//{sample}.combine_virulence_results.log"
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
join -t $'\\t' <(sort {input.Vir}) <(sort {input.SignalP}) > {output}
"""
conda: "../../envs/R.yaml"
script: "../../scripts/Virulence.R"
###########################################################
# Insert a confidence level for the different predictions #
###########################################################
# Prediction: Non-Pathogenic
rule SignalVir_non_pathogenic:
input:
"{datadir}/{project}/VIRULENCE/virulence_merged/{sample}_virulence_SignalP_prediction.tsv"
output:
temp("{datadir}/{project}/VIRULENCE/Virulence_prediction/{sample}_virulence_non_pathogenic.txt")
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
message:
"The confidence level of Non-Pathogenic is given to query sequences which are negative for both SignalP as Virulence prediction"
shell:
"""
awk '$4 =="Non-Pathogenic"' {input} | awk '$6 ="-"' | sed 's/ /\\t/g' > {output}
"""
# Prediction: confidence 1
rule SignalVir_confidence_1:
input:
"{datadir}/{project}/VIRULENCE/virulence_merged/{sample}_virulence_SignalP_prediction.tsv"
output:
temp("{datadir}/{project}/VIRULENCE/Virulence_prediction/{sample}_virulence_confidence_1.txt")
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
message:
"The confidence level of 1 is given to query sequences which are positives for both SignalP as Virulence"
shell:
"""
awk '$4 =="Pathogenic" && $5 =="Y"' {input} | awk '$6 ="1"' | sed 's/ /\\t/g' > {output}
"""
rule SignalVir_confidence_2:
input:
"{datadir}/{project}/VIRULENCE/virulence_merged/{sample}_virulence_SignalP_prediction.tsv"
output:
temp("{datadir}/{project}/VIRULENCE/Virulence_prediction/{sample}_virulence_confidence_2.txt")
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$4 =="Pathogenic" && $5 =="N"' {input} | awk '$6 ="2"' | sed 's/ /\\t/g' > {output}
"""
rule SignalVir_confidence_3:
input:
"{datadir}/{project}/VIRULENCE/virulence_merged/{sample}_virulence_SignalP_prediction.tsv"
output:
temp("{datadir}/{project}/VIRULENCE/Virulence_prediction/{sample}_virulence_confidence_3.txt")
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$4 =="-" && $5 =="Y"' {input} | awk '$6 ="3"' | sed 's/ /\\t/g' > {output}
"""
rule SignalVir_confidence_4:
input:
"{datadir}/{project}/VIRULENCE/virulence_merged/{sample}_virulence_SignalP_prediction.tsv"
output:
temp("{datadir}/{project}/VIRULENCE/Virulence_prediction/{sample}_virulence_confidence_4.txt")
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$4 =="-" && $5 =="N"' {input} | awk '$6 ="4"' | sed 's/ /\\t/g' > {output}
"""
rule SignalVir_virulence_prediction:
input:
confidence_1="{datadir}/{project}/VIRULENCE/Virulence_prediction/{sample}_virulence_confidence_1.txt",
confidence_2="{datadir}/{project}/VIRULENCE/Virulence_prediction/{sample}_virulence_confidence_2.txt",
confidence_3="{datadir}/{project}/VIRULENCE/Virulence_prediction/{sample}_virulence_confidence_3.txt",
confidence_4="{datadir}/{project}/VIRULENCE/Virulence_prediction/{sample}_virulence_confidence_4.txt",
non_pathogenic="{datadir}/{project}/VIRULENCE/Virulence_prediction/{sample}_virulence_non_pathogenic.txt"
output:
"{datadir}/{project}/VIRULENCE/Virulence_prediction/{sample}_Confidence_Virulence_ensembled.csv"
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"cat {input} > {output}"
......@@ -100,9 +100,9 @@ rule HMM_VIR_classification:
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$2 == "True" && $3 == "False" && $4 == "False"' {input} | awk '$5 = "Non-Pathogenic"' | sed 's/ /\\t/g' > {output.non_path}
awk '$3 == "True"' {input} | awk '$5 = "Pathogenic"' | sed 's/ /\\t/g' > {output.pathogenic}
awk '$3 == "False" && $4 == "True"' {input} | awk '$5 = "Unclassified"' | sed 's/ /\\t/g' > {output.unclassified}
awk '$2 == "True" && $3 == "False" && $4 == "False"' {input} | awk '$5 = "negative"' | sed 's/ /\\t/g' > {output.non_path}
awk '$3 == "True"' {input} | awk '$5 = "pathogenic"' | sed 's/ /\\t/g' > {output.pathogenic}
awk '$3 == "False" && $4 == "True"' {input} | awk '$5 = "unclassified"' | sed 's/ /\\t/g' > {output.unclassified}
"""
......@@ -294,113 +294,3 @@ rule format_classifier_2:
"""
awk '{{$1=sprintf("%010d", $1)}}1' {input} | sed 's/ /\\t/g' > {output}
"""
##############################
# Merge Virulence Prediction #
##############################
rule merge_virulence:
input:
hmm="{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}_hmm_prediction.tsv",
classifier="{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}_classifier_results_formatted.tsv"
output:
"{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_Virulence_prediction.tsv"
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"join -t $'\\t' <(sort {input.hmm}) <(sort {input.classifier}) > {output};"
"sed -i $'1 i\\\ ID\\tHMM_prediction\\tmodel_prediction' {output}"
rule virulence_nonpath:
input:
"{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_Virulence_prediction.tsv"
output:
temp("{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_prediction_non_path_1.tsv")
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$2 == "Non-Pathogenic" && $3 == "negative"' {input} | awk '$4 = "Non-Pathogenic"' | sed 's/ /\\t/g' > {output}
"""
rule virulence_nonpath2:
input:
"{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_Virulence_prediction.tsv"
output:
temp("{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_prediction_non_path_2.tsv")
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$2 == "Unclassified" && $3 == "negative"' {input} | awk '$4 = "Non-Pathogenic"' | sed 's/ /\\t/g' > {output}
"""
rule virulence_path:
input:
"{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_Virulence_prediction.tsv"
output:
temp("{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_prediction_pathogenic_1.tsv")
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$2 == "Pathogenic" && $3 == "pathogenic"' {input} | awk '$4 = "Pathogenic"' | sed 's/ /\\t/g' > {output}
"""
rule virulence_path2:
input:
"{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_Virulence_prediction.tsv"
output:
temp("{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_prediction_pathogenic_2.tsv")
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$2 == "Unclassified" && $3 == "pathogenic"' {input} | awk '$4 = "Pathogenic"' | sed 's/ /\\t/g' > {output}
"""
rule virulence_unclassified1:
input:
"{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_Virulence_prediction.tsv"
output:
temp("{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_prediction_unclassified_1.tsv")
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$2 == "Pathogenic" && $3 == "negative"' {input} | awk '$4 = "-"' | sed 's/ /\\t/g' > {output}
"""
rule virulence_unclassified2:
input:
"{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_Virulence_prediction.tsv"
output:
temp("{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_prediction_unclassified_2.tsv")
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$2 == "Non-Pathogenic" && $3 == "pathogenic"' {input} | awk '$4 = "-"' | sed 's/ /\\t/g' > {output}
"""
rule merge_combined:
input:
Nonpath_1="{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_prediction_non_path_1.tsv",
Nonpath_2="{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_prediction_non_path_2.tsv",
Pathogenic_1="{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_prediction_pathogenic_1.tsv",
Pathogenic_2="{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_prediction_pathogenic_2.tsv",
Unclassified_1="{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_prediction_unclassified_1.tsv",
Unclassified_2="{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_prediction_unclassified_2.tsv"
output:
"{datadir}/{project}/VIRULENCE/HMM_classifier_virulence/{sample}_virulence_final_prediction.tsv"
params:
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"cat {input} > {output}"
#!/usr/bin/env R
# logging
sink(file=file(snakemake@log[[1]], open="wt"), type="message")
library(tidyverse)
# HMM
HMM <- read.delim(file= snakemake@input[["hmm"]], header=FALSE)
colnames(HMM) <- c("ORF_ID","Virulence_HMM_prediction")
# classifier
classifier <- read.delim(file = snakemake@input[["classifier"]], header=FALSE)
colnames(classifier) <- c("ORF_ID","Virulence_classifier_prediction")
# ORF translation
ORF_translation<-read.delim(file = snakemake@input[["translation"]], header=FALSE)
colnames(ORF_translation)<-c("ORF_ID","ORF")
ORF_translation$ORF<- sub('>', '', ORF_translation$ORF)
ORF_translation<-ORF_translation[,c(2,1)]
# Complete Virulence prediction
Virulence <- full_join(HMM, classifier, by="ORF_ID")
Virulence <- full_join(ORF_translation, Virulence, by="ORF_ID")
Virulence$Virulence_prediction <- ifelse(Virulence$Virulence_HMM_prediction == "pathogenic" & Virulence$Virulence_classifier_prediction == "pathogenic", "pathogenic",
ifelse(Virulence$Virulence_HMM_prediction == "unclassified" & Virulence$Virulence_classifier_prediction == "pathogenic", "pathogenic",
ifelse(Virulence$Virulence_HMM_prediction == "negative" & Virulence$Virulence_classifier_prediction == "pathogenic", "unclassified",
ifelse(Virulence$Virulence_HMM_prediction == "pathogenic" & Virulence$Virulence_classifier_prediction == "negative", "unclassified","non_pathogenic"))))
# Signal P
SignalP <- read.delim(file = snakemake@input[["SignalP"]])
SignalP <- SignalP %>% select(1,2)
colnames(SignalP)<- c("ORF_ID","Signal_peptide")
# Combine Virulence prediction with SignalP prediction
Virulence <- full_join(Virulence, SignalP, by="ORF_ID")
Virulence$Virulence_confidence_level <- ifelse(Virulence$Virulence_prediction == "pathogenic" & Virulence$Signal_peptide == "Y", "1: Secreted Virulence factor",
ifelse(Virulence$Virulence_prediction == "pathogenic"& Virulence$Signal_peptide == "N", "2: Non-secreted Virulence factor",
ifelse(Virulence$Virulence_prediction == "unclassified" & Virulence$Signal_peptide == "Y", "3: Potential Secreted Virulence factor",
ifelse(Virulence$Virulence_prediction == "unclassified" & Virulence$Signal_peptide == "N", "4: Potential Non-secreted Virulence factor","-"))))
Virulence$ORF_ID <- sprintf("%010d",as.numeric(Virulence$ORF_ID))
write.table(Virulence, file=snakemake@output[["Virulence_report"]], sep="\t", row.names=FALSE, quote=FALSE)
......@@ -7,8 +7,7 @@ include:
'../rules/Virulence/Virulence.smk'
include:
'../rules/Virulence/Combine_Virulence_SignalP.smk'
include:
'../rules/Virulence/Virulence_report.smk'
# master command
rule Analysis_Virulence:
......@@ -16,7 +15,7 @@ rule Analysis_Virulence:
expand(
[
"{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.tsv",
"{datadir}/{project}/Virulence_prediction_{sample}_report.csv"
"{datadir}/{project}/Virulence_prediction_{sample}_report.tsv"
],
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment