Gitlab is now using https://gitlab.lcsb.uni.lu as it's primary address. Please update your bookmarks. FAQ.

Commit 850f0c34 authored by Laura Denies's avatar Laura Denies
Browse files

update Toxin_report with SignalP v5.0 results

parent 4e54bc02
......@@ -22,7 +22,7 @@ elif config["pathofact"]["workflow"] == "Tox":
input:
expand(
[
"{datadir}/{project}/Toxin_prediction_{sample}_report.csv",
"{datadir}/{project}/Toxin_prediction_{sample}_report.tsv",
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv"
],
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
......
......@@ -15,12 +15,12 @@ rule R_script:
signalP="{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.tsv",
library=config["pathofact"]["tox_lib"]
output:
gene_library=temp("{datadir}/{project}/TOXIN/R_output/{sample}_gene_table_library.csv"),
gene_toxic=temp("{datadir}/{project}/TOXIN/R_output/{sample}_gene_table_Toxic.csv")
gene_library="{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv",
gene_toxic="{datadir}/{project}/Toxin_prediction_{sample}_report.tsv"
log:
"{datadir}/{project}/TOXIN/R_output/{sample}_gene_table_library.log"
message:
"Run external R script to join SignalP and ToxinHMM"
"Run external R script to join SignalP and ToxinHMM and create Toxin report (incl. confidence levels)"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
......@@ -30,125 +30,3 @@ rule R_script:
script:
"../../scripts/ownHMM_library.R"
#Put the different files in the correct configuration
###Gene table library
rule config_library:
input:
"{datadir}/{project}/TOXIN/R_output/{sample}_gene_table_library.csv"
output:
temp("{datadir}/{project}/TOXIN/R_output/{sample}_gene_table_library_1.csv")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
sed 's/"//g' {input}| sed 's/,/#/g' > {output}
"""
rule config_library_2:
input:
"{datadir}/{project}/TOXIN/R_output/{sample}_gene_table_library_1.csv"
output:
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv"
message:
"Put {input} in the correct configuration: {output}"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
cut -f3,4,5,6,7,8,9 -d "#" {input} | sed 's/#/\\t/g' > {output}
"""
#Gene table Toxic
rule config_toxic:
input:
"{datadir}/{project}/TOXIN/R_output/{sample}_gene_table_Toxic.csv"
output:
"{datadir}/{project}/TOXIN/R_output/{sample}_Toxin.tsv"
message:
"Put {input} in the correct configuration: {output}"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
sed 's/"//g' {input} | sed 's/,/\\t/g' | cut -f2,3,4,5 > {output}
"""
###########################################################
### Insert a confidence level for the different predictions #
#############################################################
# Prediction: Non-Pathogenic
rule non_pathogenic:
input:
"{datadir}/{project}/TOXIN/R_output/{sample}_Toxin.tsv"
output:
"{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_Non-pathogenic.txt"
message:
"The confidence level of Non-Pathogenic is given to query sequences which are negative for both SignalP as HMM toxin"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$3 =="non-pathogenic"' {input} | awk '$5 ="-"' | sed 's/ /\\t/g' > {output}
"""
# Prediction: confidence 1
rule confidence_1:
input:
"{datadir}/{project}/TOXIN/R_output/{sample}_Toxin.tsv"
output:
"{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_confidence_1.txt"
message:
"The confidence level of 1 is given to query sequences which are positives for both SignalP as HMM toxin"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$4 =="Y" && $3 =="pathogenic"' {input}| awk '$5 ="1"' | sed 's/ /\\t/g' > {output}
"""
# Prediction: confidence 2
rule confidence_2:
input:
"{datadir}/{project}/TOXIN/R_output/{sample}_Toxin.tsv"
output:
"{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_confidence_2.txt"
message:
"The confidence level of 2 is given to query sequences which are determined pathogenic with HMM toxin yet negative for SignalP"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '$4 =="N" && $3 =="pathogenic"' {input} | awk '$5 ="2"' | sed 's/ /\\t/g' > {output}
"""
# Combine different prediction files
rule combine_confidence:
input:
Non_pathogenic="{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_Non-pathogenic.txt",
Confidence_1="{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_confidence_1.txt",
Confidence_2="{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_confidence_2.txt"
output:
"{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_Confidence_HMM_SignalP_ensembled.csv"
message:
"The different confidence files ({input}) are combined in a single output file: {output}"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
cat {input} | sort > {output}
"""
#Toxin
import glob
import os
####################################
# Final Report #
####################################
#create final report by combining all files
rule merge_final:
input:
translation="{datadir}/{project}/renamed/{sample}_translation.tsv",
prediction="{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_Confidence_HMM_SignalP_ensembled.csv",
output:
"{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_ToxinHMM_SignalP_translation_ensembled.csv"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
join -t $'\\t' <(sort {input.translation}) <(sort {input.prediction}) > {output}
"""
rule toxin_report:
input:
"{datadir}/{project}/TOXIN/Toxin_prediction/{sample}_ToxinHMM_SignalP_translation_ensembled.csv"
output:
"{datadir}/{project}/Toxin_prediction_{sample}_report.csv"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"sed -i $'1 i\\\ Sequence no.\\tSequence Query\\tNumber of Hits\\tHMM prediction\\tSignalP\\tConfidence level' {input};"
"cp {input} {output}"
......@@ -3,53 +3,48 @@
# logging
sink(file=file(snakemake@log[[1]], open="wt"), type="message")
#setwd("~/Desktop/own_HMM_library")
####################################
### Input of the different files ###
####################################
library(tidyverse)
# Input
## Toxin prediction
gene_table_HMM<-read.delim(file = snakemake@input[["input_HMM"]], comment.char="#")
translation<-read.delim(file = snakemake@input[["translation"]], header=FALSE)
gene_table_HMM <- gene_table_HMM[,c(1,2,4,3)]
colnames(gene_table_HMM) <- c("ORF_ID","HMM_Name","Score","Significance_evalue")
gene_table_HMM$ORF_ID<-sprintf("%010d",as.numeric(gene_table_HMM$ORF_ID))
## ORF translation
ORF_translation<-read.delim(file = snakemake@input[["translation"]], header=FALSE)
colnames(ORF_translation)<-c("ORF_ID","ORF")
ORF_translation$ORF<- sub('>', '', ORF_translation$ORF)
ORF_translation<-ORF_translation[,c(2,1)]
ORF_translation$ORF_ID<-sprintf("%010d",as.numeric(ORF_translation$ORF_ID))
## SignalP prediction
signalP<-read.table(file = snakemake@input[["signalP"]], sep= "\t", header=TRUE)
colnames(signalP)<- c("ORF_ID","Signal_peptide","Signal_prediction","Organism")
signalP$ORF_ID<-sprintf("%010d",as.numeric(signalP$ORF_ID))
gene_table_HMM<-gene_table_HMM[,c(1,2,4,3)]
colnames(translation)<-c("ID","Query_sequence")
translation<-translation[,c(2,1)]
#Description file of the HMM
## Gene library
library_HMM<-read.table(file = snakemake@input[["library"]], sep = ";", header = T, quote = '#')
colnames(library_HMM)<-c("HMM_ID","NAME","Alternative_name", "Database", "Description")
gene_table_library<-merge.data.frame(gene_table_HMM, library_HMM, by.x= "HMM_Name", by.y= "HMM_ID")
gene_table_library<-gene_table_library[,c(2,1,3,4,5,6,7,8)]
gene_table_library<-merge.data.frame(translation, gene_table_library, by.x = "ID", by.y = "Query_sequence")
gene_table_library$ID<-sprintf("%010d",as.numeric(gene_table_library$ID))
write.csv(gene_table_library,file = snakemake@output[["gene_library"]])
#Toxin/Non-Toxin file
frequency<-data.frame(table(gene_table_HMM[1]))
prediction<-data.frame(rep(1,length(table(gene_table_HMM[1]))))
table_pathos_freq<-cbind.data.frame(frequency,prediction)
colnames(table_pathos_freq)<-c("Query_sequence","Number_of_Hits","Prediction")
protein_number<-nrow(translation)
number<-cbind(c(seq(1,protein_number,1)),c(seq(1,protein_number,1)))
gene_table_Toxic<-merge.data.frame(table_pathos_freq,number,by.x = "Query_sequence",by.y = "V1", all.y = T)
gene_table_Toxic$Query_sequence=NULL
gene_table_Toxic<-gene_table_Toxic[,c(3,1,2)]
colnames(gene_table_Toxic)<-c("Query_sequence","Number_of_Hits","Prediction")
gene_table_Toxic[c("Number_of_Hits","Prediction")][is.na(gene_table_Toxic[c("Number_of_Hits","Prediction")])]<-0
gene_table_Toxic$Prediction<-gsub(0,"non-pathogenic",gene_table_Toxic$Prediction)
gene_table_Toxic$Prediction<-gsub(1,"pathogenic",gene_table_Toxic$Prediction)
gene_table_Toxic<-as.data.frame(gene_table_Toxic)
#Merge SignalP and Toxic HMM
gene_table_Toxic<-merge.data.frame(gene_table_Toxic,signalP,by.x="Query_sequence", by.y="ID")
gene_table_Toxic$Query_sequence<-sprintf("%010d",as.numeric(gene_table_Toxic$Query_sequence))
write.csv(gene_table_Toxic,file = snakemake@output[["gene_toxic"]])
#Description file of the HMM
gene_table_library<-merge.data.frame(gene_table_HMM, library_HMM, by.x= "HMM_Name", by.y= "HMM_ID", all.x=TRUE)
gene_table_library<-gene_table_library[,c(2,1,3,4,5,6,7,8)]
gene_table_library<-merge.data.frame(ORF_translation, gene_table_library, by = "ORF_ID")
write.table(gene_table_library,file = snakemake@output[["gene_library"]], sep="\t", row.names=FALSE, quote=FALSE)
# Toxin prediction
gene_table_Toxic <- data.frame(table(gene_table_HMM[1]))
colnames(gene_table_Toxic) <- c("ORF_ID","Number_of_hits")
gene_table_Toxic$Toxin_prediction <- "pathogenic"
gene_table_Toxic <- left_join(ORF_translation, gene_table_Toxic, by="ORF_ID")
gene_table_Toxic$Toxin_prediction <- fct_explicit_na(gene_table_Toxic$Toxin_prediction, na_level = "non_pathogenic")
gene_table_Toxic[is.na(gene_table_Toxic)] <-0
gene_table_Toxic <- full_join(gene_table_Toxic, signalP, by="ORF_ID")
gene_table_Toxic$Toxin_confidence_level <- ifelse(gene_table_Toxic$Toxin_prediction == "pathogenic" & gene_table_Toxic$Signal_peptide == "Y", "1: Secreted Toxin",
ifelse(gene_table_Toxic$Toxin_prediction == "pathogenic" & gene_table_Toxic$Signal_peptide == "N", "2: Non-secreted Toxin", "-"))
gene_table_Toxic <- gene_table_Toxic %>% select(1,2,3,4,5,8)
write.table(gene_table_Toxic, file=snakemake@output[["gene_toxic"]], sep="\t", row.names=FALSE, quote=FALSE)
......@@ -7,8 +7,6 @@ include:
'../rules/Toxin/Toxin.smk'
include:
'../rules/Toxin/Combine_Toxin_SignalP.smk'
include:
'../rules/Toxin/Toxin_report.smk'
# master command
rule Analysis:
......@@ -17,7 +15,7 @@ rule Analysis:
[
"{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.tsv",
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv",
"{datadir}/{project}/Toxin_prediction_{sample}_report.csv"
"{datadir}/{project}/Toxin_prediction_{sample}_report.tsv"
],
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment