Commit 4e54bc02 authored by Laura Denies's avatar Laura Denies
Browse files

updated signalp to v5.0

parent 21195b19
......@@ -2,11 +2,10 @@ pathofact:
sample: ["test_sample"] # requires user input
project: PathoFact_update_trial # requires user input
datadir: ../test_dataset # requires user input
workflow: "AMR"
workflow: "Tox"
size_fasta: 1000
scripts: "scripts"
signalp: "/path/to/signalp-4.1/signalp" # requires user input
deeparg: "submodules/deeparg-ss/deepARG.py"
signalp: "/work/projects/ecosystem_biology/local_tools/SignalP/signalp-5.0b/bin" # requires user input
deepvirfinder: "submodules/DeepVirFinder/dvf.py"
tox_hmm: "databases/toxins/combined_Toxin.hmm"
tox_lib: "databases/library_HMM_Toxins.csv"
......
......@@ -12,7 +12,7 @@ rule R_script:
input:
input_HMM="{datadir}/{project}/TOXIN/HMM_toxin/{sample}.Input_HMM_R.csv",
translation="{datadir}/{project}/renamed/{sample}_translation.tsv",
signalP="{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.txt",
signalP="{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.tsv",
library=config["pathofact"]["tox_lib"]
output:
gene_library=temp("{datadir}/{project}/TOXIN/R_output/{sample}_gene_table_library.csv"),
......
......@@ -6,72 +6,38 @@ import os
#Run SignalP on split sequence files
rule signalp:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
"{datadir}/{project}/renamed/{sample}_ID.faa"
output:
"{datadir}/{project}/SignalP/{sample}/{file_i}.txt"
SignalP_gramP="{datadir}/{project}/SignalP/{sample}/gramp_summary.signalp5",
SignalP_gramN="{datadir}/{project}/SignalP/{sample}/gramn_summary.signalp5"
log:
"{datadir}/{project}/SignalP/{sample}/{file_i}.log"
"{datadir}/{project}/SignalP/{sample}.log"
message:
"Running SignalP analysis on {input} resulting in {output}."
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["long"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"{config[pathofact][signalp]} -t gram+ {input} > {output} 2> {log}"
#adjust format of signalP files
rule SignalP_format:
input:
"{datadir}/{project}/SignalP/{sample}/{file_i}.txt"
output:
"{datadir}/{project}/SignalP/format/{sample}/{file_i}.txt"
message:
"Apply correct format to {input} to create {output}"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
sed '1,2d' {input} > {output}
"""
rule signalP_modified:
input:
"{datadir}/{project}/SignalP/format/{sample}/{file_i}.txt"
output:
"{datadir}/{project}/SignalP/modified/{sample}/{file_i}.txt"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
awk '{{print $1"\\t"$10}}' {input} > {output}
export PATH={config[pathofact][signalp]}:$PATH
signalp -fasta {input} -org gram+ -prefix {wildcards.datadir}/{wildcards.project}/SignalP/{wildcards.sample}/gramp -batch {config[pathofact][size_fasta]}
signalp -fasta {input} -org gram- -prefix {wildcards.datadir}/{wildcards.project}/SignalP/{wildcards.sample}/gramn -batch {config[pathofact][size_fasta]}
"""
def aggregate_input(wildcards):
checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
return expand(
"{datadir}/{project}/SignalP/modified/{sample}/{file_i}.txt",
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
# Join multiple SignalP files in a single file
rule aggregate:
rule aggregate_signalP:
input:
aggregate_input
SignalP_gramP="{datadir}/{project}/SignalP/{sample}/gramp_summary.signalp5",
SignalP_gramN="{datadir}/{project}/SignalP/{sample}/gramn_summary.signalp5"
output:
"{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.txt"
SignalP_report="{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.tsv"
message:
"concatenate multiple split signalP files in a single joined file: {output}"
log:
"{datadir}/{project}/SignalP/{sample}_SignalP_temp.log"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"cat {input} > {output}"
conda: "../../envs/R.yaml"
script: "../../scripts/SignalP.R"
#!/usr/bin/env R
# logging
sink(file=file(snakemake@log[[1]], open="wt"), type="message")
library(tidyverse)
V5_p <- read.delim(file = snakemake@input[["SignalP_gramP"]], header=FALSE, comment.char="#")
V5_p$V5_p_SignalP <- ifelse(V5_p$V2 == "OTHER", "N","Y")
V5_p <- V5_p %>% select(1,2,8)
colnames(V5_p)<- c("ID","V5_p_Prediction","V5_p_SignalP")
V5_n <- read.delim(file = snakemake@input[["SignalP_gramN"]], header=FALSE, comment.char="#")
V5_n$V5_n_SignalP <- ifelse(V5_n$V2 == "OTHER", "N","Y")
V5_n <- V5_n %>% select(1,2,8)
colnames(V5_n)<- c("ID","V5_n_Prediction","V5_n_SignalP")
SignalP <- full_join(V5_p, V5_n)
SignalP$SignalPeptide <- ifelse(SignalP$V5_p_SignalP == "Y" | SignalP$V5_n_SignalP =="Y", "Y","N")
SignalP$Organism <- ifelse(SignalP$V5_p_SignalP == "Y" & SignalP$V5_n_SignalP == "Y", "gram+ or gram-",
ifelse(SignalP$V5_p_SignalP == "Y" & SignalP$V5_n_SignalP == "N", "gram+",
ifelse(SignalP$V5_p_SignalP == "N" & SignalP$V5_n_SignalP == "Y", "gram-","-")))
SignalP$compare <- as.character(SignalP$V5_p_Prediction) == as.character(SignalP$V5_n_Prediction)
SignalP$SignalP_prediction <- ifelse(SignalP$Organism == "gram+ or gram-" & SignalP$compare == "FALSE",paste(SignalP$V5_p_Prediction, SignalP$V5_n_Prediction, sep = " / "),
ifelse(SignalP$SignalPeptide == "Y" & SignalP$compare == "TRUE", as.character(SignalP$V5_p_Prediction), "-"))
SignalP <- SignalP %>% select(1,6,9,7)
write.table(SignalP, file = snakemake@output[["SignalP_report"]], sep="\t", row.names=FALSE, quote=FALSE)
......@@ -11,12 +11,9 @@ sink(file=file(snakemake@log[[1]], open="wt"), type="message")
gene_table_HMM<-read.delim(file = snakemake@input[["input_HMM"]], comment.char="#")
translation<-read.delim(file = snakemake@input[["translation"]], header=FALSE)
signalP<-read.table(file = snakemake@input[["signalP"]], sep= "\t")
signalP<-read.table(file = snakemake@input[["signalP"]], sep= "\t", header=TRUE)
gene_table_HMM<-gene_table_HMM[,c(1,2,4,3)]
colnames(signalP)<-c("Query_sequence","SignalP")
colnames(translation)<-c("ID","Query_sequence")
translation<-translation[,c(2,1)]
......@@ -52,7 +49,7 @@ gene_table_Toxic$Prediction<-gsub(1,"pathogenic",gene_table_Toxic$Prediction)
gene_table_Toxic<-as.data.frame(gene_table_Toxic)
#Merge SignalP and Toxic HMM
gene_table_Toxic<-merge.data.frame(gene_table_Toxic,signalP,by="Query_sequence")
gene_table_Toxic<-merge.data.frame(gene_table_Toxic,signalP,by.x="Query_sequence", by.y="ID")
gene_table_Toxic$Query_sequence<-sprintf("%010d",as.numeric(gene_table_Toxic$Query_sequence))
write.csv(gene_table_Toxic,file = snakemake@output[["gene_toxic"]])
......@@ -15,7 +15,7 @@ rule Analysis:
input:
expand(
[
"{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.txt",
"{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.tsv",
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv",
"{datadir}/{project}/Toxin_prediction_{sample}_report.csv"
],
......
......@@ -15,7 +15,7 @@ rule Analysis_Virulence:
input:
expand(
[
"{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.txt",
"{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.tsv",
"{datadir}/{project}/Virulence_prediction_{sample}_report.csv"
],
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment