Commit 02d34030 authored by Valentina Galata's avatar Valentina Galata
Browse files

format fix in rules/Toxin (issue #26)

parent d0581e48
......@@ -4,12 +4,12 @@ import glob
import os
configfile: "config.yaml"
PROJECT=config["project"]
INPUT=config["input_file"]
PROJECT = config["project"]
INPUT = config["input_file"]
#################################
## Combine Toxin HMM and SignalP #
##################################
# Combine Toxin HMM and SignalP #
#################################
# Put Toxin HMM results in the correct format & join SignalP and Toxin HMM files
rule R_script:
......@@ -21,37 +21,55 @@ rule R_script:
output:
gene_library=temp("{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library.csv"),
gene_toxic=temp("{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_Toxic.csv")
message: "Run external R script to join SignalP and ToxinHMM"
params: outdir="{OUTDIR}"
conda: "../../envs/R.yaml"
script: "../../scripts/ownHMM_library.R"
message:
"Run external R script to join SignalP and ToxinHMM"
params:
outdir="{OUTDIR}"
conda:
"../../envs/R.yaml"
script:
"../../scripts/ownHMM_library.R"
#Put the different files in the correct configuration
###Gene table library
rule config_library:
input:"{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library.csv"
output: temp("{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library_1.csv")
params: outdir="{OUTDIR}"
shell: """
input:
"{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library.csv"
output:
temp("{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library_1.csv")
params:
outdir="{OUTDIR}"
shell:
"""
sed 's/"//g' {input}| sed 's/,/#/g' > {output}
"""
rule config_library_2:
input: "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library_1.csv"
output: "{OUTDIR}/{project}/Toxin_gene_library_{input_file}_report.tsv"
message: "Put {input} in the correct configuration: {output}"
params: outdir="{OUTDIR}"
shell: """
input:
"{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library_1.csv"
output:
"{OUTDIR}/{project}/Toxin_gene_library_{input_file}_report.tsv"
message:
"Put {input} in the correct configuration: {output}"
params:
outdir="{OUTDIR}"
shell:
"""
cut -f3,4,5,6,7,8,9 -d "#" {input}| sed 's/#/\t/g' >{output}
"""
#Gene table Toxic
rule config_toxic:
input: "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_Toxic.csv"
output: "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv"
message: "Put {input} in the correct configuration: {output}"
params: outdir="{OUTDIR}"
shell: """
input:
"{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_Toxic.csv"
output:
"{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv"
message:
"Put {input} in the correct configuration: {output}"
params:
outdir="{OUTDIR}"
shell:
"""
sed 's/"//g' {input} | sed 's/,/\t/g' | cut -f2,3,4,5 > {output}
"""
......@@ -61,32 +79,46 @@ rule config_toxic:
# Prediction: Non-Pathogenic
rule non_pathogenic:
input: "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv"
output: "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Non-pathogenic.txt"
message: "The confidence level of Non-Pathogenic is given to query sequences which are negative for both SignalP as HMM toxin"
params: outdir="{OUTDIR}"
shell: """
input:
"{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv"
output:
"{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Non-pathogenic.txt"
message:
"The confidence level of Non-Pathogenic is given to query sequences which are negative for both SignalP as HMM toxin"
params:
outdir="{OUTDIR}"
shell:
"""
awk '$3 =="non-pathogenic"' {input} | awk '$5 ="-"' | sed 's/ /\t/g' > {output}
"""
# Prediction: confidence 1
rule confidence_1:
input: "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv"
output: "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_confidence_1.txt"
message: "The confidence level of 1 is given to query sequences which are positives for both SignalP as HMM toxin"
params: outdir="{OUTDIR}"
shell: """
awk '$4 =="Y" && $3 =="pathogenic"' {input}| awk '$5 ="1"' | sed 's/ /\t/g' > {output}
"""
input:
"{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv"
output:
"{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_confidence_1.txt"
message:
"The confidence level of 1 is given to query sequences which are positives for both SignalP as HMM toxin"
params:
outdir="{OUTDIR}"
shell:
"""
awk '$4 =="Y" && $3 =="pathogenic"' {input}| awk '$5 ="1"' | sed 's/ /\t/g' > {output}
"""
# Prediction: confidence 2
rule confidence_2:
input: "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv"
output: "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_confidence_2.txt"
message: "The confidence level of 2 is given to query sequences which are determined pathogenic with HMM toxin yet negative for SignalP"
params: outdir="{OUTDIR}"
shell: """
input:
"{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv"
output:
"{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_confidence_2.txt"
message:
"The confidence level of 2 is given to query sequences which are determined pathogenic with HMM toxin yet negative for SignalP"
params:
outdir="{OUTDIR}"
shell:
"""
awk '$4 =="N" && $3 =="pathogenic"' {input}| awk '$5 ="2"' | sed 's/ /\t/g' > {output}
"""
......@@ -96,16 +128,13 @@ rule combine_confidence:
Non_pathogenic="{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Non-pathogenic.txt",
Confidence_1="{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_confidence_1.txt",
Confidence_2="{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_confidence_2.txt"
output: "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Confidence_HMM_SignalP_ensembled.csv"
message: "The different confidence files ({input[0]}, {input[1]}, {input[2]}) are combined in a single output file: {output}"
params: outdir="{OUTDIR}"
shell: """
cat {input[0]} {input[1]} {input[2]} | sort > {output}
"""
output:
"{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Confidence_HMM_SignalP_ensembled.csv"
message:
"The different confidence files ({input[0]}, {input[1]}, {input[2]}) are combined in a single output file: {output}"
params:
outdir="{OUTDIR}"
shell:
"""
cat {input[0]} {input[1]} {input[2]} | sort > {output}
"""
......@@ -4,56 +4,75 @@ import glob
import os
configfile: "config.yaml"
PROJECT=config["project"]
INPUT=config["input_file"]
PROJECT = config["project"]
INPUT = config["input_file"]
#HMM scan
# HMM scan
rule run_HMM:
input:
hmm=config["hmm_file"],
renamed="{OUTDIR}/{project}/splitted/{input_file}/{file_i}.faa"
output: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmmscan"
message: "Run HMM scan on {input[1]} to generate {output}"
params: outdir="{OUTDIR}"
threads: 1
shell: """
output:
"{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmmscan"
message:
"Run HMM scan on {input[1]} to generate {output}"
params:
outdir="{OUTDIR}"
threads:
1
shell:
"""
{config[hmmscan_tool]} --cpu {threads} --noali --notextw --tblout {output} {input[0]} {input[1]}
"""
# Adjust HMM results to correct format
rule HMM_correct_format:
input: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmmscan"
output: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmm.csv"
message: "Adjust {input} to correct format: {output}"
params: outdir="{OUTDIR}"
shell: """
input:
"{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmmscan"
output:
"{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmm.csv"
message:
"Adjust {input} to correct format: {output}"
params:
outdir="{OUTDIR}"
shell:
"""
sed '/^#/ d' {input} | sed 's/ \+/\t/g' > {output}
"""
def aggregate_hmm(wildcards):
checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
return expand("{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmm.csv",
OUTDIR=wildcards.OUTDIR,
project=wildcards.project,
input_file=wildcards.input_file,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i)
checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
return expand(
"{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmm.csv",
OUTDIR=wildcards.OUTDIR,
project=wildcards.project,
input_file=wildcards.input_file,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i
)
rule HMM_correct_format_2:
input: aggregate_hmm
output: temp("{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R_temp.csv")
params: outdir="{OUTDIR}"
shell: """
input:
aggregate_hmm
output:
temp("{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R_temp.csv")
params:
outdir="{OUTDIR}"
shell:
"""
cut -f 1,3,5,6 {input} |uniq >{output}
"""
"""
rule HMM_correct_format_3:
input: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R_temp.csv"
output: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R.csv"
params: outdir="{OUTDIR}"
shell: """
echo "#Toxin" > {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header
cat {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header {input} > {output}
rm -rf {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header
sed -i $'1 i\\\ Query_sequence\tHMM_Name\tSignificance_Evalue\tScore' {output}
"""
input:
"{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R_temp.csv"
output:
"{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R.csv"
params:
outdir="{OUTDIR}"
shell:
"""
echo "#Toxin" > {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header
cat {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header {input} > {output}
rm -rf {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header
sed -i $'1 i\\\ Query_sequence\tHMM_Name\tSignificance_Evalue\tScore' {output}
"""
......@@ -4,30 +4,34 @@ import glob
import os
configfile: "config.yaml"
PROJECT=config["project"]
INPUT=config["input_file"]
PROJECT = config["project"]
INPUT = config["input_file"]
####################################
## Final Report #
#####################################
# Final Report #
####################################
#create final report by combining all files
rule merge_final:
input:
translation="{OUTDIR}/{project}/renamed/{input_file}_translation.tsv",
prediction="{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Confidence_HMM_SignalP_ensembled.csv",
output: "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_ToxinHMM_SignalP_translation_ensembled.csv"
params: outdir="{OUTDIR}"
shell: """
output:
"{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_ToxinHMM_SignalP_translation_ensembled.csv"
params:
outdir="{OUTDIR}"
shell:
"""
join -t $'\t' <(sort {input[0]}) <(sort {input[1]}) >{output}
"""
rule toxin_report:
input: "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_ToxinHMM_SignalP_translation_ensembled.csv"
output: "{OUTDIR}/{project}/Toxin_prediction_{input_file}_report.csv"
params: outdir="{OUTDIR}"
input:
"{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_ToxinHMM_SignalP_translation_ensembled.csv"
output:
"{OUTDIR}/{project}/Toxin_prediction_{input_file}_report.csv"
params:
outdir="{OUTDIR}"
shell:
"sed -i $'1 i\\\ Sequence no.\tSequence Query\tNumber of Hits\tHMM prediction\tSignalP\tConfidence level' {input};"
"cp {input} {output}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment