Commit 02d34030 authored by Valentina Galata's avatar Valentina Galata
Browse files

format fix in rules/Toxin (issue #26)

parent d0581e48
...@@ -4,12 +4,12 @@ import glob ...@@ -4,12 +4,12 @@ import glob
import os import os
configfile: "config.yaml" configfile: "config.yaml"
PROJECT=config["project"] PROJECT = config["project"]
INPUT=config["input_file"] INPUT = config["input_file"]
################################# #################################
## Combine Toxin HMM and SignalP # # Combine Toxin HMM and SignalP #
################################## #################################
# Put Toxin HMM results in the correct format & join SignalP and Toxin HMM files # Put Toxin HMM results in the correct format & join SignalP and Toxin HMM files
rule R_script: rule R_script:
...@@ -21,37 +21,55 @@ rule R_script: ...@@ -21,37 +21,55 @@ rule R_script:
output: output:
gene_library=temp("{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library.csv"), gene_library=temp("{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library.csv"),
gene_toxic=temp("{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_Toxic.csv") gene_toxic=temp("{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_Toxic.csv")
message: "Run external R script to join SignalP and ToxinHMM" message:
params: outdir="{OUTDIR}" "Run external R script to join SignalP and ToxinHMM"
conda: "../../envs/R.yaml" params:
script: "../../scripts/ownHMM_library.R" outdir="{OUTDIR}"
conda:
"../../envs/R.yaml"
script:
"../../scripts/ownHMM_library.R"
#Put the different files in the correct configuration #Put the different files in the correct configuration
###Gene table library ###Gene table library
rule config_library: rule config_library:
input:"{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library.csv" input:
output: temp("{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library_1.csv") "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library.csv"
params: outdir="{OUTDIR}" output:
shell: """ temp("{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library_1.csv")
params:
outdir="{OUTDIR}"
shell:
"""
sed 's/"//g' {input}| sed 's/,/#/g' > {output} sed 's/"//g' {input}| sed 's/,/#/g' > {output}
""" """
rule config_library_2: rule config_library_2:
input: "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library_1.csv" input:
output: "{OUTDIR}/{project}/Toxin_gene_library_{input_file}_report.tsv" "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_library_1.csv"
message: "Put {input} in the correct configuration: {output}" output:
params: outdir="{OUTDIR}" "{OUTDIR}/{project}/Toxin_gene_library_{input_file}_report.tsv"
shell: """ message:
"Put {input} in the correct configuration: {output}"
params:
outdir="{OUTDIR}"
shell:
"""
cut -f3,4,5,6,7,8,9 -d "#" {input}| sed 's/#/\t/g' >{output} cut -f3,4,5,6,7,8,9 -d "#" {input}| sed 's/#/\t/g' >{output}
""" """
#Gene table Toxic #Gene table Toxic
rule config_toxic: rule config_toxic:
input: "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_Toxic.csv" input:
output: "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv" "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_gene_table_Toxic.csv"
message: "Put {input} in the correct configuration: {output}" output:
params: outdir="{OUTDIR}" "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv"
shell: """ message:
"Put {input} in the correct configuration: {output}"
params:
outdir="{OUTDIR}"
shell:
"""
sed 's/"//g' {input} | sed 's/,/\t/g' | cut -f2,3,4,5 > {output} sed 's/"//g' {input} | sed 's/,/\t/g' | cut -f2,3,4,5 > {output}
""" """
...@@ -61,32 +79,46 @@ rule config_toxic: ...@@ -61,32 +79,46 @@ rule config_toxic:
# Prediction: Non-Pathogenic # Prediction: Non-Pathogenic
rule non_pathogenic: rule non_pathogenic:
input: "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv" input:
output: "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Non-pathogenic.txt" "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv"
message: "The confidence level of Non-Pathogenic is given to query sequences which are negative for both SignalP as HMM toxin" output:
params: outdir="{OUTDIR}" "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Non-pathogenic.txt"
shell: """ message:
"The confidence level of Non-Pathogenic is given to query sequences which are negative for both SignalP as HMM toxin"
params:
outdir="{OUTDIR}"
shell:
"""
awk '$3 =="non-pathogenic"' {input} | awk '$5 ="-"' | sed 's/ /\t/g' > {output} awk '$3 =="non-pathogenic"' {input} | awk '$5 ="-"' | sed 's/ /\t/g' > {output}
""" """
# Prediction: confidence 1 # Prediction: confidence 1
rule confidence_1: rule confidence_1:
input: "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv" input:
output: "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_confidence_1.txt" "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv"
message: "The confidence level of 1 is given to query sequences which are positives for both SignalP as HMM toxin" output:
params: outdir="{OUTDIR}" "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_confidence_1.txt"
shell: """ message:
awk '$4 =="Y" && $3 =="pathogenic"' {input}| awk '$5 ="1"' | sed 's/ /\t/g' > {output} "The confidence level of 1 is given to query sequences which are positives for both SignalP as HMM toxin"
""" params:
outdir="{OUTDIR}"
shell:
"""
awk '$4 =="Y" && $3 =="pathogenic"' {input}| awk '$5 ="1"' | sed 's/ /\t/g' > {output}
"""
# Prediction: confidence 2 # Prediction: confidence 2
rule confidence_2: rule confidence_2:
input: "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv" input:
output: "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_confidence_2.txt" "{OUTDIR}/{project}/TOXIN/R_output/{input_file}_Toxin.tsv"
message: "The confidence level of 2 is given to query sequences which are determined pathogenic with HMM toxin yet negative for SignalP" output:
params: outdir="{OUTDIR}" "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_confidence_2.txt"
shell: """ message:
"The confidence level of 2 is given to query sequences which are determined pathogenic with HMM toxin yet negative for SignalP"
params:
outdir="{OUTDIR}"
shell:
"""
awk '$4 =="N" && $3 =="pathogenic"' {input}| awk '$5 ="2"' | sed 's/ /\t/g' > {output} awk '$4 =="N" && $3 =="pathogenic"' {input}| awk '$5 ="2"' | sed 's/ /\t/g' > {output}
""" """
...@@ -96,16 +128,13 @@ rule combine_confidence: ...@@ -96,16 +128,13 @@ rule combine_confidence:
Non_pathogenic="{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Non-pathogenic.txt", Non_pathogenic="{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Non-pathogenic.txt",
Confidence_1="{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_confidence_1.txt", Confidence_1="{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_confidence_1.txt",
Confidence_2="{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_confidence_2.txt" Confidence_2="{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_confidence_2.txt"
output: "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Confidence_HMM_SignalP_ensembled.csv" output:
message: "The different confidence files ({input[0]}, {input[1]}, {input[2]}) are combined in a single output file: {output}" "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Confidence_HMM_SignalP_ensembled.csv"
params: outdir="{OUTDIR}" message:
shell: """ "The different confidence files ({input[0]}, {input[1]}, {input[2]}) are combined in a single output file: {output}"
cat {input[0]} {input[1]} {input[2]} | sort > {output} params:
""" outdir="{OUTDIR}"
shell:
"""
cat {input[0]} {input[1]} {input[2]} | sort > {output}
"""
...@@ -4,56 +4,75 @@ import glob ...@@ -4,56 +4,75 @@ import glob
import os import os
configfile: "config.yaml" configfile: "config.yaml"
PROJECT=config["project"] PROJECT = config["project"]
INPUT=config["input_file"] INPUT = config["input_file"]
#HMM scan # HMM scan
rule run_HMM: rule run_HMM:
input: input:
hmm=config["hmm_file"], hmm=config["hmm_file"],
renamed="{OUTDIR}/{project}/splitted/{input_file}/{file_i}.faa" renamed="{OUTDIR}/{project}/splitted/{input_file}/{file_i}.faa"
output: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmmscan" output:
message: "Run HMM scan on {input[1]} to generate {output}" "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmmscan"
params: outdir="{OUTDIR}" message:
threads: 1 "Run HMM scan on {input[1]} to generate {output}"
shell: """ params:
outdir="{OUTDIR}"
threads:
1
shell:
"""
{config[hmmscan_tool]} --cpu {threads} --noali --notextw --tblout {output} {input[0]} {input[1]} {config[hmmscan_tool]} --cpu {threads} --noali --notextw --tblout {output} {input[0]} {input[1]}
""" """
# Adjust HMM results to correct format # Adjust HMM results to correct format
rule HMM_correct_format: rule HMM_correct_format:
input: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmmscan" input:
output: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmm.csv" "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmmscan"
message: "Adjust {input} to correct format: {output}" output:
params: outdir="{OUTDIR}" "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmm.csv"
shell: """ message:
"Adjust {input} to correct format: {output}"
params:
outdir="{OUTDIR}"
shell:
"""
sed '/^#/ d' {input} | sed 's/ \+/\t/g' > {output} sed '/^#/ d' {input} | sed 's/ \+/\t/g' > {output}
""" """
def aggregate_hmm(wildcards): def aggregate_hmm(wildcards):
checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
return expand("{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmm.csv", return expand(
OUTDIR=wildcards.OUTDIR, "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}/{file_i}.hmm.csv",
project=wildcards.project, OUTDIR=wildcards.OUTDIR,
input_file=wildcards.input_file, project=wildcards.project,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i) input_file=wildcards.input_file,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i
)
rule HMM_correct_format_2: rule HMM_correct_format_2:
input: aggregate_hmm input:
output: temp("{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R_temp.csv") aggregate_hmm
params: outdir="{OUTDIR}" output:
shell: """ temp("{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R_temp.csv")
params:
outdir="{OUTDIR}"
shell:
"""
cut -f 1,3,5,6 {input} |uniq >{output} cut -f 1,3,5,6 {input} |uniq >{output}
""" """
rule HMM_correct_format_3: rule HMM_correct_format_3:
input: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R_temp.csv" input:
output: "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R.csv" "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R_temp.csv"
params: outdir="{OUTDIR}" output:
shell: """ "{OUTDIR}/{project}/TOXIN/HMM_toxin/{input_file}.Input_HMM_R.csv"
echo "#Toxin" > {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header params:
cat {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header {input} > {output} outdir="{OUTDIR}"
rm -rf {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header shell:
sed -i $'1 i\\\ Query_sequence\tHMM_Name\tSignificance_Evalue\tScore' {output} """
""" echo "#Toxin" > {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header
cat {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header {input} > {output}
rm -rf {wildcards.OUTDIR}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.input_file}_header
sed -i $'1 i\\\ Query_sequence\tHMM_Name\tSignificance_Evalue\tScore' {output}
"""
...@@ -4,30 +4,34 @@ import glob ...@@ -4,30 +4,34 @@ import glob
import os import os
configfile: "config.yaml" configfile: "config.yaml"
PROJECT=config["project"] PROJECT = config["project"]
INPUT=config["input_file"] INPUT = config["input_file"]
#################################### ####################################
## Final Report # # Final Report #
##################################### ####################################
#create final report by combining all files #create final report by combining all files
rule merge_final: rule merge_final:
input: input:
translation="{OUTDIR}/{project}/renamed/{input_file}_translation.tsv", translation="{OUTDIR}/{project}/renamed/{input_file}_translation.tsv",
prediction="{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Confidence_HMM_SignalP_ensembled.csv", prediction="{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_Confidence_HMM_SignalP_ensembled.csv",
output: "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_ToxinHMM_SignalP_translation_ensembled.csv" output:
params: outdir="{OUTDIR}" "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_ToxinHMM_SignalP_translation_ensembled.csv"
shell: """ params:
outdir="{OUTDIR}"
shell:
"""
join -t $'\t' <(sort {input[0]}) <(sort {input[1]}) >{output} join -t $'\t' <(sort {input[0]}) <(sort {input[1]}) >{output}
""" """
rule toxin_report: rule toxin_report:
input: "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_ToxinHMM_SignalP_translation_ensembled.csv" input:
output: "{OUTDIR}/{project}/Toxin_prediction_{input_file}_report.csv" "{OUTDIR}/{project}/TOXIN/Toxin_prediction/{input_file}_ToxinHMM_SignalP_translation_ensembled.csv"
params: outdir="{OUTDIR}" output:
"{OUTDIR}/{project}/Toxin_prediction_{input_file}_report.csv"
params:
outdir="{OUTDIR}"
shell: shell:
"sed -i $'1 i\\\ Sequence no.\tSequence Query\tNumber of Hits\tHMM prediction\tSignalP\tConfidence level' {input};" "sed -i $'1 i\\\ Sequence no.\tSequence Query\tNumber of Hits\tHMM prediction\tSignalP\tConfidence level' {input};"
"cp {input} {output}" "cp {input} {output}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment