Gitlab migration complete. If you have any issue please read the FAQ.

Commit 660212ac authored by Laura Denies's avatar Laura Denies
Browse files

streamline pipeline

parent 8d6b6ddd
#Snakefile
configfile: "config.yaml"
DATA_DIR=config["pathofact"]["datadir"]
if config["pathofact"]["workflow"] == "complete":
include:
......@@ -9,12 +10,12 @@ if config["pathofact"]["workflow"] == "complete":
input:
expand(
[
"{datadir}/{project}/AMR/{sample}_AMR_MGE_prediction_detailed.tsv",
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv",
"{datadir}/{project}/PathoFact_{sample}_predictions.tsv",
"{datadir}/{project}/logs/{sample}_compressed.zip"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/{sample}_AMR_MGE_prediction_detailed.tsv"),
os.path.join(DATA_DIR,"{project}/PathoFact_report/Toxin_gene_library_{sample}_report.tsv"),
os.path.join(DATA_DIR,"{project}/PathoFact_report/PathoFact_{sample}_predictions.tsv"),
os.path.join(DATA_DIR,"{project}/logs/{sample}_compressed.zip")
],
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
)
elif config["pathofact"]["workflow"] == "Tox":
include:
......@@ -23,11 +24,11 @@ elif config["pathofact"]["workflow"] == "Tox":
input:
expand(
[
"{datadir}/{project}/Toxin_prediction_{sample}_report.tsv",
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv",
"{datadir}/{project}/logs/Tox_{sample}_compressed.zip"
os.path.join(DATA_DIR,"{project}/PathoFact_report/Toxin_prediction_{sample}_report.tsv"),
os.path.join(DATA_DIR,"{project}/PathoFact_report/Toxin_gene_library_{sample}_report.tsv"),
os.path.join(DATA_DIR,"{project}/logs/Tox_{sample}_compressed.zip")
],
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
)
elif config["pathofact"]["workflow"] == "Vir":
include:
......@@ -36,10 +37,10 @@ elif config["pathofact"]["workflow"] == "Vir":
input:
expand(
[
"{datadir}/{project}/Virulence_prediction_{sample}_report.tsv",
"{datadir}/{project}/logs/VF_{sample}_compressed.zip"
os.path.join(DATA_DIR,"{project}/PathoFact_report/Virulence_prediction_{sample}_report.tsv"),
os.path.join(DATA_DIR,"{project}/logs/VF_{sample}_compressed.zip")
],
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
)
elif config["pathofact"]["workflow"] == "AMR":
include:
......@@ -48,11 +49,11 @@ elif config["pathofact"]["workflow"] == "AMR":
input:
expand(
[
"{datadir}/{project}/AMR_MGE_prediction_{sample}_report.tsv",
"{datadir}/{project}/AMR/{sample}_AMR_MGE_prediction_detailed.tsv",
"{datadir}/{project}/logs/AMR_{sample}_compressed.zip"
os.path.join(DATA_DIR,"{project}/PathoFact_report/AMR_MGE_prediction_{sample}_report.tsv"),
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/{sample}_AMR_MGE_prediction_detailed.tsv"),
os.path.join(DATA_DIR,"{project}/logs/AMR_{sample}_compressed.zip")
],
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
)
else:
raise Exception("Unknown workflow option: %s" % config["pathofact"]["workflow"])
pathofact:
sample: ["test_2"] # requires user input
project: Lim_PathoFact # requires user input
datadir: ../test_dataset # requires user input
sample: ["test_sample"] # requires user input
project: Final_test # requires user input
datadir: /work/projects/ecosystem_biology/local_tools/PathoFact_update/test_dataset # requires user input
workflow: "complete"
size_fasta: 100
size_fasta: 10000
scripts: "scripts"
signalp: "/work/projects/ecosystem_biology/local_tools/SignalP/signalp-5.0b/bin" # requires user input
deepvirfinder: "submodules/DeepVirFinder/dvf.py"
......
......@@ -10,13 +10,12 @@ import os
## deepARG:
rule run_deepARG:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
os.path.join(DATA_DIR,"{project}/splitted/{sample}/{file_i}.fasta")
output:
temp("{datadir}/{project}/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG")
temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG"))
log:
"{datadir}/{project}/logs/{sample}/{file_i}.out.mapping.ARG.log"
os.path.join(DATA_DIR,"{project}/logs/{sample}/{file_i}.out.mapping.ARG.log")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
conda:
......@@ -24,14 +23,13 @@ rule run_deepARG:
message: "executing deep-arg on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell:
"""
deeparg predict --model LS --model-version v2 --type prot -d {config[pathofact][scripts]}/deeparg_data/deepARG --input {input} --out {wildcards.datadir}/{wildcards.project}/AMR/deepARG_results/{wildcards.sample}/{wildcards.file_i}.out &> {log}
deeparg predict --model LS --model-version v2 --type prot -d {config[pathofact][scripts]}/deeparg_data/deepARG --input {input} --out {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/AMR/deepARG_results/{wildcards.sample}/{wildcards.file_i}.out &> {log}
"""
def aggregate_AMR(wildcards):
checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
return expand(
"{datadir}/{project}/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG",
datadir=wildcards.datadir,
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG"),
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
......@@ -41,41 +39,38 @@ rule aggregate_deepARG:
input:
aggregate_AMR
output:
"{datadir}/{project}/AMR/deepARG_results/{sample}.out.mapping.ARG"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/deepARG_results/{sample}.out.mapping.ARG")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
message: "aggregating deep-arg results on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell:
"""
cat {input} > {output}
rm -rf {wildcards.datadir}/{wildcards.project}/AMR/deepARG_results/{wildcards.sample}
rm -rf {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/AMR/deepARG_results/{wildcards.sample}
"""
# RGI
rule run_RGI:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
os.path.join(DATA_DIR,"{project}/splitted/{sample}/{file_i}.fasta")
output:
temp("{datadir}/{project}/AMR/RGI_results/{sample}/{file_i}.RGI.txt")
temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/RGI_results/{sample}/{file_i}.RGI.txt"))
log:
"{datadir}/{project}/logs/{sample}/{file_i}.RGI.log"
os.path.join(DATA_DIR,"{project}/logs/{sample}/{file_i}.RGI.log")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
conda:
"../../envs/rgi.yaml"
message: "executing RGI on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell:
"rgi main --input_sequence {input} --output_file {wildcards.datadir}/{wildcards.project}/AMR/RGI_results/{wildcards.sample}/{wildcards.file_i}.RGI --input_type protein --local --clean &> {log}"
"rgi main --input_sequence {input} --output_file {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/AMR/RGI_results/{wildcards.sample}/{wildcards.file_i}.RGI --input_type protein --local --clean &> {log}"
def aggregate_RGI(wildcards):
checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
return expand(
"{datadir}/{project}/AMR/RGI_results/{sample}/{file_i}.RGI.txt",
datadir=wildcards.datadir,
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/RGI_results/{sample}/{file_i}.RGI.txt"),
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
......@@ -85,31 +80,29 @@ rule aggregate_RGI:
input:
aggregate_RGI
output:
"{datadir}/{project}/AMR/RGI_results/{sample}.RGI.txt"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/RGI_results/{sample}.RGI.txt")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
message: "Aggregate RGI results on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell:
"""
cat {input} > {output}
rm -rf {wildcards.datadir}/{wildcards.project}/AMR/RGI_results/{wildcards.sample}
rm -rf {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/AMR/RGI_results/{wildcards.sample}
"""
# Combine DeepARG and RGI results
rule combine_AMR:
input:
RGI="{datadir}/{project}/AMR/RGI_results/{sample}.RGI.txt",
DeepARG="{datadir}/{project}/AMR/deepARG_results/{sample}.out.mapping.ARG",
RGI=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/RGI_results/{sample}.RGI.txt"),
DeepARG=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/deepARG_results/{sample}.out.mapping.ARG"),
AMR_index= "scripts/PathoFact_AMR_index.tsv"
output:
AMR_combined="{datadir}/{project}/AMR/{sample}_AMR_prediction.tsv"
AMR_combined=os.path.join(DATA_DIR,"{project}/AMR/{sample}_AMR_prediction.tsv")
log:
"{datadir}/{project}/logs/{sample}/combine_AMR_temp.log"
os.path.join(DATA_DIR,"{project}/logs/{sample}/combine_AMR_temp.log")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
conda: "../../envs/R.yaml"
......
......@@ -9,21 +9,20 @@ import os
rule combine_AMR_plasmid:
input:
ORF_translation="{datadir}/{project}/renamed/{sample}_translation.tsv",
Contig_ORFs="{datadir}/{project}/Prodigal/{sample}.contig",
Contig_translation="{datadir}/{project}/renamed/{sample}_Contig_translation.tsv",
AMR="{datadir}/{project}/AMR/{sample}_AMR_prediction.tsv",
PlasFlow="{datadir}/{project}/MGE/plasmid/PlasFlow/{sample}_plasflow_prediction_final.tsv",
MOB_suite="{datadir}/{project}/MGE/plasmid/MOB_suite/{sample}_MOB_suite_aggregated.tsv",
DeepVirFinder="{datadir}/{project}/MGE/phage/{sample}_VirFinder_aggregated.csv",
VirSorter="{datadir}/{project}/MGE/phage/{sample}_VIRSorter_aggregated.csv"
ORF_translation=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/renamed/{sample}_translation.tsv"),
Contig_ORFs=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/Prodigal/{sample}.contig"),
Contig_translation=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/renamed/{sample}_Contig_translation.tsv"),
AMR=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/{sample}_AMR_prediction.tsv"),
PlasFlow=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/PlasFlow/{sample}_plasflow_prediction_final.tsv"),
MOB_suite=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/MOB_suite/{sample}_MOB_suite_aggregated.tsv"),
DeepVirFinder=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}_VirFinder_aggregated.csv"),
VirSorter=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}_VIRSorter_aggregated.csv")
output:
Report_1="{datadir}/{project}/AMR/{sample}_AMR_MGE_prediction_detailed.tsv",
Report_2="{datadir}/{project}/AMR_MGE_prediction_{sample}_report.tsv"
Report_1=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/{sample}_AMR_MGE_prediction_detailed.tsv"),
Report_2=os.path.join(DATA_DIR,"{project}/PathoFact_report/AMR_MGE_prediction_{sample}_report.tsv")
log:
"{datadir}/{project}/logs/{sample}/MGE_AMR_prediction_detail_temp.log"
os.path.join(DATA_DIR,"{project}/logs/{sample}/MGE_AMR_prediction_detail_temp.log")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
conda:
......
......@@ -11,13 +11,12 @@ import os
rule run_VirSorter:
input:
"{datadir}/{project}/renamed/{sample}_Contig_ID.fna"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/renamed/{sample}_Contig_ID.fna")
output:
temp("{datadir}/{project}/MGE/phage/{sample}/virsorter/VIRSorter_global-phage-signal.csv")
temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}/virsorter/VIRSorter_global-phage-signal.csv"))
log:
"{datadir}/{project}/logs/{sample}/VIRSorter_global-phage-signal.log"
os.path.join(DATA_DIR,"{project}/logs/{sample}/VIRSorter_global-phage-signal.log")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["long"],
mem=config["pathofact"]["mem"]["big_mem_per_core_gb"]
conda:
......@@ -27,17 +26,15 @@ rule run_VirSorter:
message: "Executing VirSorter with {threads} threads on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell:
"""
wrapper_phage_contigs_sorter_iPlant.pl -f {input} --ncpu {threads} --wdir {wildcards.datadir}/{wildcards.project}/MGE/phage/{wildcards.sample}/virsorter --data-dir {config[pathofact][scripts]}/virsorter-data &> {log}
wrapper_phage_contigs_sorter_iPlant.pl -f {input} --ncpu {threads} --wdir {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/MGE/phage/{wildcards.sample}/virsorter --data-dir {config[pathofact][scripts]}/virsorter-data &> {log}
"""
localrules: aggregate_VirSorter
rule aggregate_VirSorter:
input:
"{datadir}/{project}/MGE/phage/{sample}/virsorter/VIRSorter_global-phage-signal.csv"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}/virsorter/VIRSorter_global-phage-signal.csv")
output:
"{datadir}/{project}/MGE/phage/{sample}_VIRSorter_aggregated.csv"
params:
outdir="{datadir}"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}_VIRSorter_aggregated.csv")
message: "VirSorter failsave for empty files: {wildcards.project} - {wildcards.sample}"
shell:
"""
......@@ -47,34 +44,31 @@ rule aggregate_VirSorter:
else
mv {input} {output}
fi
rm -rf {wildcards.datadir}/{wildcards.project}/MGE/phage/{wildcards.sample}/virsorter
rm -rf {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/MGE/phage/{wildcards.sample}/virsorter
"""
# VIRFINDER Prediction
rule run_VirFinder:
input:
"{datadir}/{project}/contig_splitted/{sample}/{file_i}.fasta"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/contig_splitted/{sample}/{file_i}.fasta")
output:
temp("{datadir}/{project}/MGE/phage/{sample}/virfinder/{file_i}.fasta_gt1bp_dvfpred.txt")
temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}/virfinder/{file_i}.fasta_gt1bp_dvfpred.txt"))
log:
"{datadir}/{project}/logs/{sample}/{file_i}.fasta_gt1bp_dvfpred.log"
os.path.join(DATA_DIR,"{project}/logs/{sample}/{file_i}.fasta_gt1bp_dvfpred.log")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["long"],
mem=config["pathofact"]["mem"]["big_mem_per_core_gb"]
conda:
"../../envs/DeepVirFinder.yaml"
threads:
config["pathofact"]["mem"]["big_mem_cores"]
threads: config["pathofact"]["mem"]["big_mem_cores"]
message: "Executing Deep-VirFinder with {threads} threads on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell:
"python {config[pathofact][deepvirfinder]} -i {input} -o {wildcards.datadir}/{wildcards.project}/MGE/phage/{wildcards.sample}/virfinder -c {threads} &> {log}"
"python {config[pathofact][deepvirfinder]} -i {input} -o {config[pathofact][datadir]}/{wildcards.project}/MGE/PathoFact_intermediate/phage/{wildcards.sample}/virfinder -c {threads} &> {log}"
def aggregate_VirFinder(wildcards):
checkpoint_output= checkpoints.splitcontig.get(**wildcards).output.split
return expand(
"{datadir}/{project}/MGE/phage/{sample}/virfinder/{file_i}.fasta_gt1bp_dvfpred.txt",
datadir=wildcards.datadir,
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}/virfinder/{file_i}.fasta_gt1bp_dvfpred.txt"),
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
......@@ -84,9 +78,8 @@ rule aggregate_VirFinder:
input:
aggregate_VirFinder
output:
"{datadir}/{project}/MGE/phage/{sample}_VirFinder_aggregated.csv"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}_VirFinder_aggregated.csv")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
message: "Aggregate VirFinder predictions on the following sample(s): {wildcards.project} - {wildcards.sample}"
......
......@@ -10,15 +10,14 @@ import os
# PlasFlow Preprocessing
rule filter_seq:
input:
"{datadir}/{project}/renamed/{sample}_Contig_ID.fna"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/renamed/{sample}_Contig_ID.fna")
output:
temp("{datadir}/{project}/plasmid/{sample}_filtered.fna")
temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/plasmid/{sample}_filtered.fna"))
log:
"{datadir}/{project}/logs/{sample}/plasmid_filtered.log"
os.path.join(DATA_DIR,"{project}/logs/{sample}/plasmid_filtered.log")
conda:
"../../envs/Biopython.yaml"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
minlen=config["pathofact"]["plasflow_minlen"]
......@@ -28,11 +27,10 @@ rule filter_seq:
checkpoint splitplasmid:
input:
"{datadir}/{project}/plasmid/{sample}_filtered.fna"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/plasmid/{sample}_filtered.fna")
output:
split=temp(directory("{datadir}/{project}/MGE/plasmid_splitted/{sample}/"))
split=temp(directory(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid_splitted/{sample}/")))
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
split=config["pathofact"]["size_fasta"]
......@@ -40,21 +38,20 @@ checkpoint splitplasmid:
"../../envs/Biopython.yaml"
shell:
"""
python {config[pathofact][scripts]}/split.py {input} {params.split} {wildcards.datadir}/{wildcards.project}/MGE/plasmid_splitted/{wildcards.sample}
python {config[pathofact][scripts]}/split.py {input} {params.split} {output}
"""
# PlasFlow Plasmid prediction
rule run_PLASMID:
input:
"{datadir}/{project}/MGE/plasmid_splitted/{sample}/{file_i}.fasta"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid_splitted/{sample}/{file_i}.fasta")
output:
temp("{datadir}/{project}/MGE/plasmid/PlasFlow/{sample}/{file_i}_plasflow_prediction.tsv")
temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/PlasFlow/{sample}/{file_i}_plasflow_prediction.tsv"))
log:
"{datadir}/{project}/logs/{sample}/{file_i}_plasflow_prediction.log"
os.path.join(DATA_DIR,"{project}/logs/{sample}/{file_i}_plasflow_prediction.log")
conda:
"../../envs/PlasFlow.yaml"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["long"],
mem=config["pathofact"]["mem"]["big_mem_per_core_gb"],
threshold=config["pathofact"]["plasflow_threshold"]
......@@ -67,8 +64,7 @@ rule run_PLASMID:
def aggregate_plasmid_input(wildcards):
checkpoint_output= checkpoints.splitplasmid.get(**wildcards).output.split
return expand(
"{datadir}/{project}/MGE/plasmid/PlasFlow/{sample}/{file_i}_plasflow_prediction.tsv",
datadir=wildcards.datadir,
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/PlasFlow/{sample}/{file_i}_plasflow_prediction.tsv"),
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
......@@ -78,25 +74,23 @@ rule Plasmid_aggregate:
input:
aggregate_plasmid_input
output:
temp("{datadir}/{project}/MGE/plasmid/PlasFlow/{sample}_plasflow_aggregated.tsv")
temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/PlasFlow/{sample}_plasflow_aggregated.tsv"))
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
message: "Aggregate PlasFlow results on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell:
"""
cat {input} > {output}
{wildcards.datadir}/{wildcards.project}/MGE/plasmid/PlasFlow/{wildcards.sample}
rm -rf {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/MGE/plasmid/PlasFlow/{wildcards.sample}
"""
rule select:
input:
"{datadir}/{project}/MGE/plasmid/PlasFlow/{sample}_plasflow_aggregated.tsv"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/PlasFlow/{sample}_plasflow_aggregated.tsv")
output:
"{datadir}/{project}/MGE/plasmid/PlasFlow/{sample}_plasflow_prediction_final.tsv"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/PlasFlow/{sample}_plasflow_prediction_final.tsv")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
......@@ -105,26 +99,22 @@ rule select:
"""
rule run_MOBsuite:
input: "{datadir}/{project}/contig_splitted/{sample}/{file_i}.fasta"
input: os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/contig_splitted/{sample}/{file_i}.fasta")
output:
temp("{datadir}/{project}/MGE/plasmid/MOB_suite/{sample}/{file_i}_MOB_suite_prediction.txt")
temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/MOB_suite/{sample}/{file_i}_MOB_suite_prediction.txt"))
log:
"{datadir}/{project}/logs/{sample}/{file_i}_MOB_suite_prediction.log"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["long"]
os.path.join(DATA_DIR,"{project}/logs/{sample}/{file_i}_MOB_suite_prediction.log")
threads:
config["pathofact"]["mem"]["big_mem_cores"]
conda:
"../../envs/MOB_suite.yaml"
message: "Executing MOB_suite with {threads} threads on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell: "mob_typer --multi --infile {input} --out_file {wildcards.datadir}/{wildcards.project}/MGE/plasmid/MOB_suite/{wildcards.sample}/{wildcards.file_i}_MOB_suite_prediction.txt -n {threads} &> {log}"
shell: "mob_typer --multi --infile {input} --out_file {output} -n {threads} &> {log}"
def aggregate_MOBsuite(wildcards):
checkpoint_output= checkpoints.splitcontig.get(**wildcards).output.split
return expand(
"{datadir}/{project}/MGE/plasmid/MOB_suite/{sample}/{file_i}_MOB_suite_prediction.txt",
datadir=wildcards.datadir,
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/MOB_suite/{sample}/{file_i}_MOB_suite_prediction.txt"),
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
......@@ -133,9 +123,8 @@ def aggregate_MOBsuite(wildcards):
rule aggregate_MOBsuite:
input: aggregate_MOBsuite
output:
"{datadir}/{project}/MGE/plasmid/MOB_suite/{sample}_MOB_suite_aggregated.tsv"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/MOB_suite/{sample}_MOB_suite_aggregated.tsv")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
message: "Aggregate MOB_suite results on the following sample(s): {wildcards.project} - {wildcards.sample}"
......
......@@ -10,17 +10,16 @@ import os
# Put Toxin HMM results in the correct format & join SignalP and Toxin HMM files
rule R_script:
input:
input_HMM="{datadir}/{project}/TOXIN/HMM_toxin/{sample}.Input_HMM_R.csv",
translation="{datadir}/{project}/renamed/{sample}_translation.tsv",
signalP="{datadir}/{project}/SignalP/aggregated/{sample}_SignalP_results.tsv",
input_HMM=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/TOXIN/HMM_toxin/{sample}.Input_HMM_R.csv"),
translation=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/renamed/{sample}_translation.tsv"),
signalP=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/SignalP/aggregated/{sample}_SignalP_results.tsv"),
library=config["pathofact"]["tox_lib"]
output:
gene_library="{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv",
gene_toxic="{datadir}/{project}/Toxin_prediction_{sample}_report.tsv"
gene_library=os.path.join(DATA_DIR,"{project}/PathoFact_report/Toxin_gene_library_{sample}_report.tsv"),
gene_toxic=os.path.join(DATA_DIR,"{project}/PathoFact_report/Toxin_prediction_{sample}_report.tsv")
log:
"{datadir}/{project}/logs/{sample}/gene_table_library.log"
os.path.join(DATA_DIR,"{project}/logs/{sample}/gene_table_library.log")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
message: "Generate Toxin report on the following sample(s): {wildcards.project} - {wildcards.sample}"
......
......@@ -7,13 +7,12 @@ import os
rule run_HMM_tox:
input:
hmm=config["pathofact"]["tox_hmm"],
renamed="{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
renamed=os.path.join(DATA_DIR,"{project}/splitted/{sample}/{file_i}.fasta")
output:
temp("{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmmscan")
temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/TOXIN/HMM_toxin/{sample}/{file_i}.hmmscan"))
log:
"{datadir}/{project}/logs/{sample}/{file_i}.log"
os.path.join(DATA_DIR,"{project}/logs/{sample}/{file_i}.log")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["long"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
conda:
......@@ -29,11 +28,10 @@ rule run_HMM_tox:
# Adjust HMM results to correct format
rule HMM_correct_format:
input:
"{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmmscan"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/TOXIN/HMM_toxin/{sample}/{file_i}.hmmscan")
output:
temp("{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmm.csv")
temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/TOXIN/HMM_toxin/{sample}/{file_i}.hmm.csv"))
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
message: "Adjust output format of toxin predictions: {wildcards.project} - {wildcards.sample}"
......@@ -45,8 +43,7 @@ rule HMM_correct_format:
def aggregate_hmm(wildcards):
checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
return expand(
"{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmm.csv",
datadir=wildcards.datadir,
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/TOXIN/HMM_toxin/{sample}/{file_i}.hmm.csv"),
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
......@@ -56,9 +53,8 @@ rule HMM_correct_format_2:
input:
aggregate_hmm
output:
temp("{datadir}/{project}/TOXIN/HMM_toxin/{sample}.Input_HMM_R_temp.csv")
temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/TOXIN/HMM_toxin/{sample}.Input_HMM_R_temp.csv"))
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
message: "Aggregate toxin prediction of the following sample(s): {wildcards.project} - {wildcards.sample}"
......@@ -69,17 +65,16 @@ rule HMM_correct_format_2:
rule HMM_correct_format_3:
input:
"{datadir}/{project}/TOXIN/HMM_toxin/{sample}.Input_HMM_R_temp.csv"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/TOXIN/HMM_toxin/{sample}.Input_HMM_R_temp.csv")
output:
"{datadir}/{project}/TOXIN/HMM_toxin/{sample}.Input_HMM_R.csv"
os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/TOXIN/HMM_toxin/{sample}.Input_HMM_R.csv")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
echo "#Toxin" > {wildcards.datadir}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.sample}_header
cat {wildcards.datadir}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.sample}_header {input} > {output}
rm -rf {wildcards.datadir}/{wildcards.project}/TOXIN/HMM_toxin/{wildcards.sample}_header
echo "#Toxin" > {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/TOXIN/HMM_toxin/{wildcards.sample}_header
cat {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/TOXIN/HMM_toxin/{wildcards.sample}_header {input} > {output}
rm -rf {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/TOXIN/HMM_toxin/{wildcards.sample}_header
sed -i $'1 i\\\ Query_sequence\\tHMM_Name\\tSignificance_Evalue\\tScore' {output}
"""
......@@ -3,14 +3,14 @@ import glob
import os
rule clean_all:
input: "{datadir}/{project}/PathoFact_{sample}_predictions.tsv"
output: "{datadir}/{project}/logs/{sample}_compressed.zip"
input: os.path.join(DATA_DIR,"{project}/PathoFact_report/PathoFact_{sample}_predictions.tsv")
output: os.path.join(DATA_DIR,"{project}/logs/{sample}_compressed.zip")
shell: """
zip -rm {output} {wildcards.datadir}/{wildcards.project}/logs/{wildcards.sample}
rm -rf {wildcards.datadir}/{wildcards.project}/splitted/{wildcards.sample}
rm -rf {wildcards.datadir}/{wildcards.project}/contig_splitted/{wildcards.sample}
rm -rf {wildcards.datadir}/{wildcards.project}/SignalP/splitted/{wildcards.sample}
rm -rf {wildcards.datadir}/{wildcards.project}/MGE/plasmid_splitted/{wildcards.sample}
find {wildcards.datadir}/{wildcards.project} -type d -empty -delete
zip -rm {output} {config[pathofact][datadir]}/{wildcards.project}/logs/{wildcards.sample}
rm -rf {config[pathofact][datadir]}/{wildcards.project}/splitted/{wildcards.sample}
rm -rf {config[pathofact][datadir]}/{wildcards.project}/contig_splitted/{wildcards.sample}_dir
rm -rf {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/SignalP/splitted/{wildcards.sample}
rm -rf {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/MGE/plasmid_splitted/{wildcards.sample}
find {config[pathofact][datadir]}/{wildcards.project} -type d -empty -delete
"""
......@@ -3,32 +3,32 @@ import glob
import os
rule clean_Toxin_workflow:
input: "{datadir}/{project}/Toxin_prediction_{sample}_report.tsv"
output: "{datadir}/{project}/logs/Tox_{sample}_compressed.zip"
input: os.path.join(DATA_DIR,"{project}/PathoFact_report/Toxin_prediction_{sample}_report.tsv")
output: os.path.join(DATA_DIR,"{project}/logs/Tox_{sample}_compressed.zip")
shell: """
zip -rm {output} {wildcards.datadir}/{wildcards.project}/logs/{wildcards.sample}
rm -rf {wildcards.datadir}/{wildcards.project}/splitted/{wildcards.sample}
rm -rf {wildcards.datadir}/{wildcards.project}/SignalP/splitted/{wildcards.sample}
find {wildcards.datadir}/{wildcards.project} -type d -empty -delete
zip -rm {output} {config[pathofact][datadir]}/{wildcards.project}/logs/{wildcards.sample}
rm -rf {config[pathofact][datadir]}/{wildcards.project}/splitted/{wildcards.sample}
rm -rf {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/SignalP/splitted/{wildcards.sample}
find {config[pathofact][datadir]}/{wildcards.project} -type d -empty -delete
"""
rule clean_VF_workflow:
input: "{datadir}/{project}/Virulence_prediction_{sample}_report.tsv"
output: "{datadir}/{project}/logs/VF_{sample}_compressed.zip"
input: os.path.join(DATA_DIR,"{project}/PathoFact_report/Virulence_prediction_{sample}_report.tsv")
output: os.path.join(DATA_DIR,"{project}/logs/VF_{sample}_compressed.zip")
shell: """
zip -rm {output} {wildcards.datadir}/{wildcards.project}/logs/{wildcards.sample}
rm -rf {wildcards.datadir}/{wildcards.project}/splitted/{wildcards.sample}
rm -rf {wildcards.datadir}/{wildcards.project}/SignalP/splitted/{wildcards.sample}
find {wildcards.datadir}/{wildcards.project} -type d -empty -delete
zip -rm {output} {config[pathofact][datadir]}/{wildcards.project}/logs/{wildcards.sample}
rm -rf {config[pathofact][datadir]}/{wildcards.project}/splitted/{wildcards.sample}
rm -rf {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/SignalP/splitted/{wildcards.sample}
find {config[pathofact][datadir]}/{wildcards.project} -type d -empty -delete
"""
rule clean_AMR_workflow:
input: "{datadir}/{project}/AMR_MGE_prediction_{sample}_report.tsv"
output: "{datadir}/{project}/logs/AMR_{sample}_compressed.zip"
input: os.path.join(DATA_DIR,"{project}/PathoFact_report/AMR_MGE_prediction_{sample}_report.tsv")
output: os.path.join(DATA_DIR,"{project}/logs/AMR_{sample}_compressed.zip")
shell: """
zip -rm {output} {wildcards.datadir}/{wildcards.project}/logs/{wildcards.sample}
rm -rf {wildcards.datadir}/{wildcards.project}/splitted/{wildcards.sample}
rm -rf {wildcards.datadir}/{wildcards.project}/contig_splitted/{wildcards.sample}
rm -rf {wildcards.datadir}/{wildcards.project}/MGE/plasmid_splitted/{wildcards.sample}
find {wildcards.datadir}/{wildcards.project} -type d -empty -delete
zip -rm {output} {config[pathofact][datadir]}/{wildcards.project}/logs/{wildcards.sample}
rm -rf {config[pathofact][datadir]}/{wildcards.project}/splitted/{wildcards.sample}
rm -rf {config[pathofact][datadir]}/{wildcards.project}/contig_splitted/{wildcards.sample}_dir
rm -rf {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/MGE/plasmid_splitted/{wildcards.sample}
find {config[pathofact][datadir]}/{wildcards.project} -type d -empty -delete
"""