Gitlab is now using https://gitlab.lcsb.uni.lu as it's primary address. Please update your bookmarks. FAQ.

Commit 660212ac authored by Laura Denies's avatar Laura Denies
Browse files

streamline pipeline

parent 8d6b6ddd
#Snakefile #Snakefile
configfile: "config.yaml" configfile: "config.yaml"
DATA_DIR=config["pathofact"]["datadir"]
if config["pathofact"]["workflow"] == "complete": if config["pathofact"]["workflow"] == "complete":
include: include:
...@@ -9,12 +10,12 @@ if config["pathofact"]["workflow"] == "complete": ...@@ -9,12 +10,12 @@ if config["pathofact"]["workflow"] == "complete":
input: input:
expand( expand(
[ [
"{datadir}/{project}/AMR/{sample}_AMR_MGE_prediction_detailed.tsv", os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/{sample}_AMR_MGE_prediction_detailed.tsv"),
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv", os.path.join(DATA_DIR,"{project}/PathoFact_report/Toxin_gene_library_{sample}_report.tsv"),
"{datadir}/{project}/PathoFact_{sample}_predictions.tsv", os.path.join(DATA_DIR,"{project}/PathoFact_report/PathoFact_{sample}_predictions.tsv"),
"{datadir}/{project}/logs/{sample}_compressed.zip" os.path.join(DATA_DIR,"{project}/logs/{sample}_compressed.zip")
], ],
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"] project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
) )
elif config["pathofact"]["workflow"] == "Tox": elif config["pathofact"]["workflow"] == "Tox":
include: include:
...@@ -23,11 +24,11 @@ elif config["pathofact"]["workflow"] == "Tox": ...@@ -23,11 +24,11 @@ elif config["pathofact"]["workflow"] == "Tox":
input: input:
expand( expand(
[ [
"{datadir}/{project}/Toxin_prediction_{sample}_report.tsv", os.path.join(DATA_DIR,"{project}/PathoFact_report/Toxin_prediction_{sample}_report.tsv"),
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv", os.path.join(DATA_DIR,"{project}/PathoFact_report/Toxin_gene_library_{sample}_report.tsv"),
"{datadir}/{project}/logs/Tox_{sample}_compressed.zip" os.path.join(DATA_DIR,"{project}/logs/Tox_{sample}_compressed.zip")
], ],
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"] project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
) )
elif config["pathofact"]["workflow"] == "Vir": elif config["pathofact"]["workflow"] == "Vir":
include: include:
...@@ -36,10 +37,10 @@ elif config["pathofact"]["workflow"] == "Vir": ...@@ -36,10 +37,10 @@ elif config["pathofact"]["workflow"] == "Vir":
input: input:
expand( expand(
[ [
"{datadir}/{project}/Virulence_prediction_{sample}_report.tsv", os.path.join(DATA_DIR,"{project}/PathoFact_report/Virulence_prediction_{sample}_report.tsv"),
"{datadir}/{project}/logs/VF_{sample}_compressed.zip" os.path.join(DATA_DIR,"{project}/logs/VF_{sample}_compressed.zip")
], ],
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"] project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
) )
elif config["pathofact"]["workflow"] == "AMR": elif config["pathofact"]["workflow"] == "AMR":
include: include:
...@@ -48,11 +49,11 @@ elif config["pathofact"]["workflow"] == "AMR": ...@@ -48,11 +49,11 @@ elif config["pathofact"]["workflow"] == "AMR":
input: input:
expand( expand(
[ [
"{datadir}/{project}/AMR_MGE_prediction_{sample}_report.tsv", os.path.join(DATA_DIR,"{project}/PathoFact_report/AMR_MGE_prediction_{sample}_report.tsv"),
"{datadir}/{project}/AMR/{sample}_AMR_MGE_prediction_detailed.tsv", os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/{sample}_AMR_MGE_prediction_detailed.tsv"),
"{datadir}/{project}/logs/AMR_{sample}_compressed.zip" os.path.join(DATA_DIR,"{project}/logs/AMR_{sample}_compressed.zip")
], ],
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"] project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
) )
else: else:
raise Exception("Unknown workflow option: %s" % config["pathofact"]["workflow"]) raise Exception("Unknown workflow option: %s" % config["pathofact"]["workflow"])
pathofact: pathofact:
sample: ["test_2"] # requires user input sample: ["test_sample"] # requires user input
project: Lim_PathoFact # requires user input project: Final_test # requires user input
datadir: ../test_dataset # requires user input datadir: /work/projects/ecosystem_biology/local_tools/PathoFact_update/test_dataset # requires user input
workflow: "complete" workflow: "complete"
size_fasta: 100 size_fasta: 10000
scripts: "scripts" scripts: "scripts"
signalp: "/work/projects/ecosystem_biology/local_tools/SignalP/signalp-5.0b/bin" # requires user input signalp: "/work/projects/ecosystem_biology/local_tools/SignalP/signalp-5.0b/bin" # requires user input
deepvirfinder: "submodules/DeepVirFinder/dvf.py" deepvirfinder: "submodules/DeepVirFinder/dvf.py"
......
...@@ -10,13 +10,12 @@ import os ...@@ -10,13 +10,12 @@ import os
## deepARG: ## deepARG:
rule run_deepARG: rule run_deepARG:
input: input:
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta" os.path.join(DATA_DIR,"{project}/splitted/{sample}/{file_i}.fasta")
output: output:
temp("{datadir}/{project}/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG") temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG"))
log: log:
"{datadir}/{project}/logs/{sample}/{file_i}.out.mapping.ARG.log" os.path.join(DATA_DIR,"{project}/logs/{sample}/{file_i}.out.mapping.ARG.log")
params: params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"], runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"] mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
conda: conda:
...@@ -24,14 +23,13 @@ rule run_deepARG: ...@@ -24,14 +23,13 @@ rule run_deepARG:
message: "executing deep-arg on the following sample(s): {wildcards.project} - {wildcards.sample}" message: "executing deep-arg on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell: shell:
""" """
deeparg predict --model LS --model-version v2 --type prot -d {config[pathofact][scripts]}/deeparg_data/deepARG --input {input} --out {wildcards.datadir}/{wildcards.project}/AMR/deepARG_results/{wildcards.sample}/{wildcards.file_i}.out &> {log} deeparg predict --model LS --model-version v2 --type prot -d {config[pathofact][scripts]}/deeparg_data/deepARG --input {input} --out {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/AMR/deepARG_results/{wildcards.sample}/{wildcards.file_i}.out &> {log}
""" """
def aggregate_AMR(wildcards): def aggregate_AMR(wildcards):
checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
return expand( return expand(
"{datadir}/{project}/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG", os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG"),
datadir=wildcards.datadir,
project=wildcards.project, project=wildcards.project,
sample=wildcards.sample, sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
...@@ -41,41 +39,38 @@ rule aggregate_deepARG: ...@@ -41,41 +39,38 @@ rule aggregate_deepARG:
input: input:
aggregate_AMR aggregate_AMR
output: output:
"{datadir}/{project}/AMR/deepARG_results/{sample}.out.mapping.ARG" os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/deepARG_results/{sample}.out.mapping.ARG")
params: params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"], runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"] mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
message: "aggregating deep-arg results on the following sample(s): {wildcards.project} - {wildcards.sample}" message: "aggregating deep-arg results on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell: shell:
""" """
cat {input} > {output} cat {input} > {output}
rm -rf {wildcards.datadir}/{wildcards.project}/AMR/deepARG_results/{wildcards.sample} rm -rf {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/AMR/deepARG_results/{wildcards.sample}
""" """
# RGI # RGI
rule run_RGI: rule run_RGI:
input: input:
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta" os.path.join(DATA_DIR,"{project}/splitted/{sample}/{file_i}.fasta")
output: output:
temp("{datadir}/{project}/AMR/RGI_results/{sample}/{file_i}.RGI.txt") temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/RGI_results/{sample}/{file_i}.RGI.txt"))
log: log:
"{datadir}/{project}/logs/{sample}/{file_i}.RGI.log" os.path.join(DATA_DIR,"{project}/logs/{sample}/{file_i}.RGI.log")
params: params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"], runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"] mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
conda: conda:
"../../envs/rgi.yaml" "../../envs/rgi.yaml"
message: "executing RGI on the following sample(s): {wildcards.project} - {wildcards.sample}" message: "executing RGI on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell: shell:
"rgi main --input_sequence {input} --output_file {wildcards.datadir}/{wildcards.project}/AMR/RGI_results/{wildcards.sample}/{wildcards.file_i}.RGI --input_type protein --local --clean &> {log}" "rgi main --input_sequence {input} --output_file {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/AMR/RGI_results/{wildcards.sample}/{wildcards.file_i}.RGI --input_type protein --local --clean &> {log}"
def aggregate_RGI(wildcards): def aggregate_RGI(wildcards):
checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
return expand( return expand(
"{datadir}/{project}/AMR/RGI_results/{sample}/{file_i}.RGI.txt", os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/RGI_results/{sample}/{file_i}.RGI.txt"),
datadir=wildcards.datadir,
project=wildcards.project, project=wildcards.project,
sample=wildcards.sample, sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
...@@ -85,31 +80,29 @@ rule aggregate_RGI: ...@@ -85,31 +80,29 @@ rule aggregate_RGI:
input: input:
aggregate_RGI aggregate_RGI
output: output:
"{datadir}/{project}/AMR/RGI_results/{sample}.RGI.txt" os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/RGI_results/{sample}.RGI.txt")
params: params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"], runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"] mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
message: "Aggregate RGI results on the following sample(s): {wildcards.project} - {wildcards.sample}" message: "Aggregate RGI results on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell: shell:
""" """
cat {input} > {output} cat {input} > {output}
rm -rf {wildcards.datadir}/{wildcards.project}/AMR/RGI_results/{wildcards.sample} rm -rf {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/AMR/RGI_results/{wildcards.sample}
""" """
# Combine DeepARG and RGI results # Combine DeepARG and RGI results
rule combine_AMR: rule combine_AMR:
input: input:
RGI="{datadir}/{project}/AMR/RGI_results/{sample}.RGI.txt", RGI=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/RGI_results/{sample}.RGI.txt"),
DeepARG="{datadir}/{project}/AMR/deepARG_results/{sample}.out.mapping.ARG", DeepARG=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/deepARG_results/{sample}.out.mapping.ARG"),
AMR_index= "scripts/PathoFact_AMR_index.tsv" AMR_index= "scripts/PathoFact_AMR_index.tsv"
output: output:
AMR_combined="{datadir}/{project}/AMR/{sample}_AMR_prediction.tsv" AMR_combined=os.path.join(DATA_DIR,"{project}/AMR/{sample}_AMR_prediction.tsv")
log: log:
"{datadir}/{project}/logs/{sample}/combine_AMR_temp.log" os.path.join(DATA_DIR,"{project}/logs/{sample}/combine_AMR_temp.log")
params: params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"], runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"] mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
conda: "../../envs/R.yaml" conda: "../../envs/R.yaml"
......
...@@ -9,21 +9,20 @@ import os ...@@ -9,21 +9,20 @@ import os
rule combine_AMR_plasmid: rule combine_AMR_plasmid:
input: input:
ORF_translation="{datadir}/{project}/renamed/{sample}_translation.tsv", ORF_translation=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/renamed/{sample}_translation.tsv"),
Contig_ORFs="{datadir}/{project}/Prodigal/{sample}.contig", Contig_ORFs=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/Prodigal/{sample}.contig"),
Contig_translation="{datadir}/{project}/renamed/{sample}_Contig_translation.tsv", Contig_translation=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/renamed/{sample}_Contig_translation.tsv"),
AMR="{datadir}/{project}/AMR/{sample}_AMR_prediction.tsv", AMR=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/{sample}_AMR_prediction.tsv"),
PlasFlow="{datadir}/{project}/MGE/plasmid/PlasFlow/{sample}_plasflow_prediction_final.tsv", PlasFlow=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/PlasFlow/{sample}_plasflow_prediction_final.tsv"),
MOB_suite="{datadir}/{project}/MGE/plasmid/MOB_suite/{sample}_MOB_suite_aggregated.tsv", MOB_suite=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/MOB_suite/{sample}_MOB_suite_aggregated.tsv"),
DeepVirFinder="{datadir}/{project}/MGE/phage/{sample}_VirFinder_aggregated.csv", DeepVirFinder=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}_VirFinder_aggregated.csv"),
VirSorter="{datadir}/{project}/MGE/phage/{sample}_VIRSorter_aggregated.csv" VirSorter=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}_VIRSorter_aggregated.csv")
output: output:
Report_1="{datadir}/{project}/AMR/{sample}_AMR_MGE_prediction_detailed.tsv", Report_1=os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/AMR/{sample}_AMR_MGE_prediction_detailed.tsv"),
Report_2="{datadir}/{project}/AMR_MGE_prediction_{sample}_report.tsv" Report_2=os.path.join(DATA_DIR,"{project}/PathoFact_report/AMR_MGE_prediction_{sample}_report.tsv")
log: log:
"{datadir}/{project}/logs/{sample}/MGE_AMR_prediction_detail_temp.log" os.path.join(DATA_DIR,"{project}/logs/{sample}/MGE_AMR_prediction_detail_temp.log")
params: params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"], runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"] mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
conda: conda:
......
...@@ -11,13 +11,12 @@ import os ...@@ -11,13 +11,12 @@ import os
rule run_VirSorter: rule run_VirSorter:
input: input:
"{datadir}/{project}/renamed/{sample}_Contig_ID.fna" os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/renamed/{sample}_Contig_ID.fna")
output: output:
temp("{datadir}/{project}/MGE/phage/{sample}/virsorter/VIRSorter_global-phage-signal.csv") temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}/virsorter/VIRSorter_global-phage-signal.csv"))
log: log:
"{datadir}/{project}/logs/{sample}/VIRSorter_global-phage-signal.log" os.path.join(DATA_DIR,"{project}/logs/{sample}/VIRSorter_global-phage-signal.log")
params: params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["long"], runtime=config["pathofact"]["runtime"]["long"],
mem=config["pathofact"]["mem"]["big_mem_per_core_gb"] mem=config["pathofact"]["mem"]["big_mem_per_core_gb"]
conda: conda:
...@@ -27,17 +26,15 @@ rule run_VirSorter: ...@@ -27,17 +26,15 @@ rule run_VirSorter:
message: "Executing VirSorter with {threads} threads on the following sample(s): {wildcards.project} - {wildcards.sample}" message: "Executing VirSorter with {threads} threads on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell: shell:
""" """
wrapper_phage_contigs_sorter_iPlant.pl -f {input} --ncpu {threads} --wdir {wildcards.datadir}/{wildcards.project}/MGE/phage/{wildcards.sample}/virsorter --data-dir {config[pathofact][scripts]}/virsorter-data &> {log} wrapper_phage_contigs_sorter_iPlant.pl -f {input} --ncpu {threads} --wdir {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/MGE/phage/{wildcards.sample}/virsorter --data-dir {config[pathofact][scripts]}/virsorter-data &> {log}
""" """
localrules: aggregate_VirSorter localrules: aggregate_VirSorter
rule aggregate_VirSorter: rule aggregate_VirSorter:
input: input:
"{datadir}/{project}/MGE/phage/{sample}/virsorter/VIRSorter_global-phage-signal.csv" os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}/virsorter/VIRSorter_global-phage-signal.csv")
output: output:
"{datadir}/{project}/MGE/phage/{sample}_VIRSorter_aggregated.csv" os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}_VIRSorter_aggregated.csv")
params:
outdir="{datadir}"
message: "VirSorter failsave for empty files: {wildcards.project} - {wildcards.sample}" message: "VirSorter failsave for empty files: {wildcards.project} - {wildcards.sample}"
shell: shell:
""" """
...@@ -47,34 +44,31 @@ rule aggregate_VirSorter: ...@@ -47,34 +44,31 @@ rule aggregate_VirSorter:
else else
mv {input} {output} mv {input} {output}
fi fi
rm -rf {wildcards.datadir}/{wildcards.project}/MGE/phage/{wildcards.sample}/virsorter rm -rf {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/MGE/phage/{wildcards.sample}/virsorter
""" """
# VIRFINDER Prediction # VIRFINDER Prediction
rule run_VirFinder: rule run_VirFinder:
input: input:
"{datadir}/{project}/contig_splitted/{sample}/{file_i}.fasta" os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/contig_splitted/{sample}/{file_i}.fasta")
output: output:
temp("{datadir}/{project}/MGE/phage/{sample}/virfinder/{file_i}.fasta_gt1bp_dvfpred.txt") temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}/virfinder/{file_i}.fasta_gt1bp_dvfpred.txt"))
log: log:
"{datadir}/{project}/logs/{sample}/{file_i}.fasta_gt1bp_dvfpred.log" os.path.join(DATA_DIR,"{project}/logs/{sample}/{file_i}.fasta_gt1bp_dvfpred.log")
params: params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["long"], runtime=config["pathofact"]["runtime"]["long"],
mem=config["pathofact"]["mem"]["big_mem_per_core_gb"] mem=config["pathofact"]["mem"]["big_mem_per_core_gb"]
conda: conda:
"../../envs/DeepVirFinder.yaml" "../../envs/DeepVirFinder.yaml"
threads: threads: config["pathofact"]["mem"]["big_mem_cores"]
config["pathofact"]["mem"]["big_mem_cores"]
message: "Executing Deep-VirFinder with {threads} threads on the following sample(s): {wildcards.project} - {wildcards.sample}" message: "Executing Deep-VirFinder with {threads} threads on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell: shell:
"python {config[pathofact][deepvirfinder]} -i {input} -o {wildcards.datadir}/{wildcards.project}/MGE/phage/{wildcards.sample}/virfinder -c {threads} &> {log}" "python {config[pathofact][deepvirfinder]} -i {input} -o {config[pathofact][datadir]}/{wildcards.project}/MGE/PathoFact_intermediate/phage/{wildcards.sample}/virfinder -c {threads} &> {log}"
def aggregate_VirFinder(wildcards): def aggregate_VirFinder(wildcards):
checkpoint_output= checkpoints.splitcontig.get(**wildcards).output.split checkpoint_output= checkpoints.splitcontig.get(**wildcards).output.split
return expand( return expand(
"{datadir}/{project}/MGE/phage/{sample}/virfinder/{file_i}.fasta_gt1bp_dvfpred.txt", os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}/virfinder/{file_i}.fasta_gt1bp_dvfpred.txt"),
datadir=wildcards.datadir,
project=wildcards.project, project=wildcards.project,
sample=wildcards.sample, sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
...@@ -84,9 +78,8 @@ rule aggregate_VirFinder: ...@@ -84,9 +78,8 @@ rule aggregate_VirFinder:
input: input:
aggregate_VirFinder aggregate_VirFinder
output: output:
"{datadir}/{project}/MGE/phage/{sample}_VirFinder_aggregated.csv" os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/phage/{sample}_VirFinder_aggregated.csv")
params: params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"], runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"] mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
message: "Aggregate VirFinder predictions on the following sample(s): {wildcards.project} - {wildcards.sample}" message: "Aggregate VirFinder predictions on the following sample(s): {wildcards.project} - {wildcards.sample}"
......
...@@ -10,15 +10,14 @@ import os ...@@ -10,15 +10,14 @@ import os
# PlasFlow Preprocessing # PlasFlow Preprocessing
rule filter_seq: rule filter_seq:
input: input:
"{datadir}/{project}/renamed/{sample}_Contig_ID.fna" os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/renamed/{sample}_Contig_ID.fna")
output: output:
temp("{datadir}/{project}/plasmid/{sample}_filtered.fna") temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/plasmid/{sample}_filtered.fna"))
log: log:
"{datadir}/{project}/logs/{sample}/plasmid_filtered.log" os.path.join(DATA_DIR,"{project}/logs/{sample}/plasmid_filtered.log")
conda: conda:
"../../envs/Biopython.yaml" "../../envs/Biopython.yaml"
params: params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"], runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"], mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
minlen=config["pathofact"]["plasflow_minlen"] minlen=config["pathofact"]["plasflow_minlen"]
...@@ -28,11 +27,10 @@ rule filter_seq: ...@@ -28,11 +27,10 @@ rule filter_seq:
checkpoint splitplasmid: checkpoint splitplasmid:
input: input:
"{datadir}/{project}/plasmid/{sample}_filtered.fna" os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/plasmid/{sample}_filtered.fna")
output: output:
split=temp(directory("{datadir}/{project}/MGE/plasmid_splitted/{sample}/")) split=temp(directory(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid_splitted/{sample}/")))
params: params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"], runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"], mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
split=config["pathofact"]["size_fasta"] split=config["pathofact"]["size_fasta"]
...@@ -40,21 +38,20 @@ checkpoint splitplasmid: ...@@ -40,21 +38,20 @@ checkpoint splitplasmid:
"../../envs/Biopython.yaml" "../../envs/Biopython.yaml"
shell: shell:
""" """
python {config[pathofact][scripts]}/split.py {input} {params.split} {wildcards.datadir}/{wildcards.project}/MGE/plasmid_splitted/{wildcards.sample} python {config[pathofact][scripts]}/split.py {input} {params.split} {output}
""" """
# PlasFlow Plasmid prediction # PlasFlow Plasmid prediction
rule run_PLASMID: rule run_PLASMID:
input: input:
"{datadir}/{project}/MGE/plasmid_splitted/{sample}/{file_i}.fasta" os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid_splitted/{sample}/{file_i}.fasta")
output: output:
temp("{datadir}/{project}/MGE/plasmid/PlasFlow/{sample}/{file_i}_plasflow_prediction.tsv") temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/PlasFlow/{sample}/{file_i}_plasflow_prediction.tsv"))
log: log:
"{datadir}/{project}/logs/{sample}/{file_i}_plasflow_prediction.log" os.path.join(DATA_DIR,"{project}/logs/{sample}/{file_i}_plasflow_prediction.log")
conda: conda:
"../../envs/PlasFlow.yaml" "../../envs/PlasFlow.yaml"
params: params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["long"], runtime=config["pathofact"]["runtime"]["long"],
mem=config["pathofact"]["mem"]["big_mem_per_core_gb"], mem=config["pathofact"]["mem"]["big_mem_per_core_gb"],
threshold=config["pathofact"]["plasflow_threshold"] threshold=config["pathofact"]["plasflow_threshold"]
...@@ -67,8 +64,7 @@ rule run_PLASMID: ...@@ -67,8 +64,7 @@ rule run_PLASMID:
def aggregate_plasmid_input(wildcards): def aggregate_plasmid_input(wildcards):
checkpoint_output= checkpoints.splitplasmid.get(**wildcards).output.split checkpoint_output= checkpoints.splitplasmid.get(**wildcards).output.split
return expand( return expand(
"{datadir}/{project}/MGE/plasmid/PlasFlow/{sample}/{file_i}_plasflow_prediction.tsv", os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/PlasFlow/{sample}/{file_i}_plasflow_prediction.tsv"),
datadir=wildcards.datadir,
project=wildcards.project, project=wildcards.project,
sample=wildcards.sample, sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
...@@ -78,25 +74,23 @@ rule Plasmid_aggregate: ...@@ -78,25 +74,23 @@ rule Plasmid_aggregate:
input: input:
aggregate_plasmid_input aggregate_plasmid_input
output: output:
temp("{datadir}/{project}/MGE/plasmid/PlasFlow/{sample}_plasflow_aggregated.tsv") temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/PlasFlow/{sample}_plasflow_aggregated.tsv"))
params: params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"], runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"] mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
message: "Aggregate PlasFlow results on the following sample(s): {wildcards.project} - {wildcards.sample}" message: "Aggregate PlasFlow results on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell: shell:
""" """
cat {input} > {output} cat {input} > {output}
{wildcards.datadir}/{wildcards.project}/MGE/plasmid/PlasFlow/{wildcards.sample} rm -rf {config[pathofact][datadir]}/{wildcards.project}/PathoFact_intermediate/MGE/plasmid/PlasFlow/{wildcards.sample}
""" """
rule select: rule select:
input: input:
"{datadir}/{project}/MGE/plasmid/PlasFlow/{sample}_plasflow_aggregated.tsv" os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/PlasFlow/{sample}_plasflow_aggregated.tsv")
output: output:
"{datadir}/{project}/MGE/plasmid/PlasFlow/{sample}_plasflow_prediction_final.tsv" os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/PlasFlow/{sample}_plasflow_prediction_final.tsv")
params: params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"], runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"] mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell: shell:
...@@ -105,26 +99,22 @@ rule select: ...@@ -105,26 +99,22 @@ rule select:
""" """
rule run_MOBsuite: rule run_MOBsuite:
input: "{datadir}/{project}/contig_splitted/{sample}/{file_i}.fasta" input: os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/contig_splitted/{sample}/{file_i}.fasta")
output: output:
temp("{datadir}/{project}/MGE/plasmid/MOB_suite/{sample}/{file_i}_MOB_suite_prediction.txt") temp(os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/MOB_suite/{sample}/{file_i}_MOB_suite_prediction.txt"))
log: log:
"{datadir}/{project}/logs/{sample}/{file_i}_MOB_suite_prediction.log" os.path.join(DATA_DIR,"{project}/logs/{sample}/{file_i}_MOB_suite_prediction.log")
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["long"]
threads: threads:
config["pathofact"]["mem"]["big_mem_cores"] config["pathofact"]["mem"]["big_mem_cores"]
conda: conda:
"../../envs/MOB_suite.yaml" "../../envs/MOB_suite.yaml"
message: "Executing MOB_suite with {threads} threads on the following sample(s): {wildcards.project} - {wildcards.sample}" message: "Executing MOB_suite with {threads} threads on the following sample(s): {wildcards.project} - {wildcards.sample}"
shell: "mob_typer --multi --infile {input} --out_file {wildcards.datadir}/{wildcards.project}/MGE/plasmid/MOB_suite/{wildcards.sample}/{wildcards.file_i}_MOB_suite_prediction.txt -n {threads} &> {log}" shell: "mob_typer --multi --infile {input} --out_file {output} -n {threads} &> {log}"
def aggregate_MOBsuite(wildcards): def aggregate_MOBsuite(wildcards):
checkpoint_output= checkpoints.splitcontig.get(**wildcards).output.split checkpoint_output= checkpoints.splitcontig.get(**wildcards).output.split
return expand( return expand(
"{datadir}/{project}/MGE/plasmid/MOB_suite/{sample}/{file_i}_MOB_suite_prediction.txt", os.path.join(DATA_DIR,"{project}/PathoFact_intermediate/MGE/plasmid/MOB_suite/{sample}/{file_i}_MOB_suite_prediction.txt"),
datadir=wildcards.datadir,
project=wildcards.project, project=wildcards.project,
sample=wildcards.sample, sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
...@@ -133,9 +123,8 @@ def aggregate_MOBsuite(wildcards): ...@@ -133,9 +123,8 @@ def aggregate_MOBsuite(wildcards):
rule aggregate_MOBsuite: rule aggregate_MOBsuite:
input: aggregate_MOBsuite input: aggregate_MOBsuite