Gitlab is now using https://gitlab.lcsb.uni.lu as it's primary address. Please update your bookmarks. FAQ.

Commit d80a6ad1 authored by Laura Denies's avatar Laura Denies
Browse files

Merge branch 'IMPirisv2' into 'master'

Merge master with IMPirisv2

See merge request !2
parents fef0c395 6589efab
......@@ -82,7 +82,7 @@ Each sample should have three input files:
- contig and gene IDs should be the same as in the FASTA files
The files should be located in the same directory.
For each sample, the corresponding input files should have the same basename, e.g. `SAMPLE_A.fna`, `SAMPLE_A.fna` and `SAMPLE_A.contig` for sample `SAMPLE_A`.
For each sample, the corresponding input files should have the same basename, e.g. `SAMPLE_A.fna`, `SAMPLE_A.faa` and `SAMPLE_A.contig` for sample `SAMPLE_A`.
**NOTE**: For preprocessing and assembly of metagenomic reads we would suggest using IMP (https://imp.pages.uni.lu/web/)
......@@ -92,9 +92,9 @@ For each sample, the corresponding input files should have the same basename, e.
To run PathoFact you need to adjust some parameters in `config.yaml`.
- `input_file`: This is a list of sample names, e.g. `input_file: ["SAMPLE_A","SAMPLE_B"]`
- `project`: A unique project name which will be used as the name of the output directory in `OUTDIR` path (see below).
- `OUTDIR`: Path to directory containing the sample data; the output directory will be created there.
- `sample`: This is a list of sample names, e.g. `sample: ["SAMPLE_A","SAMPLE_B"]`
- `project`: A unique project name which will be used as the name of the output directory in `datapath` path (see below).
- `datapath`: Path to directory containing the sample data; the output directory will be created there.
- `workflow`: Pathofact can run the complete pipeline (default) or a specific step:
- "complete": complete pipeline = toxin + virulence + AMR + MGE prediction
- "Tox": toxin prediction
......@@ -120,3 +120,12 @@ snakemake -s Snakefile --use-conda --reason --cores <cores> -p
**NOTE**: It is advised to run the pipeline using multiple CPUs or CPUs with "higher" memory.
For more options, see the [snakemake documentation](https://snakemake.readthedocs.io/en/stable/index.html).
### Execution on a cluster
The pipeline can be run on a cluster using `slurm`.
The command can be found in the script `cluster.sh` which can also be used to submit the jobs to the cluster.
```bash
sbatch cluster.sh
```
......@@ -2,51 +2,51 @@
configfile: "config.yaml"
if config["workflow"] == "complete":
if config["pathofact"]["workflow"] == "complete":
include:
"workflows/Combine_PathoFact_workflow.smk"
rule all:
input:
expand(
[
"{OUTDIR}/{project}/AMR/{input_file}_AMR_MGE_prediction_detailed.tsv",
"{OUTDIR}/{project}/Toxin_gene_library_{input_file}_report.tsv",
"{OUTDIR}/{project}/PathoFact_{input_file}_predictions.csv"
"{datadir}/{project}/AMR/{sample}_AMR_MGE_prediction_detailed.tsv",
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv",
"{datadir}/{project}/PathoFact_{sample}_predictions.csv"
],
OUTDIR=config["OUTDIR"], project=config["project"], input_file=config["input_file"]
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
)
elif config["workflow"] == "Tox":
elif config["pathofact"]["workflow"] == "Tox":
include:
"workflows/Toxin_workflow.smk"
rule all:
input:
expand(
[
"{OUTDIR}/{project}/Toxin_prediction_{input_file}_report.csv",
"{OUTDIR}/{project}/Toxin_gene_library_{input_file}_report.tsv"
"{datadir}/{project}/Toxin_prediction_{sample}_report.csv",
"{datadir}/{project}/Toxin_gene_library_{sample}_report.tsv"
],
OUTDIR=config["OUTDIR"], project=config["project"], input_file=config["input_file"]
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
)
elif config["workflow"] == "Vir":
elif config["pathofact"]["workflow"] == "Vir":
include:
"workflows/Virulence_workflow.smk"
rule all:
input:
expand(
"{OUTDIR}/{project}/Virulence_prediction_{input_file}_report.csv",
OUTDIR=config["OUTDIR"], project=config["project"], input_file=config["input_file"]
"{datadir}/{project}/Virulence_prediction_{sample}_report.csv",
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
)
elif config["workflow"] == "AMR":
elif config["pathofact"]["workflow"] == "AMR":
include:
"workflows/AMR_workflow.smk"
rule all:
input:
expand(
[
"{OUTDIR}/{project}/AMR_MGE_prediction_{input_file}_report.tsv",
"{OUTDIR}/{project}/AMR/{input_file}_AMR_MGE_prediction_detailed.tsv"
"{datadir}/{project}/AMR_MGE_prediction_{sample}_report.tsv",
"{datadir}/{project}/AMR/{sample}_AMR_MGE_prediction_detailed.tsv"
],
OUTDIR=config["OUTDIR"], project=config["project"], input_file=config["input_file"]
datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"]
)
else:
raise Exception("Unknown workflow option: %s" % config["workflow"])
raise Exception("Unknown workflow option: %s" % config["pathofact"]["workflow"])
#!/bin/bash -l
#SBATCH -J PathoFact
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 1
#SBATCH --time=0-6:00:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
# activate env
conda activate PathoFact
# run pipeline
snakemake -s Snakefile --configfile config.yaml \
--use-conda --cores 10 --reason -p \
--cluster-config cluster.yaml --cluster \
"{cluster.call} {cluster.partition} {cluster.quality} {cluster.nodes} \
{cluster.runtime}{params.runtime} {cluster.mem_per_cpu}{params.mem} \
{cluster.threads}{threads} {cluster.job-name} {cluster.output}"
__default__:
call: "sbatch"
nodes: ""
mem_per_cpu: "--mem-per-cpu " # in rule params
partition: "-p batch"
quality: "-q qos-batch"
runtime: "-t" # in rule params
threads: "-n"
job-name: "-J PathoFact.{rule}"
output: "-o slurm-%j.%N-%x.out"
run_PLASMID:
nodes: "-N 1"
partition: "-p bigmem"
quality: "-q qos-bigmem"
run_VirSorter:
nodes: "-N 1"
partition: "-p bigmem"
quality: "-q qos-bigmem"
run_VirFinder:
nodes: "-N 1"
partition: "-p bigmem"
quality: "-q qos-bigmem"
# Insert name of input files to run analysis
# All three input files used by the pipeline for one sample need to be given the same name, followed by the suffix .faa (amino acid, gene fasta file), .fna (nucleotide contig fasta file), .contig (table with contig and gene names).
# Only give name of files and not the suffix (.faa, .fna, .contig)
input_file: ["SAMPLE_A","SAMPLE_B"]
# Define unique name for your project
project: Project_A_PathoFact
# Define dir to files (wil also be the output directory
OUTDIR: /path/to/samples
# Define size of split fasta files (default 10 000 sequences/file)
size_fasta: 100000
# Workflow (default: "complete")
# complete: complete pipeline: Tox + Vir + AMR prediction
# Tox: toxin prediction
# Vir: virulence prediction
# AMR: antimicrobial resistance (AMR) & mobile genetic element (MGE) prediction
workflow: "complete"
###########
# SignalP #
###########
# Define path to signalP
signalp: "/path/to/signalp-4.1/signalp"
############
# Toxin #
############
# Define path to HMM
hmmscan_tool: "hmmsearch"
hmm_file: "databases/toxins/combined_Toxin.hmm"
#################
# Virulence #
#################
vir_hmm_file: "databases/virulence/Virulence_factor.hmm"
#############
# AMR #
#############
# Define path to deepARG
deep_ARG: "submodules/deeparg-ss/deepARG.py"
#Define path to Plasflow
Plasflow: "PlasFlow.py"
# Define path to phage prediction tools
# Virsorter
virsorter: "wrapper_phage_contigs_sorter_iPlant.pl"
virsorter_data: "scripts/virsorter-data"
#VirFinder
DeepVirFinder: "submodules/DeepVirFinder/dvf.py"
pathofact:
sample: ["SAMPLE_A","SAMPLE_B"] # requires user input
project: Project_A_PathoFact # requires user input
datadir: /path/to/samples # requires user input
workflow: "complete"
size_fasta: 10000
scripts: "scripts"
signalp: "/path/to/signalp-4.1/signalp" # requires user input
deeparg: "submodules/deeparg-ss/deepARG.py"
deepvirfinder: "submodules/DeepVirFinder/dvf.py"
tox_hmm: "databases/toxins/combined_Toxin.hmm"
tox_lib: "databases/library_HMM_Toxins.csv"
vir_hmm: "databases/virulence/Virulence_factor.hmm"
vir_domains: "databases/models_and_domains"
plasflow_threshold: 0.7
plasflow_minlen: 1000
runtime:
short: "00:10:00"
medium: "01:00:00"
long: "02:00:00"
mem:
normal_mem_per_core_gb: "4G"
big_mem_cores: 4
big_mem_per_core_gb: "30G"
......@@ -3,34 +3,33 @@
import glob
import os
PROJECT = config["project"]
INPUT = config["input_file"]
##########################
# AMR Prediction #
##########################
rule run_deepARG:
input:
"{OUTDIR}/{project}/splitted/{input_file}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
output:
temp("{OUTDIR}/{project}/AMR/deepARG_results/{input_file}/{file_i}.out.mapping.ARG")
temp("{datadir}/{project}/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG")
log:
"{OUTDIR}/{project}/AMR/deepARG_results/{input_file}/{file_i}.out.mapping.ARG.log"
"{datadir}/{project}/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG.log"
params:
outdir="{OUTDIR}"
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
conda:
"../../envs/DeepARG.yaml"
shell:
"python {config[deep_ARG]} --align --type prot --genes --input {input} --output {wildcards.OUTDIR}/{wildcards.project}/AMR/deepARG_results/{wildcards.input_file}/{wildcards.file_i}.out &> {log}"
"python {config[pathofact][deeparg]} --align --type prot --genes --input {input} --output {wildcards.datadir}/{wildcards.project}/AMR/deepARG_results/{wildcards.sample}/{wildcards.file_i}.out &> {log}"
def aggregate_AMR(wildcards):
checkpoint_output = checkpoints.splitting.get(**wildcards).output.splits
return expand(
"{OUTDIR}/{project}/AMR/deepARG_results/{input_file}/{file_i}.out.mapping.ARG",
OUTDIR=wildcards.OUTDIR,
"{datadir}/{project}/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG",
datadir=wildcards.datadir,
project=wildcards.project,
input_file=wildcards.input_file,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i
)
......@@ -38,8 +37,10 @@ rule aggregate_deepARG:
input:
aggregate_AMR
output:
"{OUTDIR}/{project}/AMR/deepARG_results/{input_file}.out.mapping.ARG"
"{datadir}/{project}/AMR/deepARG_results/{sample}.out.mapping.ARG"
params:
outdir="{OUTDIR}"
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"cat {input} > {output}"
......@@ -3,29 +3,28 @@
import glob
import os
PROJECT = config["project"]
INPUT = config["input_file"]
##########################
# AMR Prediction #
##########################
rule combine_AMR_plasmid:
input:
AMR="{OUTDIR}/{project}/AMR/deepARG_results/{input_file}.out.mapping.ARG",
AMR_translation="{OUTDIR}/{project}/renamed/{input_file}_translation.tsv",
Plasmid="{OUTDIR}/{project}/MGE/plasmid/{input_file}_plasflow_prediction_final.tsv",
Contig_translation="{OUTDIR}/{project}/renamed/{input_file}_Contig_translation.tsv",
Contig_gene_list="{OUTDIR}/{input_file}.contig",
VirFinder="{OUTDIR}/{project}/MGE/phage/{input_file}_VirFinder_aggregated.csv",
VirSorter="{OUTDIR}/{project}/MGE/phage/{input_file}_VIRSorter_aggregated.csv"
AMR="{datadir}/{project}/AMR/deepARG_results/{sample}.out.mapping.ARG",
AMR_translation="{datadir}/{project}/renamed/{sample}_translation.tsv",
Plasmid="{datadir}/{project}/MGE/plasmid/{sample}_plasflow_prediction_final.tsv",
Contig_translation="{datadir}/{project}/renamed/{sample}_Contig_translation.tsv",
Contig_gene_list="{datadir}/{sample}.contig",
VirFinder="{datadir}/{project}/MGE/phage/{sample}_VirFinder_aggregated.csv",
VirSorter="{datadir}/{project}/MGE/phage/{sample}_VIRSorter_aggregated.csv"
output:
Report_1=temp("{OUTDIR}/{project}/AMR/{input_file}_MGE_AMR_prediction_detail_temp.csv"),
Report_2=temp("{OUTDIR}/{project}/AMR/{input_file}_MGE_AMR_prediction_report_temp.csv")
Report_1=temp("{datadir}/{project}/AMR/{sample}_MGE_AMR_prediction_detail_temp.csv"),
Report_2=temp("{datadir}/{project}/AMR/{sample}_MGE_AMR_prediction_report_temp.csv")
log:
"{OUTDIR}/{project}/AMR/{input_file}_MGE_AMR_prediction_detail_temp.log"
"{datadir}/{project}/AMR/{sample}_MGE_AMR_prediction_detail_temp.log"
params:
outdir="{OUTDIR}"
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
conda:
"../../envs/R.yaml"
script:
......@@ -33,11 +32,13 @@ rule combine_AMR_plasmid:
rule modify_details:
input:
"{OUTDIR}/{project}/AMR/{input_file}_MGE_AMR_prediction_detail_temp.csv"
"{datadir}/{project}/AMR/{sample}_MGE_AMR_prediction_detail_temp.csv"
output:
"{OUTDIR}/{project}/AMR/{input_file}_AMR_MGE_prediction_detailed.tsv"
"{datadir}/{project}/AMR/{sample}_AMR_MGE_prediction_detailed.tsv"
params:
outdir="{OUTDIR}"
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
sed 's/"//g' {input} | sed 's/,/\\t/g' > {output}
......@@ -45,11 +46,13 @@ rule modify_details:
rule modify_report:
input:
"{OUTDIR}/{project}/AMR/{input_file}_MGE_AMR_prediction_report_temp.csv"
"{datadir}/{project}/AMR/{sample}_MGE_AMR_prediction_report_temp.csv"
output:
"{OUTDIR}/{project}/AMR_MGE_prediction_{input_file}_report.tsv"
"{datadir}/{project}/AMR_MGE_prediction_{sample}_report.tsv"
params:
outdir="{OUTDIR}"
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"""
sed 's/"//g' {input} | sed 's/,/\\t/g' > {output}
......
......@@ -3,9 +3,6 @@
import glob
import os
PROJECT = config["project"]
INPUT = config["input_file"]
##########################
# Phage Prediction #
##########################
......@@ -14,81 +11,91 @@ INPUT = config["input_file"]
checkpoint splitphage:
input:
"{OUTDIR}/{project}/renamed/{input_file}_Contig_ID.fna"
"{datadir}/{project}/renamed/{sample}_Contig_ID.fna"
output:
split=directory("{OUTDIR}/{project}/contig_splitted/{input_file}/")
split=directory("{datadir}/{project}/contig_splitted/{sample}/")
log:
"{OUTDIR}/{project}/contig_splitted/{input_file}.log"
"{datadir}/{project}/contig_splitted/{sample}.log"
params:
outdir="{OUTDIR}"
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
split=config["pathofact"]["size_fasta"]
conda:
"../../envs/SeqKit.yaml"
shell:
"""
seqkit split2 -s 10000 {input} -O {wildcards.OUTDIR}/{wildcards.project}/contig_splitted/{wildcards.input_file} &> {log}
seqkit split2 -s {params.split} {input} -O {wildcards.datadir}/{wildcards.project}/contig_splitted/{wildcards.sample} &> {log}
"""
rule run_VirSorter:
input:
"{OUTDIR}/{project}/renamed/{input_file}_Contig_ID.fna"
"{datadir}/{project}/renamed/{sample}_Contig_ID.fna"
output:
"{OUTDIR}/{project}/MGE/phage/{input_file}/virsorter/VIRSorter_global-phage-signal.csv"
"{datadir}/{project}/MGE/phage/{sample}/virsorter/VIRSorter_global-phage-signal.csv"
log:
"{OUTDIR}/{project}/MGE/phage/{input_file}/virsorter/VIRSorter_global-phage-signal.log"
"{datadir}/{project}/MGE/phage/{sample}/virsorter/VIRSorter_global-phage-signal.log"
params:
outdir="{OUTDIR}"
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["long"],
mem=config["pathofact"]["mem"]["big_mem_per_core_gb"]
conda:
"../../envs/VirSorter.yaml"
threads:
12
config["pathofact"]["mem"]["big_mem_cores"]
shell:
"""
{config[virsorter]} -f {input} --ncpu {threads} --wdir {wildcards.OUTDIR}/{wildcards.project}/MGE/phage/{wildcards.input_file}/virsorter --data-dir {config[virsorter_data]} &> {log}
wrapper_phage_contigs_sorter_iPlant.pl -f {input} --ncpu {threads} --wdir {wildcards.datadir}/{wildcards.project}/MGE/phage/{wildcards.sample}/virsorter --data-dir {config[pathofact][scripts]}/virsorter-data &> {log}
"""
rule aggregate_Virsorter:
localrules: aggregate_VirSorter
rule aggregate_VirSorter:
input:
"{OUTDIR}/{project}/MGE/phage/{input_file}/virsorter/VIRSorter_global-phage-signal.csv"
"{datadir}/{project}/MGE/phage/{sample}/virsorter/VIRSorter_global-phage-signal.csv"
output:
"{OUTDIR}/{project}/MGE/phage/{input_file}_VIRSorter_aggregated.csv"
"{datadir}/{project}/MGE/phage/{sample}_VIRSorter_aggregated.csv"
params:
outdir="{OUTDIR}"
outdir="{datadir}"
shell:
"mv {input} {output}"
# VIRFINDER Prediction
rule run_Virfinder:
rule run_VirFinder:
input:
"{OUTDIR}/{project}/contig_splitted/{input_file}/{file_i}.fna"
"{datadir}/{project}/contig_splitted/{sample}/{file_i}.fna"
output:
"{OUTDIR}/{project}/MGE/phage/{input_file}/virfinder/{file_i}.fna_gt1bp_dvfpred.txt"
"{datadir}/{project}/MGE/phage/{sample}/virfinder/{file_i}.fna_gt1bp_dvfpred.txt"
log:
"{OUTDIR}/{project}/MGE/phage/{input_file}/virfinder/{file_i}.fna_gt1bp_dvfpred.log"
"{datadir}/{project}/MGE/phage/{sample}/virfinder/{file_i}.fna_gt1bp_dvfpred.log"
params:
outdir="{OUTDIR}"
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["long"],
mem=config["pathofact"]["mem"]["big_mem_per_core_gb"]
conda:
"../../envs/DeepVirFinder.yaml"
threads:
12
config["pathofact"]["mem"]["big_mem_cores"]
shell:
"python {config[DeepVirFinder]} -i {input} -o {wildcards.OUTDIR}/{wildcards.project}/MGE/phage/{wildcards.input_file}/virfinder -c {threads} &> {log}"
"python {config[pathofact][deepvirfinder]} -i {input} -o {wildcards.datadir}/{wildcards.project}/MGE/phage/{wildcards.sample}/virfinder -c {threads} &> {log}"
def aggregate_virfinder(wildcards):
def aggregate_VirFinder(wildcards):
checkpoint_output= checkpoints.splitphage.get(**wildcards).output.split
return expand(
"{OUTDIR}/{project}/MGE/phage/{input_file}/virfinder/{file_i}.fna_gt1bp_dvfpred.txt",
OUTDIR=wildcards.OUTDIR,
"{datadir}/{project}/MGE/phage/{sample}/virfinder/{file_i}.fna_gt1bp_dvfpred.txt",
datadir=wildcards.datadir,
project=wildcards.project,
input_file=wildcards.input_file,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fna")).i
)
rule aggregate_Virfinder:
rule aggregate_VirFinder:
input:
aggregate_virfinder
aggregate_VirFinder
output:
"{OUTDIR}/{project}/MGE/phage/{input_file}_VirFinder_aggregated.csv"
"{datadir}/{project}/MGE/phage/{sample}_VirFinder_aggregated.csv"
params:
outdir="{OUTDIR}"
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"cat {input} >{output}"
......@@ -3,9 +3,6 @@
import glob
import os
PROJECT = config["project"]
INPUT = config["input_file"]
##########################
# Plasmid Prediction #
##########################
......@@ -13,58 +10,67 @@ INPUT = config["input_file"]
# PlasFlow Preprocessing
rule filter_seq:
input:
"{OUTDIR}/{project}/renamed/{input_file}_Contig_ID.fna"
"{datadir}/{project}/renamed/{sample}_Contig_ID.fna"
output:
"{OUTDIR}/{project}/{input_file}_filtered.fna"
"{datadir}/{project}/{sample}_filtered.fna"
log:
"{OUTDIR}/{project}/{input_file}_filtered.log"
"{datadir}/{project}/{sample}_filtered.log"
conda:
"../../envs/Biopython.yaml"
params:
outdir="{OUTDIR}"
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
minlen=config["pathofact"]["plasflow_minlen"]
shell:
"./scripts/filter.pl 1000 {input} > {output} 2> {log}"
"{config[pathofact][scripts]}/filter.pl {params.minlen} {input} > {output} 2> {log}"
checkpoint splitplasmid:
input:
"{OUTDIR}/{project}/{input_file}_filtered.fna"
"{datadir}/{project}/{sample}_filtered.fna"
output:
split=directory("{OUTDIR}/{project}/MGE/plasmid_splitted/{input_file}/")
split=directory("{datadir}/{project}/MGE/plasmid_splitted/{sample}/")
log:
"{OUTDIR}/{project}/MGE/plasmid_splitted/{input_file}.log"
"{datadir}/{project}/MGE/plasmid_splitted/{sample}.log"
params:
outdir="{OUTDIR}"
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
split=config["pathofact"]["size_fasta"]
conda:
"../../envs/SeqKit.yaml"
shell:
"""
seqkit split2 -s 10000 {input} -O {wildcards.OUTDIR}/{wildcards.project}/MGE/plasmid_splitted/{wildcards.input_file} &> {log}
seqkit split2 -s {params.split} {input} -O {wildcards.datadir}/{wildcards.project}/MGE/plasmid_splitted/{wildcards.sample} &> {log}
"""
# PlasFlow Plasmid prediction
rule run_PLASMID:
input:
"{OUTDIR}/{project}/MGE/plasmid_splitted/{input_file}/{file_i}.fna"
"{datadir}/{project}/MGE/plasmid_splitted/{sample}/{file_i}.fna"
output:
temp("{OUTDIR}/{project}/MGE/plasmid/{input_file}/{file_i}_plasflow_prediction.tsv")
temp("{datadir}/{project}/MGE/plasmid/{sample}/{file_i}_plasflow_prediction.tsv")
log:
"{OUTDIR}/{project}/MGE/plasmid/{input_file}/{file_i}_plasflow_prediction.log"
"{datadir}/{project}/MGE/plasmid/{sample}/{file_i}_plasflow_prediction.log"
conda:
"../../envs/PlasFlow.yaml"
params:
outdir="{OUTDIR}"
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["long"],
mem=config["pathofact"]["mem"]["big_mem_per_core_gb"],
threshold=config["pathofact"]["plasflow_threshold"]
shell:
"""
{config[Plasflow]} --input {input} --output {output} --threshold 0.7 &> {log}
PlasFlow.py --input {input} --output {output} --threshold {params.threshold} &> {log}
"""
def aggregate_plasmid_input(wildcards):
checkpoint_output= checkpoints.splitplasmid.get(**wildcards).output.split
return expand(
"{OUTDIR}/{project}/MGE/plasmid/{input_file}/{file_i}_plasflow_prediction.tsv",
OUTDIR=wildcards.OUTDIR,
"{datadir}/{project}/MGE/plasmid/{sample}/{file_i}_plasflow_prediction.tsv",
datadir=wildcards.datadir,
project=wildcards.project,
input_file=wildcards.input_file,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fna")).i
)
......@@ -72,19 +78,23 @@ rule Plasmid_aggregate:
input:
aggregate_plasmid_input
output:
"{OUTDIR}/{project}/MGE/plasmid/{input_file}_plasflow_aggregated.tsv"
"{datadir}/{project}/MGE/plasmid/{sample}_plasflow_aggregated.tsv"
params:
outdir="{OUTDIR}"
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"]
shell:
"cat {input} > {output}"
rule select:
input:
"{OUTDIR}/{project}/MGE/plasmid/{input_file}_plasflow_aggregated.tsv"