Commit 5d05b9d0 authored by Laura Denies's avatar Laura Denies
Browse files

add Prodigal and new split script

parent d80a6ad1
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- ca-certificates=2019.5.15=0
- certifi=2019.6.16=py36_1
- libedit=3.1.20181209=hc058e9b_0
- libffi=3.2.1=hd88cf55_4
- libgcc-ng=9.1.0=hdf63c60_0
- libstdcxx-ng=9.1.0=hdf63c60_0
- ncurses=6.1=he6710b0_1
- openssl=1.1.1c=h7b6447c_1
- pip=19.1.1=py36_0
- python=3.6.9=h265db76_0
- readline=7.0=h7b6447c_5
- seqkit=0.10.2=0
- setuptools=41.0.1=py36_0
- sqlite=3.29.0=h7b6447c_0
- tk=8.6.8=hbc83047_0
- wheel=0.33.4=py36_0
- xz=5.2.4=h14c3975_4
- zlib=1.2.11=h7b6447c_3
pathofact: pathofact:
sample: ["SAMPLE_A","SAMPLE_B"] # requires user input sample: ["test_sample"] # requires user input
project: Project_A_PathoFact # requires user input project: PathoFact_update_trial # requires user input
datadir: /path/to/samples # requires user input datadir: ../test_dataset # requires user input
workflow: "complete" workflow: "Vir"
size_fasta: 10000 size_fasta: 1000
scripts: "scripts" scripts: "scripts"
signalp: "/path/to/signalp-4.1/signalp" # requires user input signalp: "/path/to/signalp-4.1/signalp" # requires user input
deeparg: "submodules/deeparg-ss/deepARG.py" deeparg: "submodules/deeparg-ss/deepARG.py"
......
name: Prodigal
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=0_gnu
- libgcc-ng=9.2.0=h24d8f2e_2
- libgomp=9.2.0=h24d8f2e_2
- prodigal=2.6.3=h516909a_2
...@@ -9,7 +9,7 @@ import os ...@@ -9,7 +9,7 @@ import os
rule run_deepARG: rule run_deepARG:
input: input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa" "{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output: output:
temp("{datadir}/{project}/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG") temp("{datadir}/{project}/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG")
log: log:
...@@ -30,7 +30,7 @@ def aggregate_AMR(wildcards): ...@@ -30,7 +30,7 @@ def aggregate_AMR(wildcards):
datadir=wildcards.datadir, datadir=wildcards.datadir,
project=wildcards.project, project=wildcards.project,
sample=wildcards.sample, sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
) )
rule aggregate_deepARG: rule aggregate_deepARG:
......
...@@ -13,7 +13,7 @@ rule combine_AMR_plasmid: ...@@ -13,7 +13,7 @@ rule combine_AMR_plasmid:
AMR_translation="{datadir}/{project}/renamed/{sample}_translation.tsv", AMR_translation="{datadir}/{project}/renamed/{sample}_translation.tsv",
Plasmid="{datadir}/{project}/MGE/plasmid/{sample}_plasflow_prediction_final.tsv", Plasmid="{datadir}/{project}/MGE/plasmid/{sample}_plasflow_prediction_final.tsv",
Contig_translation="{datadir}/{project}/renamed/{sample}_Contig_translation.tsv", Contig_translation="{datadir}/{project}/renamed/{sample}_Contig_translation.tsv",
Contig_gene_list="{datadir}/{sample}.contig", Contig_gene_list="{datadir}/{project}/Prodigal/{sample}.contig",
VirFinder="{datadir}/{project}/MGE/phage/{sample}_VirFinder_aggregated.csv", VirFinder="{datadir}/{project}/MGE/phage/{sample}_VirFinder_aggregated.csv",
VirSorter="{datadir}/{project}/MGE/phage/{sample}_VIRSorter_aggregated.csv" VirSorter="{datadir}/{project}/MGE/phage/{sample}_VIRSorter_aggregated.csv"
output: output:
......
...@@ -14,18 +14,18 @@ checkpoint splitphage: ...@@ -14,18 +14,18 @@ checkpoint splitphage:
"{datadir}/{project}/renamed/{sample}_Contig_ID.fna" "{datadir}/{project}/renamed/{sample}_Contig_ID.fna"
output: output:
split=directory("{datadir}/{project}/contig_splitted/{sample}/") split=directory("{datadir}/{project}/contig_splitted/{sample}/")
log: # log:
"{datadir}/{project}/contig_splitted/{sample}.log" # "{datadir}/{project}/contig_splitted/{sample}.log"
params: params:
outdir="{datadir}", outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"], runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"], mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
split=config["pathofact"]["size_fasta"] split=config["pathofact"]["size_fasta"]
conda: conda:
"../../envs/SeqKit.yaml" "../../envs/Biopython.yaml"
shell: shell:
""" """
seqkit split2 -s {params.split} {input} -O {wildcards.datadir}/{wildcards.project}/contig_splitted/{wildcards.sample} &> {log} python {config[pathofact][scripts]}/split.py {input} {params.split} {wildcards.datadir}/{wildcards.project}/contig_splitted/{wildcards.sample}
""" """
rule run_VirSorter: rule run_VirSorter:
...@@ -62,7 +62,7 @@ rule aggregate_VirSorter: ...@@ -62,7 +62,7 @@ rule aggregate_VirSorter:
# VIRFINDER Prediction # VIRFINDER Prediction
rule run_VirFinder: rule run_VirFinder:
input: input:
"{datadir}/{project}/contig_splitted/{sample}/{file_i}.fna" "{datadir}/{project}/contig_splitted/{sample}/{file_i}.fasta"
output: output:
"{datadir}/{project}/MGE/phage/{sample}/virfinder/{file_i}.fna_gt1bp_dvfpred.txt" "{datadir}/{project}/MGE/phage/{sample}/virfinder/{file_i}.fna_gt1bp_dvfpred.txt"
log: log:
...@@ -85,7 +85,7 @@ def aggregate_VirFinder(wildcards): ...@@ -85,7 +85,7 @@ def aggregate_VirFinder(wildcards):
datadir=wildcards.datadir, datadir=wildcards.datadir,
project=wildcards.project, project=wildcards.project,
sample=wildcards.sample, sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fna")).i file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
) )
rule aggregate_VirFinder: rule aggregate_VirFinder:
......
...@@ -38,16 +38,16 @@ checkpoint splitplasmid: ...@@ -38,16 +38,16 @@ checkpoint splitplasmid:
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"], mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
split=config["pathofact"]["size_fasta"] split=config["pathofact"]["size_fasta"]
conda: conda:
"../../envs/SeqKit.yaml" "../../envs/Biopython.yaml"
shell: shell:
""" """
seqkit split2 -s {params.split} {input} -O {wildcards.datadir}/{wildcards.project}/MGE/plasmid_splitted/{wildcards.sample} &> {log} python {config[pathofact][scripts]}/split.py {input} {params.split} {wildcards.datadir}/{wildcards.project}/plasmid_splitted/{wildcards.sample}
""" """
# PlasFlow Plasmid prediction # PlasFlow Plasmid prediction
rule run_PLASMID: rule run_PLASMID:
input: input:
"{datadir}/{project}/MGE/plasmid_splitted/{sample}/{file_i}.fna" "{datadir}/{project}/MGE/plasmid_splitted/{sample}/{file_i}.fasta"
output: output:
temp("{datadir}/{project}/MGE/plasmid/{sample}/{file_i}_plasflow_prediction.tsv") temp("{datadir}/{project}/MGE/plasmid/{sample}/{file_i}_plasflow_prediction.tsv")
log: log:
...@@ -71,7 +71,7 @@ def aggregate_plasmid_input(wildcards): ...@@ -71,7 +71,7 @@ def aggregate_plasmid_input(wildcards):
datadir=wildcards.datadir, datadir=wildcards.datadir,
project=wildcards.project, project=wildcards.project,
sample=wildcards.sample, sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fna")).i file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
) )
rule Plasmid_aggregate: rule Plasmid_aggregate:
......
...@@ -7,7 +7,7 @@ import os ...@@ -7,7 +7,7 @@ import os
rule run_HMM_tox: rule run_HMM_tox:
input: input:
hmm=config["pathofact"]["tox_hmm"], hmm=config["pathofact"]["tox_hmm"],
renamed="{datadir}/{project}/splitted/{sample}/{file_i}.faa" renamed="{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output: output:
"{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmmscan" "{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmmscan"
log: log:
...@@ -51,7 +51,7 @@ def aggregate_hmm(wildcards): ...@@ -51,7 +51,7 @@ def aggregate_hmm(wildcards):
datadir=wildcards.datadir, datadir=wildcards.datadir,
project=wildcards.project, project=wildcards.project,
sample=wildcards.sample, sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
) )
rule HMM_correct_format_2: rule HMM_correct_format_2:
......
#Prepare fasta
import glob
import os
configfile: "config.yaml"
rule all:
input: expand("{datadir}/{project}/Prodigal/{sample}.contig", datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"])
# Generate ORFs and GFF
# Generate unique ID number for each sequence
rule Prodigal:
input:
"{datadir}/{sample}.fna"
output:
ORF="{datadir}/{project}/Prodigal/{sample}.faa",
GFF="{datadir}/{project}/Prodigal/{sample}.gff"
message:
"Generates ORFs and gff"
params:
outdir="{datadir}"
conda:
"../../envs/Prodigal.yaml"
shell:
"""
prodigal -i {input} -o {output.GFF} -a {output.ORF} -f gff -p meta
"""
rule mapping_file:
input:
ORF="{datadir}/{project}/Prodigal/{sample}.faa",
GFF="{datadir}/{project}/Prodigal/{sample}.gff"
output:
"{datadir}/{project}/Prodigal/{sample}.contig"
message:
"Generate mapping file"
params:
outdir="{datadir}"
shell:
"""
sed -i 's/[^>]*ID=//;s/;.*//' {input.ORF}
sed -i '/^#/d' {input.GFF}
cut -f 1,9 {input.GFF} |cut -d';' -f1| sed 's/ID=//' > {output}
"""
...@@ -3,6 +3,44 @@ ...@@ -3,6 +3,44 @@
import glob import glob
import os import os
##########################################
# Generate ORFs and mapping file #
##########################################
rule Prodigal:
input:
"{datadir}/{sample}.fna"
output:
ORF="{datadir}/{project}/Prodigal/{sample}.faa",
GFF="{datadir}/{project}/Prodigal/{sample}.gff"
message:
"Generates ORFs and gff"
params:
outdir="{datadir}"
conda:
"../../envs/Prodigal.yaml"
shell:
"""
prodigal -i {input} -o {output.GFF} -a {output.ORF} -f gff -p meta
"""
rule mapping_file:
input:
ORF="{datadir}/{project}/Prodigal/{sample}.faa",
GFF="{datadir}/{project}/Prodigal/{sample}.gff"
output:
"{datadir}/{project}/Prodigal/{sample}.contig"
message:
"Generate mapping file"
params:
outdir="{datadir}"
shell:
"""
sed -i 's/[^>]*ID=//;s/;.*//' {input.ORF}
sed -i '/^#/d' {input.GFF}
cut -f 1,9 {input.GFF} |cut -d';' -f1| sed 's/ID=//' > {output}
"""
############################## ##############################
# Modify fasta input # # Modify fasta input #
############################## ##############################
...@@ -10,7 +48,7 @@ import os ...@@ -10,7 +48,7 @@ import os
# Generate unique ID number for each sequence # Generate unique ID number for each sequence
rule generate_ID: rule generate_ID:
input: input:
"{datadir}/{sample}.faa" "{datadir}/{project}/Prodigal/{sample}.faa"
output: output:
"{datadir}/{project}/renamed/{sample}_ID.faa" "{datadir}/{project}/renamed/{sample}_ID.faa"
message: message:
...@@ -47,21 +85,22 @@ rule generate_translation: ...@@ -47,21 +85,22 @@ rule generate_translation:
############################### ###############################
# Split fasta file # Split fasta file
#checkpoint splitting:
checkpoint splitting: checkpoint splitting:
input: input:
"{datadir}/{project}/renamed/{sample}_ID.faa" "{datadir}/{project}/renamed/{sample}_ID.faa"
output: output:
splits=directory("{datadir}/{project}/splitted/{sample}/") splits=directory("{datadir}/{project}/splitted/{sample}/")
log: # log:
"{datadir}/{project}/splitted/{sample}.log" # "{datadir}/{project}/splitted/{sample}.log"
params: params:
outdir="{datadir}", outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"], runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"], mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
split=config["pathofact"]["size_fasta"] split=config["pathofact"]["size_fasta"]
conda: conda:
"../../envs/SeqKit.yaml" "../../envs/Biopython.yaml"
shell: shell:
""" """
seqkit split2 -s {params.split} {input} -O {wildcards.datadir}/{wildcards.project}/splitted/{wildcards.sample} &> {log} python {config[pathofact][scripts]}/split.py {input} {params.split} {wildcards.datadir}/{wildcards.project}/splitted/{wildcards.sample}
""" """
...@@ -6,7 +6,7 @@ import os ...@@ -6,7 +6,7 @@ import os
#Run SignalP on split sequence files #Run SignalP on split sequence files
rule signalp: rule signalp:
input: input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa" "{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output: output:
"{datadir}/{project}/SignalP/{sample}/{file_i}.txt" "{datadir}/{project}/SignalP/{sample}/{file_i}.txt"
log: log:
...@@ -58,7 +58,7 @@ def aggregate_input(wildcards): ...@@ -58,7 +58,7 @@ def aggregate_input(wildcards):
datadir=wildcards.datadir, datadir=wildcards.datadir,
project=wildcards.project, project=wildcards.project,
sample=wildcards.sample, sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
) )
# Join multiple SignalP files in a single file # Join multiple SignalP files in a single file
......
...@@ -7,7 +7,7 @@ import os ...@@ -7,7 +7,7 @@ import os
rule run_HMM_vir: rule run_HMM_vir:
input: input:
hmm=config["pathofact"]["vir_hmm"], hmm=config["pathofact"]["vir_hmm"],
renamed="{datadir}/{project}/splitted/{sample}/{file_i}.faa" renamed="{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output: output:
"{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}/{file_i}.hmmscan" "{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}/{file_i}.hmmscan"
log: log:
...@@ -51,7 +51,7 @@ def aggregate_hmm(wildcards): ...@@ -51,7 +51,7 @@ def aggregate_hmm(wildcards):
datadir=wildcards.datadir, datadir=wildcards.datadir,
project=wildcards.project, project=wildcards.project,
sample=wildcards.sample, sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
) )
rule HMM_correct_format_2_vir: rule HMM_correct_format_2_vir:
...@@ -158,7 +158,7 @@ rule HMM_VIR_finalformat: ...@@ -158,7 +158,7 @@ rule HMM_VIR_finalformat:
########################## ##########################
rule AAC: rule AAC:
input: input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa" "{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output: output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_AAC.txt" "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_AAC.txt"
log: log:
...@@ -173,7 +173,7 @@ rule AAC: ...@@ -173,7 +173,7 @@ rule AAC:
rule DPC: rule DPC:
input: input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa" "{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output: output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_DPC.txt" "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_DPC.txt"
log: log:
...@@ -188,7 +188,7 @@ rule DPC: ...@@ -188,7 +188,7 @@ rule DPC:
rule CTDC: rule CTDC:
input: input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa" "{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output: output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDC.txt" "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDC.txt"
log: log:
...@@ -203,7 +203,7 @@ rule CTDC: ...@@ -203,7 +203,7 @@ rule CTDC:
rule CTDT: rule CTDT:
input: input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa" "{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output: output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDT.txt" "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDT.txt"
log: log:
...@@ -218,7 +218,7 @@ rule CTDT: ...@@ -218,7 +218,7 @@ rule CTDT:
rule CTDD: rule CTDD:
input: input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa" "{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output: output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDD.txt" "{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDD.txt"
log: log:
...@@ -287,7 +287,7 @@ def aggregate_classifier(wildcards): ...@@ -287,7 +287,7 @@ def aggregate_classifier(wildcards):
datadir=wildcards.datadir, datadir=wildcards.datadir,
project=wildcards.project, project=wildcards.project,
sample=wildcards.sample, sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
) )
rule format_classifier: rule format_classifier:
......
#!/usr/bin/env python
from Bio import SeqIO
import argparse
import os
def batch_iterator(iterator, batch_size):
entry = True # Make sure we loop once
while entry:
batch = []
while len(batch) < batch_size:
try:
entry = iterator.__next__()
except StopIteration:
entry = None
if entry is None:
# End of file
break
batch.append(entry)
if batch:
yield batch
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Split fasta')
parser.add_argument('infile', metavar='input', type=str, help='The input file (FASTA format)')
parser.add_argument('split_size', metavar='split_size', type=int, help='Size of splitted fasta files')
parser.add_argument('out', metavar="output_directory", type=str, help='Location of the output directory')
args = parser.parse_args()
try:
os.stat(args.out)
except:
os.mkdir(args.out)
record_iter = SeqIO.parse(open(args.infile),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, args.split_size)):
save_to_path = args.out
filename = "group_%i.fasta" % (i + 1)
complete_name = os.path.join(save_to_path, filename)
with open(complete_name, "w") as handle:
count = SeqIO.write(batch, handle, "fasta")
print("Wrote %i records to %s" % (count, filename))
File mode changed from 100644 to 100755
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment