Commit 5d05b9d0 authored by Laura Denies's avatar Laura Denies
Browse files

add Prodigal and new split script

parent d80a6ad1
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- ca-certificates=2019.5.15=0
- certifi=2019.6.16=py36_1
- libedit=3.1.20181209=hc058e9b_0
- libffi=3.2.1=hd88cf55_4
- libgcc-ng=9.1.0=hdf63c60_0
- libstdcxx-ng=9.1.0=hdf63c60_0
- ncurses=6.1=he6710b0_1
- openssl=1.1.1c=h7b6447c_1
- pip=19.1.1=py36_0
- python=3.6.9=h265db76_0
- readline=7.0=h7b6447c_5
- seqkit=0.10.2=0
- setuptools=41.0.1=py36_0
- sqlite=3.29.0=h7b6447c_0
- tk=8.6.8=hbc83047_0
- wheel=0.33.4=py36_0
- xz=5.2.4=h14c3975_4
- zlib=1.2.11=h7b6447c_3
pathofact:
sample: ["SAMPLE_A","SAMPLE_B"] # requires user input
project: Project_A_PathoFact # requires user input
datadir: /path/to/samples # requires user input
workflow: "complete"
size_fasta: 10000
sample: ["test_sample"] # requires user input
project: PathoFact_update_trial # requires user input
datadir: ../test_dataset # requires user input
workflow: "Vir"
size_fasta: 1000
scripts: "scripts"
signalp: "/path/to/signalp-4.1/signalp" # requires user input
deeparg: "submodules/deeparg-ss/deepARG.py"
......
name: Prodigal
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=0_gnu
- libgcc-ng=9.2.0=h24d8f2e_2
- libgomp=9.2.0=h24d8f2e_2
- prodigal=2.6.3=h516909a_2
......@@ -9,7 +9,7 @@ import os
rule run_deepARG:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
temp("{datadir}/{project}/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG")
log:
......@@ -30,7 +30,7 @@ def aggregate_AMR(wildcards):
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
rule aggregate_deepARG:
......
......@@ -13,7 +13,7 @@ rule combine_AMR_plasmid:
AMR_translation="{datadir}/{project}/renamed/{sample}_translation.tsv",
Plasmid="{datadir}/{project}/MGE/plasmid/{sample}_plasflow_prediction_final.tsv",
Contig_translation="{datadir}/{project}/renamed/{sample}_Contig_translation.tsv",
Contig_gene_list="{datadir}/{sample}.contig",
Contig_gene_list="{datadir}/{project}/Prodigal/{sample}.contig",
VirFinder="{datadir}/{project}/MGE/phage/{sample}_VirFinder_aggregated.csv",
VirSorter="{datadir}/{project}/MGE/phage/{sample}_VIRSorter_aggregated.csv"
output:
......
......@@ -14,18 +14,18 @@ checkpoint splitphage:
"{datadir}/{project}/renamed/{sample}_Contig_ID.fna"
output:
split=directory("{datadir}/{project}/contig_splitted/{sample}/")
log:
"{datadir}/{project}/contig_splitted/{sample}.log"
# log:
# "{datadir}/{project}/contig_splitted/{sample}.log"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
split=config["pathofact"]["size_fasta"]
conda:
"../../envs/SeqKit.yaml"
"../../envs/Biopython.yaml"
shell:
"""
seqkit split2 -s {params.split} {input} -O {wildcards.datadir}/{wildcards.project}/contig_splitted/{wildcards.sample} &> {log}
python {config[pathofact][scripts]}/split.py {input} {params.split} {wildcards.datadir}/{wildcards.project}/contig_splitted/{wildcards.sample}
"""
rule run_VirSorter:
......@@ -62,7 +62,7 @@ rule aggregate_VirSorter:
# VIRFINDER Prediction
rule run_VirFinder:
input:
"{datadir}/{project}/contig_splitted/{sample}/{file_i}.fna"
"{datadir}/{project}/contig_splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/MGE/phage/{sample}/virfinder/{file_i}.fna_gt1bp_dvfpred.txt"
log:
......@@ -85,7 +85,7 @@ def aggregate_VirFinder(wildcards):
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fna")).i
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
rule aggregate_VirFinder:
......
......@@ -38,16 +38,16 @@ checkpoint splitplasmid:
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
split=config["pathofact"]["size_fasta"]
conda:
"../../envs/SeqKit.yaml"
"../../envs/Biopython.yaml"
shell:
"""
seqkit split2 -s {params.split} {input} -O {wildcards.datadir}/{wildcards.project}/MGE/plasmid_splitted/{wildcards.sample} &> {log}
python {config[pathofact][scripts]}/split.py {input} {params.split} {wildcards.datadir}/{wildcards.project}/plasmid_splitted/{wildcards.sample}
"""
# PlasFlow Plasmid prediction
rule run_PLASMID:
input:
"{datadir}/{project}/MGE/plasmid_splitted/{sample}/{file_i}.fna"
"{datadir}/{project}/MGE/plasmid_splitted/{sample}/{file_i}.fasta"
output:
temp("{datadir}/{project}/MGE/plasmid/{sample}/{file_i}_plasflow_prediction.tsv")
log:
......@@ -71,7 +71,7 @@ def aggregate_plasmid_input(wildcards):
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fna")).i
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
rule Plasmid_aggregate:
......
......@@ -7,7 +7,7 @@ import os
rule run_HMM_tox:
input:
hmm=config["pathofact"]["tox_hmm"],
renamed="{datadir}/{project}/splitted/{sample}/{file_i}.faa"
renamed="{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmmscan"
log:
......@@ -51,7 +51,7 @@ def aggregate_hmm(wildcards):
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
rule HMM_correct_format_2:
......
#Prepare fasta
import glob
import os
configfile: "config.yaml"
rule all:
input: expand("{datadir}/{project}/Prodigal/{sample}.contig", datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"])
# Generate ORFs and GFF
# Generate unique ID number for each sequence
rule Prodigal:
input:
"{datadir}/{sample}.fna"
output:
ORF="{datadir}/{project}/Prodigal/{sample}.faa",
GFF="{datadir}/{project}/Prodigal/{sample}.gff"
message:
"Generates ORFs and gff"
params:
outdir="{datadir}"
conda:
"../../envs/Prodigal.yaml"
shell:
"""
prodigal -i {input} -o {output.GFF} -a {output.ORF} -f gff -p meta
"""
rule mapping_file:
input:
ORF="{datadir}/{project}/Prodigal/{sample}.faa",
GFF="{datadir}/{project}/Prodigal/{sample}.gff"
output:
"{datadir}/{project}/Prodigal/{sample}.contig"
message:
"Generate mapping file"
params:
outdir="{datadir}"
shell:
"""
sed -i 's/[^>]*ID=//;s/;.*//' {input.ORF}
sed -i '/^#/d' {input.GFF}
cut -f 1,9 {input.GFF} |cut -d';' -f1| sed 's/ID=//' > {output}
"""
......@@ -3,6 +3,44 @@
import glob
import os
##########################################
# Generate ORFs and mapping file #
##########################################
rule Prodigal:
input:
"{datadir}/{sample}.fna"
output:
ORF="{datadir}/{project}/Prodigal/{sample}.faa",
GFF="{datadir}/{project}/Prodigal/{sample}.gff"
message:
"Generates ORFs and gff"
params:
outdir="{datadir}"
conda:
"../../envs/Prodigal.yaml"
shell:
"""
prodigal -i {input} -o {output.GFF} -a {output.ORF} -f gff -p meta
"""
rule mapping_file:
input:
ORF="{datadir}/{project}/Prodigal/{sample}.faa",
GFF="{datadir}/{project}/Prodigal/{sample}.gff"
output:
"{datadir}/{project}/Prodigal/{sample}.contig"
message:
"Generate mapping file"
params:
outdir="{datadir}"
shell:
"""
sed -i 's/[^>]*ID=//;s/;.*//' {input.ORF}
sed -i '/^#/d' {input.GFF}
cut -f 1,9 {input.GFF} |cut -d';' -f1| sed 's/ID=//' > {output}
"""
##############################
# Modify fasta input #
##############################
......@@ -10,7 +48,7 @@ import os
# Generate unique ID number for each sequence
rule generate_ID:
input:
"{datadir}/{sample}.faa"
"{datadir}/{project}/Prodigal/{sample}.faa"
output:
"{datadir}/{project}/renamed/{sample}_ID.faa"
message:
......@@ -47,21 +85,22 @@ rule generate_translation:
###############################
# Split fasta file
#checkpoint splitting:
checkpoint splitting:
input:
"{datadir}/{project}/renamed/{sample}_ID.faa"
output:
splits=directory("{datadir}/{project}/splitted/{sample}/")
log:
"{datadir}/{project}/splitted/{sample}.log"
# log:
# "{datadir}/{project}/splitted/{sample}.log"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
split=config["pathofact"]["size_fasta"]
conda:
"../../envs/SeqKit.yaml"
"../../envs/Biopython.yaml"
shell:
"""
seqkit split2 -s {params.split} {input} -O {wildcards.datadir}/{wildcards.project}/splitted/{wildcards.sample} &> {log}
python {config[pathofact][scripts]}/split.py {input} {params.split} {wildcards.datadir}/{wildcards.project}/splitted/{wildcards.sample}
"""
......@@ -6,7 +6,7 @@ import os
#Run SignalP on split sequence files
rule signalp:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/SignalP/{sample}/{file_i}.txt"
log:
......@@ -58,7 +58,7 @@ def aggregate_input(wildcards):
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
# Join multiple SignalP files in a single file
......
......@@ -7,7 +7,7 @@ import os
rule run_HMM_vir:
input:
hmm=config["pathofact"]["vir_hmm"],
renamed="{datadir}/{project}/splitted/{sample}/{file_i}.faa"
renamed="{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}/{file_i}.hmmscan"
log:
......@@ -51,7 +51,7 @@ def aggregate_hmm(wildcards):
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
rule HMM_correct_format_2_vir:
......@@ -158,7 +158,7 @@ rule HMM_VIR_finalformat:
##########################
rule AAC:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_AAC.txt"
log:
......@@ -173,7 +173,7 @@ rule AAC:
rule DPC:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_DPC.txt"
log:
......@@ -188,7 +188,7 @@ rule DPC:
rule CTDC:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDC.txt"
log:
......@@ -203,7 +203,7 @@ rule CTDC:
rule CTDT:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDT.txt"
log:
......@@ -218,7 +218,7 @@ rule CTDT:
rule CTDD:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDD.txt"
log:
......@@ -287,7 +287,7 @@ def aggregate_classifier(wildcards):
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
rule format_classifier:
......
#!/usr/bin/env python
from Bio import SeqIO
import argparse
import os
def batch_iterator(iterator, batch_size):
entry = True # Make sure we loop once
while entry:
batch = []
while len(batch) < batch_size:
try:
entry = iterator.__next__()
except StopIteration:
entry = None
if entry is None:
# End of file
break
batch.append(entry)
if batch:
yield batch
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Split fasta')
parser.add_argument('infile', metavar='input', type=str, help='The input file (FASTA format)')
parser.add_argument('split_size', metavar='split_size', type=int, help='Size of splitted fasta files')
parser.add_argument('out', metavar="output_directory", type=str, help='Location of the output directory')
args = parser.parse_args()
try:
os.stat(args.out)
except:
os.mkdir(args.out)
record_iter = SeqIO.parse(open(args.infile),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, args.split_size)):
save_to_path = args.out
filename = "group_%i.fasta" % (i + 1)
complete_name = os.path.join(save_to_path, filename)
with open(complete_name, "w") as handle:
count = SeqIO.write(batch, handle, "fasta")
print("Wrote %i records to %s" % (count, filename))
File mode changed from 100644 to 100755
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment