Commit f997d34e authored by Laura Denies's avatar Laura Denies
Browse files

Merge branch 'PathoFact_updates' of...

Merge branch 'PathoFact_updates' of https://git-r3lab.uni.lu/laura.denies/PathoFact into PathoFact_updates
parents 7b3ea2ae 5d05b9d0
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=main
- ca-certificates=2019.5.15=0
- certifi=2019.6.16=py36_1
- libedit=3.1.20181209=hc058e9b_0
- libffi=3.2.1=hd88cf55_4
- libgcc-ng=9.1.0=hdf63c60_0
- libstdcxx-ng=9.1.0=hdf63c60_0
- ncurses=6.1=he6710b0_1
- openssl=1.1.1c=h7b6447c_1
- pip=19.1.1=py36_0
- python=3.6.9=h265db76_0
- readline=7.0=h7b6447c_5
- seqkit=0.10.2=0
- setuptools=41.0.1=py36_0
- sqlite=3.29.0=h7b6447c_0
- tk=8.6.8=hbc83047_0
- wheel=0.33.4=py36_0
- xz=5.2.4=h14c3975_4
- zlib=1.2.11=h7b6447c_3
pathofact:
sample: ["SAMPLE_A","SAMPLE_B"] # requires user input
project: Project_A_PathoFact # requires user input
datadir: /path/to/samples # requires user input
workflow: "complete"
size_fasta: 10000
sample: ["test_sample"] # requires user input
project: PathoFact_update_trial # requires user input
datadir: ../test_dataset # requires user input
workflow: "Vir"
size_fasta: 1000
scripts: "scripts"
signalp: "/path/to/signalp-4.1/signalp" # requires user input
deeparg: "submodules/deeparg-ss/deepARG.py"
......
name: Prodigal
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=0_gnu
- libgcc-ng=9.2.0=h24d8f2e_2
- libgomp=9.2.0=h24d8f2e_2
- prodigal=2.6.3=h516909a_2
......@@ -9,7 +9,7 @@ import os
rule run_deepARG:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
temp("{datadir}/{project}/AMR/deepARG_results/{sample}/{file_i}.out.mapping.ARG")
log:
......@@ -30,7 +30,7 @@ def aggregate_AMR(wildcards):
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
rule aggregate_deepARG:
......
......@@ -13,7 +13,7 @@ rule combine_AMR_plasmid:
AMR_translation="{datadir}/{project}/renamed/{sample}_translation.tsv",
Plasmid="{datadir}/{project}/MGE/plasmid/{sample}_plasflow_prediction_final.tsv",
Contig_translation="{datadir}/{project}/renamed/{sample}_Contig_translation.tsv",
Contig_gene_list="{datadir}/{sample}.contig",
Contig_gene_list="{datadir}/{project}/Prodigal/{sample}.contig",
VirFinder="{datadir}/{project}/MGE/phage/{sample}_VirFinder_aggregated.csv",
VirSorter="{datadir}/{project}/MGE/phage/{sample}_VIRSorter_aggregated.csv"
output:
......
......@@ -14,18 +14,18 @@ checkpoint splitphage:
"{datadir}/{project}/renamed/{sample}_Contig_ID.fna"
output:
split=directory("{datadir}/{project}/contig_splitted/{sample}/")
log:
"{datadir}/{project}/contig_splitted/{sample}.log"
# log:
# "{datadir}/{project}/contig_splitted/{sample}.log"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["medium"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
split=config["pathofact"]["size_fasta"]
conda:
"../../envs/SeqKit.yaml"
"../../envs/Biopython.yaml"
shell:
"""
seqkit split2 -s {params.split} {input} -O {wildcards.datadir}/{wildcards.project}/contig_splitted/{wildcards.sample} &> {log}
python {config[pathofact][scripts]}/split.py {input} {params.split} {wildcards.datadir}/{wildcards.project}/contig_splitted/{wildcards.sample}
"""
rule run_VirSorter:
......@@ -62,7 +62,7 @@ rule aggregate_VirSorter:
# VIRFINDER Prediction
rule run_VirFinder:
input:
"{datadir}/{project}/contig_splitted/{sample}/{file_i}.fna"
"{datadir}/{project}/contig_splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/MGE/phage/{sample}/virfinder/{file_i}.fna_gt1bp_dvfpred.txt"
log:
......@@ -85,7 +85,7 @@ def aggregate_VirFinder(wildcards):
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fna")).i
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
rule aggregate_VirFinder:
......
......@@ -38,16 +38,16 @@ checkpoint splitplasmid:
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
split=config["pathofact"]["size_fasta"]
conda:
"../../envs/SeqKit.yaml"
"../../envs/Biopython.yaml"
shell:
"""
seqkit split2 -s {params.split} {input} -O {wildcards.datadir}/{wildcards.project}/MGE/plasmid_splitted/{wildcards.sample} &> {log}
python {config[pathofact][scripts]}/split.py {input} {params.split} {wildcards.datadir}/{wildcards.project}/plasmid_splitted/{wildcards.sample}
"""
# PlasFlow Plasmid prediction
rule run_PLASMID:
input:
"{datadir}/{project}/MGE/plasmid_splitted/{sample}/{file_i}.fna"
"{datadir}/{project}/MGE/plasmid_splitted/{sample}/{file_i}.fasta"
output:
temp("{datadir}/{project}/MGE/plasmid/{sample}/{file_i}_plasflow_prediction.tsv")
log:
......@@ -71,7 +71,7 @@ def aggregate_plasmid_input(wildcards):
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fna")).i
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
rule Plasmid_aggregate:
......
......@@ -7,7 +7,7 @@ import os
rule run_HMM_tox:
input:
hmm=config["pathofact"]["tox_hmm"],
renamed="{datadir}/{project}/splitted/{sample}/{file_i}.faa"
renamed="{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/TOXIN/HMM_toxin/{sample}/{file_i}.hmmscan"
log:
......@@ -51,7 +51,7 @@ def aggregate_hmm(wildcards):
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
rule HMM_correct_format_2:
......
#Prepare fasta
import glob
import os
configfile: "config.yaml"
rule all:
input: expand("{datadir}/{project}/Prodigal/{sample}.contig", datadir=config["pathofact"]["datadir"], project=config["pathofact"]["project"], sample=config["pathofact"]["sample"])
# Generate ORFs and GFF
# Generate unique ID number for each sequence
rule Prodigal:
input:
"{datadir}/{sample}.fna"
output:
ORF="{datadir}/{project}/Prodigal/{sample}.faa",
GFF="{datadir}/{project}/Prodigal/{sample}.gff"
message:
"Generates ORFs and gff"
params:
outdir="{datadir}"
conda:
"../../envs/Prodigal.yaml"
shell:
"""
prodigal -i {input} -o {output.GFF} -a {output.ORF} -f gff -p meta
"""
rule mapping_file:
input:
ORF="{datadir}/{project}/Prodigal/{sample}.faa",
GFF="{datadir}/{project}/Prodigal/{sample}.gff"
output:
"{datadir}/{project}/Prodigal/{sample}.contig"
message:
"Generate mapping file"
params:
outdir="{datadir}"
shell:
"""
sed -i 's/[^>]*ID=//;s/;.*//' {input.ORF}
sed -i '/^#/d' {input.GFF}
cut -f 1,9 {input.GFF} |cut -d';' -f1| sed 's/ID=//' > {output}
"""
......@@ -3,6 +3,44 @@
import glob
import os
##########################################
# Generate ORFs and mapping file #
##########################################
rule Prodigal:
input:
"{datadir}/{sample}.fna"
output:
ORF="{datadir}/{project}/Prodigal/{sample}.faa",
GFF="{datadir}/{project}/Prodigal/{sample}.gff"
message:
"Generates ORFs and gff"
params:
outdir="{datadir}"
conda:
"../../envs/Prodigal.yaml"
shell:
"""
prodigal -i {input} -o {output.GFF} -a {output.ORF} -f gff -p meta
"""
rule mapping_file:
input:
ORF="{datadir}/{project}/Prodigal/{sample}.faa",
GFF="{datadir}/{project}/Prodigal/{sample}.gff"
output:
"{datadir}/{project}/Prodigal/{sample}.contig"
message:
"Generate mapping file"
params:
outdir="{datadir}"
shell:
"""
sed -i 's/[^>]*ID=//;s/;.*//' {input.ORF}
sed -i '/^#/d' {input.GFF}
cut -f 1,9 {input.GFF} |cut -d';' -f1| sed 's/ID=//' > {output}
"""
##############################
# Modify fasta input #
##############################
......@@ -10,7 +48,7 @@ import os
# Generate unique ID number for each sequence
rule generate_ID:
input:
"{datadir}/{sample}.faa"
"{datadir}/{project}/Prodigal/{sample}.faa"
output:
"{datadir}/{project}/renamed/{sample}_ID.faa"
message:
......@@ -47,21 +85,22 @@ rule generate_translation:
###############################
# Split fasta file
#checkpoint splitting:
checkpoint splitting:
input:
"{datadir}/{project}/renamed/{sample}_ID.faa"
output:
splits=directory("{datadir}/{project}/splitted/{sample}/")
log:
"{datadir}/{project}/splitted/{sample}.log"
# log:
# "{datadir}/{project}/splitted/{sample}.log"
params:
outdir="{datadir}",
runtime=config["pathofact"]["runtime"]["short"],
mem=config["pathofact"]["mem"]["normal_mem_per_core_gb"],
split=config["pathofact"]["size_fasta"]
conda:
"../../envs/SeqKit.yaml"
"../../envs/Biopython.yaml"
shell:
"""
seqkit split2 -s {params.split} {input} -O {wildcards.datadir}/{wildcards.project}/splitted/{wildcards.sample} &> {log}
python {config[pathofact][scripts]}/split.py {input} {params.split} {wildcards.datadir}/{wildcards.project}/splitted/{wildcards.sample}
"""
......@@ -6,7 +6,7 @@ import os
#Run SignalP on split sequence files
rule signalp:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/SignalP/{sample}/{file_i}.txt"
log:
......@@ -58,7 +58,7 @@ def aggregate_input(wildcards):
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
# Join multiple SignalP files in a single file
......
......@@ -7,7 +7,7 @@ import os
rule run_HMM_vir:
input:
hmm=config["pathofact"]["vir_hmm"],
renamed="{datadir}/{project}/splitted/{sample}/{file_i}.faa"
renamed="{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/VIRULENCE/HMM_virulence/{sample}/{file_i}.hmmscan"
log:
......@@ -51,7 +51,7 @@ def aggregate_hmm(wildcards):
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
rule HMM_correct_format_2_vir:
......@@ -158,7 +158,7 @@ rule HMM_VIR_finalformat:
##########################
rule AAC:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_AAC.txt"
log:
......@@ -173,7 +173,7 @@ rule AAC:
rule DPC:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_DPC.txt"
log:
......@@ -188,7 +188,7 @@ rule DPC:
rule CTDC:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDC.txt"
log:
......@@ -203,7 +203,7 @@ rule CTDC:
rule CTDT:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDT.txt"
log:
......@@ -218,7 +218,7 @@ rule CTDT:
rule CTDD:
input:
"{datadir}/{project}/splitted/{sample}/{file_i}.faa"
"{datadir}/{project}/splitted/{sample}/{file_i}.fasta"
output:
"{datadir}/{project}/VIRULENCE/classifier_virulence/{sample}/{file_i}_CTDD.txt"
log:
......@@ -287,7 +287,7 @@ def aggregate_classifier(wildcards):
datadir=wildcards.datadir,
project=wildcards.project,
sample=wildcards.sample,
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.faa")).i
file_i=glob_wildcards(os.path.join(checkpoint_output, "{i}.fasta")).i
)
rule format_classifier:
......
#!/usr/bin/env python
from Bio import SeqIO
import argparse
import os
def batch_iterator(iterator, batch_size):
entry = True # Make sure we loop once
while entry:
batch = []
while len(batch) < batch_size:
try:
entry = iterator.__next__()
except StopIteration:
entry = None
if entry is None:
# End of file
break
batch.append(entry)
if batch:
yield batch
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Split fasta')
parser.add_argument('infile', metavar='input', type=str, help='The input file (FASTA format)')
parser.add_argument('split_size', metavar='split_size', type=int, help='Size of splitted fasta files')
parser.add_argument('out', metavar="output_directory", type=str, help='Location of the output directory')
args = parser.parse_args()
try:
os.stat(args.out)
except:
os.mkdir(args.out)
record_iter = SeqIO.parse(open(args.infile),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, args.split_size)):
save_to_path = args.out
filename = "group_%i.fasta" % (i + 1)
complete_name = os.path.join(save_to_path, filename)
with open(complete_name, "w") as handle:
count = SeqIO.write(batch, handle, "fasta")
print("Wrote %i records to %s" % (count, filename))
File mode changed from 100644 to 100755
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment