Commit 334acc25 authored by Valentina Galata's avatar Valentina Galata
Browse files

added circ_contigs_fasta.py (issue #53); added kraken2 (tax) (issue #55)

parent 1ebb366e
############################################################
# STEPS
# Pipeline steps
# NOTE: no binning and taxonomic analysis
# steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis"]
steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis"]
# NOTE: currently not used
# Annotation sub-steps
# annotation_steps: ["plasmids", "crispr", "amr"]
# NOTE: currently not used
# Analysis sub-steps
# analysis_steps: ["quast", "prodigal", "cdhit", "mmseqs2"]
# Pipeline steps to be done
# steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
steps: ["taxonomy"]
############################################################
# INPUT
# working directory: will contain the results
# work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB"
# working directory: will contain the results (should be writeable)
work_dir: "/scratch/users/vgalata/ont_pilot"
# Paths WITHIN the working directory
# directory containing required DBs
# directory containing required DBs (should be writeable)
db_dir: "dbs"
# results directory
# results directory (will be created in work_dir)
results_dir: "results"
# Data paths: Use absolute paths or paths relative to the working directory !!!
......@@ -48,8 +38,6 @@ data:
metap:
# TODO
# binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
############################################################
# TOOLS
......@@ -209,9 +197,16 @@ mummer:
##############################
# Taxonomy
# XXX
# kraken2:
# db: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# https://ccb.jhu.edu/software/kraken2/
# https://github.com/DerrickWood/kraken2
kraken2:
threads: 10
db:
maxikraken: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
class:
sr: "--gzip-compressed --paired"
lr: "" # TODO
contigs: "" # TODO
# # XXX
# GTDBTK:
......
......@@ -8,7 +8,7 @@
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 1
#SBATCH --time=2-00:00:00
#SBATCH --time=0-10:00:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
......
......@@ -159,6 +159,30 @@ annotation_plasflow:
n: 1
explicit: ""
kraken2_contigs:
time: "00-02:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
kraken2_sr:
time: "00-02:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
kraken2_lr:
time: "00-02:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
# "mmseq2_compare":
# {
# "n": 1,
......
############################################################
# STEPS
# Pipeline steps
# NOTE: no binning and taxonomic analysis
# steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis"]
steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis"]
# NOTE: currently not used
# Annotation sub-steps
# annotation_steps: ["plasmids", "crispr", "amr"]
# NOTE: currently not used
# Analysis sub-steps
# analysis_steps: ["quast", "prodigal", "cdhit", "mmseqs2"]
# Pipeline steps to be done
# steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
steps: ["taxonomy"]
############################################################
# INPUT
# working directory: will contain the results
# work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB"
work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/mock"
# Paths WITHIN the working directory
......@@ -48,8 +38,6 @@ data:
metap:
# TODO
# binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
############################################################
# TOOLS
......@@ -209,9 +197,16 @@ mummer:
##############################
# Taxonomy
# XXX
# kraken2:
# db: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# https://ccb.jhu.edu/software/kraken2/
# https://github.com/DerrickWood/kraken2
kraken2:
threads: 10
db:
maxikraken: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
class:
sr: "--gzip-compressed --paired"
lr: "" # TODO
contigs: "" # TODO
# # XXX
# GTDBTK:
......
......@@ -8,7 +8,7 @@
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 1
#SBATCH --time=2-00:00:00
#SBATCH --time=0-10:00:00
#SBATCH -p batch
#SBATCH --qos=qos-batch
......@@ -16,7 +16,7 @@
# SNAKEMAKE
# conda env name
ONTP_ENV="snakemake" # USER INPUT REQUIRED
ONTP_ENV="ONT_pilot"
# number of cores for snakemake
ONTP_CORES=60
# snakemake file
......@@ -35,6 +35,6 @@ ONTP_CLUSTER="-p {cluster.partition} -q {cluster.qos} {cluster.explicit} -N {clu
conda activate ${ONTP_ENV}
# run the pipeline
snakemake --rerun-incomplete -s ${ONTP_SMK} -rp --cores ${ONTP_CORES} --configfile ${ONTP_CONFIG} \
snakemake -s ${ONTP_SMK} -rp --cores ${ONTP_CORES} --configfile ${ONTP_CONFIG} \
--use-conda --conda-prefix ${CONDA_PREFIX}/pipeline \
--cluster-config ${ONTP_SLURM} --cluster "sbatch ${ONTP_CLUSTER}"
......@@ -159,6 +159,30 @@ annotation_plasflow:
n: 1
explicit: ""
kraken2_contigs:
time: "00-02:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
kraken2_sr:
time: "00-02:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
kraken2_lr:
time: "00-02:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
# "mmseq2_compare":
# {
# "n": 1,
......
......@@ -129,6 +129,14 @@ if "analysis" in STEPS:
"status/analysis.done"
]
# Taxonomy
if "taxonomy" in STEPS:
include:
"steps/taxonomy.smk"
TARGETS += [
"status/taxonomy.done"
]
# No targets
if len(TARGETS) == 0:
raise Exception('You are not serious. Nothing to be done? Really?')
......
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- _libgcc_mutex=0.1=conda_forge
- _openmp_mutex=4.5=0_gnu
- blast=2.5.0=hc0b0e79_3
- boost=1.73.0=py38hd103949_0
- boost-cpp=1.73.0=h7b93d67_1
- bzip2=1.0.8=h516909a_2
- ca-certificates=2020.6.20=hecda079_0
- certifi=2020.6.20=py38h32f6830_0
- gettext=0.19.8.1=hc5be6a0_1002
- icu=67.1=he1b5a44_0
- kraken2=2.0.9beta=pl526hc9558a2_0
- ld_impl_linux-64=2.34=h53a641e_5
- libblas=3.8.0=14_openblas
- libcblas=3.8.0=14_openblas
- libffi=3.2.1=he1b5a44_1007
- libgcc-ng=9.2.0=h24d8f2e_2
- libgfortran-ng=7.5.0=hdf63c60_6
- libgomp=9.2.0=h24d8f2e_2
- libiconv=1.15=h516909a_1006
- libidn2=2.3.0=h516909a_0
- liblapack=3.8.0=14_openblas
- libopenblas=0.3.7=h5ec1e0e_6
- libstdcxx-ng=9.2.0=hdf63c60_2
- libunistring=0.9.10=h14c3975_0
- lz4-c=1.9.2=he1b5a44_1
- ncurses=6.1=hf484d3e_1002
- numpy=1.18.5=py38h8854b6b_0
- openssl=1.1.1g=h516909a_0
- perl=5.26.2=h516909a_1006
- pip=20.1.1=py_1
- popt=1.16=h299ea2f_2002
- python=3.8.3=cpython_he5300dc_0
- python_abi=3.8=1_cp38
- readline=8.0=hf8c457e_0
- rsync=3.1.3=hed695b0_1002
- setuptools=47.3.1=py38h32f6830_0
- sqlite=3.32.3=hcee41ef_0
- tar=1.32=hd4ba37b_0
- tk=8.6.10=hed695b0_0
- wget=1.20.1=h22169c7_0
- wheel=0.34.2=py_1
- xz=5.2.5=h516909a_0
- zlib=1.2.11=h516909a_1006
- zstd=1.4.4=h6597ccf_3
# Taxonomic analysis
rule kraken2_contigs:
input:
contigs=os.path.join(RESULTS_DIR, "assembly/{rtype}/{tool}/ASSEMBLY.fasta"),
db=lambda wildcards: config["kraken2"]["db"][wildcards.db]
output:
labels=os.path.join(RESULTS_DIR, "taxonomy/kraken2/{rtype}_{tool}_{db}_labels.txt"),
report=os.path.join(RESULTS_DIR, "taxonomy/kraken2/{rtype}_{tool}_{db}_report.txt")
log:
out="logs/kraken2.{db}.{rtype}.{tool}.out.log",
err="logs/kraken2.{db}.{rtype}.{tool}.err.log"
wildcard_constraints:
rtype="|".join(READ_TYPES),
tool="|".join(ASSEMBLERS),
db="|".join(config["kraken2"]["db"].keys())
threads:
config["kraken2"]["threads"]
params:
params=config["kraken2"]["class"]["contigs"]
conda:
"../envs/kraken2.yaml"
message:
"Classification w/ Kraken2 ({wildcards.db})"
shell:
"(date && "
"kraken2 --db {input.db} --threads {threads} --use-names {params.params} --report {output.report} --output {output.labels} {input.contigs} && "
"date) 2> {log.err} > {log.out}"
rule kraken2_sr:
input:
r1=os.path.join(RESULTS_DIR, "preproc/{mtype}/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/{mtype}/sr/R2.fastp.fastq.gz"),
db=lambda wildcards: config["kraken2"]["db"][wildcards.db]
output:
labels=os.path.join(RESULTS_DIR, "taxonomy/kraken2/{mtype}_sr_{db}_labels.txt"),
report=os.path.join(RESULTS_DIR, "taxonomy/kraken2/{mtype}_sr_{db}_report.txt")
log:
out="logs/kraken2.{db}.{mtype}.sr.out.log",
err="logs/kraken2.{db}.{mtype}.sr.err.log"
wildcard_constraints:
mtype="|".join(META_TYPES),
db="|".join(config["kraken2"]["db"].keys())
threads:
config["kraken2"]["threads"]
params:
params=config["kraken2"]["class"]["sr"]
conda:
"../envs/kraken2.yaml"
message:
"Classification w/ Kraken2 ({wildcards.db})"
shell:
"(date && "
"kraken2 --db {input.db} --threads {threads} --use-names {params.params} --report {output.report} --output {output.labels} {input.r1} {input.r2} && "
"date) 2> {log.err} > {log.out}"
rule kraken2_lr:
input:
lr=os.path.join(RESULTS_DIR, "preproc/metag/lr/lr.fastq.gz"),
db=lambda wildcards: config["kraken2"]["db"][wildcards.db]
output:
labels=os.path.join(RESULTS_DIR, "taxonomy/kraken2/metag_lr_{db}_labels.txt"),
report=os.path.join(RESULTS_DIR, "taxonomy/kraken2/metag_lr_{db}_report.txt")
log:
out="logs/kraken2.{db}.metag.lr.out.log",
err="logs/kraken2.{db}.metag.lr.err.log"
wildcard_constraints:
rtype="|".join(READ_TYPES),
tool="|".join(ASSEMBLERS),
db="|".join(config["kraken2"]["db"].keys())
threads:
config["kraken2"]["threads"]
params:
params=config["kraken2"]["class"]["contigs"]
conda:
"../envs/kraken2.yaml"
message:
"Classification w/ Kraken2 ({wildcards.db})"
shell:
"(date && "
"kraken2 --db {input.db} --threads {threads} --use-names {params.params} --report {output.report} --output {output.labels} {input.lr} && "
"date) 2> {log.err} > {log.out}"
\ No newline at end of file
#!/usr/bin/env python
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
with open(snakemake.input[0], "r") as ifile, open(snakemake.output.split1, "w") as ofile1, open(snakemake.output.split2, "w") as ofile2:
for record in SeqIO.parse(ifile, "fasta"):
n = round(len(record.seq)/2)
record_1 = SeqRecord(record.seq[0:n], id=record.id + "_1", name="", description="")
record_2 = SeqRecord(record.seq[n:], id=record.id + "_2", name="", description="")
SeqIO.write(record_1, ofile1, "fasta")
SeqIO.write(record_2, ofile2, "fasta")
# Annotation
include:
"../rules/taxonomy.smk"
rule TAXONOMY:
input:
"status/taxonomy_kraken2.done",
output:
touch("status/taxonomy.done")
rule TAXONOMY_KRAKEN2:
input:
expand(
os.path.join(RESULTS_DIR, "taxonomy/kraken2/{rtype_tool}_{db}_{otype}.txt"),
rtype_tool=["%s_%s" % (rtype, tool) for rtype, tool in READ_ASSEMBLERS],
db=config["kraken2"]["db"].keys(),
otype=["labels", "report"]
),
expand(
os.path.join(RESULTS_DIR, "taxonomy/kraken2/{mtype}_sr_{db}_{otype}.txt"),
mtype=META_TYPES,
db=config["kraken2"]["db"].keys(),
otype=["labels", "report"]
),
expand(
os.path.join(RESULTS_DIR, "taxonomy/kraken2/metag_lr_{db}_{otype}.txt"),
db=config["kraken2"]["db"].keys(),
otype=["labels", "report"]
),
output:
touch("status/taxonomy_kraken2.done")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment