Commit 6cbf5ed6 authored by Valentina Galata's avatar Valentina Galata
Browse files

init setup for preprocessing and assembly

parent ab685564
......@@ -39,7 +39,7 @@ INPUT_FAST5 = []
INPUT_FAST5 += [os.path.abspath(f) for f in config["data"]["ont"]["files"]]
for ont_d in config["data"]["ont"]["dirs"]:
INPUT_FAST5 += [os.path.abspath(f) for f in Path(ont_d).rglob('*.fast5')]
INPUT_FAST5 = INPUT_FAST5[0:10] # NOTE TEST
# INPUT_FAST5 = INPUT_FAST5[0:10] # NOTE TEST
# SR files
INPUT_SR = config["data"]["sr"]
# MetaT files
......@@ -49,19 +49,14 @@ INPUT_METAT = config["data"]["metat"]
DATA_FAST5 = [os.path.join(RESULTS_DIR, "input_ont", os.path.basename(f)) for f in INPUT_FAST5]
assert len(DATA_FAST5) == len(set(DATA_FAST5)), "Created link names for FAST5 files are NOT unique: {}".format(DATA_FAST5)
DATA_SR = {
"r1": os.path.join(RESULTS_DIR, "input_sr", os.path.basename(INPUT_SR["r1"])),
"r2": os.path.join(RESULTS_DIR, "input_sr", os.path.basename(INPUT_SR["r2"]))
"r1": os.path.join(RESULTS_DIR, "input_sr/R1.fq.gz"),
"r2": os.path.join(RESULTS_DIR, "input_sr/R2.fq.gz")
}
DATA_METAT = {
"r1": os.path.join(RESULTS_DIR, "input_metat", os.path.basename(INPUT_METAT["r1"])),
"r2": os.path.join(RESULTS_DIR, "input_metat", os.path.basename(INPUT_METAT["r2"]))
"r1": os.path.join(RESULTS_DIR, "input_metat/R1.fq.gz"),
"r2": os.path.join(RESULTS_DIR, "input_metat/R2.fq.gz")
}
# # Tools
# ASSEMBLERS = config["assemblers"]
# HYBRID_ASSEMBLER = config["hybrid_assembler"]
# MAPPERS = ["bwa", "mmi"]
##############################
# TARGETS & RULES
# List of targets to be created
......@@ -74,11 +69,22 @@ include:
"workflow/steps/prepare_input.smk"
# TARGETS.append("status/prepare_input.done")
# Basecalling
# Preprocessing
if "preprocessing" in STEPS:
include:
"workflow/steps/preprocessing.smk"
TARGETS.append("status/preprocessing_lr.done")
TARGETS += [
"status/preprocessing_lr.done",
"status/preprocessing_sr.done"
]
# Assembly
if "assembly" in STEPS:
include:
"workflow/steps/assembly.smk"
TARGETS += [
"status/assembly.done"
]
# # Assembly annotation
# if 'assembly_annotation' in STEPS:
......
......@@ -3,7 +3,7 @@
# Pipeline steps
# steps: ["assembly_annotation", "mapping", "metaT", "mmseq", "binning", "taxonomy", "analysis"]
steps: ["preprocessing"]
steps: ["preprocessing", "assembly"]
# Analysis sub-steps
analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"]
......@@ -15,7 +15,7 @@ analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"]
# work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB"
work_dir: "/scratch/users/vgalata/ont_pilot"
# paths within the working directory
# Paths WITHIN the working directory
# directory containing required DBs
db_dir: "dbs"
# results directory
......@@ -38,16 +38,14 @@ data:
# List of FAST5 files
files: []
binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
# binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
############################################################
# TOOLS
# Basecalling
# Preprocessing: LR: Basecalling
guppy:
config:
methylation_aware: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
# not_methylation_aware: "dna_r9.4.1_450bps_hac.cfg"
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
# cpu:
# path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin"
# bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller"
......@@ -63,8 +61,7 @@ guppy:
num_callers: 4
runners_per_device: 2
gpu_device: "cuda:0"
# threads: 28
threads: 5
threads: 20
# barcoder:
# path: "/home/users/sbusi/apps/ont-guppy/bin"
# bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_barcoder"
......@@ -72,132 +69,147 @@ guppy:
# records_per_fastq: 8000
# threads: 8
# assemblers: ["flye"]
assemblers:
sr: ["megahit", "metaspades"]
lr: ["flye"]
hy: ["metaspadeshybrid"]
p7zip:
bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
threads: 4
ont_fast5_api:
single_to_multi_fast5:
bin: "single_to_multi_fast5"
batch: 8000
threads: 8
nanostats:
# Preprocessing: SR
fastp:
threads: 10
min_length: 40
minimap2:
threads: 16
igc:
uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
hg38:
uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
genomecov:
bin: "bedtools genomecov"
compute_avg_coverage:
bin: "scripts/coverage.awk"
bwa:
threads: 24
long_reads_index:
opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
# QC
fastqc:
threads: 10
params: "-q -f fastq"
samtools:
sort:
threads: 4
chunk_size: "4G"
view:
threads: 4
# Assembly
assemblers:
sr: ["megahit", "metaspades"]
lr: ["flye"]
hy: ["metaspadeshybrid"]
flye:
bin: "flye"
threads: 27
threads: 10
genome_size: "1g"
operams:
bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl"
threads: 28
metaspades:
threads: 10
megahit:
threads: 28
nonpareil:
memory: 4096
threads: 14
medaka:
threads: 28
racon:
threads: 28
rebaler:
threads: 28
diamond:
threads: 28
#db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd"
metaspades:
threads: 28
mmseq2:
threads: 24
# Hybrid assembler
hybrid_assembler: "metaspades_hybrid"
# Number of cpus or threads to use
threads: 28
# kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
kraken2:
db: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# Binning
DAS_Tool:
path: "/home/users/sbusi/apps/DAS_Tool-master"
bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
db: "/home/users/sbusi/apps/DAS_Tool-master/db/"
Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# XXX
GTDBTK:
DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
# XXX
mmseqs:
path: "/home/users/sbusi/apps/mmseqs/bin"
createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb"
rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh"
convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis"
# CRISPR
CASC:
PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin"
PERL5LIB: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl"
minced:
PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/"
# Plasmid prediction
plasflow:
threshold: 0.7 # class. prob. threshold
minlen: 1000 # rm contigs with length below this threshold
# AMR prediction
rgi:
db_url: "https://card.mcmaster.ca/latest/data"
alignment_tool: "DIAMOND" # DIAMOND or BLAST
threads: 10
# p7zip:
# bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
# threads: 4
# ont_fast5_api:
# single_to_multi_fast5:
# bin: "single_to_multi_fast5"
# batch: 8000
# threads: 8
# minimap2:
# threads: 16
# igc:
# uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
# hg38:
# uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
# genomecov:
# bin: "bedtools genomecov"
# compute_avg_coverage:
# bin: "scripts/coverage.awk"
# bwa:
# threads: 24
# long_reads_index:
# opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
# samtools:
# sort:
# threads: 4
# chunk_size: "4G"
# view:
# threads: 4
# flye:
# bin: "flye"
# threads: 27
# genome_size: "1g"
# operams:
# bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl"
# threads: 28
# megahit:
# threads: 28
# nonpareil:
# memory: 4096
# threads: 14
# medaka:
# threads: 28
# racon:
# threads: 28
# rebaler:
# threads: 28
# diamond:
# threads: 28
# #db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
# db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd"
# metaspades:
# threads: 28
# mmseq2:
# threads: 24
# # Hybrid assembler
# hybrid_assembler: "metaspades_hybrid"
# # Number of cpus or threads to use
# threads: 28
# # kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# kraken2:
# db: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# # Binning
# DAS_Tool:
# path: "/home/users/sbusi/apps/DAS_Tool-master"
# bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
# db: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# # Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# # dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# # XXX
# GTDBTK:
# DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
# # XXX
# mmseqs:
# path: "/home/users/sbusi/apps/mmseqs/bin"
# createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb"
# rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh"
# convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis"
# # CRISPR
# CASC:
# PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin"
# PERL5LIB: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl"
# minced:
# PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/"
# # Plasmid prediction
# plasflow:
# threshold: 0.7 # class. prob. threshold
# minlen: 1000 # rm contigs with length below this threshold
# # AMR prediction
# rgi:
# db_url: "https://card.mcmaster.ca/latest/data"
# alignment_tool: "DIAMOND" # DIAMOND or BLAST
__default__:
time: "2-00:00:00"
time: "0-02:00:00"
partition: "batch"
qos: "qos-batch"
nodes: 1
n: 1
# ncpus: 1
job-name: "ONT_pilot.{rule}"
explicit: ""
# job-name: "ONT_pilot.{rule}"
# output: "slurm-%j.%N-%x.out"
# error: "slurm-%j.%N-%x.err"
explicit: ""
# mail-type: "end"
# Preprocessing
guppy_gpu_basecalling:
time: "01-00:00:00"
partition: "gpu"
qos: "qos-gpu"
nodes: 1
n: 1
# ncpus: 28
explicit: "--gres=gpu:1"
# "run_fastp_on_short_reads":
# {
# "time": "00-04:00:00",
# "n": 1,
# "ncpus": 3,
# "partition": "batch",
# "qos": "qos-batch",
# "mail-type": "ALL"
# },
fastp_sr:
time: "00-04:00:00"
partition: "batch"
qos: "qos-batch"
nodes: 1
n: 1
explicit: ""
# Assembly
assembly_lr_flye:
time: "01-00:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
assembly_sr_megahit:
time: "01-00:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
assembly_sr_metaspades:
time: "01-00:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
assemble_hy_metaspades:
time: "01-00:00:00"
partition: "bigmem"
qos: "qos-bigmem"
nodes: 1
n: 1
explicit: ""
# "mmseq2_compare":
# {
# "n": 1,
......
......@@ -18,7 +18,7 @@
# conda env name
ONTP_ENV="ONT_pilot"
# number of cores for snakemake
ONTP_CORES=30
ONTP_CORES=10
# IMP config file
ONTP_CONFIG="config/config.yaml" # USER INPUT REQUIRED
# slurm config file
......
# Assembly
# Long reads
rule assembly_lr_flye:
input:
os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/lr/flye/assembly.fna")
threads:
config["flye"]["threads"]
log:
out="logs/assembly_lr.flye.out.log",
err="logs/assembly_lr.flye.err.log"
conda:
"../envs/flye_v2_7.yaml"
message:
"Assembly: long reads: Flye"
shell:
"(date && flye --nano-raw {input} --meta --out-dir $(dirname {output}) --genome-size {config[flye][genome_size]} --threads {threads} && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf assembly.fasta $(basename {output})"
# Short reads
rule assembly_sr_megahit:
input:
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/sr/megahit/assembly.fna")
log:
out="logs/assembly_sr.megahit.out.log",
err="logs/assembly_sr.megahit.err.log"
threads:
config["megahit"]["threads"]
conda:
os.path.join(ENV_DIR, "megahit.yaml")
message:
"Assembly: short reads: MEGAHIT"
shell:
"(date && megahit -1 {input.r1} -2 {input.r2} -t {threads} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf final.contigs.fa $(basename {output})"
rule assembly_sr_metaspades:
input:
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/sr/metaspades/assembly.fna")
log:
out="logs/assembly_sr.metaspades.out.log",
err="logs/assembly_sr.metaspades.err.log"
threads:
config["metaspades"]["threads"]
conda:
os.path.join(ENV_DIR, "spades.yaml")
message:
"Assembly: short reads: MetaSPAdes"
shell:
"(date && metaspades.py -k 21,33,55,77 -t {threads} -1 {input.r1} -2 {input.r2} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf contigs.fasta $(basename {output})"
# Hybrid
rule assemble_hy_metaspades:
input:
lr=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"),
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/hy/metaspadeshybrid/assembly.fna")
log:
out="logs/assembly_hy.metaspades.out.log",
err="logs/assembly_hy.metaspades.err.log"
threads:
config["metaspades"]["threads"]
conda:
os.path.join(ENV_DIR, "spades.yaml")
message:
"Assembly: hybrid: MetaSPAdes"
shell:
"(date && spades.py --meta -k 21,33,55,77 -t {threads} -1 {input.r1} -2 {input.r2} --nanopore {input.lr} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf contigs.fasta $(basename {output})"
# Preprocessing of long reads
# Basecalling
# Guppy
checkpoint guppy_gpu_basecalling:
input:
DATA_FAST5
output:
directory(os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/checkpoints"))
directory(os.path.join(RESULTS_DIR, "preproc/lr/checkpoints"))
log:
out="logs/basecalling.{guppy_config}.out.log",
err="logs/basecalling.{guppy_config}.err.log"
wildcard_constraints:
guppy_config="|".join(config["guppy"]["config"].keys())
out="logs/preproc_lr.guppy.out.log",
err="logs/preproc_lr.guppy.err.log"
threads:
config["guppy"]["gpu"]["threads"]
params:
config=lambda wildcards: config["guppy"]["config"][wildcards.guppy_config]
message:
"Preprocessing long reads: Basecalling w/ Guppy"
shell:
"""
(date && \
{config[guppy][gpu][bin]} --input_path $(dirname {input[0]}) --save_path $(dirname {output}) \
--config {params.config} \
--config config[guppy][config] \
--disable_pings --compress_fastq \
--cpu_threads_per_caller {threads} \
-x {config[guppy][gpu][gpu_device]} \
......@@ -33,8 +32,7 @@ checkpoint guppy_gpu_basecalling:
def aggregate_guppy_basecalling(wildcards):
checkpoint_output = checkpoints.guppy_gpu_basecalling.get(**wildcards).output[0]
return expand(
os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/checkpoints/fastq_runid_{runid_i_j}.fastq.gz"),
guppy_config=wildcards.guppy_config,
os.path.join(RESULTS_DIR, "preproc/lr/checkpoints/fastq_runid_{runid_i_j}.fastq.gz"),
runid_i_j=glob_wildcards(os.path.join(checkpoint_output, "fastq_runid_{runid_i_j}.fastq.gz")).runid_i_j,
)
......@@ -42,22 +40,24 @@ rule merge_guppy_basecalling:
input:
aggregate_guppy_basecalling
output:
os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/lr.fastq.gz")
wildcard_constraints:
guppy_config="|".join(config["guppy"]["config"].keys())
os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz")
message:
"Preprocessing long reads: Cat FASTQ"
shell:
"cat $(echo \"{input}\" | sort) > {output}"
# QC
rule nanostat:
rule nanostat_guppy_basecalling:
input:
os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/lr.fastq.gz")
os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz")
output:
os.path.join(RESULTS_DIR, "qc/lr/{guppy_config}/NanoStats.txt")
os.path.join(RESULTS_DIR, "qc/lr/NanoStats.txt")
log:
out="logs/nanostats.{guppy_config}.out.log",
err="logs/nanostats.{guppy_config}.err.log"
out="logs/preproc_lr.nanostats.out.log",
err="logs/preproc_lr.nanostats.err.log"
conda:
os.path.join(ENV_DIR, "nanostat.yaml")
message:
"Preprocessing long reads: NanoStats"
shell:
"(date && NanoStat --fastq {input} --outdir $(dirname {output}) -n $(basename {output}) && date) 2> {log.err} > {log.out}"
# Preprocessing of short reads
# Preprocess the short reads using fastp
rule fastp_sr:
input:
r1=DATA_SR["r1"],
r2=DATA_SR["r2"]
output:
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz"),
html=os.path.join(RESULTS_DIR, "preproc/sr/fastp.html"),
json=os.path.join(RESULTS_DIR, "preproc/sr/fastp.json")
log:
out="logs/preproc_sr.fastp.out.log",
err="logs/preproc_sr.fastp.err.log"
threads:
config["fastp"]["threads"]
conda:
os.path.join(ENV_DIR, "fastp.yaml")
message:
"Preprocessing short reads: FastP"
shell:
"(date && fastp -l {config[fastp][min_length]} -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} -h {output.html} -j {output.json} -w {threads} && date) 2> {log.err} > {log.out}"
rule fastqc_fastp_sr:
input:
os.path.join(RESULTS_DIR, "preproc/sr/{rid}.fastp.fastq.gz")