Commit 5feedaab authored by Valentina Galata's avatar Valentina Galata
Browse files

initial fast5 setup, basecalling and nanostat

parent 23ad860f
......@@ -3,17 +3,29 @@
##############################
# MODULES
import os
from pathlib import Path
from tempfile import TemporaryDirectory
##############################
# CONFIG
# can be overwritten by using --configfile <path to config> when calling snakemake
configfile: "config/CONFIG.yaml"
configfile: "config/config.yaml"
# Paths
SRC_DIR = srcdir("workflow/scripts")
ENV_DIR = srcdir("workflow/envs")
DATA_DIR = config["data_dir"]
# default executable for snakmake
shell.executable("bash")
# working directory
workdir:
config["work_dir"]
##############################
# DATA & TOOLS
# DATA_DIR = config["data_dir"]
RESULTS_DIR = config["results_dir"]
DB_DIR = config["db_dir"]
......@@ -22,31 +34,42 @@ STEPS = config['steps']
ANALYSIS_STEPS = config["analysis_steps"]
# Input
BARCODES = config["barcodes"]
RUNS = [
config["runs"]["first"],
config["runs"]["second"],
# config["runs"]["third"]
]
SAMPLES = config["samples"]
BINNING_SAMPLES = config["binning_samples"]
# References
IGC_URI = config["igc"]["uri"]
HG38_URI = config["hg38"]["uri"]
REFERENCES = ["igc", "hg38"]
# Tools
ASSEMBLERS = config["assemblers"]
HYBRID_ASSEMBLER = config["hybrid_assembler"]
MAPPERS = ["bwa", "mmi"]
# default executable for snakmake
shell.executable("bash")
# working directory
workdir:
config["work_dir"]
# ONT files (i.e. FAST5 files): from given directories and files
INPUT_FAST5 = []
INPUT_FAST5 += [os.path.abspath(f) for f in config["data"]["ont"]["files"]]
for ont_d in config["data"]["ont"]["dirs"]:
INPUT_FAST5 += [os.path.abspath(f) for f in Path(ont_d).rglob('*.fast5')]
INPUT_FAST5 = INPUT_FAST5[0:10] # NOTE TEST
# SR files
INPUT_SR = config["data"]["sr"]
# MetaT files
INPUT_METAT = config["data"]["metat"]
# Data from input
DATA_FAST5 = [os.path.join(RESULTS_DIR, "input", os.path.basename(f)) for f in INPUT_FAST5]
assert len(DATA_FAST5) == len(set(DATA_FAST5)), "Created link names for FAST5 files are NOT unique: {}".format(DATA_FAST5)
# DATA_SR = [os.path.join(RESULTS_DIR, "input", os.path.basename(f)) for f in INPUT_SR]
# assert len(DATA_SR) == len(set(DATA_SR)), "Created link names for SR files are NOT unique: {}".format(DATA_FAST5)
# # Input
# BARCODES = config["barcodes"]
# RUNS = [
# config["runs"]["first"],
# config["runs"]["second"],
# # config["runs"]["third"]
# ]
# SAMPLES = config["samples"]
# BINNING_SAMPLES = config["binning_samples"]
# # References
# IGC_URI = config["igc"]["uri"]
# HG38_URI = config["hg38"]["uri"]
# REFERENCES = ["igc", "hg38"]
# # Tools
# ASSEMBLERS = config["assemblers"]
# HYBRID_ASSEMBLER = config["hybrid_assembler"]
# MAPPERS = ["bwa", "mmi"]
##############################
# TARGETS & RULES
......@@ -55,84 +78,95 @@ TARGETS = []
# Include rules and add targets based on the config file
# Assembly annotation
if 'assembly_annotation' in STEPS:
include:
"workflow/steps/assembly_annotation.smk"
TARGETS += [
"assemble_and_coverage.done",
"annotate.done",
"basecall_merge_qc.done",
"coverage_of_references.done",
"prodigal_gene_call.done",
"diamond_proteins.done"
]
if 'checkpoint_assembly_annotation' in STEPS:
include:
"workflow/steps/checkpoint_assembly_annotation.smk"
TARGETS += [
"assemble_and_coverage.done",
"annotate.done",
"basecall_merge_qc.done",
"basecall_merge_qc_NO_MOD.done",
"coverage_of_references.done",
"prodigal_gene_call.done",
"diamond_proteins.done",
"dummy_folders_created.done"
]
# MMSEQ2
if 'mmseq' in STEPS:
include:
"workflow/steps/mmseq.smk"
TARGETS += [
"mmseq_comparison_for_ont.done"
]
# MetaT
if 'metaT' in STEPS:
include:
"workflow/steps/metat.smk"
TARGETS += ["metaT_mapping_for_ONT.done"]
# Mapping
if 'mapping' in STEPS:
include:
"workflow/steps/mapping.smk"
TARGETS += ["mapping_for_binning.done"]
# Binning
if 'binning' in STEPS:
include:
"workflow/steps/binning.smk"
TARGETS += ["binning_for_ont.done"]
# Taxonomy
if 'taxonomy' in STEPS:
include:
"workflow/steps/taxonomy.smk"
TARGETS += ["taxonomy_for_ont.done"]
# Prepare input files
include:
"workflow/steps/prepare_input.smk"
# TARGETS.append("status/prepare_input.done")
# Analysis
if 'analysis' in STEPS:
# Basecalling
if "preprocessing" in STEPS:
include:
"workflow/steps/analysis.smk"
# CD-HIT
if "cdhit" in ANALYSIS_STEPS:
TARGETS.append("cdhit_analysis.done")
# Mappability
if "mappability" in ANALYSIS_STEPS:
TARGETS.append("mappability_index.done")
# CRISPR
if "crispr" in ANALYSIS_STEPS:
TARGETS.append("crispr_analysis.done")
# Plasmid prediction
if "plasmids" in ANALYSIS_STEPS:
TARGETS.append("plasmids_analysis.done")
# AMR prediction
if "amr" in ANALYSIS_STEPS:
TARGETS.append("amr_analysis.done")
"workflow/steps/preprocessing.smk"
TARGETS.append("status/preprocessing_lr.done")
# # Assembly annotation
# if 'assembly_annotation' in STEPS:
# include:
# "workflow/steps/assembly_annotation.smk"
# TARGETS += [
# "assemble_and_coverage.done",
# "annotate.done",
# "basecall_merge_qc.done",
# "coverage_of_references.done",
# "prodigal_gene_call.done",
# "diamond_proteins.done"
# ]
# if 'checkpoint_assembly_annotation' in STEPS:
# include:
# "workflow/steps/checkpoint_assembly_annotation.smk"
# TARGETS += [
# "assemble_and_coverage.done",
# "annotate.done",
# "basecall_merge_qc.done",
# "basecall_merge_qc_NO_MOD.done",
# "coverage_of_references.done",
# "prodigal_gene_call.done",
# "diamond_proteins.done",
# "dummy_folders_created.done"
# ]
# # MMSEQ2
# if 'mmseq' in STEPS:
# include:
# "workflow/steps/mmseq.smk"
# TARGETS += [
# "mmseq_comparison_for_ont.done"
# ]
# # MetaT
# if 'metaT' in STEPS:
# include:
# "workflow/steps/metat.smk"
# TARGETS += ["metaT_mapping_for_ONT.done"]
# # Mapping
# if 'mapping' in STEPS:
# include:
# "workflow/steps/mapping.smk"
# TARGETS += ["mapping_for_binning.done"]
# # Binning
# if 'binning' in STEPS:
# include:
# "workflow/steps/binning.smk"
# TARGETS += ["binning_for_ont.done"]
# # Taxonomy
# if 'taxonomy' in STEPS:
# include:
# "workflow/steps/taxonomy.smk"
# TARGETS += ["taxonomy_for_ont.done"]
# # Analysis
# if 'analysis' in STEPS:
# include:
# "workflow/steps/analysis.smk"
# # CD-HIT
# if "cdhit" in ANALYSIS_STEPS:
# TARGETS.append("cdhit_analysis.done")
# # Mappability
# if "mappability" in ANALYSIS_STEPS:
# TARGETS.append("mappability_index.done")
# # CRISPR
# if "crispr" in ANALYSIS_STEPS:
# TARGETS.append("crispr_analysis.done")
# # Plasmid prediction
# if "plasmids" in ANALYSIS_STEPS:
# TARGETS.append("plasmids_analysis.done")
# # AMR prediction
# if "amr" in ANALYSIS_STEPS:
# TARGETS.append("amr_analysis.done")
# No targets
if len(TARGETS) == 0:
......
steps: ["assembly_annotation", "mapping", "metaT", "mmseq", "binning", "taxonomy", "analysis"]
############################################################
# STEPS
# Pipeline steps
# steps: ["assembly_annotation", "mapping", "metaT", "mmseq", "binning", "taxonomy", "analysis"]
steps: ["preprocessing"]
# Analysis sub-steps
analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"]
# working directory containing all relevant data,
# i.e. prefix for data, results, DBs etc.
work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB"
data_dir: "data"
results_dir: "results"
############################################################
# INPUT
# working directory: will contain the results
# work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB"
work_dir: "/scratch/users/vgalata/ont_pilot"
# paths within the working directory
# directory containing required DBs
db_dir: "dbs"
# results directory
results_dir: "results"
# Data paths: Use absolute paths or paths relative to the working directory !!!
data:
# MetaT data: R1 and R2
metat:
r1: "data/metaT/FastSelectFull1_MT_Rashi_S14_R1_001.fastq.gz"
r2: "data/metaT/FastSelectFull1_MT_Rashi_S14_R2_001.fastq.gz"
# Short reads data: R1 and R2
sr:
r1: "data/raw/short_reads/ONT3_MG_xx_Rashi_S11_R1_001.fastq.gz"
r2: "data/raw/short_reads/ONT3_MG_xx_Rashi_S11_R2_001.fastq.gz"
# ONT data
ont:
# List of directories containing FAST5 files
dirs: ["data/multifast5"]
# List of FAST5 files
files: []
runs:
first: "S1_SizeSelected"
second: "S3_Gtube"
# third: "20181108_0827_test"
binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
############################################################
# TOOLS
# Basecalling
guppy:
config:
methylation_aware: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
# not_methylation_aware: "dna_r9.4.1_450bps_hac.cfg"
# cpu:
# path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin"
# bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller"
# version: "cpu-3.1.5"
# threads: 28
gpu:
path: "/home/users/sbusi/apps/ont-guppy/bin"
bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_basecaller"
version: "3.6.0+98ff765"
records_per_fastq: 8000
chunk_size: 1000
chunks_per_runner: 1000
num_callers: 4
runners_per_device: 2
gpu_device: "cuda:0"
# threads: 28
threads: 5
# barcoder:
# path: "/home/users/sbusi/apps/ont-guppy/bin"
# bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_barcoder"
# version: "3.4.5+fb1fbfb"
# records_per_fastq: 8000
# threads: 8
# assemblers: ["flye"]
assemblers: ["flye", "megahit", "metaspades", "metaspades_hybrid"]
assemblers:
sr: ["megahit", "metaspades"]
lr: ["flye"]
hy: ["metaspadeshybrid"]
p7zip:
bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
......@@ -25,143 +86,98 @@ ont_fast5_api:
bin: "single_to_multi_fast5"
batch: 8000
threads: 8
flowcell: "FLO-MIN106"
kit: "SQK-LSK109"
#barcodes: ["barcode06", "barcode07", "barcode08", "barcode09", "barcode10"]
barcodes: ["no_barcode"]
guppy_cpu:
path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin"
bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller"
version: "cpu-3.1.5"
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
cpu_threads: 28
guppy_gpu:
path: "/home/users/sbusi/apps/ont-guppy/bin"
bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_basecaller"
version: "3.6.0+98ff765"
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
hac_config: "dna_r9.4.1_450bps_hac.cfg"
records_per_fastq: 8000
chunk_size: 1000
chunks_per_runner: 1000
num_callers: 4
runners_per_device: 2
gpu_device: "cuda:0"
cpu_threads: 28
guppy_barcoder:
path: "/home/users/sbusi/apps/ont-guppy/bin"
bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_barcoder"
version: "3.4.5+fb1fbfb"
records_per_fastq: 8000
threads: 8
nanostats:
short_reads_prefix: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/raw/short_reads"
#short_reads_prefix: "/mnt/isilon/projects/lcsb_sequencing/transfer/bioecosystem/Rashi/2019/Apr/fastq"
metaT_prefix: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/data/metaT"
fastp:
min_length: 40
minimap2:
threads: 16
igc:
uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
hg38:
uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
genomecov:
bin: "bedtools genomecov"
compute_avg_coverage:
bin: "scripts/coverage.awk"
bwa:
threads: 24
long_reads_index:
opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
samtools:
sort:
threads: 4
chunk_size: "4G"
view:
threads: 4
flye:
bin: "flye"
threads: 27
genome_size: "1g"
operams:
bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl"
threads: 28
megahit:
threads: 28
nonpareil:
memory: 4096
threads: 14
medaka:
threads: 28
racon:
threads: 28
threads: 28
rebaler:
threads: 28
diamond:
threads: 28
#db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd"
metaspades:
threads: 28
mmseq2:
threads: 24
# Define sample names
samples: ["ONT3_MG_xx_Rashi_S11"]
binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
# Hybrid assembler
hybrid_assembler: "metaspades_hybrid"
# Directory where fastq files are
#data_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/Binning"
# Directory to save the output to
#results_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/Binning"
# Number of cpus or threads to use
threads: 28
# Path to the the 140GB Kraken2 database
kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
kraken2:
db: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# Path to DAS_Tool
# Binning
DAS_Tool:
path: "/home/users/sbusi/apps/DAS_Tool-master"
bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
db: "/home/users/sbusi/apps/DAS_Tool-master/db/"
Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# Path to DAS_Tool database
# TODO: mv to DAS_Tool
dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# Mapping options
bwa:
threads: 24
long_reads_index:
opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
samtools:
sort:
threads: 4
chunk_size: "4G"
view:
threads: 4
minimap2:
threads: 24
# Path to GTDBTK database
# XXX
GTDBTK:
DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
# Rscript path
# TODO: mv to DAS_Tool
Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# XXX
mmseqs:
path: "/home/users/sbusi/apps/mmseqs/bin"
......
__default__:
time: "2-00:00:00"
partition: "batch"
qos: "qos-batch"
nodes: 1
n: 1
# ncpus: 1
job-name: "ONT_pilot.{rule}"
# output: "slurm-%j.%N-%x.out"
# error: "slurm-%j.%N-%x.err"
explicit: ""
# mail-type: "end"
guppy_gpu_basecalling:
time: "01-00:00:00"
partition: "gpu"
qos: "qos-gpu"
nodes: 1
n: 1
# ncpus: 28
explicit: "--gres=gpu:1"
# "run_fastp_on_short_reads":
# {
# "time": "00-04:00:00",
# "n": 1,
# "ncpus": 3,
# "partition": "batch",
# "qos": "qos-batch",
# "mail-type": "ALL"
# },
# "mmseq2_compare":
# {
# "n": 1,
# "ncpus": 12,
# "time": "00-04:00:00",
# "partition": "bigmem",
# "qos": "qos-bigmem",
# "mail-type": "end"
# },
# "call_genes_w_prodigal":
# {
# "time": "01-12:00:00"
# },
# "compute_avg_genomecov":
# {
# "time": "00-04:00:00",
# "n": 1,
# "ncpus": 8,
# "partition": "batch",
# "qos": "qos-batch"
# },
# "build_minimap2_ont_index":
# {
# "time": "00-04:00:00",
# "n": 1,
# "ncpus": 12,
# "partition": "batch"
# },
# "build_minimap2_sr_index":
# {
# "time": "1-00:00:00",
# "n": 1,
# "ncpus": 24,
# "partition": "batch",
# "qos": "qos-batch"
# },
# "build_bwa_index":
# {
# "time": "01-00:00:00",
# "n": 1,
# "ncpus": 24,
# "partition": "batch",
# "qos": "qos-batch"
# },
# "map_long_reads_to_reference_w_minimap2":
# {
# "time": "00-04:00:00",
# "n": 1,
# "ncpus": 14,
# "partition": "batch",
# "qos": "qos-batch"
# },
# "map_short_reads_to_reference_w_minimap2":
# {
# "time": "00-04:00:00",
# "n": 1,
# "ncpus": 28,
# "partition": "batch",
# "qos": "qos-batch"
# },
# "map_short_reads_to_reference_w_bwa_mem":
# {
# "time": "00-04:00:00",
# "n": 1,
# "ncpus": 28,
# "partition": "batch",
# "qos": "qos-batch"
# },
# "run_genomecov":
# {
# "time": "00-04:00:00",
# "n": 1,
# "ncpus": 8,
# "partition": "batch",
# "qos": "qos-batch"
# },
# "map_short_reads_to_long_reads":
# {
# "time": "05-00:00:00",
# "n": 1,
# "ncpus": 24,
# "partition": "batch",
# "qos": "qos-batch"
# },