Commit c0b0ba64 authored by Valentina Galata's avatar Valentina Galata
Browse files

config: major update

parent 23bf58a1
work_dir: "/scratch/users/vgalata/Aquifer"
# input/output
single_fast5_dir: "/scratch/users/sbusi/aquifer/"
multi_fast5_dir: "data_multi_fast5" # relative to work_dir
basecalling_dir: "basecalling" # relative to work_dir
# https://github.com/nanoporetech/ont_fast5_api
ont_fast5_api:
single_to_multi_fast5:
batch_size: 8000
threads: 20
# https://community.nanoporetech.com/protocols/Guppy-protocol/v/GPB_2003_v1_revT_14Dec2018
guppy:
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
gpu:
path: "/home/users/sbusi/apps/ont-guppy/bin"
bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_basecaller"
version: "3.6.0+98ff765"
records_per_fastq: 8000
chunk_size: 1000
chunks_per_runner: 1000
num_callers: 4
runners_per_device: 2
gpu_device: "cuda:0"
threads: 20
\ No newline at end of file
############################################################
# STEPS
# Steps to be done
# steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
steps: ["assembly"]
steps_annotation: ["rgi", "plasflow", "minced", "barrnap", "kegg"] # prodigal is run in any case
steps_analysis: ["quast", "mash", "mashmap", "fastani", "mummer", "cdhit", "diamond", "cov"]
steps_taxonomy: ["kraken2", "kaiju"]
############################################################
# INPUT
# working directory: will contain the results (should be writeable)
work_dir: "/scratch/users/vgalata/Aquifer"
# Paths WITHIN the working directory
# directory containing required DBs (should be writeable)
db_dir: "/mnt/lscratch/users/vgalata/ONT_pilot_DBs"
# results directory (will be created in work_dir)
results_dir: "results"
# Data paths: Use absolute paths or paths relative to the working directory !!!
data:
# Meta-genomics
metag:
sr:
r1: "/mnt/isilon/projects/ecosystem_biology/ONT_pilot/external_data/aquifer/sr/ERR3654041/ERR3654041_1.fastq.gz"
r2: "/mnt/isilon/projects/ecosystem_biology/ONT_pilot/external_data/aquifer/sr/ERR3654041/ERR3654041_2.fastq.gz"
ont:
fastq: "/scratch/users/vgalata/Aquifer/basecalling/lr.fastq.gz"
# Meta-transcriptomics
metat:
sr:
r1: "" # leave empty if no data, i.e. ""
r2: "" # leave empty if no data, i.e. ""
# Meta-proteomics
# metap:
# TODO
############################################################
# TOOLS
##############################
# Preprocessing
# https://github.com/OpenGene/fastp
fastp:
threads: 10
min_length: 40
# https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
fastqc:
threads: 10
##############################
# Assembly
# List of assemblers for different read types: assembler names MUST be UNIQUE
assemblers:
sr: ["megahit", "metaspades"]
lr: ["flye", "canu"]
hy: ["metaspadeshybrid", "operamsmegahit", "operamsmetaspades"]
# https://github.com/fenderglass/Flye
flye:
threads: 10
# https://canu.readthedocs.io/en/latest/
canu:
threads: 24
genome_size: "1g"
# https://github.com/ablab/spades
metaspades:
threads: 10
# https://github.com/voutcn/megahit
megahit:
threads: 10
# https://github.com/CSB5/OPERA-MS
operams:
threads: 10
##############################
# Assembly polishing
# https://nanoporetech.github.io/medaka/index.html
medaka:
threads: 10 # NOTE: avoid large values !!! e.g. 30 did not work
model: r941_min_high # the MinION model, high accuarcy
# https://github.com/isovic/racon
racon:
threads: 30
##############################
# Mapping
# http://bio-bwa.sourceforge.net/
bwa:
threads: 10
# http://www.htslib.org/doc/samtools.html
samtools:
sort:
chunk_size: "4G"
chunk_size_bigmem: "16G"
##############################
# Annotation
# https://github.com/bbuchfink/diamond
diamond:
threads: 20
db: "nr_uniprot_trembl.dmnd" # file name in "dbs" folder
# https://github.com/dnasko/CASC
casc:
threads: 10
# https://github.com/ctSkennerton/minced
# minced:
# https://github.com/smaegol/PlasFlow
# plasflow:
# threshold: 0.7 # class. prob. threshold
# minlen: 1000 # rm contigs with length below this threshold
# https://github.com/arpcard/rgi
rgi:
threads: 5
db_url: "https://card.mcmaster.ca/latest/data"
# https://github.com/tseemann/barrnap
barrnap:
threads: 5
kingdom: ["bac", "arc", "euk", "mito"]
##############################
# Analysis
# https://github.com/weizhongli/cdhit --> wiki
cdhit:
threads: 10
# https://sourceforge.net/projects/bbmap/
# https://github.com/BioInfoTools/BBMap/
bbmap:
threads: 10
rrna_refs: [ # file names in "dbs" folder
"sortmerna/rfam-5.8s-database-id98.fasta",
"sortmerna/rfam-5s-database-id98.fasta",
"sortmerna/silva-arc-16s-id95.fasta",
"sortmerna/silva-arc-23s-id98.fasta",
"sortmerna/silva-bac-16s-id90.fasta",
"sortmerna/silva-bac-23s-id98.fasta",
"sortmerna/silva-euk-18s-id95.fasta",
"sortmerna/silva-euk-28s-id98.fasta"
]
host_refs: null
# HMMs
hmm:
threads: 10
kegg: "KO_cdhitGe10000_160314.hmm"
# Assembly quality
# https://github.com/ablab/quast
quast:
threads: 10
# https://github.com/marbl/mash
mash:
threads: 10
# https://github.com/marbl/MashMap
mashmap:
threads: 10
# https://github.com/ParBLiSS/FastANI
fastani:
threads: 10
##############################
# Taxonomy
# https://ccb.jhu.edu/software/kraken2/
# https://github.com/DerrickWood/kraken2
kraken2:
threads: 10
db: # dir. name in "dbs" folder
maxikraken: "maxikraken2_1903_140GB"
# http://kaiju.binf.ku.dk/
# http://kaiju.binf.ku.dk/server
# https://github.com/bioinformatics-centre/kaiju
kaiju:
threads: 10
db: # dir. name in "dbs" folder
# key = basename of *.fmi
kaiju_db_nr_euk: "kaiju_db_nr_euk_2020-05-25"
ranks: ["phylum", "class", "order", "family", "genus", "species"]
# https://github.com/Ecogenomics/GTDBTk
GTDBTK: # dir. name in "dbs" folder
DATA: "gtdbtk_release89"
#!/bin/bash -l
# slurm settings if called using sbatch
#SBATCH -J ONT_SMK
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 1
#SBATCH --time=2-00:00:00
#SBATCH -p batch
# conda env name or path
SMK_ENV="/scratch/users/vgalata/miniconda3/ONT_pilot" # CHANGE as needed
# config files
SMK_CONFIG="config/Aquifer/config.fast5.yaml"
SMK_SLURM="config/Aquifer/slurm.fast5.yaml"
# slurm cluster call
SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} \
--ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
conda activate ${SMK_ENV} && \
snakemake -s workflow_fast5/Snakefile -rp --jobs 5 --local-cores 1 \
--configfile ${SMK_CONFIG} --use-conda --conda-prefix ${CONDA_PREFIX}/pipeline \
--cluster-config ${SMK_SLURM} --cluster "${SMK_CLUSTER}"
\ No newline at end of file
#!/bin/bash -l
# slurm settings if called using sbatch
#SBATCH -J ONT_SMK
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 1
#SBATCH --time=5-00:00:00
#SBATCH -p batch
#SBATCH -q long
# conda env name or path
SMK_ENV="/scratch/users/vgalata/miniconda3/ONT_pilot" # CHANGE as needed
# config files
SMK_CONFIG="config/Aquifer/config.yaml"
SMK_SLURM="config/Aquifer/slurm.yaml"
# slurm cluster call
SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} \
--ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
conda activate ${SMK_ENV} && \
snakemake -s workflow/Snakefile -rp --jobs 5 --local-cores 1 \
--configfile ${SMK_CONFIG} --use-conda --conda-prefix ${CONDA_PREFIX}/pipeline \
--cluster-config ${SMK_SLURM} --cluster "${SMK_CLUSTER}"
__default__:
time: "0-02:00:00"
partition: "batch"
qos: ""
nodes: 1
ntasks: 1
explicit: ""
job-name: "ONT_Aquifer_FAST5.{rule}"
create_multifast5s:
time: "00-04:00:00"
partition: "batch"
guppy_gpu_basecalling:
time: "00-08:00:00"
partition: "gpu"
explicit: "--gres=gpu:1"
\ No newline at end of file
__default__:
time: "0-02:00:00"
partition: "batch"
qos: ""
nodes: 1
ntasks: 1
explicit: ""
job-name: "ONT_Aquifer.{rule}"
# Preprocessing
fastp_sr:
time: "00-01:00:00"
partition: "batch"
rm_rrna_bbmap:
time: "00-4:00:00"
partition: "batch"
rm_host_bbmap_sr_metat:
time: "00-4:00:00"
partition: "bigmem"
rm_host_bbmap_sr_metag:
time: "00-4:00:00"
partition: "bigmem"
rm_host_bbmap_lr_metag:
time: "00-4:00:00"
partition: "bigmem"
# Assembly
assembly_lr_flye:
time: "00-8:00:00"
partition: "bigmem"
assembly_lr_canu:
time: "05-00:00:00"
partition: "bigmem"
qos: "--qos long"
assembly_sr_megahit:
time: "01-4:00:00"
partition: "bigmem"
assembly_sr_metaspades:
time: "01-8:00:00"
partition: "bigmem"
assembly_hy_metaspades:
time: "01-12:00:00"
partition: "bigmem"
assembly_hy_operams:
time: "00-16:00:00"
partition: "bigmem"
# Assembly polishing
polishing_racon_lr:
time: "00-6:00:00"
partition: "bigmem"
polishing_racon_sr:
time: "00-6:00:00"
partition: "bigmem"
polishing_medaka_lr:
time: "01-00:00:00"
partition: "bigmem"
# Mapping
mapping_bwa_idx_asm:
time: "00-02:00:00"
partition: "batch"
mapping_bwa_mem_asm_sr:
time: "00-02:00:00"
partition: "batch"
mapping_bwa_mem_asm_lr:
time: "00-12:00:00"
partition: "batch"
mapping_bwa_mem_asm_hy:
time: "00-02:00:00"
partition: "batch"
# Annotation
annotation_prodigal:
time: "00-2:00:00"
partition: "batch"
annotation_hmm_kegg:
time: "00-8:00:00"
partition: "batch"
annotation_plasflow:
time: "00-01:00:00"
partition: "bigmem"
# Analysis
analysis_mashmap_one2one:
time: "00-00:30:00"
partition: "batch"
analysis_diamond_db:
time: "00-6:00:00"
partition: "batch"
# Taxonomy
tax_kraken2_contigs:
time: "00-00:30:00"
partition: "bigmem"
tax_kraken2_sr:
time: "00-00:30:00"
partition: "bigmem"
tax_kraken2_lr:
time: "00-00:30:00"
partition: "bigmem"
tax_kaiju:
time: "00-00:30:00"
partition: "bigmem"
tax_kaiju_summary:
time: "00-00:10:00"
partition: "batch"
work_dir: "/scratch/users/vgalata/GDB"
# input/output
single_fast5_dir: "" # NOTE: started w/ multi-FAST5 files
multi_fast5_dir: "data_multi_fast5" # relative to work_dir
basecalling_dir: "basecalling" # relative to work_dir
# https://github.com/nanoporetech/ont_fast5_api
ont_fast5_api:
single_to_multi_fast5:
batch_size: 8000
threads: 20
# https://community.nanoporetech.com/protocols/Guppy-protocol/v/GPB_2003_v1_revT_14Dec2018
guppy:
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
gpu:
path: "/home/users/sbusi/apps/ont-guppy/bin"
bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_basecaller"
version: "3.6.0+98ff765"
records_per_fastq: 8000
chunk_size: 1000
chunks_per_runner: 1000
num_callers: 4
runners_per_device: 2
gpu_device: "cuda:0"
threads: 20
\ No newline at end of file
......@@ -2,17 +2,16 @@
# STEPS
# Steps to be done
# steps: ["preprocessing"] # execute first so raw input can be archived/moved etc.
steps: ["assembly", "mapping", "annotation", "analysis", "taxonomy"] # w/o "preprocessing"
steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
steps_annotation: ["rgi", "plasflow", "minced", "barrnap", "kegg"] # prodigal is run in any case
steps_analysis: ["quast", "mash", "mashmap", "fastani", "mummer", "cdhit", "diamond"]
steps_analysis: ["quast", "mash", "mashmap", "fastani", "mummer", "cdhit", "diamond", "cov"]
steps_taxonomy: ["kraken2", "kaiju"]
############################################################
# INPUT
# working directory: will contain the results (should be writeable)
work_dir: "/scratch/users/vgalata/ont_pilot"
work_dir: "/scratch/users/vgalata/GDB"
# Paths WITHIN the working directory
# directory containing required DBs (should be writeable)
......@@ -25,20 +24,15 @@ data:
# Meta-genomics
metag:
sr:
r1: "data/raw/short_reads/ONT3_MG_xx_Rashi_S11_R1_001.fastq.gz"
r2: "data/raw/short_reads/ONT3_MG_xx_Rashi_S11_R2_001.fastq.gz"
r1: "/mnt/isilon/projects/ecosystem_biology/ONT_pilot/GDB_2019/metag/sr/ONT3_MG_xx_Rashi_S11_R1_001.fastq.gz"
r2: "/mnt/isilon/projects/ecosystem_biology/ONT_pilot/GDB_2019/metag/sr/ONT3_MG_xx_Rashi_S11_R2_001.fastq.gz"
ont:
# List of directories containing FAST5 files
dirs: ["data/multifast5"] # leave empty if no data, i.e. []
# List of FAST5 files
files: [] # leave empty if no data, i.e. []
# FastQ: if given NO basecalling will be done !!!
fastq: "" # leave empty if no data, i.e. ""
fastq: "/scratch/users/vgalata/GDB/basecalling/lr.fastq.gz"
# Meta-transcriptomics
metat:
sr:
r1: "data/metaT/FastSelectFull1_MT_Rashi_S14_R1_001.fastq.gz" # leave empty if no data, i.e. ""
r2: "data/metaT/FastSelectFull1_MT_Rashi_S14_R2_001.fastq.gz" # leave empty if no data, i.e. ""
r1: "/mnt/isilon/projects/ecosystem_biology/ONT_pilot/GDB_2019/metat/sr/FastSelectFull1_MT_Rashi_S14_R1_001.fastq.gz" # leave empty if no data, i.e. ""
r2: "/mnt/isilon/projects/ecosystem_biology/ONT_pilot/GDB_2019/metat/sr/FastSelectFull1_MT_Rashi_S14_R2_001.fastq.gz" # leave empty if no data, i.e. ""
# Meta-proteomics
# metap:
# TODO
......@@ -49,21 +43,6 @@ data:
##############################
# Preprocessing
# https://community.nanoporetech.com/protocols/Guppy-protocol/v/GPB_2003_v1_revT_14Dec2018
guppy:
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
gpu:
path: "/home/users/sbusi/apps/ont-guppy/bin"
bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_basecaller"
version: "3.6.0+98ff765"
records_per_fastq: 8000
chunk_size: 1000
chunks_per_runner: 1000
num_callers: 4
runners_per_device: 2
gpu_device: "cuda:0"
threads: 20
# https://github.com/OpenGene/fastp
fastp:
threads: 10
......
#!/bin/bash -l
# slurm settings if called using sbatch
#SBATCH -J ONT_SMK
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 1
#SBATCH --time=2-00:00:00
#SBATCH -p batch
# conda env name or path
SMK_ENV="/scratch/users/vgalata/miniconda3/ONT_pilot" # CHANGE as needed
# config files
SMK_CONFIG="config/GDB/config.fast5.yaml"
SMK_SLURM="config/GDB/slurm.fast5.yaml"
# slurm cluster call
SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} \
--ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
conda activate ${SMK_ENV} && \
snakemake -s workflow_fast5/Snakefile -rp --jobs 5 --local-cores 1 \
--configfile ${SMK_CONFIG} --use-conda --conda-prefix ${CONDA_PREFIX}/pipeline \
--cluster-config ${SMK_SLURM} --cluster "${SMK_CLUSTER}"
\ No newline at end of file
__default__:
time: "0-02:00:00"
partition: "batch"
qos: ""
nodes: 1
ntasks: 1
explicit: ""
job-name: "ONT_GDB_FAST5.{rule}"
create_multifast5s:
time: "00-04:00:00"
partition: "batch"
guppy_gpu_basecalling:
time: "00-08:00:00"
partition: "gpu"
explicit: "--gres=gpu:1"
\ No newline at end of file
......@@ -8,11 +8,6 @@ __default__:
job-name: "ONT_GDB.{rule}"
# Preprocessing
guppy_gpu_basecalling:
time: "01-00:00:00"
partition: "gpu"
explicit: "--gres=gpu:1"
fastp_sr:
time: "00-01:00:00"
partition: "batch"
......
......@@ -2,10 +2,9 @@
# STEPS
# Steps to be done
# steps: ["preprocessing"] # execute first so raw input can be archived/moved etc.
steps: ["assembly", "mapping", "annotation", "analysis"] # w/o "preprocessing"
steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
steps_annotation: ["rgi", "plasflow", "minced", "barrnap", "kegg"] # prodigal is run in any case
steps_analysis: ["quast", "mash", "mashmap", "fastani", "mummer", "cdhit", "diamond"]
steps_analysis: ["quast", "mash", "mashmap", "fastani", "mummer", "cdhit", "diamond", "cov"]
steps_taxonomy: ["kraken2", "kaiju"]
############################################################
......
......@@ -8,11 +8,6 @@ __default__:
job-name: "ONT_NWC.{rule}"
# Preprocessing
guppy_gpu_basecalling:
time: "01-00:00:00"
partition: "gpu"
explicit: "--gres=gpu:1"
fastp_sr:
time: "00-01:00:00"
partition: "batch"
......
############################################################
# STEPS
# Steps to be done
# steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]