Commit 778fc6a9 authored by Valentina Galata's avatar Valentina Galata
Browse files

config: updated GDB and Zymo

parent f54b9e60
......@@ -3,9 +3,9 @@
# Steps to be done
# steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
steps: ["preprocessing", "assembly", "mapping"]
steps_annotation: ["rgi", "plasflow", "minced", "barrnap"] # prodigal is run in any case
steps_analysis: ["quast", "cdhit", "diamond", "mash"]
steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis"]
steps_annotation: ["rgi", "plasflow", "minced", "barrnap", "kegg"] # prodigal is run in any case
steps_analysis: ["quast", "mash", "mashmap", "fastani", "mummer", "cdhit", "diamond"]
steps_taxonomy: ["kraken2", "kaiju"]
############################################################
......@@ -183,6 +183,11 @@ bbmap:
# key: url of GZ archive
GCF_000001405.38_GRCh38.p12: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
# HMMs
hmm:
threads: 10
kegg: "KO_cdhitGe10000_160314.hmm"
# Assembly quality
# https://github.com/ablab/quast
quast:
......
......@@ -15,7 +15,8 @@ SMK_ENV="/scratch/users/vgalata/miniconda3/ONT_pilot" # CHANGE as needed
SMK_CONFIG="config/GDB/config.yaml"
SMK_SLURM="config/GDB/slurm.yaml"
# slurm cluster call
SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} --ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} \
--ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
conda activate ${SMK_ENV} && \
snakemake -s workflow/Snakefile -rp --jobs 7 --local-cores 1 \
......
......@@ -56,7 +56,7 @@ assembly_hy_metaspades:
partition: "bigmem"
assembly_hy_operams:
time: "00-12:00:00"
time: "00-16:00:00"
partition: "bigmem"
# Assembly polishing
......@@ -95,7 +95,7 @@ annotation_prodigal:
partition: "batch"
annotation_hmm_kegg:
time: "00-6:00:00"
time: "00-8:00:00"
partition: "batch"
annotation_plasflow:
......
......@@ -4,8 +4,8 @@
# Steps to be done
# steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
steps_annotation: ["diamond", "rgi", "plasflow", "minced", "barrnap"] # prodigal is run in any case
steps_analysis: ["quast", "cdhit", "mash_dist"]
steps_annotation: ["rgi", "plasflow", "minced", "barrnap", "kegg"] # prodigal is run in any case
steps_analysis: ["quast", "mash", "mashmap", "fastani", "mummer", "cdhit", "diamond"]
steps_taxonomy: ["kraken2", "kaiju"]
############################################################
......@@ -16,8 +16,8 @@ work_dir: "/scratch/users/vgalata/Zymo"
# Paths WITHIN the working directory
# directory containing required DBs (should be writeable)
db_dir: "dbs"
# results directory
db_dir: "/mnt/lscratch/users/vgalata/ONT_pilot_DBs"
# results directory (will be created in work_dir)
results_dir: "results"
# Data paths: Use absolute paths or paths relative to the working directory !!!
......@@ -49,9 +49,7 @@ data:
##############################
# Preprocessing
# TODO: installation ???
# Preprocessing: LR: Basecalling
# XXX
# https://community.nanoporetech.com/protocols/Guppy-protocol/v/GPB_2003_v1_revT_14Dec2018
guppy:
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
gpu:
......@@ -66,49 +64,31 @@ guppy:
gpu_device: "cuda:0"
threads: 20
# Preprocessing: SR
# https://github.com/OpenGene/fastp
fastp:
threads: 10
min_length: 40
# FastQ QC
# https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
fastqc:
threads: 10
params: "-q -f fastq"
##############################
# Assembly
# List of assemblers for different read types: assembler names MUST be UNIQUE
# Allowed values:
# SR: megahit, metaspades
# Hy: metaspadeshybrid (metaspades w/ LR), operams
# LR: flye, wtdbg2, canu
# HyHy: imp3 (IMP3 assembly using LR and SR from metaT and metaG), assembly is not part of the pipeline
# Polishing w/ SR: suffix "_sr" for hybrid and LR assemblers
assemblers:
sr: ["megahit", "metaspades"]
lr: ["flye", "wtdbg2", "flye_sr", "wtdbg2_sr"]
hy: ["metaspadeshybrid", "operams", "metaspadeshybrid_sr", "operams_sr"]
hyhy: []
lr: ["flye", "canu"]
hy: ["metaspadeshybrid", "operamsmegahit", "operamsmetaspades"]
# https://github.com/fenderglass/Flye
flye:
threads: 10
genome_size: "1g"
# https://github.com/ruanjue/wtdbg2
wtdbg2:
threads: 10
bin: "/scratch/users/sbusi/tools/wtdbg2/"
genome_size: "1g"
# https://canu.readthedocs.io/en/latest/
canu:
threads: 24
# mem: "64g"
genome_size: "1g"
# https://github.com/ablab/spades
......@@ -119,14 +99,12 @@ metaspades:
megahit:
threads: 10
# TODO: installation
# https://github.com/CSB5/OPERA-MS
operams:
threads: 10
bin: "/home/users/sbusi/apps/miniconda3/envs/operams/OPERA-MS/OPERA-MS.pl"
##############################
# Long-read assembly polishing
# Assembly polishing
# https://nanoporetech.github.io/medaka/index.html
medaka:
......@@ -140,14 +118,10 @@ racon:
##############################
# Mapping
# Mapper
# http://bio-bwa.sourceforge.net/
bwa:
threads: 10
long_reads_index:
opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
# SAM utils
# http://www.htslib.org/doc/samtools.html
samtools:
sort:
......@@ -157,35 +131,28 @@ samtools:
##############################
# Annotation
# Sequence search
# https://github.com/bbuchfink/diamond
diamond:
threads: 20
db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd" # TODO: data download
db: "nr_uniprot_trembl.dmnd" # file name in "dbs" folder
# CRISPR
# https://github.com/dnasko/CASC
casc:
threads: 10
# CRISPR
# https://github.com/ctSkennerton/minced
# minced:
# Plasmid prediction
# https://github.com/smaegol/PlasFlow
plasflow:
threshold: 0.7 # class. prob. threshold
minlen: 1000 # rm contigs with length below this threshold
# plasflow:
# threshold: 0.7 # class. prob. threshold
# minlen: 1000 # rm contigs with length below this threshold
# AMR prediction
# https://github.com/arpcard/rgi
rgi:
threads: 5
db_url: "https://card.mcmaster.ca/latest/data"
alignment_tool: "DIAMOND"
# rRNA genes prediction
# https://github.com/tseemann/barrnap
barrnap:
threads: 5
......@@ -202,42 +169,39 @@ cdhit:
# https://github.com/BioInfoTools/BBMap/
bbmap:
threads: 10
# References to be used (w/ md5sums)
rrna_refs: [
# c0cd2aa2e84e3e3977859c34feb63cd5 /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/rfam-5.8s-database-id98.fasta
# 703e4c270ab0a578deb4800c33b36367 /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/rfam-5s-database-id98.fasta
# 8b4e6c6f17f6f35444a60fdc915e052c /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-arc-16s-id95.fasta
# ca4edcdddb98d7868f93e2308e297704 /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-arc-23s-id98.fasta
# db6e72022cf650c4b33bd888b92a0391 /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-bac-16s-id90.fasta
# f347d2f8f8ffbfa28c785e3a9fe3db79 /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-bac-23s-id98.fasta
# 878a413765d09c3ec75409fb1d1573f1 /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-euk-18s-id95.fasta
# cbb973e63f52981bd591de0404df5839 /mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-euk-28s-id98.fast
"/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/rfam-5.8s-database-id98.fasta",
"/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/rfam-5s-database-id98.fasta",
"/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-arc-16s-id95.fasta",
"/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-arc-23s-id98.fasta",
"/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-bac-16s-id90.fasta",
"/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-bac-23s-id98.fasta",
"/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-euk-18s-id95.fasta",
"/mnt/irisgpfs/projects/ecosystem_biology/local_tools/IMP3/databases/sortmerna/silva-euk-28s-id98.fasta"
rrna_refs: [ # file names in "dbs" folder
"sortmerna/rfam-5.8s-database-id98.fasta",
"sortmerna/rfam-5s-database-id98.fasta",
"sortmerna/silva-arc-16s-id95.fasta",
"sortmerna/silva-arc-23s-id98.fasta",
"sortmerna/silva-bac-16s-id90.fasta",
"sortmerna/silva-bac-23s-id98.fasta",
"sortmerna/silva-euk-18s-id95.fasta",
"sortmerna/silva-euk-28s-id98.fasta"
]
host_refs: null
# HMMs
hmm:
threads: 10
kegg: "KO_cdhitGe10000_160314.hmm"
# Assembly quality
# https://github.com/ablab/quast
quast:
threads: 10
# Sequence search and clustering
# https://github.com/soedinglab/MMseqs2
# mmseqs2:
# threads: 30
# createdb: "--dbtype 2 --shuffle -v"
# easycluster: "--kmer-per-seq-scale 0.5 --cov-mode 0 -c 0.5 --min-seq-id 0.9"
# easylinclust: "--kmer-per-seq-scale 0.5 --cov-mode 0 -c 0.5 --min-seq-id 0.9"
# path: "/home/users/sbusi/apps/mmseqs/bin"
# createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb"
# rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh"
# convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis"
# https://github.com/marbl/mash
mash:
threads: 10
# https://github.com/marbl/MashMap
mashmap:
threads: 10
# https://github.com/ParBLiSS/FastANI
fastani:
threads: 10
##############################
# Taxonomy
......@@ -246,49 +210,19 @@ quast:
# https://github.com/DerrickWood/kraken2
kraken2:
threads: 10
db:
maxikraken: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
class:
sr: "--gzip-compressed --paired"
lr: ""
contigs: ""
db: # dir. name in "dbs" folder
maxikraken: "maxikraken2_1903_140GB"
# http://kaiju.binf.ku.dk/
# http://kaiju.binf.ku.dk/server
# https://github.com/bioinformatics-centre/kaiju
kaiju:
threads: 10
db: # key = basename of *.fmi
kaiju_db_nr_euk: "/mnt/isilon/projects/ecosystem_biology/databases/kaiju/kaiju_db_nr_euk_2020-05-25"
db: # dir. name in "dbs" folder
# key = basename of *.fmi
kaiju_db_nr_euk: "kaiju_db_nr_euk_2020-05-25"
ranks: ["phylum", "class", "order", "family", "genus", "species"]
# # XXX
# GTDBTK:
# DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
##############################
# MISC
# https://github.com/marbl/mash
mash:
threads: 10
##############################
# Binning
# DAS_Tool:
# path: "/home/users/sbusi/apps/DAS_Tool-master"
# bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
# db: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# # Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# # dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
##############################
# ???
# nonpareil:
# memory: 4096
# threads: 14
# rebaler:
# threads: 28
# https://github.com/Ecogenomics/GTDBTk
GTDBTK: # dir. name in "dbs" folder
DATA: "gtdbtk_release89"
......@@ -15,7 +15,8 @@ SMK_ENV="/scratch/users/vgalata/miniconda3/ONT_pilot" # CHANGE as needed
SMK_CONFIG="config/Zymo/config.fast5.yaml"
SMK_SLURM="config/Zymo/slurm.fast5.yaml"
# slurm cluster call
SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} --ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} \
--ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
conda activate ${SMK_ENV} && \
snakemake -s workflow/Snakefile -rp --jobs 7 --local-cores 1 \
......
......@@ -15,7 +15,8 @@ SMK_ENV="/scratch/users/vgalata/miniconda3/ONT_pilot" # CHANGE as needed
SMK_CONFIG="config/Zymo/config.yaml"
SMK_SLURM="config/Zymo/slurm.yaml"
# slurm cluster call
SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} --ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} \
--ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
conda activate ${SMK_ENV} && \
snakemake -s workflow/Snakefile -rp --jobs 7 --local-cores 1 \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment