Commit e47143ef authored by Valentina Galata's avatar Valentina Galata
Browse files

rm not relevant folders and files (issue #127)

parent 90a677ba
work_dir: "/scratch/users/vgalata/aquifer"
# input/output (can be relative to work_dir)
single_fast5_dir: "/scratch/users/sbusi/aquifer/" # leave empty, i.e. "", if input are multi-FAST5 files
multi_fast5_dir: "data_multi_fast5" # if used as input it should not contain any sub-folders
basecalling_dir: "basecalling"
# https://github.com/nanoporetech/ont_fast5_api
ont_fast5_api:
single_to_multi_fast5:
batch_size: 8000
threads: 20
# https://community.nanoporetech.com/protocols/Guppy-protocol/v/GPB_2003_v1_revT_14Dec2018
guppy:
config: "dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
gpu:
path: "/home/users/sbusi/apps/ont-guppy/bin"
bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_basecaller"
version: "3.6.0+98ff765"
records_per_fastq: 8000
chunk_size: 1000
chunks_per_runner: 1000
num_callers: 4
runners_per_device: 2
gpu_device: "cuda:0"
threads: 20
\ No newline at end of file
############################################################
# STEPS
# Steps to be done
steps: ["preprocessing", "assembly", "mapping", "annotation", "analysis", "taxonomy"]
steps_annotation: ["rgi", "plasflow", "minced", "barrnap"] # prodigal is run in any case, no kegg
steps_analysis: ["quast", "mash", "mashmap", "fastani", "cdhit", "diamond"] # no mummer, cov
steps_taxonomy: ["kraken2", "kaiju"]
############################################################
# INPUT
# working directory: will contain the results (should be writeable)
work_dir: "/scratch/users/sbusi/aquifer"
# Paths WITHIN the working directory
# directory containing required DBs (should be writeable)
db_dir: "/mnt/lscratch/users/vgalata/ONT_pilot_DBs"
# results directory (will be created in work_dir)
results_dir: "results"
# Data paths: Use absolute paths or paths relative to the working directory !!!
data:
# Meta-genomics
metag:
sr:
r1: "/mnt/isilon/projects/ecosystem_biology/ONT_pilot/external_data/aquifer/sr/ERR3654041/ERR3654041_1.fastq.gz"
r2: "/mnt/isilon/projects/ecosystem_biology/ONT_pilot/external_data/aquifer/sr/ERR3654041/ERR3654041_2.fastq.gz"
ont:
fastq: "/scratch/users/vgalata/aquifer/basecalling/lr.fastq.gz"
# Meta-transcriptomics
metat:
sr:
r1: "" # leave empty if no data, i.e. ""
r2: "" # leave empty if no data, i.e. ""
# Meta-proteomics
# metap:
# TODO
############################################################
# TOOLS
##############################
# Preprocessing
# https://github.com/OpenGene/fastp
fastp:
threads: 10
min_length: 40
# https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
fastqc:
threads: 10
##############################
# Assembly
# List of assemblers for different read types: assembler names MUST be UNIQUE
assemblers:
sr: ["megahit", "metaspades"]
lr: ["flye", "raven"]
hy: ["metaspadeshybrid"] # excluded: "operamsmegahit", "operamsmetaspades"
# https://github.com/fenderglass/Flye
flye:
threads: 10
# https://canu.readthedocs.io/en/latest/
canu:
threads: 24
genome_size: "1g"
# https://github.com/lbcb-sci/raven
raven:
threads: 10
# https://github.com/ablab/spades
metaspades:
threads: 10
# https://github.com/voutcn/megahit
megahit:
threads: 10
# https://github.com/CSB5/OPERA-MS
operams:
threads: 12
##############################
# Assembly polishing
# https://nanoporetech.github.io/medaka/index.html
medaka:
threads: 10 # NOTE: avoid large values !!! e.g. 30 did not work
model: r941_min_high # the MinION model, high accuarcy
# https://github.com/isovic/racon
racon:
threads: 30
##############################
# Mapping
# http://bio-bwa.sourceforge.net/
bwa:
threads: 10
# http://www.htslib.org/doc/samtools.html
samtools:
sort:
chunk_size: "4G"
chunk_size_bigmem: "16G"
##############################
# Annotation
# https://github.com/bbuchfink/diamond
diamond:
threads: 20
db: "nr_uniprot_trembl.dmnd" # file name in "dbs" folder
# https://github.com/dnasko/CASC
casc:
threads: 10
# https://github.com/ctSkennerton/minced
# minced:
# https://github.com/smaegol/PlasFlow
# plasflow:
# threshold: 0.7 # class. prob. threshold
# minlen: 1000 # rm contigs with length below this threshold
# https://github.com/arpcard/rgi
rgi:
threads: 5
db_url: "https://card.mcmaster.ca/latest/data"
# https://github.com/tseemann/barrnap
barrnap:
threads: 5
kingdom: ["bac", "arc", "euk", "mito"]
##############################
# Analysis
# https://github.com/weizhongli/cdhit --> wiki
cdhit:
threads: 10
# https://sourceforge.net/projects/bbmap/
# https://github.com/BioInfoTools/BBMap/
bbmap:
threads: 10
rrna_refs: [ # file names in "dbs" folder
"sortmerna/rfam-5.8s-database-id98.fasta",
"sortmerna/rfam-5s-database-id98.fasta",
"sortmerna/silva-arc-16s-id95.fasta",
"sortmerna/silva-arc-23s-id98.fasta",
"sortmerna/silva-bac-16s-id90.fasta",
"sortmerna/silva-bac-23s-id98.fasta",
"sortmerna/silva-euk-18s-id95.fasta",
"sortmerna/silva-euk-28s-id98.fasta"
]
host_refs: null
# HMMs
hmm:
threads: 10
kegg: "KO_cdhitGe10000_160314.hmm"
# Assembly quality
# https://github.com/ablab/quast
quast:
threads: 10
# https://github.com/marbl/mash
mash:
threads: 10
# https://github.com/marbl/MashMap
mashmap:
threads: 10
# https://github.com/ParBLiSS/FastANI
fastani:
threads: 10
##############################
# Taxonomy
# https://ccb.jhu.edu/software/kraken2/
# https://github.com/DerrickWood/kraken2
kraken2:
threads: 10
db: # dir. name in "dbs" folder
maxikraken: "maxikraken2_1903_140GB"
# http://kaiju.binf.ku.dk/
# http://kaiju.binf.ku.dk/server
# https://github.com/bioinformatics-centre/kaiju
kaiju:
threads: 10
db: # dir. name in "dbs" folder
# key = basename of *.fmi
kaiju_db_nr_euk: "kaiju_db_nr_euk_2020-05-25"
ranks: ["phylum", "class", "order", "family", "genus", "species"]
# https://github.com/Ecogenomics/GTDBTk
GTDBTK: # dir. name in "dbs" folder
DATA: "gtdbtk_release89"
#!/bin/bash -l
# slurm settings if called using sbatch
#SBATCH -J ONT_SMK
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 1
#SBATCH --time=2-00:00:00
#SBATCH -p batch
# conda env name or path
SMK_ENV=$1
# where to create snakemake's conda env.s (path)
SMK_CONDA=$2
# config files
SMK_CONFIG="config/aquifer/config.fast5.yaml"
SMK_SLURM="config/aquifer/slurm.fast5.yaml"
# slurm cluster call
SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} \
--ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
conda activate ${SMK_ENV} && \
snakemake -s workflow_fast5/Snakefile -rp --jobs 5 --local-cores 1 \
--configfile ${SMK_CONFIG} --use-conda --conda-prefix ${SMK_CONDA} \
--cluster-config ${SMK_SLURM} --cluster "${SMK_CLUSTER}"
#!/bin/bash -l
# slurm settings if called using sbatch
#SBATCH -J ONT_SMK
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 1
#SBATCH --time=5-00:00:00
#SBATCH -p batch
#SBATCH -q long
# conda env name or path
SMK_ENV=$1
# where to create snakemake's conda env.s (path)
SMK_CONDA=$2
# config files
SMK_CONFIG="config/aquifer/config.yaml"
SMK_SLURM="config/aquifer/slurm.yaml"
# slurm cluster call
SMK_CLUSTER="sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes} \
--ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
conda activate ${SMK_ENV} && \
snakemake -s workflow/Snakefile -rp --jobs 7 --local-cores 1 \
--configfile ${SMK_CONFIG} --use-conda --conda-prefix ${SMK_CONDA} \
--cluster-config ${SMK_SLURM} --cluster "${SMK_CLUSTER}"
__default__:
time: "0-02:00:00"
partition: "batch"
qos: ""
nodes: 1
ntasks: 1
explicit: ""
job-name: "ONT_aquifer_FAST5.{rule}"
create_multifast5s:
time: "00-04:00:00"
partition: "batch"
guppy_gpu_basecalling:
time: "00-08:00:00"
partition: "gpu"
explicit: "--gres=gpu:1"
\ No newline at end of file
__default__:
time: "0-02:00:00"
partition: "batch"
qos: ""
nodes: 1
ntasks: 1
explicit: ""
job-name: "ONT_aquifer.{rule}"
# Preprocessing
fastp_sr:
time: "00-00:30:00"
partition: "batch"
rm_rrna_bbmap:
time: "00-01:30:00"
partition: "batch"
rm_host_bbmap_sr_metat:
time: "00-02:00:00"
partition: "bigmem"
rm_host_bbmap_sr_metag:
time: "00-02:00:00"
partition: "bigmem"
rm_host_bbmap_lr_metag:
time: "00-03:00:00"
partition: "bigmem"
# Assembly
assembly_lr_flye:
time: "00-14:00:00"
partition: "bigmem"
assembly_lr_canu:
time: "05-00:00:00"
partition: "bigmem"
qos: "--qos long"
assembly_lr_raven:
time: "00-08:00:00"
partition: "bigmem"
assembly_sr_megahit:
time: "00-08:00:00"
partition: "bigmem"
assembly_sr_metaspades:
time: "01-00:00:00"
partition: "bigmem"
assembly_hy_metaspades:
time: "01-00:00:00"
partition: "bigmem"
assembly_hy_operams:
time: "08-00:00:00"
partition: "bigmem"
qos: "--qos long"
# Assembly polishing
polishing_racon_lr:
time: "00-06:00:00"
partition: "bigmem"
polishing_racon_sr:
time: "00-06:00:00"
partition: "bigmem"
polishing_medaka_lr:
time: "01-12:00:00"
partition: "bigmem"
# Mapping
mapping_bwa_idx_asm:
time: "00-01:00:00"
partition: "batch"
mapping_bwa_mem_asm_sr:
time: "00-06:00:00"
partition: "bigmem"
mapping_bwa_mem_asm_lr:
time: "00-12:00:00"
partition: "bigmem"
mapping_bwa_mem_asm_hy:
time: "00-04:00:00"
partition: "batch"
# Annotation
annotation_prodigal:
time: "00-03:00:00"
partition: "batch"
annotation_rgi:
time: "00-02:00:00"
partition: "batch"
annotation_minced:
time: "00-00:40:00"
partition: "batch"
annotation_plasflow:
time: "00-01:00:00"
partition: "bigmem"
annotation_barrnap:
time: "00-00:40:00"
partition: "batch"
annotation_hmm_kegg:
time: "00-12:00:00"
partition: "bigmem"
# Analysis
analysis_quast:
time: "00-00:20:00"
partition: "batch"
analysis_mash_sketch_sr:
time: "00-01:00:00"
partition: "batch"
analysis_mash_sketch_lr:
time: "00-01:00:00"
partition: "batch"
analysis_mash_reads:
time: "00-00:30:00"
partition: "batch"
analysis_mash_sketch_asm:
time: "00-00:20:00"
partition: "batch"
analysis_mash_asm:
time: "00-00:20:00"
partition: "batch"
analysis_mashmap_one2one:
time: "00-02:00:00"
partition: "batch"
analysis_fastani_one2one:
time: "00-02:00:00"
partition: "batch"
analysis_mummer_dnadiff:
time: "00-04:00:00"
partition: "bigmem"
analysis_cdhit:
time: "00-02:00:00"
partition: "batch"
analysis_diamond:
time: "00-02:00:00"
partition: "batch"
analysis_diamond_db:
time: "00-12:00:00"
partition: "batch"
analysis_genomecov_pergene:
time: "01-00:00:00"
partition: "batch"
analysis_genomecov_segmentation:
time: "00-12:00:00"
partition: "bigmem"
# Taxonomy
tax_kraken2_contigs:
time: "00-00:30:00"
partition: "bigmem"
tax_kraken2_sr:
time: "00-01:00:00"
partition: "bigmem"
tax_kraken2_lr:
time: "00-01:00:00"
partition: "bigmem"
tax_kaiju:
time: "00-00:30:00"
partition: "bigmem"
tax_kaiju_summary:
time: "00-00:30:00"
partition: "batch"
work_dir: "/mnt/lscratch/users/vgalata/gene_calling_benchmark"
# reference genomes
refs:
GCF_011456075.1_ASM1145607v1: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/011/456/075/GCF_011456075.1_ASM1145607v1"
GCA_000006765.1_ASM676v1: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/006/765/GCA_000006765.1_ASM676v1"
GCA_000005845.2_ASM584v2: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/005/845/GCA_000005845.2_ASM584v2"
GCA_014334155.1_ASM1433415v1: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/014/334/155/GCA_014334155.1_ASM1433415v1"
# # file extensions for reference genomes
# refs_ext:
# fna: "_genomic.fna.gz"
# cds: "_cds_from_genomic.fna.gz"
# faa: "_translated_cds.faa.gz"
diamond:
threads: 5
blast:
threads: 5
\ No newline at end of file
# Pipeline for gene calling performance analysis using ref. data (indep. of other workflows)
#
# Example call:
# snakemake -s workflow_gcall/Snakefile --configfile config/gcall/config.yaml --use-conda --conda-prefix /scratch/users/vgalata/miniconda3/ONT_pilot --cores 1 -rpn
##############################
# MODULES
import os
import re
import pandas
##############################
# CONFIG
# Paths
SRC_DIR = srcdir("scripts")
ENV_DIR = srcdir("envs")
# working directory
workdir:
config["work_dir"]
##################################################
# RULES
##################################################
rule all:
input:
# blastn
blastn="blastn/ref_vs_prodigal.tsv"
rule download_ref:
output:
"refs/{acc}_{ext}"
log:
"refs/{acc}_{ext}.log"
wildcard_constraints:
acc="|".join(config["refs"].keys())
params:
url=lambda wildcards: os.path.join(config["refs"][wildcards.acc], "%s_%s.gz" % (wildcards.acc, wildcards.ext))
message:
"Gene calling: download reference {wildcards.acc} ({wildcards.ext})"
shell:
"(date && wget {params.url} -O {output}.gz && gunzip {output}.gz && date) &> {log}"
rule concat_ref:
input:
expand("refs/{acc}_{{ext}}", acc=config["refs"].keys())
output:
"refs/ref_{ext}"
message:
"Gene calling: concat references ({wildcards.ext})"
shell:
"cat {input} > {output}"
rule prodigal_ref:
input:
"refs/ref_genomic.fna"
output:
"refs/ref_genomic.prodigal.faa"
log:
"refs/ref_genomic.prodigal.log"
conda:
os.path.join(ENV_DIR, "prodigal.yaml")
message:
"Gene calling: run Prodigal on references"
shell:
"(date && prodigal -a {output} -p meta -i {input} && date) &> {log}"
rule prodigal_genes_ref:
input:
fna="refs/ref_genomic.fna",
faa="refs/ref_genomic.prodigal.faa"
output:
"refs/ref_genomic.prodigal.fa"
params:
prefix=""
conda:
os.path.join(ENV_DIR, "biopython.yaml")
message:
"Gene calling: extract reference genes from Prodigal proteins"
script:
os.path.join(SRC_DIR, "genes_from_prodigal_prots.py")
rule blastn_db_ref:
input:
"refs/ref_cds_from_genomic.fna"
output:
expand(
"refs/ref_cds_from_genomic.blast.{ext}",
ext=["ndb","nhr","nin","not","nsq","ntf","nto"]
)
log:
"refs/ref_cds_from_genomic.blast.log"
threads:
config["blast"]["threads"]
conda:
os.path.join(ENV_DIR, "blast.yaml")
message:
"Gene calling: reference BLAST nucl. DB"
shell:
"(date && db={output[0]} && makeblastdb -dbtype 'nucl' -in {input} -input_type 'fasta' -out ${{db%.*}} -logfile {log} && date) &> {log}"
rule diamond_db_ref: