Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
ESB
ONT_pilot_gitlab
Commits
e47143ef
Commit
e47143ef
authored
Apr 13, 2021
by
Valentina Galata
Browse files
rm not relevant folders and files (issue
#127
)
parent
90a677ba
Changes
28
Expand all
Hide whitespace changes
Inline
Side-by-side
config/aquifer/config.fast5.yaml
deleted
100644 → 0
View file @
90a677ba
work_dir
:
"
/scratch/users/vgalata/aquifer"
# input/output (can be relative to work_dir)
single_fast5_dir
:
"
/scratch/users/sbusi/aquifer/"
# leave empty, i.e. "", if input are multi-FAST5 files
multi_fast5_dir
:
"
data_multi_fast5"
# if used as input it should not contain any sub-folders
basecalling_dir
:
"
basecalling"
# https://github.com/nanoporetech/ont_fast5_api
ont_fast5_api
:
single_to_multi_fast5
:
batch_size
:
8000
threads
:
20
# https://community.nanoporetech.com/protocols/Guppy-protocol/v/GPB_2003_v1_revT_14Dec2018
guppy
:
config
:
"
dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
gpu
:
path
:
"
/home/users/sbusi/apps/ont-guppy/bin"
bin
:
"
set
+u;
source
~/.bashrc;
set
-u;
ml
compiler/LLVM
system/CUDA
&&
/home/users/sbusi/apps/ont-guppy/bin/guppy_basecaller"
version
:
"
3.6.0+98ff765"
records_per_fastq
:
8000
chunk_size
:
1000
chunks_per_runner
:
1000
num_callers
:
4
runners_per_device
:
2
gpu_device
:
"
cuda:0"
threads
:
20
\ No newline at end of file
config/aquifer/config.yaml
deleted
100644 → 0
View file @
90a677ba
############################################################
# STEPS
# Steps to be done
steps
:
[
"
preprocessing"
,
"
assembly"
,
"
mapping"
,
"
annotation"
,
"
analysis"
,
"
taxonomy"
]
steps_annotation
:
[
"
rgi"
,
"
plasflow"
,
"
minced"
,
"
barrnap"
]
# prodigal is run in any case, no kegg
steps_analysis
:
[
"
quast"
,
"
mash"
,
"
mashmap"
,
"
fastani"
,
"
cdhit"
,
"
diamond"
]
# no mummer, cov
steps_taxonomy
:
[
"
kraken2"
,
"
kaiju"
]
############################################################
# INPUT
# working directory: will contain the results (should be writeable)
work_dir
:
"
/scratch/users/sbusi/aquifer"
# Paths WITHIN the working directory
# directory containing required DBs (should be writeable)
db_dir
:
"
/mnt/lscratch/users/vgalata/ONT_pilot_DBs"
# results directory (will be created in work_dir)
results_dir
:
"
results"
# Data paths: Use absolute paths or paths relative to the working directory !!!
data
:
# Meta-genomics
metag
:
sr
:
r1
:
"
/mnt/isilon/projects/ecosystem_biology/ONT_pilot/external_data/aquifer/sr/ERR3654041/ERR3654041_1.fastq.gz"
r2
:
"
/mnt/isilon/projects/ecosystem_biology/ONT_pilot/external_data/aquifer/sr/ERR3654041/ERR3654041_2.fastq.gz"
ont
:
fastq
:
"
/scratch/users/vgalata/aquifer/basecalling/lr.fastq.gz"
# Meta-transcriptomics
metat
:
sr
:
r1
:
"
"
# leave empty if no data, i.e. ""
r2
:
"
"
# leave empty if no data, i.e. ""
# Meta-proteomics
# metap:
# TODO
############################################################
# TOOLS
##############################
# Preprocessing
# https://github.com/OpenGene/fastp
fastp
:
threads
:
10
min_length
:
40
# https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
fastqc
:
threads
:
10
##############################
# Assembly
# List of assemblers for different read types: assembler names MUST be UNIQUE
assemblers
:
sr
:
[
"
megahit"
,
"
metaspades"
]
lr
:
[
"
flye"
,
"
raven"
]
hy
:
[
"
metaspadeshybrid"
]
# excluded: "operamsmegahit", "operamsmetaspades"
# https://github.com/fenderglass/Flye
flye
:
threads
:
10
# https://canu.readthedocs.io/en/latest/
canu
:
threads
:
24
genome_size
:
"
1g"
# https://github.com/lbcb-sci/raven
raven
:
threads
:
10
# https://github.com/ablab/spades
metaspades
:
threads
:
10
# https://github.com/voutcn/megahit
megahit
:
threads
:
10
# https://github.com/CSB5/OPERA-MS
operams
:
threads
:
12
##############################
# Assembly polishing
# https://nanoporetech.github.io/medaka/index.html
medaka
:
threads
:
10
# NOTE: avoid large values !!! e.g. 30 did not work
model
:
r941_min_high
# the MinION model, high accuarcy
# https://github.com/isovic/racon
racon
:
threads
:
30
##############################
# Mapping
# http://bio-bwa.sourceforge.net/
bwa
:
threads
:
10
# http://www.htslib.org/doc/samtools.html
samtools
:
sort
:
chunk_size
:
"
4G"
chunk_size_bigmem
:
"
16G"
##############################
# Annotation
# https://github.com/bbuchfink/diamond
diamond
:
threads
:
20
db
:
"
nr_uniprot_trembl.dmnd"
# file name in "dbs" folder
# https://github.com/dnasko/CASC
casc
:
threads
:
10
# https://github.com/ctSkennerton/minced
# minced:
# https://github.com/smaegol/PlasFlow
# plasflow:
# threshold: 0.7 # class. prob. threshold
# minlen: 1000 # rm contigs with length below this threshold
# https://github.com/arpcard/rgi
rgi
:
threads
:
5
db_url
:
"
https://card.mcmaster.ca/latest/data"
# https://github.com/tseemann/barrnap
barrnap
:
threads
:
5
kingdom
:
[
"
bac"
,
"
arc"
,
"
euk"
,
"
mito"
]
##############################
# Analysis
# https://github.com/weizhongli/cdhit --> wiki
cdhit
:
threads
:
10
# https://sourceforge.net/projects/bbmap/
# https://github.com/BioInfoTools/BBMap/
bbmap
:
threads
:
10
rrna_refs
:
[
# file names in "dbs" folder
"
sortmerna/rfam-5.8s-database-id98.fasta"
,
"
sortmerna/rfam-5s-database-id98.fasta"
,
"
sortmerna/silva-arc-16s-id95.fasta"
,
"
sortmerna/silva-arc-23s-id98.fasta"
,
"
sortmerna/silva-bac-16s-id90.fasta"
,
"
sortmerna/silva-bac-23s-id98.fasta"
,
"
sortmerna/silva-euk-18s-id95.fasta"
,
"
sortmerna/silva-euk-28s-id98.fasta"
]
host_refs
:
null
# HMMs
hmm
:
threads
:
10
kegg
:
"
KO_cdhitGe10000_160314.hmm"
# Assembly quality
# https://github.com/ablab/quast
quast
:
threads
:
10
# https://github.com/marbl/mash
mash
:
threads
:
10
# https://github.com/marbl/MashMap
mashmap
:
threads
:
10
# https://github.com/ParBLiSS/FastANI
fastani
:
threads
:
10
##############################
# Taxonomy
# https://ccb.jhu.edu/software/kraken2/
# https://github.com/DerrickWood/kraken2
kraken2
:
threads
:
10
db
:
# dir. name in "dbs" folder
maxikraken
:
"
maxikraken2_1903_140GB"
# http://kaiju.binf.ku.dk/
# http://kaiju.binf.ku.dk/server
# https://github.com/bioinformatics-centre/kaiju
kaiju
:
threads
:
10
db
:
# dir. name in "dbs" folder
# key = basename of *.fmi
kaiju_db_nr_euk
:
"
kaiju_db_nr_euk_2020-05-25"
ranks
:
[
"
phylum"
,
"
class"
,
"
order"
,
"
family"
,
"
genus"
,
"
species"
]
# https://github.com/Ecogenomics/GTDBTk
GTDBTK
:
# dir. name in "dbs" folder
DATA
:
"
gtdbtk_release89"
config/aquifer/sbatch.fast5.sh
deleted
100755 → 0
View file @
90a677ba
#!/bin/bash -l
# slurm settings if called using sbatch
#SBATCH -J ONT_SMK
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 1
#SBATCH --time=2-00:00:00
#SBATCH -p batch
# conda env name or path
SMK_ENV
=
$1
# where to create snakemake's conda env.s (path)
SMK_CONDA
=
$2
# config files
SMK_CONFIG
=
"config/aquifer/config.fast5.yaml"
SMK_SLURM
=
"config/aquifer/slurm.fast5.yaml"
# slurm cluster call
SMK_CLUSTER
=
"sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes}
\
--ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
conda activate
${
SMK_ENV
}
&&
\
snakemake
-s
workflow_fast5/Snakefile
-rp
--jobs
5
--local-cores
1
\
--configfile
${
SMK_CONFIG
}
--use-conda
--conda-prefix
${
SMK_CONDA
}
\
--cluster-config
${
SMK_SLURM
}
--cluster
"
${
SMK_CLUSTER
}
"
config/aquifer/sbatch.sh
deleted
100755 → 0
View file @
90a677ba
#!/bin/bash -l
# slurm settings if called using sbatch
#SBATCH -J ONT_SMK
#SBATCH -N 1
#SBATCH -n 1
#SBATCH -c 1
#SBATCH --time=5-00:00:00
#SBATCH -p batch
#SBATCH -q long
# conda env name or path
SMK_ENV
=
$1
# where to create snakemake's conda env.s (path)
SMK_CONDA
=
$2
# config files
SMK_CONFIG
=
"config/aquifer/config.yaml"
SMK_SLURM
=
"config/aquifer/slurm.yaml"
# slurm cluster call
SMK_CLUSTER
=
"sbatch --partition {cluster.partition} {cluster.qos} {cluster.explicit} --nodes {cluster.nodes}
\
--ntasks {cluster.ntasks} --cpus-per-task {threads} --time {cluster.time} --job-name={cluster.job-name}"
conda activate
${
SMK_ENV
}
&&
\
snakemake
-s
workflow/Snakefile
-rp
--jobs
7
--local-cores
1
\
--configfile
${
SMK_CONFIG
}
--use-conda
--conda-prefix
${
SMK_CONDA
}
\
--cluster-config
${
SMK_SLURM
}
--cluster
"
${
SMK_CLUSTER
}
"
config/aquifer/slurm.fast5.yaml
deleted
100644 → 0
View file @
90a677ba
__default__
:
time
:
"
0-02:00:00"
partition
:
"
batch"
qos
:
"
"
nodes
:
1
ntasks
:
1
explicit
:
"
"
job-name
:
"
ONT_aquifer_FAST5.{rule}"
create_multifast5s
:
time
:
"
00-04:00:00"
partition
:
"
batch"
guppy_gpu_basecalling
:
time
:
"
00-08:00:00"
partition
:
"
gpu"
explicit
:
"
--gres=gpu:1"
\ No newline at end of file
config/aquifer/slurm.yaml
deleted
100644 → 0
View file @
90a677ba
__default__
:
time
:
"
0-02:00:00"
partition
:
"
batch"
qos
:
"
"
nodes
:
1
ntasks
:
1
explicit
:
"
"
job-name
:
"
ONT_aquifer.{rule}"
# Preprocessing
fastp_sr
:
time
:
"
00-00:30:00"
partition
:
"
batch"
rm_rrna_bbmap
:
time
:
"
00-01:30:00"
partition
:
"
batch"
rm_host_bbmap_sr_metat
:
time
:
"
00-02:00:00"
partition
:
"
bigmem"
rm_host_bbmap_sr_metag
:
time
:
"
00-02:00:00"
partition
:
"
bigmem"
rm_host_bbmap_lr_metag
:
time
:
"
00-03:00:00"
partition
:
"
bigmem"
# Assembly
assembly_lr_flye
:
time
:
"
00-14:00:00"
partition
:
"
bigmem"
assembly_lr_canu
:
time
:
"
05-00:00:00"
partition
:
"
bigmem"
qos
:
"
--qos
long"
assembly_lr_raven
:
time
:
"
00-08:00:00"
partition
:
"
bigmem"
assembly_sr_megahit
:
time
:
"
00-08:00:00"
partition
:
"
bigmem"
assembly_sr_metaspades
:
time
:
"
01-00:00:00"
partition
:
"
bigmem"
assembly_hy_metaspades
:
time
:
"
01-00:00:00"
partition
:
"
bigmem"
assembly_hy_operams
:
time
:
"
08-00:00:00"
partition
:
"
bigmem"
qos
:
"
--qos
long"
# Assembly polishing
polishing_racon_lr
:
time
:
"
00-06:00:00"
partition
:
"
bigmem"
polishing_racon_sr
:
time
:
"
00-06:00:00"
partition
:
"
bigmem"
polishing_medaka_lr
:
time
:
"
01-12:00:00"
partition
:
"
bigmem"
# Mapping
mapping_bwa_idx_asm
:
time
:
"
00-01:00:00"
partition
:
"
batch"
mapping_bwa_mem_asm_sr
:
time
:
"
00-06:00:00"
partition
:
"
bigmem"
mapping_bwa_mem_asm_lr
:
time
:
"
00-12:00:00"
partition
:
"
bigmem"
mapping_bwa_mem_asm_hy
:
time
:
"
00-04:00:00"
partition
:
"
batch"
# Annotation
annotation_prodigal
:
time
:
"
00-03:00:00"
partition
:
"
batch"
annotation_rgi
:
time
:
"
00-02:00:00"
partition
:
"
batch"
annotation_minced
:
time
:
"
00-00:40:00"
partition
:
"
batch"
annotation_plasflow
:
time
:
"
00-01:00:00"
partition
:
"
bigmem"
annotation_barrnap
:
time
:
"
00-00:40:00"
partition
:
"
batch"
annotation_hmm_kegg
:
time
:
"
00-12:00:00"
partition
:
"
bigmem"
# Analysis
analysis_quast
:
time
:
"
00-00:20:00"
partition
:
"
batch"
analysis_mash_sketch_sr
:
time
:
"
00-01:00:00"
partition
:
"
batch"
analysis_mash_sketch_lr
:
time
:
"
00-01:00:00"
partition
:
"
batch"
analysis_mash_reads
:
time
:
"
00-00:30:00"
partition
:
"
batch"
analysis_mash_sketch_asm
:
time
:
"
00-00:20:00"
partition
:
"
batch"
analysis_mash_asm
:
time
:
"
00-00:20:00"
partition
:
"
batch"
analysis_mashmap_one2one
:
time
:
"
00-02:00:00"
partition
:
"
batch"
analysis_fastani_one2one
:
time
:
"
00-02:00:00"
partition
:
"
batch"
analysis_mummer_dnadiff
:
time
:
"
00-04:00:00"
partition
:
"
bigmem"
analysis_cdhit
:
time
:
"
00-02:00:00"
partition
:
"
batch"
analysis_diamond
:
time
:
"
00-02:00:00"
partition
:
"
batch"
analysis_diamond_db
:
time
:
"
00-12:00:00"
partition
:
"
batch"
analysis_genomecov_pergene
:
time
:
"
01-00:00:00"
partition
:
"
batch"
analysis_genomecov_segmentation
:
time
:
"
00-12:00:00"
partition
:
"
bigmem"
# Taxonomy
tax_kraken2_contigs
:
time
:
"
00-00:30:00"
partition
:
"
bigmem"
tax_kraken2_sr
:
time
:
"
00-01:00:00"
partition
:
"
bigmem"
tax_kraken2_lr
:
time
:
"
00-01:00:00"
partition
:
"
bigmem"
tax_kaiju
:
time
:
"
00-00:30:00"
partition
:
"
bigmem"
tax_kaiju_summary
:
time
:
"
00-00:30:00"
partition
:
"
batch"
config/gcall/config.yaml
deleted
100644 → 0
View file @
90a677ba
work_dir
:
"
/mnt/lscratch/users/vgalata/gene_calling_benchmark"
# reference genomes
refs
:
GCF_011456075.1_ASM1145607v1
:
"
https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/011/456/075/GCF_011456075.1_ASM1145607v1"
GCA_000006765.1_ASM676v1
:
"
https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/006/765/GCA_000006765.1_ASM676v1"
GCA_000005845.2_ASM584v2
:
"
https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/005/845/GCA_000005845.2_ASM584v2"
GCA_014334155.1_ASM1433415v1
:
"
https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/014/334/155/GCA_014334155.1_ASM1433415v1"
# # file extensions for reference genomes
# refs_ext:
# fna: "_genomic.fna.gz"
# cds: "_cds_from_genomic.fna.gz"
# faa: "_translated_cds.faa.gz"
diamond
:
threads
:
5
blast
:
threads
:
5
\ No newline at end of file
workflow_gcall/Snakefile
deleted
100644 → 0
View file @
90a677ba
# Pipeline for gene calling performance analysis using ref. data (indep. of other workflows)
#
# Example call:
# snakemake -s workflow_gcall/Snakefile --configfile config/gcall/config.yaml --use-conda --conda-prefix /scratch/users/vgalata/miniconda3/ONT_pilot --cores 1 -rpn
##############################
# MODULES
import os
import re
import pandas
##############################
# CONFIG
# Paths
SRC_DIR = srcdir("scripts")
ENV_DIR = srcdir("envs")
# working directory
workdir:
config["work_dir"]
##################################################
# RULES
##################################################
rule all:
input:
# blastn
blastn="blastn/ref_vs_prodigal.tsv"
rule download_ref:
output:
"refs/{acc}_{ext}"
log:
"refs/{acc}_{ext}.log"
wildcard_constraints:
acc="|".join(config["refs"].keys())
params:
url=lambda wildcards: os.path.join(config["refs"][wildcards.acc], "%s_%s.gz" % (wildcards.acc, wildcards.ext))
message:
"Gene calling: download reference {wildcards.acc} ({wildcards.ext})"
shell:
"(date && wget {params.url} -O {output}.gz && gunzip {output}.gz && date) &> {log}"
rule concat_ref:
input:
expand("refs/{acc}_{{ext}}", acc=config["refs"].keys())
output:
"refs/ref_{ext}"
message:
"Gene calling: concat references ({wildcards.ext})"
shell:
"cat {input} > {output}"
rule prodigal_ref:
input:
"refs/ref_genomic.fna"
output:
"refs/ref_genomic.prodigal.faa"
log:
"refs/ref_genomic.prodigal.log"
conda:
os.path.join(ENV_DIR, "prodigal.yaml")
message:
"Gene calling: run Prodigal on references"
shell:
"(date && prodigal -a {output} -p meta -i {input} && date) &> {log}"
rule prodigal_genes_ref:
input:
fna="refs/ref_genomic.fna",
faa="refs/ref_genomic.prodigal.faa"
output:
"refs/ref_genomic.prodigal.fa"
params:
prefix=""
conda:
os.path.join(ENV_DIR, "biopython.yaml")
message:
"Gene calling: extract reference genes from Prodigal proteins"
script:
os.path.join(SRC_DIR, "genes_from_prodigal_prots.py")
rule blastn_db_ref:
input:
"refs/ref_cds_from_genomic.fna"
output:
expand(
"refs/ref_cds_from_genomic.blast.{ext}",
ext=["ndb","nhr","nin","not","nsq","ntf","nto"]
)
log:
"refs/ref_cds_from_genomic.blast.log"
threads:
config["blast"]["threads"]
conda:
os.path.join(ENV_DIR, "blast.yaml")
message:
"Gene calling: reference BLAST nucl. DB"
shell:
"(date && db={output[0]} && makeblastdb -dbtype 'nucl' -in {input} -input_type 'fasta' -out ${{db%.*}} -logfile {log} && date) &> {log}"