Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
ESB
ONT_pilot_gitlab
Commits
6cbf5ed6
Commit
6cbf5ed6
authored
Jun 10, 2020
by
Valentina Galata
Browse files
init setup for preprocessing and assembly
parent
ab685564
Changes
9
Hide whitespace changes
Inline
Side-by-side
Snakefile
View file @
6cbf5ed6
...
...
@@ -39,7 +39,7 @@ INPUT_FAST5 = []
INPUT_FAST5 += [os.path.abspath(f) for f in config["data"]["ont"]["files"]]
for ont_d in config["data"]["ont"]["dirs"]:
INPUT_FAST5 += [os.path.abspath(f) for f in Path(ont_d).rglob('*.fast5')]
INPUT_FAST5 = INPUT_FAST5[0:10] # NOTE TEST
#
INPUT_FAST5 = INPUT_FAST5[0:10] # NOTE TEST
# SR files
INPUT_SR = config["data"]["sr"]
# MetaT files
...
...
@@ -49,19 +49,14 @@ INPUT_METAT = config["data"]["metat"]
DATA_FAST5 = [os.path.join(RESULTS_DIR, "input_ont", os.path.basename(f)) for f in INPUT_FAST5]
assert len(DATA_FAST5) == len(set(DATA_FAST5)), "Created link names for FAST5 files are NOT unique: {}".format(DATA_FAST5)
DATA_SR = {
"r1": os.path.join(RESULTS_DIR, "input_sr
", os.path.basename(INPUT_SR["r1"])
),
"r2": os.path.join(RESULTS_DIR, "input_sr
", os.path.basename(INPUT_SR["r2"])
)
"r1": os.path.join(RESULTS_DIR, "input_sr
/R1.fq.gz"
),
"r2": os.path.join(RESULTS_DIR, "input_sr
/R2.fq.gz"
)
}
DATA_METAT = {
"r1": os.path.join(RESULTS_DIR, "input_metat
", os.path.basename(INPUT_METAT["r1"])
),
"r2": os.path.join(RESULTS_DIR, "input_metat
", os.path.basename(INPUT_METAT["r2"])
)
"r1": os.path.join(RESULTS_DIR, "input_metat
/R1.fq.gz"
),
"r2": os.path.join(RESULTS_DIR, "input_metat
/R2.fq.gz"
)
}
# # Tools
# ASSEMBLERS = config["assemblers"]
# HYBRID_ASSEMBLER = config["hybrid_assembler"]
# MAPPERS = ["bwa", "mmi"]
##############################
# TARGETS & RULES
# List of targets to be created
...
...
@@ -74,11 +69,22 @@ include:
"workflow/steps/prepare_input.smk"
# TARGETS.append("status/prepare_input.done")
#
Basecall
ing
#
Preprocess
ing
if "preprocessing" in STEPS:
include:
"workflow/steps/preprocessing.smk"
TARGETS.append("status/preprocessing_lr.done")
TARGETS += [
"status/preprocessing_lr.done",
"status/preprocessing_sr.done"
]
# Assembly
if "assembly" in STEPS:
include:
"workflow/steps/assembly.smk"
TARGETS += [
"status/assembly.done"
]
# # Assembly annotation
# if 'assembly_annotation' in STEPS:
...
...
config/config.yaml
View file @
6cbf5ed6
...
...
@@ -3,7 +3,7 @@
# Pipeline steps
# steps: ["assembly_annotation", "mapping", "metaT", "mmseq", "binning", "taxonomy", "analysis"]
steps
:
[
"
preprocessing"
]
steps
:
[
"
preprocessing"
,
"
assembly"
]
# Analysis sub-steps
analysis_steps
:
[
"
cdhit"
,
"
mappability"
,
"
crispr"
,
"
plasmids"
,
"
amr"
]
...
...
@@ -15,7 +15,7 @@ analysis_steps: ["cdhit", "mappability", "crispr", "plasmids", "amr"]
# work_dir: "/scratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB"
work_dir
:
"
/scratch/users/vgalata/ont_pilot"
#
p
aths
within
the working directory
#
P
aths
WITHIN
the working directory
# directory containing required DBs
db_dir
:
"
dbs"
# results directory
...
...
@@ -38,16 +38,14 @@ data:
# List of FAST5 files
files
:
[]
binning_samples
:
[
"
flye"
,
"
megahit"
,
"
bwa_sr_metaspades_hybrid"
,
"
bwa_lr_metaspades_hybrid"
,
"
bwa_merged_metaspades_hybrid"
,
"
mmi_sr_metaspades_hybrid"
,
"
mmi_lr_metaspades_hybrid"
,
"
mmi_merged_metaspades_hybrid"
]
#
binning_samples: ["flye", "megahit", "bwa_sr_metaspades_hybrid", "bwa_lr_metaspades_hybrid", "bwa_merged_metaspades_hybrid", "mmi_sr_metaspades_hybrid", "mmi_lr_metaspades_hybrid", "mmi_merged_metaspades_hybrid"]
############################################################
# TOOLS
# Basecalling
#
Preprocessing: LR:
Basecalling
guppy
:
config
:
methylation_aware
:
"
dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
# not_methylation_aware: "dna_r9.4.1_450bps_hac.cfg"
config
:
"
dna_r9.4.1_450bps_modbases_dam-dcm-cpg_hac.cfg"
# cpu:
# path: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin"
# bin: "/scratch/users/claczny/ont/apps/software/ont-guppy-cpu-3.1.5_linux64/bin/guppy_basecaller"
...
...
@@ -63,8 +61,7 @@ guppy:
num_callers
:
4
runners_per_device
:
2
gpu_device
:
"
cuda:0"
# threads: 28
threads
:
5
threads
:
20
# barcoder:
# path: "/home/users/sbusi/apps/ont-guppy/bin"
# bin: "set +u; source ~/.bashrc; set -u; ml compiler/LLVM system/CUDA && /home/users/sbusi/apps/ont-guppy/bin/guppy_barcoder"
...
...
@@ -72,132 +69,147 @@ guppy:
# records_per_fastq: 8000
# threads: 8
# assemblers: ["flye"]
assemblers
:
sr
:
[
"
megahit"
,
"
metaspades"
]
lr
:
[
"
flye"
]
hy
:
[
"
metaspadeshybrid"
]
p7zip
:
bin
:
"
/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
threads
:
4
ont_fast5_api
:
single_to_multi_fast5
:
bin
:
"
single_to_multi_fast5"
batch
:
8000
threads
:
8
nanostats
:
# Preprocessing: SR
fastp
:
threads
:
10
min_length
:
40
minimap2
:
threads
:
16
igc
:
uri
:
"
parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
hg38
:
uri
:
"
ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
genomecov
:
bin
:
"
bedtools
genomecov"
compute_avg_coverage
:
bin
:
"
scripts/coverage.awk"
bwa
:
threads
:
24
long_reads_index
:
opts
:
"
-aY
-A
5
-B
11
-O
2,1
-E
4,3
-k
8
-W
16
-w
40
-r
1
-D
0
-y
20
-L
30,30
-T
2.5"
# QC
fastqc
:
threads
:
10
params
:
"
-q
-f
fastq"
samtools
:
sort
:
threads
:
4
chunk_size
:
"
4G"
view
:
threads
:
4
# Assembly
assemblers
:
sr
:
[
"
megahit"
,
"
metaspades"
]
lr
:
[
"
flye"
]
hy
:
[
"
metaspadeshybrid"
]
flye
:
bin
:
"
flye"
threads
:
27
threads
:
10
genome_size
:
"
1g"
operams
:
bin
:
"
set
+u;
source
~/.bashrc;
set
-u;
ml
lang/Perl
lang/R
&&
perl
/scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl"
threads
:
28
metaspades
:
threads
:
10
megahit
:
threads
:
28
nonpareil
:
memory
:
4096
threads
:
14
medaka
:
threads
:
28
racon
:
threads
:
28
rebaler
:
threads
:
28
diamond
:
threads
:
28
#db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
db
:
"
/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd"
metaspades
:
threads
:
28
mmseq2
:
threads
:
24
# Hybrid assembler
hybrid_assembler
:
"
metaspades_hybrid"
# Number of cpus or threads to use
threads
:
28
# kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
kraken2
:
db
:
"
/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# Binning
DAS_Tool
:
path
:
"
/home/users/sbusi/apps/DAS_Tool-master"
bin
:
"
/home/users/sbusi/apps/DAS_Tool-master/src/"
db
:
"
/home/users/sbusi/apps/DAS_Tool-master/db/"
Rscript
:
"
/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# XXX
GTDBTK
:
DATA
:
"
/home/users/sbusi/apps/db/gtdbtk/release89"
# XXX
mmseqs
:
path
:
"
/home/users/sbusi/apps/mmseqs/bin"
createdb
:
"
/home/users/sbusi/apps/mmseqs/bin/mmseqs
createdb"
rbh
:
"
/home/users/sbusi/apps/mmseqs/bin/mmseqs
rbh"
convertalis
:
"
/home/users/sbusi/apps/mmseqs/bin/mmseqs
convertalis"
# CRISPR
CASC
:
PATH
:
"
$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin"
PERL5LIB
:
"
/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl"
minced
:
PATH
:
"
$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/"
# Plasmid prediction
plasflow
:
threshold
:
0.7
# class. prob. threshold
minlen
:
1000
# rm contigs with length below this threshold
# AMR prediction
rgi
:
db_url
:
"
https://card.mcmaster.ca/latest/data"
alignment_tool
:
"
DIAMOND"
# DIAMOND or BLAST
threads
:
10
# p7zip:
# bin: "/home/users/claczny/apps/software/p7zip_16.02/bin/7za"
# threads: 4
# ont_fast5_api:
# single_to_multi_fast5:
# bin: "single_to_multi_fast5"
# batch: 8000
# threads: 8
# minimap2:
# threads: 16
# igc:
# uri: "parrot.genomics.cn/gigadb/pub/10.5524/100001_101000/100064/1.GeneCatalogs/IGC.fa.gz"
# hg38:
# uri: "ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/405/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.fna.gz"
# genomecov:
# bin: "bedtools genomecov"
# compute_avg_coverage:
# bin: "scripts/coverage.awk"
# bwa:
# threads: 24
# long_reads_index:
# opts: "-aY -A 5 -B 11 -O 2,1 -E 4,3 -k 8 -W 16 -w 40 -r 1 -D 0 -y 20 -L 30,30 -T 2.5"
# samtools:
# sort:
# threads: 4
# chunk_size: "4G"
# view:
# threads: 4
# flye:
# bin: "flye"
# threads: 27
# genome_size: "1g"
# operams:
# bin: "set +u; source ~/.bashrc; set -u; ml lang/Perl lang/R && perl /scratch/users/claczny/ont/apps/software/OPERA-MS/OPERA-MS.pl"
# threads: 28
# megahit:
# threads: 28
# nonpareil:
# memory: 4096
# threads: 14
# medaka:
# threads: 28
# racon:
# threads: 28
# rebaler:
# threads: 28
# diamond:
# threads: 28
# #db: "/mnt/isilon/projects/ecosystem_biology/NOMIS/DIAMOND/new_nr.dmnd"
# db: "/work/projects/ecosystem_biology/local_tools/databases/nr_uniprot_trembl.dmnd"
# metaspades:
# threads: 28
# mmseq2:
# threads: 24
# # Hybrid assembler
# hybrid_assembler: "metaspades_hybrid"
# # Number of cpus or threads to use
# threads: 28
# # kraken2_database: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# kraken2:
# db: "/scratch/users/bkunath/Kraken2/maxikraken2_1903_140GB/"
# # Binning
# DAS_Tool:
# path: "/home/users/sbusi/apps/DAS_Tool-master"
# bin: "/home/users/sbusi/apps/DAS_Tool-master/src/"
# db: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# # Rscript: "/home/users/sbusi/apps/miniconda3/envs/dastool/bin/"
# # dastool_database: "/home/users/sbusi/apps/DAS_Tool-master/db/"
# # XXX
# GTDBTK:
# DATA: "/home/users/sbusi/apps/db/gtdbtk/release89"
# # XXX
# mmseqs:
# path: "/home/users/sbusi/apps/mmseqs/bin"
# createdb: "/home/users/sbusi/apps/mmseqs/bin/mmseqs createdb"
# rbh: "/home/users/sbusi/apps/mmseqs/bin/mmseqs rbh"
# convertalis: "/home/users/sbusi/apps/mmseqs/bin/mmseqs convertalis"
# # CRISPR
# CASC:
# PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/bin"
# PERL5LIB: "/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/lib/site_perl"
# minced:
# PATH: "$PATH:/mnt/lscratch/users/sbusi/ONT/cedric_ont_basecalling/2019_GDB/crispr/minced/"
# # Plasmid prediction
# plasflow:
# threshold: 0.7 # class. prob. threshold
# minlen: 1000 # rm contigs with length below this threshold
# # AMR prediction
# rgi:
# db_url: "https://card.mcmaster.ca/latest/data"
# alignment_tool: "DIAMOND" # DIAMOND or BLAST
config/slurm.yaml
View file @
6cbf5ed6
__default__
:
time
:
"
2
-0
0
:00:00"
time
:
"
0
-0
2
:00:00"
partition
:
"
batch"
qos
:
"
qos-batch"
nodes
:
1
n
:
1
# ncpus: 1
job-name
:
"
ONT_pilot.{rule}"
explicit
:
"
"
#
job-name: "ONT_pilot.{rule}"
# output: "slurm-%j.%N-%x.out"
# error: "slurm-%j.%N-%x.err"
explicit
:
"
"
# mail-type: "end"
# Preprocessing
guppy_gpu_basecalling
:
time
:
"
01-00:00:00"
partition
:
"
gpu"
qos
:
"
qos-gpu"
nodes
:
1
n
:
1
# ncpus: 28
explicit
:
"
--gres=gpu:1"
# "run_fastp_on_short_reads":
# {
# "time": "00-04:00:00",
# "n": 1,
# "ncpus": 3,
# "partition": "batch",
# "qos": "qos-batch",
# "mail-type": "ALL"
# },
fastp_sr
:
time
:
"
00-04:00:00"
partition
:
"
batch"
qos
:
"
qos-batch"
nodes
:
1
n
:
1
explicit
:
"
"
# Assembly
assembly_lr_flye
:
time
:
"
01-00:00:00"
partition
:
"
bigmem"
qos
:
"
qos-bigmem"
nodes
:
1
n
:
1
explicit
:
"
"
assembly_sr_megahit
:
time
:
"
01-00:00:00"
partition
:
"
bigmem"
qos
:
"
qos-bigmem"
nodes
:
1
n
:
1
explicit
:
"
"
assembly_sr_metaspades
:
time
:
"
01-00:00:00"
partition
:
"
bigmem"
qos
:
"
qos-bigmem"
nodes
:
1
n
:
1
explicit
:
"
"
assemble_hy_metaspades
:
time
:
"
01-00:00:00"
partition
:
"
bigmem"
qos
:
"
qos-bigmem"
nodes
:
1
n
:
1
explicit
:
"
"
# "mmseq2_compare":
# {
# "n": 1,
...
...
sbatch.sh
View file @
6cbf5ed6
...
...
@@ -18,7 +18,7 @@
# conda env name
ONTP_ENV
=
"ONT_pilot"
# number of cores for snakemake
ONTP_CORES
=
3
0
ONTP_CORES
=
1
0
# IMP config file
ONTP_CONFIG
=
"config/config.yaml"
# USER INPUT REQUIRED
# slurm config file
...
...
workflow/rules/assembly.smk
0 → 100644
View file @
6cbf5ed6
# Assembly
# Long reads
rule assembly_lr_flye:
input:
os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/lr/flye/assembly.fna")
threads:
config["flye"]["threads"]
log:
out="logs/assembly_lr.flye.out.log",
err="logs/assembly_lr.flye.err.log"
conda:
"../envs/flye_v2_7.yaml"
message:
"Assembly: long reads: Flye"
shell:
"(date && flye --nano-raw {input} --meta --out-dir $(dirname {output}) --genome-size {config[flye][genome_size]} --threads {threads} && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf assembly.fasta $(basename {output})"
# Short reads
rule assembly_sr_megahit:
input:
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/sr/megahit/assembly.fna")
log:
out="logs/assembly_sr.megahit.out.log",
err="logs/assembly_sr.megahit.err.log"
threads:
config["megahit"]["threads"]
conda:
os.path.join(ENV_DIR, "megahit.yaml")
message:
"Assembly: short reads: MEGAHIT"
shell:
"(date && megahit -1 {input.r1} -2 {input.r2} -t {threads} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf final.contigs.fa $(basename {output})"
rule assembly_sr_metaspades:
input:
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/sr/metaspades/assembly.fna")
log:
out="logs/assembly_sr.metaspades.out.log",
err="logs/assembly_sr.metaspades.err.log"
threads:
config["metaspades"]["threads"]
conda:
os.path.join(ENV_DIR, "spades.yaml")
message:
"Assembly: short reads: MetaSPAdes"
shell:
"(date && metaspades.py -k 21,33,55,77 -t {threads} -1 {input.r1} -2 {input.r2} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf contigs.fasta $(basename {output})"
# Hybrid
rule assemble_hy_metaspades:
input:
lr=os.path.join(RESULTS_DIR, "preproc/lr/lr.fastq.gz"),
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz")
output:
os.path.join(RESULTS_DIR, "assembly/hy/metaspadeshybrid/assembly.fna")
log:
out="logs/assembly_hy.metaspades.out.log",
err="logs/assembly_hy.metaspades.err.log"
threads:
config["metaspades"]["threads"]
conda:
os.path.join(ENV_DIR, "spades.yaml")
message:
"Assembly: hybrid: MetaSPAdes"
shell:
"(date && spades.py --meta -k 21,33,55,77 -t {threads} -1 {input.r1} -2 {input.r2} --nanopore {input.lr} -o $(dirname {output}) && date) 2> {log.err} > {log.out} && "
"cd $(dirname {output}) && ln -sf contigs.fasta $(basename {output})"
workflow/rules/preprocessing_lr.smk
View file @
6cbf5ed6
# Preprocessing of long reads
# Basecalling
# Guppy
checkpoint guppy_gpu_basecalling:
input:
DATA_FAST5
output:
directory(os.path.join(RESULTS_DIR, "
basecalling/{guppy_config}
/checkpoints"))
directory(os.path.join(RESULTS_DIR, "
preproc/lr
/checkpoints"))
log:
out="logs/basecalling.{guppy_config}.out.log",
err="logs/basecalling.{guppy_config}.err.log"
wildcard_constraints:
guppy_config="|".join(config["guppy"]["config"].keys())
out="logs/preproc_lr.guppy.out.log",
err="logs/preproc_lr.guppy.err.log"
threads:
config["guppy"]["gpu"]["threads"]
params
:
config=lambda wildcards: config["guppy"]["config"][wildcards.guppy_config]
message
:
"Preprocessing long reads: Basecalling w/ Guppy"
shell:
"""
(date && \
{config[guppy][gpu][bin]} --input_path $(dirname {input[0]}) --save_path $(dirname {output}) \
--config
{params.
config
}
\
--config
config[guppy][
config
]
\
--disable_pings --compress_fastq \
--cpu_threads_per_caller {threads} \
-x {config[guppy][gpu][gpu_device]} \
...
...
@@ -33,8 +32,7 @@ checkpoint guppy_gpu_basecalling:
def aggregate_guppy_basecalling(wildcards):
checkpoint_output = checkpoints.guppy_gpu_basecalling.get(**wildcards).output[0]
return expand(
os.path.join(RESULTS_DIR, "basecalling/{guppy_config}/checkpoints/fastq_runid_{runid_i_j}.fastq.gz"),
guppy_config=wildcards.guppy_config,
os.path.join(RESULTS_DIR, "preproc/lr/checkpoints/fastq_runid_{runid_i_j}.fastq.gz"),
runid_i_j=glob_wildcards(os.path.join(checkpoint_output, "fastq_runid_{runid_i_j}.fastq.gz")).runid_i_j,
)
...
...
@@ -42,22 +40,24 @@ rule merge_guppy_basecalling:
input:
aggregate_guppy_basecalling
output:
os.path.join(RESULTS_DIR, "
basecalling/{guppy_config}
/lr.fastq.gz")
wildcard_constraints
:
guppy_config="|".join(config["guppy"]["config"].keys())
os.path.join(RESULTS_DIR, "
preproc/lr
/lr.fastq.gz")
message
:
"Preprocessing long reads: Cat FASTQ"
shell:
"cat $(echo \"{input}\" | sort) > {output}"
# QC
rule nanostat:
rule nanostat
_guppy_basecalling
:
input:
os.path.join(RESULTS_DIR, "
basecalling/{guppy_config}
/lr.fastq.gz")
os.path.join(RESULTS_DIR, "
preproc/lr
/lr.fastq.gz")
output:
os.path.join(RESULTS_DIR, "qc/lr/
{guppy_config}/
NanoStats.txt")
os.path.join(RESULTS_DIR, "qc/lr/NanoStats.txt")
log:
out="logs/
nanostats.{guppy_config}
.out.log",
err="logs/
nanostats.{guppy_config}
.err.log"
out="logs/
preproc_lr.nanostats
.out.log",
err="logs/
preproc_lr.nanostats
.err.log"
conda:
os.path.join(ENV_DIR, "nanostat.yaml")
message:
"Preprocessing long reads: NanoStats"
shell:
"(date && NanoStat --fastq {input} --outdir $(dirname {output}) -n $(basename {output}) && date) 2> {log.err} > {log.out}"
workflow/rules/preprocessing_sr.smk
0 → 100644
View file @
6cbf5ed6
# Preprocessing of short reads
# Preprocess the short reads using fastp
rule fastp_sr:
input:
r1=DATA_SR["r1"],
r2=DATA_SR["r2"]
output:
r1=os.path.join(RESULTS_DIR, "preproc/sr/R1.fastp.fastq.gz"),
r2=os.path.join(RESULTS_DIR, "preproc/sr/R2.fastp.fastq.gz"),
html=os.path.join(RESULTS_DIR, "preproc/sr/fastp.html"),
json=os.path.join(RESULTS_DIR, "preproc/sr/fastp.json")
log:
out="logs/preproc_sr.fastp.out.log",
err="logs/preproc_sr.fastp.err.log"
threads:
config["fastp"]["threads"]
conda:
os.path.join(ENV_DIR, "fastp.yaml")
message:
"Preprocessing short reads: FastP"
shell:
"(date && fastp -l {config[fastp][min_length]} -i {input.r1} -I {input.r2} -o {output.r1} -O {output.r2} -h {output.html} -j {output.json} -w {threads} && date) 2> {log.err} > {log.out}"
rule fastqc_fastp_sr:
input:
os.path.join(RESULTS_DIR, "preproc/sr/{rid}.fastp.fastq.gz")