Unverified Commit 3f3ed4f5 authored by AntonieV's avatar AntonieV Committed by GitHub
Browse files

trimming with cutadapt added (#1)

* trimming with cutadapt added

* additions to cutadapt according to View #1, BWA added as draft

* adding cutadapt comments back, bwa mem will be moved to a new branch
parent 8c4ebe50
......@@ -3,3 +3,23 @@
# one row per sample. It can be parsed easily via pandas.
samples: "config/samples.tsv"
units: "config/units.tsv"
params:
# these cutadapt parameters need to contain the required flag(s) for
# the type of adapter(s) to trim, i.e.:
# * https://cutadapt.readthedocs.io/en/stable/guide.html#adapter-types
# * `-a` for 3' adapter in the forward reads
# * `-g` for 5' adapter in the forward reads
# * `-b` for adapters anywhere in the forward reads
# also, separate capitalised letter flags are required for adapters in
# the reverse reads of paired end sequencing:
# * https://cutadapt.readthedocs.io/en/stable/guide.html#trimming-paired-end-reads
cutadapt-se: "-g AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT"
# reasoning behind parameters:
# * `-e 0.005`: the default cutadapt maximum error rate of `0.2` is far too high, for Illumina
# data the error rate is more in the range of `0.005` and setting it accordingly should avoid
# false positive adapter matches
# * `--minimum-overlap 7`: the cutadapt default minimum overlap of `5` did trimming on the level
# of expected adapter matches by chance
cutadapt-pe: "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA -g AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT -G AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT"
cutadapt-others: "-e 0.005 --overlap 7"
......@@ -3,6 +3,7 @@
include: "rules/common.smk"
include: "rules/qc.smk"
include: "rules/cutadapt.smk"
def all_input(wildcards):
......@@ -10,6 +11,30 @@ def all_input(wildcards):
wanted_input.extend(["results/qc/multiqc/multiqc.html"])
for (sample, unit) in units.index:
if is_single_end(sample, unit):
wanted_input.extend(expand(
[
"results/trimmed/{sample}-{unit}.fastq.gz",
"results/trimmed/{sample}-{unit}.se.qc.txt"
],
sample = sample,
unit = unit
)
)
else:
wanted_input.extend(
expand (
[
"results/trimmed/{sample}-{unit}.1.fastq.gz",
"results/trimmed/{sample}-{unit}.2.fastq.gz",
"results/trimmed/{sample}-{unit}.pe.qc.txt"
],
sample = sample,
unit = unit
)
)
return wanted_input
rule all:
......
......@@ -70,3 +70,11 @@ def get_multiqc_input(wildcards):
)
)
return multiqc_input
def get_fastqs(wildcards):
"""Get raw FASTQ files from unit sheet."""
if is_single_end(wildcards.sample, wildcards.unit):
return units.loc[ (wildcards.sample, wildcards.unit), "fq1" ]
else:
u = units.loc[ (wildcards.sample, wildcards.unit), ["fq1", "fq2"] ].dropna()
return [ f"{u.fq1}", f"{u.fq2}" ]
rule cutadapt_pe:
input:
get_fastqs
output:
fastq1="results/trimmed/{sample}-{unit}.1.fastq.gz",
fastq2="results/trimmed/{sample}-{unit}.2.fastq.gz",
qc="results/trimmed/{sample}-{unit}.pe.qc.txt"
params:
adapters = config["params"]["cutadapt-pe"],
others = config["params"]["cutadapt-others"]
log:
"results/logs/cutadapt/{sample}-{unit}.log"
wrapper:
"0.52.0/bio/cutadapt/pe"
rule cutadapt_se:
input:
get_fastqs
output:
fastq="results/trimmed/{sample}-{unit}.fastq.gz",
qc="results/trimmed/{sample}-{unit}.se.qc.txt"
params:
"{} {}".format(
config["params"]["cutadapt-se"],
config["params"]["cutadapt-others"]
)
log:
"results/logs/cutadapt/{sample}-{unit}.log"
wrapper:
"0.52.0/bio/cutadapt/se"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment