Commit f639b76c authored by AntonieV's avatar AntonieV
Browse files

fastqc integration in workflow

parent 394bdf80
......@@ -2,3 +2,7 @@
# In case of sample based data, it should be complemented by a samples.tsv file that contains
# one row per sample. It can be parsed easily via pandas.
samples: "config/samples.tsv"
units: "config/units.tsv"
# directory where the reads are
reads_dir: "data/reads"
sample condition
A untreated
B treated
sample condition batch_effect
A treated batch1
B untreated batch1
C treated batch2
D untreated batch2
sample unit fragment_len_mean fragment_len_sd fq1 fq2
A 1 raw/a.chr21.1.fq raw/a.chr21.2.fq
B 1 raw/b.chr21.1.fq raw/b.chr21.2.fq
B 2 300 14 raw/b.chr21.1.fq
C 1 raw/a.chr21.1.fq raw/a.chr21.2.fq
D 1 raw/b.chr21.1.fq raw/b.chr21.2.fq
# The main entry point of your workflow.
# The main entry point of workflow.
# After configuring, running snakemake -n in a clone of this repository should successfully execute a dry-run of the workflow.
from pathlib import Path
import glob
include: "rules/common.smk"
include: "rules/qc.smk"
report: "report/workflow.rst"
reads_names = set()
path = glob.glob(config["reads_dir"]+"/*.fq")
for p in path:
reads_names.add(Path(p).stem)
# Allow users to fix the underlying OS via singularity.
singularity: "docker://continuumio/miniconda3"
def all_input(wildcards):
wanted_input = []
rule all:
input:
# The first rule should define the default target files
# Subsequent target rules can be specified below. They should start with all_*.
wanted_input.extend(
expand (
[
"results/qc/fastqc/{filename}.fq_fastqc.zip",
"results/qc/fastqc/reports/{filename}.fq.html"
],
filename=reads_names
)
)
return wanted_input
include: "rules/common.smk"
include: "rules/other.smk"
rule all:
input: all_input
......@@ -10,6 +10,44 @@ singularity: "docker://continuumio/miniconda3"
configfile: "config/config.yaml"
validate(config, schema="../schemas/config.schema.yaml")
samples = pd.read_csv(config["samples"], sep="\t").set_index("sample", drop=False)
samples = pd.read_csv(config["samples"], sep="\t", dtype = str).set_index("sample", drop=False)
samples.index.names = ["sample_id"]
validate(samples, schema="../schemas/samples.schema.yaml")
units = pd.read_csv(
config["units"], dtype=str, sep="\t").set_index(["sample", "unit"], drop=False)
units.index.names = ["sample_id", "unit_id"]
units.index = units.index.set_levels(
[i.astype(str) for i in units.index.levels]) # enforce str in index
validate(units, schema="../schemas/units.schema.yaml")
report: "../report/workflow.rst"
##### wildcard constraints #####
wildcard_constraints:
sample = "|".join(samples.index),
unit = "|".join(units["unit"])
####### helpers ###########
def is_single_end(sample, unit):
"""Determine whether unit is single-end."""
fq2_present = pd.isnull(units.loc[(sample, unit), "fq2"])
if isinstance(fq2_present, pd.core.series.Series):
# if this is the case, get_fastqs cannot work properly
raise ValueError(
f"Multiple fq2 entries found for sample-unit combination {sample}-{unit}.\n"
"This is most likely due to a faulty units.tsv file, e.g. "
"a unit name is used twice for the same sample.\n"
"Try checking your units.tsv for duplicates."
)
return fq2_present
def get_fastqs(wildcards):
"""Get raw FASTQ files from unit sheet."""
if is_single_end(wildcards.sample, wildcards.unit):
return units.loc[ (wildcards.sample, wildcards.unit), "fq1" ]
else:
u = units.loc[ (wildcards.sample, wildcards.unit), ["fq1", "fq2"] ].dropna()
return [ f"{u.fq1}", f"{u.fq2}" ]
# An example collection of Snakemake rules imported in the main Snakefile.
rule fastqc:
input:
expand("{path}/{{read}}.fq", path=config["reads_dir"])
output:
html="results/qc/fastqc/reports/{read}.fq.html",
zip="results/qc/fastqc/{read}.fq_fastqc.zip"
params: ""
log:
"logs/fastqc/{read}.log"
wrapper:
"0.51.2/bio/fastqc"
......@@ -8,7 +8,10 @@ type: object
properties:
samples:
type: string
units:
type: string
# entries that have to be in the config file for successful validation
required:
- samples
- units
$schema: "http://json-schema.org/draft-04/schema#"
description: row of the units.tsv, representing a sequencing unit, i.e. single-end or paired-end data
type: object
properties:
sample:
type: string
description: sample name/id the unit has been sequenced from
unit:
type: string
description: unit id
fq1:
type: string
description: path to FASTQ file
fq2:
type: string
description: path to second FASTQ file (leave empty in case of single-end)
required:
- sample
- unit
- fq1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment