Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
Aurélien Ginolhac
snakemake-chip-seq
Commits
f639b76c
Commit
f639b76c
authored
Apr 22, 2020
by
AntonieV
Browse files
fastqc integration in workflow
parent
394bdf80
Changes
9
Hide whitespace changes
Inline
Side-by-side
config/config.yaml
View file @
f639b76c
...
...
@@ -2,3 +2,7 @@
# In case of sample based data, it should be complemented by a samples.tsv file that contains
# one row per sample. It can be parsed easily via pandas.
samples
:
"
config/samples.tsv"
units
:
"
config/units.tsv"
# directory where the reads are
reads_dir
:
"
data/reads"
config/samples.tsv
View file @
f639b76c
sample condition
A untreated
B treated
sample condition batch_effect
A treated batch1
B untreated batch1
C treated batch2
D untreated batch2
config/units.tsv
0 → 100644
View file @
f639b76c
sample unit fragment_len_mean fragment_len_sd fq1 fq2
A 1 raw/a.chr21.1.fq raw/a.chr21.2.fq
B 1 raw/b.chr21.1.fq raw/b.chr21.2.fq
B 2 300 14 raw/b.chr21.1.fq
C 1 raw/a.chr21.1.fq raw/a.chr21.2.fq
D 1 raw/b.chr21.1.fq raw/b.chr21.2.fq
workflow/Snakefile
View file @
f639b76c
# The main entry point of
your
workflow.
# The main entry point of workflow.
# After configuring, running snakemake -n in a clone of this repository should successfully execute a dry-run of the workflow.
from pathlib import Path
import glob
include: "rules/common.smk"
include: "rules/qc.smk"
report: "report/workflow.rst"
reads_names = set()
path = glob.glob(config["reads_dir"]+"/*.fq")
for p in path:
reads_names.add(Path(p).stem)
# Allow users to fix the underlying OS via singularity.
singularity: "docker://continuumio/miniconda3"
def all_input(wildcards):
wanted_input = []
rule all:
input:
# The first rule should define the default target files
# Subsequent target rules can be specified below. They should start with all_*.
wanted_input.extend(
expand (
[
"results/qc/fastqc/{filename}.fq_fastqc.zip",
"results/qc/fastqc/reports/{filename}.fq.html"
],
filename=reads_names
)
)
return wanted_input
include: "rules/common.smk"
include: "rules/other.smk"
rule all:
input: all_input
workflow/rules/common.smk
View file @
f639b76c
...
...
@@ -10,6 +10,44 @@ singularity: "docker://continuumio/miniconda3"
configfile: "config/config.yaml"
validate(config, schema="../schemas/config.schema.yaml")
samples = pd.read_csv(config["samples"], sep="\t").set_index("sample", drop=False)
samples = pd.read_csv(config["samples"], sep="\t"
, dtype = str
).set_index("sample", drop=False)
samples.index.names = ["sample_id"]
validate(samples, schema="../schemas/samples.schema.yaml")
units = pd.read_csv(
config["units"], dtype=str, sep="\t").set_index(["sample", "unit"], drop=False)
units.index.names = ["sample_id", "unit_id"]
units.index = units.index.set_levels(
[i.astype(str) for i in units.index.levels]) # enforce str in index
validate(units, schema="../schemas/units.schema.yaml")
report: "../report/workflow.rst"
##### wildcard constraints #####
wildcard_constraints:
sample = "|".join(samples.index),
unit = "|".join(units["unit"])
####### helpers ###########
def is_single_end(sample, unit):
"""Determine whether unit is single-end."""
fq2_present = pd.isnull(units.loc[(sample, unit), "fq2"])
if isinstance(fq2_present, pd.core.series.Series):
# if this is the case, get_fastqs cannot work properly
raise ValueError(
f"Multiple fq2 entries found for sample-unit combination {sample}-{unit}.\n"
"This is most likely due to a faulty units.tsv file, e.g. "
"a unit name is used twice for the same sample.\n"
"Try checking your units.tsv for duplicates."
)
return fq2_present
def get_fastqs(wildcards):
"""Get raw FASTQ files from unit sheet."""
if is_single_end(wildcards.sample, wildcards.unit):
return units.loc[ (wildcards.sample, wildcards.unit), "fq1" ]
else:
u = units.loc[ (wildcards.sample, wildcards.unit), ["fq1", "fq2"] ].dropna()
return [ f"{u.fq1}", f"{u.fq2}" ]
workflow/rules/other.smk
deleted
100644 → 0
View file @
394bdf80
# An example collection of Snakemake rules imported in the main Snakefile.
workflow/rules/qc.smk
0 → 100644
View file @
f639b76c
rule fastqc:
input:
expand("{path}/{{read}}.fq", path=config["reads_dir"])
output:
html="results/qc/fastqc/reports/{read}.fq.html",
zip="results/qc/fastqc/{read}.fq_fastqc.zip"
params: ""
log:
"logs/fastqc/{read}.log"
wrapper:
"0.51.2/bio/fastqc"
workflow/schemas/config.schema.yaml
View file @
f639b76c
...
...
@@ -8,7 +8,10 @@ type: object
properties
:
samples
:
type
:
string
units
:
type
:
string
# entries that have to be in the config file for successful validation
required
:
-
samples
-
units
workflow/schemas/units.schema.yaml
0 → 100644
View file @
f639b76c
$schema
:
"
http://json-schema.org/draft-04/schema#"
description
:
row of the units.tsv, representing a sequencing unit, i.e. single-end or paired-end data
type
:
object
properties
:
sample
:
type
:
string
description
:
sample name/id the unit has been sequenced from
unit
:
type
:
string
description
:
unit id
fq1
:
type
:
string
description
:
path to FASTQ file
fq2
:
type
:
string
description
:
path to second FASTQ file (leave empty in case of single-end)
required
:
-
sample
-
unit
-
fq1
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment