common.smk 11.5 KB
Newer Older
AntonieV's avatar
AntonieV committed
1
2
3
4
5
from snakemake.utils import validate
import pandas as pd

# this container defines the underlying OS for each job when using the workflow
# with --use-conda --use-singularity
Antonie Vietor's avatar
Antonie Vietor committed
6
container: "docker://continuumio/miniconda3"
AntonieV's avatar
AntonieV committed
7
8
9
10
11
12

##### load config and sample sheets #####

configfile: "config/config.yaml"
validate(config, schema="../schemas/config.schema.yaml")

AntonieV's avatar
AntonieV committed
13
samples = pd.read_csv(config["samples"], sep="\t", dtype = str).set_index("sample", drop=False)
AntonieV's avatar
AntonieV committed
14
15
samples.index.names = ["sample_id"]
validate(samples, schema="../schemas/samples.schema.yaml")
AntonieV's avatar
AntonieV committed
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

units = pd.read_csv(
    config["units"], dtype=str, sep="\t").set_index(["sample", "unit"], drop=False)
units.index.names = ["sample_id", "unit_id"]
units.index = units.index.set_levels(
    [i.astype(str) for i in units.index.levels])  # enforce str in index
validate(units, schema="../schemas/units.schema.yaml")

report: "../report/workflow.rst"

##### wildcard constraints #####

wildcard_constraints:
    sample = "|".join(samples.index),
    unit = "|".join(units["unit"])

####### helpers ###########

def is_single_end(sample, unit):
    """Determine whether unit is single-end."""
    fq2_present = pd.isnull(units.loc[(sample, unit), "fq2"])
    if isinstance(fq2_present, pd.core.series.Series):
        # if this is the case, get_fastqs cannot work properly
        raise ValueError(
            f"Multiple fq2 entries found for sample-unit combination {sample}-{unit}.\n"
            "This is most likely due to a faulty units.tsv file, e.g. "
            "a unit name is used twice for the same sample.\n"
            "Try checking your units.tsv for duplicates."
        )
    return fq2_present

47
48
49
def get_individual_fastq(wildcards):
    """Get individual raw FASTQ files from unit sheet, based on a read (end) wildcard"""
    if ( wildcards.read == "0" or wildcards.read == "1" ):
AntonieV's avatar
AntonieV committed
50
        return units.loc[ (wildcards.sample, wildcards.unit), "fq1" ]
51
52
53
    elif wildcards.read == "2":
        return units.loc[ (wildcards.sample, wildcards.unit), "fq2" ]

Antonie Vietor's avatar
Antonie Vietor committed
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
def get_fastqs(wildcards):
    """Get raw FASTQ files from unit sheet."""
    if is_single_end(wildcards.sample, wildcards.unit):
        return units.loc[ (wildcards.sample, wildcards.unit), "fq1" ]
    else:
        u = units.loc[ (wildcards.sample, wildcards.unit), ["fq1", "fq2"] ].dropna()
        return [ f"{u.fq1}", f"{u.fq2}" ]

def is_control(sample):
    control = samples.loc[sample]["control"]
    return pd.isna(control) or pd.isnull(control)

def get_sample_control_peak_combinations_list():
    sam_contr = []
    for sample in samples.index:
        if not is_control(sample):
            sam_contr.extend(expand(["{sample}-{control}.{peak}"], sample = sample, control = samples.loc[sample]["control"], peak = config["params"]["peak-analysis"]))
    return sam_contr

def get_peaks_count_plot_input():
    return expand(
        "results/macs2_callpeak/peaks_count/{sam_contr_peak}.peaks_count.tsv",
        sam_contr_peak = get_sample_control_peak_combinations_list()
    )

def get_frip_score_input():
    return expand(
        "results/intersect/{sam_contr_peak}.peaks_frip.tsv",
        sam_contr_peak = get_sample_control_peak_combinations_list()
    )

def get_plot_macs_qc_input():
    return expand(
        "results/macs2_callpeak/{sam_contr_peak}_peaks.{peak}Peak",
        sam_contr_peak = get_sample_control_peak_combinations_list(), peak =config["params"]["peak-analysis"]
    )

def get_plot_homer_annotatepeaks_input():
    return expand("results/homer/annotate_peaks/{sam_contr_peak}_peaks.annotatePeaks.txt",
        sam_contr_peak = get_sample_control_peak_combinations_list()
    )

def get_map_reads_input(wildcards):
    if is_single_end(wildcards.sample, wildcards.unit):
        return "results/trimmed/{sample}-{unit}.fastq.gz"
    return ["results/trimmed/{sample}-{unit}.1.fastq.gz", "results/trimmed/{sample}-{unit}.2.fastq.gz"]

def get_read_group(wildcards):
    """Denote sample name and platform in read group."""
    return r"-R '@RG\tID:{sample}-{unit}\tSM:{sample}-{unit}\tPL:{platform}'".format(
        sample=wildcards.sample,
        unit=wildcards.unit,
        platform=units.loc[(wildcards.sample, wildcards.unit), "platform"])

108
109
110
111
112
113
def get_multiqc_input(wildcards):
    multiqc_input = []
    for (sample, unit) in units.index:
        reads = [ "1", "2" ]
        if is_single_end(sample, unit):
            reads = [ "0" ]
114
115
116
117
118
            multiqc_input.extend(expand (["logs/cutadapt/{sample}-{unit}.se.log"],
            sample = sample, unit = unit))
        else:
            multiqc_input.extend(expand (["logs/cutadapt/{sample}-{unit}.pe.log"],
            sample = sample, unit = unit))
119
120
121
122
123

        multiqc_input.extend(
            expand (
                [
                    "results/qc/fastqc/{sample}.{unit}.{reads}_fastqc.zip",
Antonie Vietor's avatar
Antonie Vietor committed
124
125
126
127
                    "results/qc/fastqc/{sample}.{unit}.{reads}.html",
                    "results/mapped/{sample}-{unit}.mapped.flagstat",
                    "results/mapped/{sample}-{unit}.mapped.idxstats",
                    "results/mapped/{sample}-{unit}.mapped.stats.txt"
128
129
130
131
132
                ],
                sample = sample,
                unit = unit,
                reads = reads
            )
AntonieV's avatar
multiqc    
AntonieV committed
133
        )
134
135
136
137
    for sample in samples.index:
        multiqc_input.extend(
            expand (
                [
Antonie Vietor's avatar
Antonie Vietor committed
138
139
140
141
142
143
144
145
146
                    "results/picard_dedup/{sample}.metrics.txt",
                    "results/picard_dedup/{sample}.picard_dedup.flagstat",
                    "results/picard_dedup/{sample}.picard_dedup.idxstats",
                    "results/picard_dedup/{sample}.picard_dedup.stats.txt",
                    "results/filtered/{sample}.filtered.flagstat",
                    "results/filtered/{sample}.filtered.idxstats",
                    "results/filtered/{sample}.filtered.stats.txt",
                    "results/orphan_rm_sorted/{sample}.orphan_rm_sorted.idxstats",
                    "results/orphan_rm_sorted/{sample}.orphan_rm_sorted.flagstat",
Antonie Vietor's avatar
Antonie Vietor committed
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
                    "results/orphan_rm_sorted/{sample}.orphan_rm_sorted.stats.txt",
                    "results/qc/multiple_metrics/{sample}.alignment_summary_metrics",
                    "results/qc/multiple_metrics/{sample}.base_distribution_by_cycle_metrics",
                    "results/qc/multiple_metrics/{sample}.base_distribution_by_cycle.pdf",
                    "results/qc/multiple_metrics/{sample}.insert_size_metrics",
                    "results/qc/multiple_metrics/{sample}.insert_size_histogram.pdf",
                    "results/qc/multiple_metrics/{sample}.quality_by_cycle_metrics",
                    "results/qc/multiple_metrics/{sample}.quality_by_cycle.pdf",
                    "results/qc/multiple_metrics/{sample}.quality_distribution_metrics",
                    "results/qc/multiple_metrics/{sample}.quality_distribution.pdf",
                    "results/deeptools/plot_profile_data.tab",
                    "results/phantompeakqualtools/{sample}.phantompeak.spp.out",
                    "results/phantompeakqualtools/{sample}.spp_correlation_mqc.tsv",
                    "results/phantompeakqualtools/{sample}.spp_nsc_mqc.tsv",
                    "results/phantompeakqualtools/{sample}.spp_rsc_mqc.tsv"
162
163
164
165
                ],
                sample = sample
            )
        )
Antonie Vietor's avatar
Antonie Vietor committed
166
167
168
169
170
171
172
173
174
175
176
177
178
        if not is_control(sample):
            multiqc_input.extend(
                expand (
                    [
                        "results/deeptools/{sample}-{control}.fingerprint_qcmetrics.txt",
                        "results/deeptools/{sample}-{control}.fingerprint_counts.txt",
                        "results/macs2_callpeak/{sample}-{control}.{peak}_peaks.xls"
                    ],
                sample = sample,
                control = samples.loc[sample]["control"],
                peak = config["params"]["peak-analysis"]
            )
        )
Antonie Vietor's avatar
Antonie Vietor committed
179
180
        if config["params"]["lc_extrap"]:
                multiqc_input.extend( expand(["results/preseq/{sample}.lc_extrap"], sample = sample))
181
    return multiqc_input
AntonieV's avatar
AntonieV committed
182

Antonie Vietor's avatar
Antonie Vietor committed
183
184
185
186
187
def all_input(wildcards):

    wanted_input = []

    # QC with fastQC and multiQC
Antonie Vietor's avatar
Antonie Vietor committed
188
189
190
    wanted_input.extend([
        "results/qc/multiqc/multiqc.html"
    ])
Antonie Vietor's avatar
Antonie Vietor committed
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221

    # trimming reads
    for (sample, unit) in units.index:
        if is_single_end(sample, unit):
            wanted_input.extend(expand(
                    [
                        "results/trimmed/{sample}-{unit}.fastq.gz",
                        "results/trimmed/{sample}-{unit}.se.qc.txt"
                    ],
                    sample = sample,
                    unit = unit
                )
            )
        else:
            wanted_input.extend(
                expand (
                    [
                        "results/trimmed/{sample}-{unit}.1.fastq.gz",
                        "results/trimmed/{sample}-{unit}.2.fastq.gz",
                        "results/trimmed/{sample}-{unit}.pe.qc.txt"
                    ],
                    sample = sample,
                    unit = unit
            )
        )

    # mapping, merging and filtering bam-files
    for sample in samples.index:
        wanted_input.extend(
            expand (
                [
Antonie Vietor's avatar
Antonie Vietor committed
222
                    "results/IGV/big_wig/merged_library.bigWig.igv.txt",
Antonie Vietor's avatar
Antonie Vietor committed
223
224
225
226
227
228
229
230
                    "results/deeptools/plot_profile.pdf",
                    "results/deeptools/heatmap.pdf",
                    "results/deeptools/heatmap_matrix.tab",
                    "results/phantompeakqualtools/{sample}.phantompeak.pdf"
                ],
                sample = sample
            )
        )
Antonie Vietor's avatar
Antonie Vietor committed
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
        if not is_control(sample):
            wanted_input.extend(
                expand(
                    [
                        "results/deeptools/{sample}-{control}.plot_fingerprint.pdf",
                        "results/macs2_callpeak/{sample}-{control}.{peak}_treat_pileup.bdg",
                        "results/macs2_callpeak/{sample}-{control}.{peak}_control_lambda.bdg",
                        "results/macs2_callpeak/{sample}-{control}.{peak}_peaks.{peak}Peak",
                        "results/IGV/macs2_callpeak/{peak}/merged_library.{sample}-{control}.{peak}_peaks.igv.txt",
                        "results/macs2_callpeak/plots/plot_{peak}_peaks_count.pdf",
                        "results/macs2_callpeak/plots/plot_{peak}_peaks_frip_score.pdf",
                        "results/macs2_callpeak/plots/plot_{peak}_peaks_macs2.pdf",
                        "results/homer/plots/plot_{peak}_annotatepeaks.pdf",
                        "results/homer/plots/plot_{peak}_annotatepeaks_summary.pdf"
                    ],
                    sample = sample,
                    control = samples.loc[sample]["control"],
                    peak = config["params"]["peak-analysis"]
                )
            )
            if config["params"]["peak-analysis"] == "broad":
                wanted_input.extend(
                    expand(
                        [
                            "results/macs2_callpeak/{sample}-{control}.{peak}_peaks.gappedPeak"
                        ],
                        sample = sample,
                        control = samples.loc[sample]["control"],
                        peak = config["params"]["peak-analysis"]
                    )
                )
            if config["params"]["peak-analysis"] == "narrow":
                wanted_input.extend(
                    expand(
                        [
                            "results/macs2_callpeak/{sample}-{control}.{peak}_summits.bed"
Antonie Vietor's avatar
Antonie Vietor committed
267

Antonie Vietor's avatar
Antonie Vietor committed
268
269
270
271
272
273
                        ],
                        sample = sample,
                        control = samples.loc[sample]["control"],
                        peak = config["params"]["peak-analysis"]
                    )
                )
Antonie Vietor's avatar
Antonie Vietor committed
274
    return wanted_input