diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..e6a2389a13e110f2943c43d4dd4ef261bbcad0cf --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Aurélien Ginolhac + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/config/config.yaml b/config/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d0c90bac9fec62a0e9c9a962cf556babcc4b5c07 --- /dev/null +++ b/config/config.yaml @@ -0,0 +1,47 @@ +#coding=utf-8 +# path or URL to sample sheet (TSV format, columns: sample, condition, ...) +samples: config/samples.tsv +# path or URL to sequencing unit sheet (TSV format, columns: sample, unit, fq1, fq2, +# strandedness). Units are technical replicates (e.g. lanes, or resequencing of the +# same biological sample).If the column "strandedness" is present (which is optional), +# can be empty or has one of these values: none, yes or reverse. none is for unstranded +# protocols, yes an reverse follow the nomenclature used in `htseq-count --reverse` +# which is referenced in STAR manual section 7, "Counting number of reads per gene". + +units: config/units.tsv + +# Adapter Removal +trimming: + # skip trimming: false or true + skip: false + threads: 2 + # Adapter from https://emea.support.illumina.com/bulletins/2016/12/what-sequences-do-i-use-for-adapter-trimming.html + adapter1: AGATCGGAAGAGCACACGTCTGAACTCCAGTCA + adapter2: AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT + min_length: 35 + +ref: + species: "saccharomyces_cerevisiae" + build: "R64-1-1" + release: "99" + +pca: + labels: + # columns of sample sheet to use for PCA + - condition + +diffexp: + contrasts: + # contrasts for the deseq2 results method + # write down EFFECT_vs_CONTROL: + # - FULLNAME_CONTROL + # - FULLNAME_EFFECT + KO_vs_WT: + - WT + - KO + + +params: + star: " --twopassMode Basic --outSAMtype BAM SortedByCoordinate --limitOutSJcollapsed 1000000 --limitSjdbInsertNsj 1000000 --outFilterMultimapNmax 100 --outFilterMismatchNmax 33 --outFilterMismatchNoverLmax 0.3 --seedSearchStartLmax 12 --alignSJoverhangMin 15 --alignEndsType Local --outFilterMatchNminOverLread 0 --outFilterScoreMinOverLread 0.3 --winAnchorMultimapNmax 50 --alignSJDBoverhangMin 3" + bowtie_path: "/usr/local/bin/" + db_bowtie_path: "/scratch/users/aginolhac/FastQ_Screen_Genomes/" diff --git a/config/samples.tsv b/config/samples.tsv new file mode 100644 index 0000000000000000000000000000000000000000..f7021a01f70d1aba28ca86e27991032a776eabc4 --- /dev/null +++ b/config/samples.tsv @@ -0,0 +1,5 @@ +sample condition +WT_A WT +WT_B WT +KO_A KO +KO_C KO diff --git a/config/units.tsv b/config/units.tsv new file mode 100644 index 0000000000000000000000000000000000000000..bb3706fa63b76df30259f23cd39a11c635bbe747 --- /dev/null +++ b/config/units.tsv @@ -0,0 +1,6 @@ +sample unit fq1 fq2 strandedness +WT_A 1 fastq/BY_A.fastq.gz reverse +WT_A 2 fastq/BY_A-2.fastq.gz reverse +WT_B 1 fastq/BY_B.fastq.gz reverse +KO_A 1 fastq/BY_Gis1_A.fastq.gz reverse +KO_C 1 fastq/BY_Gis1_C.fastq.gz reverse diff --git a/workflow/Snakefile b/workflow/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..8034cf777f8258df2807cdf0f563ff6df5d71087 --- /dev/null +++ b/workflow/Snakefile @@ -0,0 +1,28 @@ + +container: "docker://ginolhac/snake-rna-seq:0.5" + +##### load rules ##### +include: "rules/common.smk" +include: "rules/refs.smk" +include: "rules/trim.smk" +include: "rules/align.smk" +include: "rules/diffexp.smk" +include: "rules/qc.smk" + +rule all: + log: + "logs/all.log" + input: + expand(["results/diffexp/{contrast}.diffexp.tsv", + "results/diffexp/{contrast}.ma-plot.pdf"], + contrast=config["diffexp"]["contrasts"]), + "results/pca.pdf", + "qc/multiqc_report.html", + # not all combinations, just the sample-unit pairs + expand("qc/fastqc/{unit.sample}-{unit.unit}.html", unit=units.itertuples()), + expand("qc/fastq_screen/{unit.sample}-{unit.unit}.fastq_screen.txt", unit=units.itertuples()), + "results/session.txt" + +##### setup report ##### +report: "report/workflow.rst" + diff --git a/workflow/report/diffexp.rst b/workflow/report/diffexp.rst new file mode 100644 index 0000000000000000000000000000000000000000..baf5b0b7476257b3dc5c34ecf3c6cab0d92ae764 --- /dev/null +++ b/workflow/report/diffexp.rst @@ -0,0 +1 @@ +table sorted by adjusted-pvalues (Benjamini-Hochberg) diff --git a/workflow/report/ma.rst b/workflow/report/ma.rst new file mode 100644 index 0000000000000000000000000000000000000000..ceeada2e8d8763cdf3b3bb69ed59630828c1cee7 --- /dev/null +++ b/workflow/report/ma.rst @@ -0,0 +1 @@ +MA-plot obtained after schrinkage of logFC by apeglm. diff --git a/workflow/report/pc.rst b/workflow/report/pc.rst new file mode 100644 index 0000000000000000000000000000000000000000..a091013c591500b1fb1f8dc25b00e1009158a389 --- /dev/null +++ b/workflow/report/pc.rst @@ -0,0 +1 @@ +plot normalized counts for QC, please ensure the contrast directionality is correct diff --git a/workflow/report/pca.rst b/workflow/report/pca.rst new file mode 100644 index 0000000000000000000000000000000000000000..0c456ef7bdd985d9f04f3956907965dbc80c32e4 --- /dev/null +++ b/workflow/report/pca.rst @@ -0,0 +1 @@ +PCA of the 500 most variable genes diff --git a/workflow/report/session.rst b/workflow/report/session.rst new file mode 100644 index 0000000000000000000000000000000000000000..052906010e2781642b28566e4af7639f82789e61 --- /dev/null +++ b/workflow/report/session.rst @@ -0,0 +1 @@ +Versions of tools used diff --git a/workflow/report/volcano.rst b/workflow/report/volcano.rst new file mode 100644 index 0000000000000000000000000000000000000000..ce941a0d7290acdacfd38565f2d3b14998d9999c --- /dev/null +++ b/workflow/report/volcano.rst @@ -0,0 +1,2 @@ +red dots are for adjusted-pval < 0.05 and abs(logFC) >= 1. +gene names are reported for -log10(adjusted-pval) > 10 and abs(logFC) >= 3 diff --git a/workflow/report/workflow.rst b/workflow/report/workflow.rst new file mode 100644 index 0000000000000000000000000000000000000000..e6eeb4730831b443869378eb9c50e5c0043f5383 --- /dev/null +++ b/workflow/report/workflow.rst @@ -0,0 +1,3 @@ +This `workflow template <https://gitlab.lcsb.uni.lu/aurelien.ginolhac/snakemake-rna-seq>` is derived from Johannes Koester `rnaseq-deseq2 <https://github.com/snakemake-workflows/rna-seq-star-deseq2>`_ Snakemake template. +However, reads trimmed by `AdapterRemoval <https://github.com/MikkelSchubert/adapterremoval>`_ and read counting performed by `Rsubread <https://bioconductor.org/packages/release/bioc/html/Rsubread.html>`_. + diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk new file mode 100644 index 0000000000000000000000000000000000000000..de6c9c8047af60223f7df1986f522e5cbb09af8b --- /dev/null +++ b/workflow/rules/align.smk @@ -0,0 +1,40 @@ + +rule star_index: + input: + fasta = "refs/{}.fasta".format(config["ref"]["build"]), + gtf = "refs/{}.gtf".format(config["ref"]["build"]) + output: + directory("refs/{}".format(config["ref"]["build"])) + message: + "STAR indexing {}".format(config["ref"]["build"]) + threads: + 12 + params: + build = config["ref"]["build"], + extra = "" + log: + "logs/star_index_{}.log".format(config["ref"]["build"]) + wrapper: + "0.65.0/bio/star/index" + +rule align_multi: + input: + # this function is called as many samples are provided in output + # technical replicates are merged in STAR + fq1 = get_fq1, + fq2 = get_fq2, # optional, return none if not in units.tsv + index = rules.star_index.output + output: + "star/{sample}/Aligned.sortedByCoord.out.bam", + "star/{sample}/ReadsPerGene.out.tab" + log: + "logs/star/{sample}.log" + params: + # path to STAR reference build index + build = config["ref"]["build"], + index = rules.star_index.output, + # optional parameters + extra = "--quantMode GeneCounts {}".format(config["params"]["star"]) + threads: 12 + wrapper: + "0.65.0/bio/star/align" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk new file mode 100644 index 0000000000000000000000000000000000000000..9a69712bd3c6285aba84c341b2a0ccc4e65ba0ed --- /dev/null +++ b/workflow/rules/common.smk @@ -0,0 +1,123 @@ +import pandas as pd +import numpy as np +from snakemake.utils import validate, min_version +##### set minimum snakemake version ##### +min_version("5.20.1") # for singularity binds +from snakemake.utils import validate, min_version + +##### load config and sample sheets ##### +configfile: "config/config.yaml" +validate(config, schema="../schemas/config.schema.yaml") + +samples = pd.read_table(config["samples"]).set_index("sample", drop=False) +validate(samples, schema="../schemas/samples.schema.yaml") + +units = pd.read_table(config["units"], dtype=str).set_index(["sample", "unit"], drop=False) +units.index = units.index.set_levels([i.astype(str) for i in units.index.levels]) # enforce str in index +validate(units, schema="../schemas/units.schema.yaml") + +# check if contrast condition in samples and config match +conditions = np.unique(samples["condition"].values).tolist() +for key, val in config["diffexp"]["contrasts"].items(): + for cond in val: + assert cond.find('-') == -1, "dashes found in {}, not allowed since later converted to underscores by DESeq2".format(cond) + assert cond in conditions, "condition {} in config.yaml don't match in samples.yaml".format(cond) + assert len(key.split("_vs_")) == 2, "contrasts in config.yaml must in the form XX_vs_YY (provided: {})".format(key) + for cond in key.split("_vs_"): + assert cond in conditions, "conditions in headers must be present in samples.yaml ({} missing)".format(cond) + + +# check if user didn't change the headers and/or add extra columns +assert units.columns.values.flatten().tolist() == ['sample', 'unit', 'fq1', 'fq2', 'strandedness'], "config/units.tsv columns header must be 'sample', 'unit', 'fq1', 'fq2', 'strandedness'" +assert samples.columns.values.flatten().tolist() == ['sample', 'condition'], "config/samples.tsv columns header must be 'sample', 'condition'" + +assert len(np.unique(units["fq1"].values).tolist()) == len(units), "In units.tsv, some fq1 path are duplicated" +# check that sample are all different when not technical replicates +sample_dup=units[units.duplicated(['sample'], keep=False)] +assert len(np.unique(sample_dup["unit"].values).tolist()) == len(sample_dup), "sample {} in units.tsv is duplicated".format(sample_dup["sample"].values.tolist()) +# fq2 should NOT be one of the fq1 +assert not units["fq2"].isin(units["fq1"]).values.any(), "fq2 files cannot be found in fq1" + +# check if sample in units.tsv and samples.tsv are coherent +for usp in np.unique(units["sample"].values).tolist(): + assert usp in np.unique(samples["sample"].values).tolist(), "sample {} in units.tsv is not samples.tsv".format(usp) +for ssp in np.unique(samples["sample"].values).tolist(): + assert ssp in np.unique(units["sample"].values).tolist(), "sample {} in samples.tsv is not units.tsv".format(ssp) +# check for paired-end that FASTQ are different +for usp in np.unique(units["sample"].values).tolist(): + fq2=units.loc[usp, ["fq2"]].values.flatten().tolist() + if not fq2 is None: + assert fq2 != units.loc[usp, ["fq1"]].values.flatten().tolist(), "sample {} in units.tsv have identical fq1 and fq2 files!".format(usp) + +def get_settings(sample, unit): + if config["trimming"]["skip"]: + # no trimming, don't look for setting files + # return an empty list + return [] + else: + if not is_single_end(sample): + # paired-end sample + return expand("trimmed_pe/{sample}-{unit}.settings", sample=sample, unit=unit) + # single end sample + return "trimmed_se/{sample}-{unit}.settings".format(sample=sample, unit=unit) + +def get_raw_fq(wildcards): + # no trimming, use raw reads + return units.loc[(wildcards.sample, wildcards.unit), ["fq1", "fq2"]].dropna() + +def get_strandness(units): + if "strandedness" in units.columns: + return units["strandedness"].tolist() + else: + strand_list=["none"] + return strand_list*units.shape[0] + +def get_deseq2_threads(wildcards=None): + # https://twitter.com/mikelove/status/918770188568363008 + few_coeffs = False if wildcards is None else len(get_contrast(wildcards)) < 10 + return 1 if len(samples) < 100 or few_coeffs else 6 + +def get_contrast(wildcards): + return config["diffexp"]["contrasts"][wildcards.contrast] + + +def is_single_end(sample): + fq2=pd.isnull(units.loc[(sample), ["fq2"]].values.flatten().tolist()) + if len(fq2) > 1: + assert len(np.unique(fq2)) == 1, "units per sample are expected to of same seq type (se/pe)" + return np.unique(fq2) + + +def get_fastq(wildcards): + fastqs = units.loc[(wildcards.sample, wildcards.unit), ["fq1", "fq2"]].dropna() + if len(fastqs) == 2: + return {"fq1": fastqs.fq1, "fq2": fastqs.fq2} + return {"fq1": fastqs.fq1} + + +# return list of technical replicates (units) for fq1 if present +def get_fq1(sample): + if config["trimming"]["skip"]: + # no trimming, use raw reads + return units.loc[(sample), ["fq1"]].values.flatten().tolist() + else: + # yes trimming, use trimmed data + if not is_single_end(sample): + # paired-end sample + return expand("trimmed_pe/{sample}-{unit}.1.fastq.gz", sample=sample, unit=units.loc[(sample), ["unit"]].values.flatten()) + # single end sample, return only 1 file using pure python format + return expand("trimmed_se/{sample}-{unit}.fastq.gz", sample=sample, unit=units.loc[(sample), ["unit"]].values.flatten()) + +# return list of technical replicates (units) for fq1 if present +def get_fq2(sample): + if config["trimming"]["skip"]: + # no trimming, use raw reads + return units.loc[(sample), ["fq2"]].values.flatten().tolist() + else: + # yes trimming, use trimmed data + if not is_single_end(sample): + # paired-end sample + return expand("trimmed_pe/{sample}-{unit}.2.fastq.gz", sample=sample, unit=units.loc[(sample), ["unit"]].values.flatten()) + # single end sample, return None + return [] + diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk new file mode 100644 index 0000000000000000000000000000000000000000..f2182b074c9841241be5522da7f2dd89d4b92820 --- /dev/null +++ b/workflow/rules/diffexp.smk @@ -0,0 +1,64 @@ + +rule feature_count: + input: + bams = expand("star/{sample.sample}/Aligned.sortedByCoord.out.bam", sample=samples.itertuples()) + output: + fc = "counts/fc.rds", + stat = "counts/fc_stats.summary" + threads: + 8 + log: + "logs/featureCounts.log" + params: + annotation = "refs/{}.gtf".format(config["ref"]["build"]), + strandness = get_strandness(units), + # convert array boolean to binary + se = [is_single_end(sample.sample).astype(int) for sample in samples.itertuples()] + script: + "../scripts/featureCounts.R" + +rule deseq2_init: + input: + counts="counts/fc.rds" + output: + "deseq2/all.rds" + params: + samples=config["samples"] + log: + "logs/deseq2/init.log" + threads: get_deseq2_threads() + script: + "../scripts/deseq2-init.R" + + +rule pca: + input: + "deseq2/all.rds" + output: + report("results/pca.pdf", category = "PCA", caption = "../report/pca.rst") + params: + pca_labels=config["pca"]["labels"] + log: + "logs/pca.log" + script: + "../scripts/plot-pca.R" + + +rule deseq2: + input: + "deseq2/all.rds" + output: + table = report("results/diffexp/{contrast}.diffexp.tsv", category = "Differential Expression", caption = "../report/diffexp.rst"), + ma_plot = report("results/diffexp/{contrast}.ma-plot.pdf", category = "Differential Expression", caption = "../report/ma.rst"), + plot_count = report("results/diffexp/{contrast}.plot-count.pdf", category = "Differential Expression", caption = "../report/pc.rst"), + volcano = report("results/diffexp/{contrast}.volcano.pdf", category = "Differential Expression", caption = "../report/volcano.rst") + params: + contrast="{contrast}" + message: + "performing contrast using apeglm logFC shrinkage" + log: + "logs/deseq2/{contrast}.diffexp.log" + threads: get_deseq2_threads() + script: + "../scripts/deseq2.R" + diff --git a/workflow/rules/peaks.smk b/workflow/rules/peaks.smk new file mode 100644 index 0000000000000000000000000000000000000000..80dd6be5cb08f977310a7407ab43fc56d339938f --- /dev/null +++ b/workflow/rules/peaks.smk @@ -0,0 +1,12 @@ +rule callpeaks: + input: + "mapped/{sample}.bam" + output: + "results/callpeaks/{sample}_peaks.bed" + log: + "logs/callpeaks/{sample}_gopeaks.json" + params: + igg = get_igg, + params = callpeaks_params + shell: + "gopeaks -b {input[0]} {params.igg} -o results/callpeaks/{wildcards.sample} {params.params} > {log} 2>&1" diff --git a/workflow/rules/qc.smk b/workflow/rules/qc.smk new file mode 100644 index 0000000000000000000000000000000000000000..3f925fdee1bdec0887d4b8e9cdf35344a9a289ca --- /dev/null +++ b/workflow/rules/qc.smk @@ -0,0 +1,87 @@ + +rule multiqc: + input: + [get_settings(unit.sample, unit.unit) for unit in units.itertuples()], + expand("qc/fastqc/{unit.sample}-{unit.unit}_fastqc.zip", unit=units.itertuples()), + expand("qc/fastq_screen/{unit.sample}-{unit.unit}.fastq_screen.txt", unit=units.itertuples()), + expand("star/{sample.sample}/Aligned.sortedByCoord.out.bam", sample=samples.itertuples()), + "counts/fc_stats.summary" + output: + "qc/multiqc_report.html" + log: + "logs/multiqc.log" + wrapper: + "0.65.0/bio/multiqc" + +rule fastqc: + input: + sample=get_raw_fq + output: + html="qc/fastqc/{sample}-{unit}.html", + zip="qc/fastqc/{sample}-{unit}_fastqc.zip" # the suffix _fastqc.zip is necessary for multiqc to find the file. If not using multiqc, you are free to choose an arbitrary filename + group: "raw_qc" + params: "--quiet" + log: + "logs/fastqc/{sample}-{unit}.log" + wrapper: + "v1.7.0/bio/fastqc" + +rule fastq_screen: + input: + sample=get_raw_fq + output: + txt="qc/fastq_screen/{sample}-{unit}.fastq_screen.txt", + png="qc/fastq_screen/{sample}-{unit}.fastq_screen.png" + group: "raw_qc" + log: + "logs/fastq_screen/{sample}-{unit}.log" + params: + fastq_screen_config = { + 'database': { + 'human': { + 'bowtie2': "{}/Human/Homo_sapiens.GRCh38".format(config["params"]["db_bowtie_path"])}, + 'mouse': { + 'bowtie2': "{}/Mouse/Mus_musculus.GRCm38".format(config["params"]["db_bowtie_path"])}, + 'rat':{ + 'bowtie2': "{}/Rat/Rnor_6.0".format(config["params"]["db_bowtie_path"])}, + 'drosophila':{ + 'bowtie2': "{}/Drosophila/BDGP6".format(config["params"]["db_bowtie_path"])}, + 'worm':{ + 'bowtie2': "{}/Worm/Caenorhabditis_elegans.WBcel235".format(config["params"]["db_bowtie_path"])}, + 'yeast':{ + 'bowtie2': "{}/Yeast/Saccharomyces_cerevisiae.R64-1-1".format(config["params"]["db_bowtie_path"])}, + 'arabidopsis':{ + 'bowtie2': "{}/Arabidopsis/Arabidopsis_thaliana.TAIR10".format(config["params"]["db_bowtie_path"])}, + 'ecoli':{ + 'bowtie2': "{}/E_coli/Ecoli".format(config["params"]["db_bowtie_path"])}, + 'rRNA':{ + 'bowtie2': "{}/rRNA/GRCm38_rRNA".format(config["params"]["db_bowtie_path"])}, + 'MT':{ + 'bowtie2': "{}/Mitochondria/mitochondria".format(config["params"]["db_bowtie_path"])}, + 'PhiX':{ + 'bowtie2': "{}/PhiX/phi_plus_SNPs".format(config["params"]["db_bowtie_path"])}, + 'Lambda':{ + 'bowtie2': "{}/Lambda/Lambda".format(config["params"]["db_bowtie_path"])}, + 'vectors':{ + 'bowtie2': "{}/Vectors/Vectors".format(config["params"]["db_bowtie_path"])}, + 'adapters':{ + 'bowtie2': "{}/Adapters/Contaminants".format(config["params"]["db_bowtie_path"])}, + 'mycoplasma':{ + 'bowtie2': "{}/Mycoplasma/mycoplasma".format(config["params"]["db_bowtie_path"])} + }, + 'aligner_paths': {'bowtie2': "{}/bowtie2".format(config["params"]["bowtie_path"])} + }, + subset=100000, + aligner='bowtie2' + threads: 8 + wrapper: + "0.65.0/bio/fastq_screen" + +rule session: + output: + report("results/session.txt", category = "Versions", caption = "../report/session.rst") + log: + "logs/session.log" + script: + "../scripts/session.R" + diff --git a/workflow/rules/refs.smk b/workflow/rules/refs.smk new file mode 100644 index 0000000000000000000000000000000000000000..4abd5201e8cd9dd6354065611e9ace6d7c6fa335 --- /dev/null +++ b/workflow/rules/refs.smk @@ -0,0 +1,27 @@ +build = config["ref"]["build"] + +rule get_genome: + output: + "refs/{build}.fasta" + params: + species=config["ref"]["species"], + datatype="dna", + build=config["ref"]["build"], + release=config["ref"]["release"] + log: + "logs/genome_{build}.log" + wrapper: + "0.49.0/bio/reference/ensembl-sequence" + +rule get_annotation: + output: + "refs/{build}.gtf" + params: + species=config["ref"]["species"], + build=config["ref"]["build"], + release=config["ref"]["release"], + fmt="gtf" + log: + "logs/annotation_{build}.log" + wrapper: + "0.49.0/bio/reference/ensembl-annotation" diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk new file mode 100644 index 0000000000000000000000000000000000000000..d6d091d5a7446b831a929e81c581ab92264be1ee --- /dev/null +++ b/workflow/rules/trim.smk @@ -0,0 +1,50 @@ + +# adapted from https://github.com/UofABioinformaticsHub/RNAseq_snakemake/blob/master/snakefile.py +rule adapter_removal_pe: + input: + # unpack to convert the returned dict to a list + unpack(get_fastq) + output: + fq1="trimmed_pe/{sample}-{unit}.1.fastq.gz", + fq2="trimmed_pe/{sample}-{unit}.2.fastq.gz", + single="trimmed_pe/{sample}-{unit}.singletons.gz", + discarded="trimmed_pe/{sample}-{unit}.discarded.gz", + settings="trimmed_pe/{sample}-{unit}.settings" + params: + min_len = config["trimming"]["min_length"], + adapter1 = config["trimming"]["adapter1"], + adapter2 = config["trimming"]["adapter2"] + threads: config["trimming"]["threads"] + log: + "logs/adapterremoval/{sample}-{unit}.log" + shell: + "AdapterRemoval --file1 {input.fq1} --file2 {input.fq2} " + "--output1 {output.fq1} --output2 {output.fq2} " + "--singleton {output.single} " + "--settings {output.settings} --discarded {output.discarded} " + "--adapter1 {params.adapter1} --adapter2 {params.adapter2} " + "--threads {threads} --gzip --trimqualities --trimns " + "--minlength {params.min_len}" +# adapted from https://github.com/UofABioinformaticsHub/RNAseq_snakemake/blob/master/snakefile.py +rule adapter_removal_se: + input: + unpack(get_fastq) + output: + fq1="trimmed_se/{sample}-{unit}.fastq.gz", + settings="trimmed_se/{sample}-{unit}.settings", + discarded="trimmed_se/{sample}-{unit}.discarded.gz" + params: + min_len = config["trimming"]["min_length"], + adapter1 = config["trimming"]["adapter1"], + threads: config["trimming"]["threads"] + log: + "logs/adapterremoval/{sample}-{unit}.log" + shell: + "AdapterRemoval --file1 {input.fq1} " + "--output1 {output.fq1} " + "--settings {output.settings} " + "--discarded {output.discarded} " + "--adapter1 {params.adapter1} " + "--threads {threads} " + "--gzip --trimqualities --trimns " + "--minlength {params.min_len}" diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a2c6739e019ad0fd6ebd1ea16ecfc3dd247109a3 --- /dev/null +++ b/workflow/schemas/config.schema.yaml @@ -0,0 +1,87 @@ +$schema: "http://json-schema.org/draft-06/schema#" + +description: snakemake configuration file + +type: object + +properties: + samples: + type: string + units: + type: string + trimming: + type: object + properties: + skip: + type: boolean + threads: + type: integer + min_length: + type: integer + adapter1: + type: string + pattern: "^[ACGTN]+$" + adapter2: + type: string + pattern: "^[ACGT]+$" + required: + - skip + - threads + - min_length + - adapter1 + - adapter2 + + ref: + type: object + properties: + species: + type: string + build: + type: string + release: + type: string + required: + - species + - build + - release + + pca: + type: object + properties: + labels: + type: array + items: + type: string + required: + - labels + + diffexp: + type: object + properties: + contrasts: + type: object + required: + - contrasts + + params: + type: object + properties: + star: + type: string + bowtie_path: + type: string + db_bowtie_path: + type: string + required: + - star + - bowtie_path + - db_bowtie_path + +required: + - samples + - units + - trimming + - ref + - pca + - diffexp + - params diff --git a/workflow/schemas/samples.schema.yaml b/workflow/schemas/samples.schema.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9690ff5d7dfe55307d37387b6f82106d9a298e75 --- /dev/null +++ b/workflow/schemas/samples.schema.yaml @@ -0,0 +1,14 @@ +$schema: "http://json-schema.org/draft-06/schema#" + +description: an entry in the sample sheet +properties: + sample: + type: string + description: sample name/identifier + condition: + type: string + description: sample condition that will be compared during differential expression analysis (e.g. a treatment, a tissue time, a disease) + +required: + - sample + - condition diff --git a/workflow/schemas/units.schema.yaml b/workflow/schemas/units.schema.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac504748791c342f1e9a5587fad0a0cfe3fff9c6 --- /dev/null +++ b/workflow/schemas/units.schema.yaml @@ -0,0 +1,24 @@ +$schema: "http://json-schema.org/draft-06/schema#" +description: row of the units.tsv, representing a sequencing unit, i.e. single-end or paired-end data +type: object +properties: + sample: + type: string + description: sample name/id the unit has been sequenced from + unit: + type: string + description: unit id + fq1: + type: string + description: path to FASTQ file + fq2: + type: string + description: path to second FASTQ file (leave empty in case of single-end) + strandedness: + type: string + description: one of the values 'none', 'yes' or 'reverse' according to protocol strandedness +required: + - sample + - unit + - fq1 + - strandedness diff --git a/workflow/scripts/deseq2-init.R b/workflow/scripts/deseq2-init.R new file mode 100644 index 0000000000000000000000000000000000000000..c9ae9c74ab53ac7d5d7aae643b0778361b188402 --- /dev/null +++ b/workflow/scripts/deseq2-init.R @@ -0,0 +1,43 @@ +log <- file(snakemake@log[[1]], open = "wt") +sink(log) +sink(log, type = "message") + +suppressPackageStartupMessages(library("DESeq2")) +message(paste("log is", log)) +parallel <- FALSE +if (snakemake@threads > 1) { + library("BiocParallel") + # setup parallelization + register(MulticoreParam(snakemake@threads)) + parallel <- TRUE +} + +# colData and countData must have the same sample order, but this is ensured +# by the way we create the count matrix +fc <- readRDS(snakemake@input[["counts"]]) +cts <- fc$counts +coldata <- read.table(snakemake@params[["samples"]], header = TRUE, + row.names = "sample", stringsAsFactors = TRUE) + +message("coldata is ", coldata) + +# DESeq2 check that rownames of coldata and counts column names matches +colnames(cts) <- stringr::str_split_fixed(colnames(cts), pattern = "/", n = 3)[, 2] + +dds <- DESeqDataSetFromMatrix(countData = cts, + colData = coldata, + design = ~condition) +# annotate as in https://support.bioconductor.org/p/62374/ Mike Love +stopifnot(all(rownames(dds) == fc$annotation$GeneID)) +mcols(dds) <- cbind(mcols(dds), gene_name = fc$annotation$gene_name) + +dds <- estimateSizeFactors(dds) +# remove uninformative genes with too few counts across columns +dds <- dds[rowSums(counts(dds, normalized = FALSE)) > 10, ] +# normalization and preprocessing +dds <- DESeq(dds, parallel = parallel) + +message("resultsNames") +resultsNames(dds) + +saveRDS(dds, file = snakemake@output[[1]]) diff --git a/workflow/scripts/deseq2.R b/workflow/scripts/deseq2.R new file mode 100644 index 0000000000000000000000000000000000000000..1b262adbaa7cf4aa8e2e57d09b8695e997da7520 --- /dev/null +++ b/workflow/scripts/deseq2.R @@ -0,0 +1,83 @@ +log <- file(snakemake@log[[1]], open = "wt") +sink(log) +sink(log, type = "message") + +library(tibble) +suppressPackageStartupMessages(library(ggplot2)) +suppressPackageStartupMessages(library(DESeq2)) + +parallel <- FALSE +if (snakemake@threads > 1) { + library("BiocParallel") + # setup parallelization + register(MulticoreParam(snakemake@threads)) + parallel <- TRUE +} + +dds <- readRDS(snakemake@input[[1]]) + +my_coef <- paste0("condition", "_", snakemake@params[["contrast"]]) + +if (stringr::str_detect(my_coef, "_vs_", negate = TRUE)) { + stop(paste("contrast", snakemake@params[["contrast"]], "does not contain _vs_"), + call. = FALSE + ) +} +print(paste("orginal contrast", my_coef)) +print(paste("available contrast", resultsNames(dds))) +# if we don't have the right contrast +if (!my_coef %in% resultsNames(dds)) { + # relevel by the desired reference + my_ref <- stringr::str_split(my_coef, "_vs_", simplify = TRUE)[1, 2] + dds$condition <- relevel(dds$condition, my_ref) + # redo the test as https://support.bioconductor.org/p/123247/ + dds <- nbinomWaldTest(dds, quiet = TRUE) + # now it should work, otherwise fail + stopifnot(my_coef %in% resultsNames(dds)) +} +print(my_coef) +res <- results(dds, name = my_coef, parallel = parallel) +# shrink fold changes for low expressed genes +res <- lfcShrink(dds, coef = my_coef, res = res, type = "apeglm") + +# annotate with gene names +stopifnot(all(row.names(res) == row.names(rowData(dds)))) +res$gene_name <- rowData(dds)$gene_name +# sort by p-value +res <- res[order(res$padj), ] +# TODO explore IHW usage +#http://bioconductor.org/packages/release/bioc/vignettes/IHW/inst/doc/introduction_to_ihw.html + + +# volcano +res_tib <- rownames_to_column(as.data.frame(res), var = "gene_id") +volcano <- ggplot(res_tib, aes(x = log2FoldChange, y = -log10(padj))) + + geom_hex(bins = 90) + + geom_point(data = subset(res_tib, !is.na(padj) & padj < 0.1 & abs(log2FoldChange) >= 1), + colour = "tomato1") + + ggrepel::geom_text_repel(data = subset(res_tib, !is.na(padj) & !is.na(gene_name) & abs(log2FoldChange) >= 3 & -log10(padj) > 10), + colour = "tomato3", aes(label = gene_name)) + + scale_fill_gradient(low = "grey80", high = "grey30") + + theme_minimal(14) + + labs(y = bquote(-log[10] * " ("*italic(padj)*")"), + title = snakemake@params[["contrast"]]) +ggsave(snakemake@output[["volcano"]], volcano) + +# store results +pdf(snakemake@output[["ma_plot"]]) +plotMA(res, ylim = c(-2, 2), main = snakemake@params[["contrast"]]) +dev.off() +write.table(res_tib, + file = snakemake@output[["table"]], + sep = "\t", row.names = FALSE, quote = FALSE +) +# also for QC a top plotCounts to ensure directionality +pc_df <- plotCounts(dds, row.names(res)[1], + intgroup = "condition", + returnData = TRUE +) +pc <- ggplot(pc_df, aes(x = condition, y = count)) + + geom_point(position = position_jitter(width = 0.1, height = 0)) + + labs(title = paste("gene name:", res[1, "gene_name"]), + subtitle = my_coef) +ggsave(snakemake@output[["plot_count"]], pc) diff --git a/workflow/scripts/featureCounts.R b/workflow/scripts/featureCounts.R new file mode 100644 index 0000000000000000000000000000000000000000..41d1e40bf4451a88005574872772ad097ca73aef --- /dev/null +++ b/workflow/scripts/featureCounts.R @@ -0,0 +1,73 @@ +log <- file(snakemake@log[[1]], open = "wt") +sink(log) +sink(log, type = "message") + +suppressPackageStartupMessages(library("Rsubread")) + +bams <- snakemake@input[["bams"]] +annot <- snakemake@params[["annotation"]] +# se should come as 0 and 1 +se <- as.logical(snakemake@params[["se"]]) +# se should be boolean of is each BAM a single-end +stopifnot(is.logical(se)) +stopifnot(length(bams) == length(se)) + +strand <- unique(snakemake@params[["strandness"]]) +if (length(strand) > 1) stop("cannot mix strandeness", call. = FALSE) +if (strand == "reverse") { + strand <- 2L +} else if (strand == "yes") { + strand <- 1L +} else { + strand <- 0L +} + +message("strandeness is ", strand) +data.frame( + single_end = se, + bams = bams +) + +run_fc <- function(files = NULL, PE = NULL) { + featureCounts( + files = files, + annot.ext = annot, + nthreads = snakemake@threads, + minMQS = 30, + isPairedEnd = PE, + isGTFAnnotationFile = TRUE, GTF.featureType = "exon", + GTF.attrType.extra = "gene_name", + GTF.attrType = "gene_id", + strandSpecific = strand + ) # libs are reverse stranded +} + +# we have all single-end +if (sum(se) == length(bams)) { + fc <- run_fc(files = bams[se], PE = FALSE) +} else if (sum(!se) == length(bams)) { + # or all paired-end + fc <- run_fc(files = bams[!se], PE = TRUE) +} else { + # or a mix that need to be merged + message("Deal with a mix of SE/PE") + fc_se <- run_fc(files = bams[se], PE = FALSE) + fc_pe <- run_fc(files = bams[!se], PE = TRUE) + # before merging, check + stopifnot(identical(rownames(fc_pe$counts), + rownames(fc_se$counts))) + stopifnot(identical(fc_pe$annotation$GeneID, + rownames(fc_se$counts))) + fc <- list(counts = cbind(fc_pe$counts, fc_se$counts), + annotation = fc_se$annotation, + targets = c(fc_pe$targets, fc_se$targets), + stat = cbind(Status = fc_pe$stat$Status, + fc_pe$stat[!names(fc_pe$stat) == "Status"], + fc_se$stat[!names(fc_se$stat) == "Status"])) +} + +write.table(fc$stat, + file = snakemake@output[["stat"]], + row.names = FALSE, quote = FALSE, sep = "\t" +) +saveRDS(fc, file = snakemake@output[["fc"]]) diff --git a/workflow/scripts/gtf2bed.py b/workflow/scripts/gtf2bed.py new file mode 100644 index 0000000000000000000000000000000000000000..3ce318dca63eb9e4a83734dd8f850dd10dd2c6dc --- /dev/null +++ b/workflow/scripts/gtf2bed.py @@ -0,0 +1,16 @@ +import gffutils + +db = gffutils.create_db(snakemake.input[0], + dbfn=snakemake.output.db, + force=True, + keep_order=True, + merge_strategy='merge', + sort_attribute_values=True, + disable_infer_genes=True, + disable_infer_transcripts=True) + +with open(snakemake.output.bed, 'w') as outfileobj: + for tx in db.features_of_type('transcript', order_by='start'): + bed = [s.strip() for s in db.bed12(tx).split('\t')] + bed[3] = tx.id + outfileobj.write('{}\n'.format('\t'.join(bed))) diff --git a/workflow/scripts/plot-pca.R b/workflow/scripts/plot-pca.R new file mode 100644 index 0000000000000000000000000000000000000000..35acb9b53184096f8de4aeaf5e344df3e19d57e3 --- /dev/null +++ b/workflow/scripts/plot-pca.R @@ -0,0 +1,37 @@ +log <- file(snakemake@log[[1]], open = "wt") +sink(log) +sink(log, type = "message") + +suppressPackageStartupMessages(library("DESeq2")) +suppressPackageStartupMessages(library("ggplot2")) +suppressPackageStartupMessages(library("cowplot")) + +# load deseq2 data +dds <- readRDS(snakemake@input[[1]]) + +# obtain normalized counts with stabilized variance +# but vst() uses only 1000 genes, must check if we don't have less +if (nrow(dds) < 1000) { + # then stabilize the variance using them all + # https://support.bioconductor.org/p/98634/ + vst_counts_blind <- varianceStabilizingTransformation(dds, blind = TRUE) + vst_counts_design <- varianceStabilizingTransformation(dds, blind = FALSE) +} else { + vst_counts_blind <- vst(dds, blind = TRUE) + vst_counts_design <- vst(dds, blind = FALSE) +} +p1 <- plotPCA(vst_counts_blind, intgroup = snakemake@params[["pca_labels"]]) + + labs(title = "blind to design") +legend <- get_legend( + # create some space to the left of the legend + p1 + theme(legend.box.margin = margin(0, 0, 0, 12)) +) +prow <- plot_grid( + p1 + theme(legend.position = "none"), + plotPCA(vst_counts_design, intgroup = snakemake@params[["pca_labels"]]) + + labs(title = "design aware") + theme(legend.position = "none"), + align = "vh") + +pdf(snakemake@output[[1]], width = 9) +plot_grid(prow, legend, rel_widths = c(2, .4)) +dev.off() diff --git a/workflow/scripts/session.R b/workflow/scripts/session.R new file mode 100644 index 0000000000000000000000000000000000000000..766ae1e6bd731f9d5f84fe37e6c63dceec542a45 --- /dev/null +++ b/workflow/scripts/session.R @@ -0,0 +1,110 @@ +log <- file(snakemake@log[[1]], open = "wt") +sink(log) +sink(log, type = "message") + +# versions +out <- snakemake@output[[1]] + +print(out) + +write("", file = out) +write(paste("STAR", system2("STAR", "--version", stdout = TRUE)), + file = out, append = TRUE +) +write(system2("samtools", "--version", stdout = TRUE)[1], + file = out, append = TRUE +) +write(system2("fastqc", "--version", stdout = TRUE), + file = out, append = TRUE +) +write(system2("fastq_screen", "--version", stdout = TRUE), + file = out, append = TRUE +) +write(system2("AdapterRemoval", "--version", stdout = TRUE, stderr = TRUE), + file = out, append = TRUE +) +write("", file = out, append = TRUE) +write(sessionInfo()$R.version$version.string, file = out, append = TRUE) +write(sessionInfo()$running, file = out, append = TRUE) +write("", file = out, append = TRUE) +pkgs <- c("ggplot2", "DESeq2", "apeglm", "Rsubread") +# from sessioninfo::package_info() +pkg_info <- function(pkgs = NULL, lib.loc = .libPaths()) { + + desc <- lapply(pkgs, function(x) { + suppressWarnings(utils::packageDescription(x, + lib.loc = lib.loc)) + }) + ondiskversion <- vapply(desc, function(x) { + ifelse(is.null(x$Version), NA_character_, x$Version) + }, character(1)) + date <- vapply(desc, function(desc) { + if (!is.null(desc$`Date/Publication`)) { + date <- desc$`Date/Publication` + } + else if (!is.null(desc$Built)) { + built <- strsplit(desc$Built, "; ")[[1]] + date <- built[3] + } + else { + date <- NA_character_ + } + as.character(as.Date(strptime(date, "%Y-%m-%d"))) + }, character(1)) + source <- vapply(desc, function(desc) { + if (is.null(desc)) { + NA_character_ + } + else if (!is.null(desc$GithubSHA1)) { + str <- paste0("Github (", desc$GithubUsername, "/", desc$GithubRepo, + "@", substr(desc$GithubSHA1, 1, 7), ")") + } + else if (!is.null(desc$RemoteType) && desc$RemoteType != "cran") { + remote_type <- desc$RemoteType + if (!is.null(desc$RemoteUsername) && (!is.null(desc$RemoteRepo))) { + user_repo <- paste0(desc$RemoteUsername, "/", desc$RemoteRepo) + } + else { + user_repo <- NULL + } + if (!is.null(desc$RemoteSha)) { + sha <- paste0("@", substr(desc$RemoteSha, 1, 7)) + } + else { + sha <- NULL + } + if (!is.null(user_repo) || !is.null(sha)) { + user_repo_and_sha <- paste0(" (", user_repo, sha, + ")") + } + else { + user_repo_and_sha <- NULL + } + str <- paste0(remote_type, user_repo_and_sha) + } + else if (!is.null(desc$Repository)) { + repo <- desc$Repository + if (!is.null(desc$Built)) { + built <- strsplit(desc$Built, "; ")[[1]] + ver <- sub("$R ", "", built[1]) + repo <- paste0(repo, " (", ver, ")") + } + repo + } + else if (!is.null(desc$biocViews) && desc$biocViews != "") { + "Bioconductor" + } + else { + "local" + } + }, character(1)) + # format result + data.frame(package = pkgs, + version = ondiskversion, + date = date, + source = source) +} +suppressWarnings(write.table(pkg_info(pkgs), + file = out, quote = FALSE, row.names = FALSE, + append = TRUE, sep = "\t" +)) diff --git a/workflow/slurm_profile/config.yaml b/workflow/slurm_profile/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..54508a6aaea6451b37d042363b7f5a6ea93714df --- /dev/null +++ b/workflow/slurm_profile/config.yaml @@ -0,0 +1,7 @@ +use-singularity: true +#singularity-prefix: ../00container +singularity-args: '-B /scratch/users/aginolhac/:/scratch/users/aginolhac' +max-jobs-per-second: 1 +latency-wait: 120 +keep-going: true +cluster: 'sbatch -c {threads} --mail-user={cluster.mail-user} --mail-type={cluster.mail-type} --partition={cluster.partition} --qos={cluster.qos} --time={cluster.time} {cluster.extra}' diff --git a/workflow/slurm_profile/iris.yaml b/workflow/slurm_profile/iris.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5bd34947324f2d29a1c8be0c065ba80267d69508 --- /dev/null +++ b/workflow/slurm_profile/iris.yaml @@ -0,0 +1,35 @@ +__default__: + partition: batch + qos: qos-besteffort + cpus: "{threads}" + mail-user: aurelien.ginolhac@uni.lu + mail-type: end,fail + #output: "output/slurm/{rule}/{wildcards}.o" + #error: "output/slurm/{rule}/{wildcards}.e" + time: 0-0:10:00 + extra: "" +qc_raw: + time: 0-0:15:00 + cpus: 2 +adapter_removal_se: + time: 0-2:00:00 + qos: qos-batch +adapter_removal_pe: + time: 0-2:00:00 + qos: qos-batch +align: + cpus: 24 + qos: qos-batch + time: 0-0:40:00 +star_index: + cpus: 16 + qos: qos-batch + time: 0-1:30:00 +get_genome: + cpus: 1 + qos: qos-batch + time: 0-1:00:00 +get_annotation: + cpus: 1 + qos: qos-batch + time: 0-1:00:00