Verified Commit f8084486 authored by Aurélien Ginolhac's avatar Aurélien Ginolhac 🚴
Browse files

trimming with adapter removal

parent 7ac84a08
sample group batch_effect control antibody sample group batch_effect control antibody
A TNFa batch1 D p65
B TNFa batch2 D p65
C E2TNFa batch1 E p65
D TNFa batch1
E E2TNFa batch1
sample unit fq1 fq2 sra_accession platform sample unit fq1 fq2 sra_accession platform
A 1 data/single_end_test_data/A-1_chr21.fastq.gz ILLUMINA
B 1 data/single_end_test_data/B-1_chr21.fastq.gz ILLUMINA
C 1 data/single_end_test_data/C-1_chr21.fastq.gz ILLUMINA
D 1 data/single_end_test_data/D-1_chr21.fastq.gz ILLUMINA
E 1 data/single_end_test_data/E-1_chr21.fastq.gz ILLUMINA
...@@ -81,19 +81,11 @@ def get_individual_fastq(wildcards): ...@@ -81,19 +81,11 @@ def get_individual_fastq(wildcards):
else: else:
return units.loc[ (wildcards.sample, wildcards.unit), "fq2" ] return units.loc[ (wildcards.sample, wildcards.unit), "fq2" ]
def get_fastqs(wildcards): def get_fastq(wildcards):
"""Get raw FASTQ files from unit sheet.""" fastqs = units.loc[(wildcards.sample, wildcards.unit), ["fq1", "fq2"]].dropna()
if is_sra_se(wildcards.sample, wildcards.unit): if len(fastqs) == 2:
return expand("resources/ref/sra-se-reads/{accession}.fastq", return {"fq1": fastqs.fq1, "fq2": fastqs.fq2}
accession=units.loc[ (wildcards.sample, wildcards.unit), "sra_accession" ]) return {"fq1": fastqs.fq1}
elif is_sra_pe(wildcards.sample, wildcards.unit):
return expand(["resources/ref/sra-pe-reads/{accession}.1.fastq", "ref/sra-pe-reads/{accession}.2.fastq"],
accession=units.loc[ (wildcards.sample, wildcards.unit), "sra_accession" ])
elif is_single_end(wildcards.sample, wildcards.unit):
return units.loc[ (wildcards.sample, wildcards.unit), "fq1" ]
else:
u = units.loc[ (wildcards.sample, wildcards.unit), ["fq1", "fq2"] ].dropna()
return [ f"{u.fq1}", f"{u.fq2}" ]
def is_control(sample): def is_control(sample):
control = samples.loc[sample]["control"] control = samples.loc[sample]["control"]
...@@ -151,8 +143,8 @@ def get_samples_of_antibody(antibody): ...@@ -151,8 +143,8 @@ def get_samples_of_antibody(antibody):
def get_map_reads_input(wildcards): def get_map_reads_input(wildcards):
if is_single_end(wildcards.sample, wildcards.unit): if is_single_end(wildcards.sample, wildcards.unit):
return "results/trimmed/{sample}-{unit}.fastq.gz" return "results/trimmed_se/{sample}-{unit}.fastq.gz"
return ["results/trimmed/{sample}-{unit}.1.fastq.gz", "results/trimmed/{sample}-{unit}.2.fastq.gz"] return ["results/trimmed_pe/{sample}-{unit}.1.fastq.gz", "results/trimmed_pe/{sample}-{unit}.2.fastq.gz"]
def get_read_group(wildcards): def get_read_group(wildcards):
"""Denote sample name and platform in read group.""" """Denote sample name and platform in read group."""
...@@ -346,8 +338,8 @@ def all_input(wildcards): ...@@ -346,8 +338,8 @@ def all_input(wildcards):
if is_single_end(sample, unit): if is_single_end(sample, unit):
wanted_input.extend(expand( wanted_input.extend(expand(
[ [
"results/trimmed/{sample}-{unit}.fastq.gz", "results/trimmed_se/{sample}-{unit}.fastq.gz"#,
"results/trimmed/{sample}-{unit}.se.qc.txt" #"results/trimmed_se/{sample}-{unit}.settings"
], ],
sample = sample, sample = sample,
unit = unit unit = unit
...@@ -357,9 +349,9 @@ def all_input(wildcards): ...@@ -357,9 +349,9 @@ def all_input(wildcards):
wanted_input.extend( wanted_input.extend(
expand ( expand (
[ [
"results/trimmed/{sample}-{unit}.1.fastq.gz", "results/trimmed_pe/{sample}-{unit}.R1.fastq.gz",
"results/trimmed/{sample}-{unit}.2.fastq.gz", "results/trimmed_pe/{sample}-{unit}.R2.fastq.gz",
"results/trimmed/{sample}-{unit}.pe.qc.txt" "results/trimmed_pe/{sample}-{unit}.settings"
], ],
sample = sample, sample = sample,
unit = unit unit = unit
......
...@@ -20,7 +20,7 @@ rule bamtools_filter_json: ...@@ -20,7 +20,7 @@ rule bamtools_filter_json:
temp("results/bamtools_filtered/{sample}.bam") temp("results/bamtools_filtered/{sample}.bam")
params: params:
# filters mismatches in all reads and filters pe-reads within a size range given in json-file # filters mismatches in all reads and filters pe-reads within a size range given in json-file
json="../config/{}_bamtools_filtering_rules.json".format("se" if config["single_end"] else "pe") json="config/{}_bamtools_filtering_rules.json".format("se" if config["single_end"] else "pe")
log: log:
"logs/filtered/{sample}.log" "logs/filtered/{sample}.log"
wrapper: wrapper:
...@@ -54,7 +54,7 @@ rule orphan_remove: ...@@ -54,7 +54,7 @@ rule orphan_remove:
conda: conda:
"../envs/pysam.yaml" "../envs/pysam.yaml"
shell: shell:
" ../workflow/scripts/rm_orphan_pe_bam.py {input} {output.bam} {params} 2> {log}" " workflow/scripts/rm_orphan_pe_bam.py {input} {output.bam} {params} 2> {log}"
rule samtools_sort_pe: rule samtools_sort_pe:
input: input:
......
...@@ -206,7 +206,7 @@ rule phantompeakqualtools: ...@@ -206,7 +206,7 @@ rule phantompeakqualtools:
rule phantompeak_correlation: rule phantompeak_correlation:
input: input:
data="results/phantompeakqualtools/{sample}.phantompeak.Rdata", data="results/phantompeakqualtools/{sample}.phantompeak.Rdata",
header="../workflow/header/spp_corr_header.txt" header="workflow/header/spp_corr_header.txt"
output: output:
"results/phantompeakqualtools/{sample}.spp_correlation_mqc.tsv" "results/phantompeakqualtools/{sample}.spp_correlation_mqc.tsv"
log: log:
...@@ -220,8 +220,8 @@ rule phantompeak_multiqc: ...@@ -220,8 +220,8 @@ rule phantompeak_multiqc:
# NSC (Normalized strand cross-correlation) and RSC (relative strand cross-correlation) metrics use cross-correlation # NSC (Normalized strand cross-correlation) and RSC (relative strand cross-correlation) metrics use cross-correlation
input: input:
data="results/phantompeakqualtools/{sample}.phantompeak.spp.out", data="results/phantompeakqualtools/{sample}.phantompeak.spp.out",
nsc_header="../workflow/header/nsc_header.txt", nsc_header="workflow/header/nsc_header.txt",
rsc_header="../workflow/header/rsc_header.txt" rsc_header="workflow/header/rsc_header.txt"
output: output:
nsc="results/phantompeakqualtools/{sample}.spp_nsc_mqc.tsv", nsc="results/phantompeakqualtools/{sample}.spp_nsc_mqc.tsv",
rsc="results/phantompeakqualtools/{sample}.spp_rsc_mqc.tsv" rsc="results/phantompeakqualtools/{sample}.spp_rsc_mqc.tsv"
......
# adapted from https://github.com/UofABioinformaticsHub/RNAseq_snakemake/blob/master/snakefile.py
rule trimming_pe:
input:
# unpack to convert the returned dict to a list
unpack(get_fastq)
output:
fastq1="results/trimmed_pe/{sample}-{unit}.1.fastq.gz",
fastq2="results/trimmed_pe/{sample}-{unit}.2.fastq.gz",
single="results/trimmed_pe/{sample}-{unit}.singletons.gz",
discarded="results/trimmed_pe/{sample}-{unit}.discarded.gz",
settings="results/trimmed_pe/{sample}-{unit}.settings"
params:
adapter = config["trimming"]["pe"],
others = config["trimming"]["others"]
container:
"docker://ginolhac/snake-rna-seq:0.4"
threads: config["trimming"]["threads"]
log:
"logs/trimming/{sample}-{unit}.pe.log"
shell:
"AdapterRemoval --file1 {input.fq1} --file2 {input.fq2} "
"--output1 {output.fq1} --output2 {output.fq2} "
"--singleton {output.single} "
"--settings {output.settings} --discarded {output.discarded} "
" {params.adapter} "
"--threads {threads} {params.others}"
# adapted from https://github.com/UofABioinformaticsHub/RNAseq_snakemake/blob/master/snakefile.py
rule trimming_se:
input:
unpack(get_fastq)
output:
fastq="results/trimmed_se/{sample}-{unit}.fastq.gz",
settings="results/trimmed_se/{sample}-{unit}.settings",
discarded="results/trimmed_se/{sample}-{unit}.discarded.gz"
params:
adapter = config["trimming"]["se"],
others = config["trimming"]["others"]
container:
"docker://ginolhac/snake-rna-seq:0.4"
threads: config["trimming"]["threads"]
log:
"logs/trimming/{sample}-{unit}.se.log"
shell:
"AdapterRemoval --file1 {input.fq1} "
"--output1 {output.fastq} "
"--settings {output.settings} --discarded {output.discarded} "
" {params.adapter} "
"--threads {threads} {params.others}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment