Unverified Commit 4607b5fc authored by Antonie Vietor's avatar Antonie Vietor Committed by GitHub
Browse files

Filtering (#3)

* trimming with cutadapt added

* additions to cutadapt according to View #1, BWA added as draft

* adding cutadapt comments back, bwa mem will be moved to a new branch

* mapping, duplicate marking and merging

* changes according View #2 added, some minor additional changes with handling cutadapt-log-files for multiqc input in common.smk were necessary

* code formatting with black

* changes according view #2 added and replaced directory name mapped with merged in bwa_mem output, because its bam files are only temporary and otherwise the empty mapped directory would remain after the workflow-run

* rule for samtools view added

* rule samtools_view added, rules for samtools_index, flagstat, idxstats and samtools_stats were created as drafts and are therefore still commented out

* adding filtering rules, rule for removing orphan reads, adding rule for samtools sorting and statistics for multiqc

* Higher parallelization of the stats rules by outsourcing general rules for this purpose in stats.smk and utils.smk. Statistics for multiqc input of BAM files after 'bwa_mem' and 'merge_duplicates' added.

* Changes according to View #3.

* refactoring _dedup

* Log-path refactoring according to view #3. In multiqc-report the hovering of data points for samtools stats, flagstat and idxstats works for Chromium browser, but not for Firefox. Additional plots can be generated from the multiqc_data statistics, as described here: https://github.com/ewels/MultiQC/issues/512.
parent bb5a88a1
{
"filters" : [
{
"id" : "paired_end",
"isPaired" : "true"
},
{
"id" : "mismatch",
"tag" : "NM:<=4"
},
{
"id" : "min_size",
"insertSize" : ">=-2000"
},
{
"id" : "max_size",
"insertSize" : "<=2000"
}
],
"rule" : " (paired_end & mismatch & min_size & max_size) | (!paired_end & mismatch) "
}
......@@ -5,6 +5,9 @@ include: "rules/common.smk"
include: "rules/qc.smk"
include: "rules/cutadapt.smk"
include: "rules/mapping.smk"
include: "rules/filtering.smk"
include: "rules/stats.smk"
include: "rules/utils.smk"
def all_input(wildcards):
......@@ -38,15 +41,16 @@ def all_input(wildcards):
)
)
# mapping and merging bam-files
wanted_input.extend(
expand (
[
"results/merged/dedup/{sample}.bam"
],
sample = sample
# mapping, merging and filtering bam-files
for sample in samples.index:
wanted_input.extend(
expand (
[
"results/orphan_rm_sorted/{sample}.bam"
],
sample = sample
)
)
)
return wanted_input
......
# conda environments needed for the workflow should be stored here.
channels:
- bioconda
dependencies:
- pysam ==0.15.4
......@@ -67,7 +67,10 @@ def get_multiqc_input(wildcards):
expand (
[
"results/qc/fastqc/{sample}.{unit}.{reads}_fastqc.zip",
"results/qc/fastqc/{sample}.{unit}.{reads}.html"
"results/qc/fastqc/{sample}.{unit}.{reads}.html",
"results/mapped/{sample}-{unit}.mapped.flagstat",
"results/mapped/{sample}-{unit}.mapped.idxstats",
"results/mapped/{sample}-{unit}.mapped.stats.txt"
],
sample = sample,
unit = unit,
......@@ -78,7 +81,16 @@ def get_multiqc_input(wildcards):
multiqc_input.extend(
expand (
[
"results/merged/dedup/{sample}.metrics.txt"
"results/picard_dedup/{sample}.metrics.txt",
"results/picard_dedup/{sample}.picard_dedup.flagstat",
"results/picard_dedup/{sample}.picard_dedup.idxstats",
"results/picard_dedup/{sample}.picard_dedup.stats.txt",
"results/filtered/{sample}.filtered.flagstat",
"results/filtered/{sample}.filtered.idxstats",
"results/filtered/{sample}.filtered.stats.txt",
"results/orphan_rm_sorted/{sample}.orphan_rm_sorted.idxstats",
"results/orphan_rm_sorted/{sample}.orphan_rm_sorted.flagstat",
"results/orphan_rm_sorted/{sample}.orphan_rm_sorted.stats.txt"
],
sample = sample
)
......
......@@ -12,7 +12,7 @@ rule cutadapt_pe:
log:
"logs/cutadapt/{sample}-{unit}.pe.log"
wrapper:
"0.52.0/bio/cutadapt/pe"
"0.60.0/bio/cutadapt/pe"
rule cutadapt_se:
input:
......@@ -29,4 +29,4 @@ rule cutadapt_se:
log:
"logs/cutadapt/{sample}-{unit}.se.log"
wrapper:
"0.52.0/bio/cutadapt/se"
"0.60.0/bio/cutadapt/se"
rule samtools_view:
input:
"results/picard_dedup/{sample}.bam"
output:
pipe(temp("results/sam-view/{sample}.bam"))
params:
"-b -F 0x004 -G 0x009 -f 0x001"
# if duplicates should be removed in this filtering, add "-F 0x0400" to the params
# if for each read, you only want to retain a single (best) mapping, add "-q 1" to params
# if you would like to restrict analysis to certain regions (e.g. excluding other "blacklisted" regions),
# please provide a respective bed file via "-L path/to/regions.bed"
wrapper:
"0.60.0/bio/samtools/view"
rule bamtools_filter_json:
input:
"results/sam-view/{sample}.bam"
output:
temp("results/filtered/{sample}.bam")
params:
# filters mismatches in all reads and filters pe-reads within a size range given in json-file
json="../config/bamtools_filtering_rules.json"
log:
"logs/filtered/{sample}.log"
wrapper:
"0.60.0/bio/bamtools/filter_json"
#TODO for later: customize and substitute rm_orphan_pe_bam.py with some existing tool
rule orphan_remove:
input:
"results/filtered/{sample}.bam"
output:
bam=temp("results/orphan_rm/{sample}.bam"),
qc="results/orphan_rm/{sample}_bampe_rm_orphan.log"
params:
"--only_fr_pairs"
conda:
"../envs/pysam.yaml"
shell:
" ../workflow/scripts/rm_orphan_pe_bam.py {input} {output.bam} {params} "
rule samtools_sort:
input:
"results/orphan_rm/{sample}.bam"
output:
"results/orphan_rm_sorted/{sample}.bam"
params:
""
threads: # Samtools takes additional threads through its option -@
8
wrapper:
"0.60.0/bio/samtools/sort"
......@@ -8,7 +8,7 @@ rule bwa_index:
params:
algorithm="bwtsw"
wrapper:
"0.55.1/bio/bwa/index"
"0.60.0/bio/bwa/index"
rule bwa_mem:
input:
......@@ -25,7 +25,7 @@ rule bwa_mem:
sort_order="coordinate",
threads: 8
wrapper:
"0.52.0/bio/bwa/mem"
"0.60.0/bio/bwa/mem"
rule merge_bams:
input:
......@@ -40,17 +40,17 @@ rule merge_bams:
params:
"VALIDATION_STRINGENCY=LENIENT"
wrapper:
"0.55.1/bio/picard/mergesamfiles"
"0.60.0/bio/picard/mergesamfiles"
rule mark_merged_duplicates:
input:
"results/merged/{sample}.bam"
output:
bam="results/merged/dedup/{sample}.bam",
metrics="results/merged/dedup/{sample}.metrics.txt"
bam=temp("results/picard_dedup/{sample}.bam"),
metrics="results/picard_dedup/{sample}.metrics.txt"
log:
"logs/picard/dedup_merged/{sample}.log"
"logs/picard/picard_dedup/{sample}.log"
params:
"REMOVE_DUPLICATES=false ASSUME_SORTED=true VALIDATION_STRINGENCY=LENIENT"
wrapper:
"0.55.1/bio/picard/markduplicates"
"0.60.0/bio/picard/markduplicates"
......@@ -7,7 +7,7 @@ rule fastqc:
log:
"logs/fastqc/{sample}.{unit}.{read}.log"
wrapper:
"0.51.2/bio/fastqc"
"0.60.0/bio/fastqc"
rule multiqc:
input:
......@@ -17,4 +17,4 @@ rule multiqc:
log:
"logs/multiqc.log"
wrapper:
"0.51.3/bio/multiqc"
"0.60.0/bio/multiqc"
rule samtools_flagstat:
input:
"results/{step}/{samples_units}.bam"
output:
"results/{step,[^./]+}/{samples_units}.{step}.flagstat"
wrapper:
"0.60.0/bio/samtools/flagstat"
rule samtools_idxstats:
input:
bam = "results/{step}/{samples_units}.bam",
idx = "results/{step}/{samples_units}.bam.bai"
output:
"results/{step,[^./]+}/{samples_units}.{step}.idxstats"
log:
"logs/samtools-idxstats/{step}/{samples_units}.{step}.log"
wrapper:
"0.60.0/bio/samtools/idxstats"
rule samtools_stats:
input:
"results/{step}/{samples_units}.bam"
output:
"results/{step,[^./]+}/{samples_units}.{step}.stats.txt"
params:
""
log:
"logs/samtools-stats/{step}/{samples_units}.{step}.log"
wrapper:
"0.60.0/bio/samtools/stats"
rule samtools_index:
input:
"results/{step}/{samples_units}.bam"
output:
"results/{step}/{samples_units}.bam.bai"
params:
"" # optional params string
wrapper:
"0.60.0/bio/samtools/index"
# Any Python script in the scripts folder will be able to import from this module.
#!/usr/bin/env python
# source: https://github.com/nf-core/chipseq/blob/master/bin/bampe_rm_orphan.py
# MIT License
#
# Copyright (c) Philip Ewels
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
###############################################################################
#!# own changes and adjustments for snakemake-workflow chipseq are marked with "#!# AVI: " in comment
###############################################################################
###############################################################################
## Created on February 1st 2017 to remove singletons from paired-end BAM file
###############################################################################
###############################################################################
import os
import pysam
import argparse
############################################
############################################
## PARSE ARGUMENTS
############################################
############################################
Description = 'Remove singleton reads from paired-end BAM file i.e if read1 is present in BAM file without read 2 and vice versa.'
Epilog = """Example usage: bampe_rm_orphan.py <BAM_INPUT_FILE> <BAM_OUTPUT_FILE>"""
argParser = argparse.ArgumentParser(description=Description, epilog=Epilog)
## REQUIRED PARAMETERS
argParser.add_argument('BAM_INPUT_FILE', help="Input BAM file sorted by name.")
argParser.add_argument('BAM_OUTPUT_FILE', help="Output BAM file sorted by name.")
## OPTIONAL PARAMETERS
argParser.add_argument('-fr', '--only_fr_pairs', dest="ONLY_FR_PAIRS", help="Only keeps pairs that are in FR orientation on same chromosome.",action='store_true')
args = argParser.parse_args()
############################################
############################################
## HELPER FUNCTIONS
############################################
############################################
def makedir(path):
if not len(path) == 0:
try:
#!# AVI: changed because of race conditions if directory exists, original code: os.makedirs(path)
os.makedirs(path, exist_ok=True)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
############################################
############################################
## MAIN FUNCTION
############################################
############################################
def bampe_rm_orphan(BAMIn,BAMOut,onlyFRPairs=False):
## SETUP DIRECTORY/FILE STRUCTURE
OutDir = os.path.dirname(BAMOut)
makedir(OutDir)
## COUNT VARIABLES
totalReads = 0; totalOutputPairs = 0; totalSingletons = 0; totalImproperPairs = 0
## ITERATE THROUGH BAM FILE
EOF = 0
SAMFin = pysam.AlignmentFile(BAMIn,"rb") #!# AVI: changed to new API from pysam.Samfile
SAMFout = pysam.AlignmentFile(BAMOut, "wb",header=SAMFin.header) #!# AVI: changed to new API from pysam.Samfile
currRead = next(SAMFin) #!# AVI: adapted for the use of the iterator, original code: currRead = SAMFin.next()
for read in SAMFin.fetch(until_eof=True): #!# AVI: added .fetch() to explicitly use new API
totalReads += 1
if currRead.qname == read.qname:
pair1 = currRead; pair2 = read
## FILTER FOR READS ON SAME CHROMOSOME IN FR ORIENTATION
if onlyFRPairs:
if pair1.tid == pair2.tid:
## READ1 FORWARD AND READ2 REVERSE STRAND
if not pair1.is_reverse and pair2.is_reverse:
if pair1.reference_start <= pair2.reference_start:
totalOutputPairs += 1
SAMFout.write(pair1)
SAMFout.write(pair2)
else:
totalImproperPairs += 1
## READ1 REVERSE AND READ2 FORWARD STRAND
elif pair1.is_reverse and not pair2.is_reverse:
if pair2.reference_start <= pair1.reference_start:
totalOutputPairs += 1
SAMFout.write(pair1)
SAMFout.write(pair2)
else:
totalImproperPairs += 1
else:
totalImproperPairs += 1
else:
totalImproperPairs += 1
else:
totalOutputPairs += 1
SAMFout.write(pair1)
SAMFout.write(pair2)
## RESET COUNTER
try:
totalReads += 1
currRead = next(SAMFin) #!# AVI: adapted for the use of the iterator, original code: currRead = SAMFin.next()
except:
StopIteration
EOF = 1
## READS WHERE ONLY ONE OF A PAIR IS IN FILE
else:
totalSingletons += 1
pair1 = currRead
currRead = read
if not EOF:
totalReads += 1
totalSingletons += 1
pair1 = currRead
## CLOSE ALL FILE HANDLES
SAMFin.close()
SAMFout.close()
LogFile = os.path.join(OutDir,'%s_bampe_rm_orphan.log' % (os.path.basename(BAMOut[:-4])))
SamLogFile = open(LogFile,'w')
SamLogFile.write('\n##############################\n')
SamLogFile.write('FILES/DIRECTORIES')
SamLogFile.write('\n##############################\n\n')
SamLogFile.write('Input File: ' + BAMIn + '\n')
SamLogFile.write('Output File: ' + BAMOut + '\n')
SamLogFile.write('\n##############################\n')
SamLogFile.write('OVERALL COUNTS')
SamLogFile.write('\n##############################\n\n')
SamLogFile.write('Total Input Reads = ' + str(totalReads) + '\n')
SamLogFile.write('Total Output Pairs = ' + str(totalOutputPairs) + '\n')
SamLogFile.write('Total Singletons Excluded = ' + str(totalSingletons) + '\n')
SamLogFile.write('Total Improper Pairs Excluded = ' + str(totalImproperPairs) + '\n')
SamLogFile.write('\n##############################\n')
SamLogFile.close()
############################################
############################################
## RUN FUNCTION
############################################
############################################
bampe_rm_orphan(BAMIn=args.BAM_INPUT_FILE,BAMOut=args.BAM_OUTPUT_FILE,onlyFRPairs=args.ONLY_FR_PAIRS)
############################################
############################################
############################################
############################################
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment