diff --git a/src/coverageBed/coverageBed.cpp b/src/coverageBed/coverageBed.cpp index 4b40d92ad4ca994e2d6929f4aeb665f815fd545b..fcc94c9d46297849faf0ec6cbbfb3cb793b19985 100755 --- a/src/coverageBed/coverageBed.cpp +++ b/src/coverageBed/coverageBed.cpp @@ -73,10 +73,9 @@ void BedCoverage::CollectCoverageBed() { bedVector bedBlocks; splitBedIntoBlocks(a, lineNum, bedBlocks); - vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) - _bedB->countHits(*bedItr, _forceStrand); + // use countSplitHits to avoid over-counting each split chunk + // as distinct read coverage. + _bedB->countSplitHits(bedBlocks, _forceStrand); } a = nullBed; } @@ -127,12 +126,9 @@ void BedCoverage::CollectCoverageBam(string bamFile) { // since we are counting coverage, we do want to split blocks when a // deletion (D) CIGAR op is encountered (hence the true for the last parm) getBamBlocks(bam, refs, bedBlocks, true); - - vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) { - _bedB->countHits(*bedItr, _forceStrand); - } + // use countSplitHits to avoid over-counting each split chunk + // as distinct read coverage. + _bedB->countSplitHits(bedBlocks, _forceStrand); } } } diff --git a/src/genomeCoverageBed/genomeCoverageBed.cpp b/src/genomeCoverageBed/genomeCoverageBed.cpp index 091d605516912dd81f671cd4316eda65ceb73d5b..44401427eddb558fc0f8f97de59413c658f401bc 100755 --- a/src/genomeCoverageBed/genomeCoverageBed.cpp +++ b/src/genomeCoverageBed/genomeCoverageBed.cpp @@ -100,7 +100,10 @@ void BedGenomeCoverage::AddBlockedCoverage(const vector<BED> &bedBlocks) { vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); for (; bedItr != bedEnd; ++bedItr) { - AddCoverage(bedItr->start, bedItr->end); + // the end - 1 must be done because BamAncillary::getBamBlocks + // returns ends uncorrected for the genomeCoverageBed data structure. + // ugly, but necessary. + AddCoverage(bedItr->start, bedItr->end - 1); } } diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp index 0472eef1959ab6e2dd4d126683e48bb0572aad7d..a1611354bb657533ecccc97e05bf5b13cb48b573 100755 --- a/src/intersectBed/intersectBed.cpp +++ b/src/intersectBed/intersectBed.cpp @@ -293,6 +293,7 @@ void BedIntersect::IntersectBam(string bamFile) { else { bool overlapFoundForBlock; bedVector bedBlocks; // vec to store the discrete BED "blocks" from a + // we don't want to split on "D" ops, hence the "false" getBamBlocks(bam, refs, bedBlocks, false); vector<BED>::const_iterator bedItr = bedBlocks.begin(); diff --git a/src/utils/BamTools/BamAncillary.cpp b/src/utils/BamTools/BamAncillary.cpp index 3633d4c970c9ed669de106e576b3e06c149c169c..0cdc80a3da51071fae3d9c8f9b7809cb1a0d02ed 100755 --- a/src/utils/BamTools/BamAncillary.cpp +++ b/src/utils/BamTools/BamAncillary.cpp @@ -19,7 +19,7 @@ using namespace std; namespace BamTools { void getBamBlocks(const BamAlignment &bam, const RefVector &refs, - vector<BED> &blocks, bool includeDeletions) { + vector<BED> &blocks, bool breakOnDeletionOps) { CHRPOS currPosition = bam.Position; CHRPOS blockStart = bam.Position; @@ -32,38 +32,30 @@ namespace BamTools { vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin(); vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end(); for ( ; cigItr != cigEnd; ++cigItr ) { - switch (cigItr->Type) { - case 'M': + if (cigItr->Type == 'M') { currPosition += cigItr->Length; blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) ); - break; - - case 'S': - case 'P': - case 'H': - case 'I': - // Insertion relative to the reference genome. - // Don't advance the current position, since no new nucleotides are covered. - break; - - case 'D': - if (includeDeletions == true) - currPosition += cigItr->Length; - else { - blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) ); + blockStart = currPosition; + } + else if (cigItr->Type == 'D') { + if (breakOnDeletionOps == false) currPosition += cigItr->Length; - blockStart = currPosition; - } - case 'N': + else { + currPosition += cigItr->Length; + blockStart = currPosition; + } + } + else if (cigItr->Type == 'N') { currPosition += cigItr->Length; - blockStart = currPosition; - break; - - default: - cerr << "Input error: invalid CIGAR type (" << cigItr->Type + blockStart = currPosition; } + else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') { + // do nothing + } + else { + cerr << "Input error: invalid CIGAR type (" << cigItr->Type << ") for: " << bam.Name << endl; exit(1); - } + } } } } \ No newline at end of file diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 2662079f08daa6c4dd42fc06978f72e4cdc5fd2d..4ffe3e70d9249a8ac38ca78b779daccdc6bed4e5 100755 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -370,6 +370,62 @@ void BedFile::countHits(const BED &a, bool forceStrand) { } +void BedFile::countSplitHits(const vector<BED> &bedBlocks, bool forceStrand) { + + // set to track the distinct B features that had coverage. + // we'll update the counts of coverage for these features by one + // at the end of this function to avoid over-counting. + set< vector<BEDCOV>::iterator > validHits; + + vector<BED>::const_iterator blockItr = bedBlocks.begin(); + vector<BED>::const_iterator blockEnd = bedBlocks.end(); + for (; blockItr != blockEnd; ++blockItr) { + + BIN startBin, endBin; + startBin = (blockItr->start >> _binFirstShift); + endBin = ((blockItr->end-1) >> _binFirstShift); + + // loop through each bin "level" in the binning hierarchy + for (BINLEVEL i = 0; i < _binLevels; ++i) { + + // loop through each bin at this level of the hierarchy + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { + + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<BEDCOV>::iterator bedItr = bedCovMap[blockItr->chrom][j].begin(); + vector<BEDCOV>::iterator bedEnd = bedCovMap[blockItr->chrom][j].end(); + for (; bedItr != bedEnd; ++bedItr) { + + // skip the hit if not on the same strand (and we care) + if (forceStrand && (blockItr->strand != bedItr->strand)) { + continue; + } + else if (overlaps(bedItr->start, bedItr->end, blockItr->start, blockItr->end) > 0) { + bedItr->depthMap[blockItr->start+1].starts++; + bedItr->depthMap[blockItr->end].ends++; + validHits.insert(bedItr); + if (blockItr->start < bedItr->minOverlapStart) + bedItr->minOverlapStart = blockItr->start; + } + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } + } + // incrment the count of overlap by one for each B feature that overlapped + // the current passed hit. This is necessary to prevent over-counting for + // each "split"" of a single read. + set< vector<BEDCOV>::iterator >::iterator validHitsItr = validHits.begin(); + set< vector<BEDCOV>::iterator >::iterator validHitsEnd = validHits.end(); + for (; validHitsItr != validHitsEnd; ++validHitsItr) + (*validHitsItr)->count++; +} + + void BedFile::setGff (bool gff) { if (gff == true) this->_isGff = true; else this->_isGff = false; diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 10e6a5897b12dfa62c76a801be6511d9e9aae55d..c685b860f6493855243fe5f14074b56cc0537a7f 100755 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -15,6 +15,7 @@ #include "gzstream.h" #include <vector> #include <map> +#include <set> #include <string> #include <iostream> #include <fstream> @@ -295,6 +296,13 @@ public: // that the feature overlaps void countHits(const BED &a, bool forceStrand); + // same as above, but has special logic that processes a set of + // BED "blocks" from a single entry so as to avoid over-counting + // each "block" of a single BAM/BED12 as distinct coverage. That is, + // if one read has four block, we only want to count the coverage as + // coming from one read, notfour. + void countSplitHits(const vector<BED> &bedBlock, bool forceStrand); + // the bedfile with which this instance is associated string bedFile; unsigned int bedType; // 3-6, 12 for BED