From cee8949d76c05d64bad4561a09ad2befb4a80272 Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Thu, 26 Jan 2012 13:10:46 -0500 Subject: [PATCH] refactored intersect; better -bed support for split BAM alignments --- src/intersectBed/Makefile | 4 +- src/intersectBed/intersectBed.cpp | 164 +++++++----------- src/intersectBed/intersectBed.h | 8 +- src/utils/BamTools-Ancillary/BamAncillary.cpp | 39 +++++ src/utils/BamTools-Ancillary/BamAncillary.h | 3 + .../BlockedIntervals/BlockedIntervals.cpp | 2 +- src/utils/BlockedIntervals/BlockedIntervals.h | 2 +- src/utils/bedFile/bedFile.h | 3 - 8 files changed, 117 insertions(+), 108 deletions(-) diff --git a/src/intersectBed/Makefile b/src/intersectBed/Makefile index c96a433e..1eef2c0a 100644 --- a/src/intersectBed/Makefile +++ b/src/intersectBed/Makefile @@ -12,6 +12,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ -I$(UTILITIES_DIR)/fileType/ \ -I$(UTILITIES_DIR)/BamTools/include \ -I$(UTILITIES_DIR)/BlockedIntervals \ + -I$(UTILITIES_DIR)/BamTools-Ancillary \ -I$(UTILITIES_DIR)/chromsweep \ -I$(UTILITIES_DIR)/version/ @@ -20,7 +21,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ # ---------------------------------- SOURCES= intersectMain.cpp intersectBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o lineFileUtilities.o BlockedIntervals.o gzstream.o fileType.o chromsweep.o +_EXT_OBJECTS=bedFile.o lineFileUtilities.o BlockedIntervals.o BamAncillary.o gzstream.o fileType.o chromsweep.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= intersectBed @@ -38,6 +39,7 @@ $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BlockedIntervals/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools-Ancillary/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/chromsweep/ diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp index 42093c35..39ed3e31 100644 --- a/src/intersectBed/intersectBed.cpp +++ b/src/intersectBed/intersectBed.cpp @@ -16,7 +16,6 @@ Helper functions ************************************/ bool BedIntersect::processHits(const BED &a, const vector<BED> &hits) { - bool hitsFound = false; if (_printable == true) { vector<BED>::const_iterator h = hits.begin(); @@ -96,17 +95,8 @@ bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { } -bool BedIntersect::FindBlockedOverlaps(const BED &a, vector<BED> &hits) { - - // find the hits that overlap with the full span of the blocked BED - _bedB->allHits(a.chrom, a.start, a.end, a.strand, - hits, _sameStrand, _diffStrand, - _overlapFraction, _reciprocal); - - // break a into discrete blocks, as we need to - // measure overlap with the individual blocks, not the full span. - bedVector a_blocks; - GetBedBlocks(a, a_blocks); +bool BedIntersect::FindBlockedOverlaps(const BED &a, const vector<BED> &a_blocks, + const vector<BED> &hits, bool a_is_bam) { int a_footprint = GetTotalBlockLength(a_blocks); // container to store the set of raw hits // that actually overlap the A blocks @@ -159,7 +149,11 @@ bool BedIntersect::FindBlockedOverlaps(const BED &a, vector<BED> &hits) { } } } - return processHits(a, valid_hits); + if (!a_is_bam) { + return processHits(a, valid_hits); + } + else + return !valid_hits.empty(); } @@ -242,8 +236,19 @@ void BedIntersect::IntersectBed() { if (_obeySplits == false) FindOverlaps(a, hits); // split the BED12 into blocks and look for overlaps in each discrete block - else - FindBlockedOverlaps(a, hits); + else { + // find the hits that overlap with the full span of the blocked BED + _bedB->allHits(a.chrom, a.start, a.end, a.strand, + hits, _sameStrand, _diffStrand, + _overlapFraction, _reciprocal); + // break a into discrete blocks, as we need to + // measure overlap with the individual blocks, not the full span. + bedVector a_blocks; + GetBedBlocks(a, a_blocks); + // find the overlaps between the block in A and B + // last parm is false as a is not a BAM entry + FindBlockedOverlaps(a, a_blocks, hits, false); + } hits.clear(); } } @@ -261,8 +266,11 @@ void BedIntersect::IntersectBed() { while (sweep.Next(hit_set)) { if (_obeySplits == false) processHits(hit_set.first, hit_set.second); - else - FindBlockedOverlaps(hit_set.first, hit_set.second); + else { + bedVector a_blocks; + GetBedBlocks(hit_set.first, a_blocks); + FindBlockedOverlaps(hit_set.first, a_blocks, hit_set.second, false); + } } } } @@ -279,19 +287,15 @@ void BedIntersect::IntersectBam(string bamFile) { // using BAM output. if (_bamOutput == false) { _bedA = new BedFile(_bedAFile); - _bedA->bedType = 6; + _bedA->bedType = 12; } - // open the BAM file BamReader reader; BamWriter writer; - reader.Open(bamFile); - // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); - // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode @@ -301,93 +305,55 @@ void BedIntersect::IntersectBam(string bamFile) { // open our BAM writer writer.Open("stdout", bamHeader, refs); } - vector<BED> hits; // reserve some space hits.reserve(100); - - BamAlignment bam; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { - if (bam.IsMapped()) { - BED a; - a.chrom = refs.at(bam.RefID).RefName; - a.start = bam.Position; - a.end = bam.GetEndPosition(false, false); - - // build the name field from the BAM alignment. - a.name = bam.Name; - if (bam.IsFirstMate()) a.name += "/1"; - if (bam.IsSecondMate()) a.name += "/2"; - - a.score = ToString(bam.MapQuality); - - a.strand = "+"; - if (bam.IsReverseStrand()) a.strand = "-"; - - if (_bamOutput == true) { - bool overlapsFound = false; - // treat the BAM alignment as a single "block" - if (_obeySplits == false) { - overlapsFound = _bedB->anyHits(a.chrom, a.start, a.end, - a.strand, _sameStrand, _diffStrand, - _overlapFraction, _reciprocal); - } - // split the BAM alignment into discrete blocks and - // look for overlaps only within each block. - else { - bool overlapFoundForBlock; - bedVector bedBlocks; // vec to store the discrete BED "blocks" from a - // we don't want to split on "D" ops, hence the "false" - GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks); - - vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) { - overlapFoundForBlock = _bedB->anyHits(bedItr->chrom, bedItr->start, bedItr->end, - bedItr->strand, _sameStrand, _diffStrand, - _overlapFraction, _reciprocal); - if (overlapFoundForBlock == true) - overlapsFound = true; - } - } - if (overlapsFound == true) { - if (_noHit == false) - writer.SaveAlignment(bam); - } - else { - if (_noHit == true) { - writer.SaveAlignment(bam); - } - } - } - else { - // treat the BAM alignment as a single BED "block" - if (_obeySplits == false) { - FindOverlaps(a, hits); - hits.clear(); - } - // split the BAM alignment into discrete BED blocks and - // look for overlaps only within each block. - else { - bedVector bedBlocks; // vec to store the discrete BED "blocks" from a - GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks); - - vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) { - FindOverlaps(*bedItr, hits); - hits.clear(); - } - } - } - } // BAM IsMapped() is false - else if (_noHit == true) { + if ((!bam.IsMapped()) && (_noHit == true)) writer.SaveAlignment(bam); + else if (!bam.IsMapped()) + continue; + + // break alignment into discrete blocks, + bedVector bed_blocks; + string chrom = refs.at(bam.RefID).RefName; + GetBamBlocks(bam, chrom, bed_blocks); + // create a basic BED entry from the BAM alignment + BED bed; + MakeBedFromBam(bam, chrom, bed_blocks, bed); + bool overlapsFound = false; + if ((_bamOutput == true) && (_obeySplits == false)) + { + overlapsFound = _bedB->anyHits(bed.chrom, bed.start, bed.end, + bed.strand, _sameStrand, _diffStrand, + _overlapFraction, _reciprocal); + } + else if ( ((_bamOutput == true) && (_obeySplits == true)) || + ((_bamOutput == false) && (_obeySplits == true)) ) + { + // find the hits that overlap with the full span of the blocked BED + _bedB->allHits(bed.chrom, bed.start, bed.end, bed.strand, + hits, _sameStrand, _diffStrand, + _overlapFraction, _reciprocal); + // find the overlaps between the block in A and B + overlapsFound = FindBlockedOverlaps(bed, bed_blocks, hits, _bamOutput); + } + else if ((_bamOutput == false) && (_obeySplits == false)) + { + FindOverlaps(bed, hits); + } + // save the BAM alignment if overlap reqs. were met + if (_bamOutput == true) { + if ((overlapsFound == true) && (_noHit == false)) + writer.SaveAlignment(bam); + else if ((overlapsFound == false) && (_noHit == true)) + writer.SaveAlignment(bam); } + hits.clear(); } // close the relevant BAM files. diff --git a/src/intersectBed/intersectBed.h b/src/intersectBed/intersectBed.h index 221233cd..84c7ebb1 100644 --- a/src/intersectBed/intersectBed.h +++ b/src/intersectBed/intersectBed.h @@ -18,6 +18,7 @@ #include "api/BamWriter.h" #include "api/BamAux.h" #include "BlockedIntervals.h" +#include "BamAncillary.h" using namespace BamTools; @@ -87,11 +88,12 @@ private: bool processHits(const BED &a, const vector<BED> &hits); bool FindOverlaps(const BED &a, vector<BED> &hits); - - bool FindBlockedOverlaps(const BED &a, vector<BED> &hits); + + bool FindBlockedOverlaps(const BED &a, const vector<BED> &a_blocks, + const vector<BED> &hits, bool a_is_bam); void ReportOverlapDetail(int overlapBases, const BED &a, const BED &b, CHRPOS s, CHRPOS e); - + void ReportOverlapSummary(const BED &a, const int &numOverlapsFound); }; diff --git a/src/utils/BamTools-Ancillary/BamAncillary.cpp b/src/utils/BamTools-Ancillary/BamAncillary.cpp index 09891e96..e59a6f2d 100644 --- a/src/utils/BamTools-Ancillary/BamAncillary.cpp +++ b/src/utils/BamTools-Ancillary/BamAncillary.cpp @@ -73,4 +73,43 @@ namespace BamTools { blocks.push_back( BED(chrom, bam.Position, currPosition, name, score, strand) ); } } + + void MakeBedFromBam(const BamAlignment &bam, const string &chrom, + const bedVector &blocks, BED &bed) + { + bed.chrom = chrom; + bed.start = bam.Position; + bed.end = bam.GetEndPosition(false, false); + // build the name field from the BAM alignment. + bed.name = bam.Name; + if (bam.IsFirstMate()) bed.name += "/1"; + else if (bam.IsSecondMate()) bed.name += "/2"; + bed.score = ToString(bam.MapQuality); + bed.strand = "+"; + if (bam.IsReverseStrand()) bed.strand = "-"; + + bed.fields.push_back(bed.chrom); + bed.fields.push_back(ToString(bed.start)); + bed.fields.push_back(ToString(bed.end)); + bed.fields.push_back(bed.name); + bed.fields.push_back(bed.score); + bed.fields.push_back(bed.strand); + bed.fields.push_back(ToString(bam.Position)); + bed.fields.push_back(ToString(bed.end)); + bed.fields.push_back("0,0,0"); + bed.fields.push_back(ToString(blocks.size())); + + ostringstream block_lengths, block_starts; + for (size_t i = 0; i < blocks.size(); ++i) { + block_lengths << blocks[i].end - blocks[i].start << ","; + block_starts << blocks[i].start - bam.Position << ","; + } + bed.fields.push_back(block_lengths.str()); + bed.fields.push_back(block_starts.str()); + + // identify which indices relate to the "other" + // (i,e. non-BED6) fields + for (size_t i = 5; i <= 11; ++i) + bed.other_idxs.push_back(i); + } } diff --git a/src/utils/BamTools-Ancillary/BamAncillary.h b/src/utils/BamTools-Ancillary/BamAncillary.h index ed544e25..07cc0678 100644 --- a/src/utils/BamTools-Ancillary/BamAncillary.h +++ b/src/utils/BamTools-Ancillary/BamAncillary.h @@ -16,4 +16,7 @@ namespace BamTools { void getBamBlocks(const BamAlignment &bam, const RefVector &refs, vector<BED> &blocks, bool includeDeletions = true); + + void MakeBedFromBam(const BamAlignment &bam, const string &chrom, + const bedVector &blocks, BED &bed); } diff --git a/src/utils/BlockedIntervals/BlockedIntervals.cpp b/src/utils/BlockedIntervals/BlockedIntervals.cpp index da4c5341..eead4ca7 100644 --- a/src/utils/BlockedIntervals/BlockedIntervals.cpp +++ b/src/utils/BlockedIntervals/BlockedIntervals.cpp @@ -14,7 +14,7 @@ void GetBamBlocks(const BamAlignment &bam, const string &chrom, - vector<BED> &bedBlocks, + bedVector &bedBlocks, bool breakOnDeletionOps) { vector<int> starts; diff --git a/src/utils/BlockedIntervals/BlockedIntervals.h b/src/utils/BlockedIntervals/BlockedIntervals.h index 60556e6e..5a3a5c2c 100644 --- a/src/utils/BlockedIntervals/BlockedIntervals.h +++ b/src/utils/BlockedIntervals/BlockedIntervals.h @@ -22,7 +22,7 @@ using namespace BamTools; */ void GetBamBlocks(const BamAlignment &bam, const string &chrom, - vector<BED> &bedBlocks, + bedVector &bedBlocks, bool breakOnDeletionOps = false); /* break a BED12 record into discrete BED6 blocks. */ diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 2a6016b8..d2b323fd 100644 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -798,7 +798,6 @@ public: */ template <typename T> inline void reportBedTab(const T &bed) { - // if it is azeroLength feature, we need to // correct the start and end coords to what they were // in the original file @@ -888,7 +887,6 @@ public: start++; end--; } - //BED if (_isGff == false && _isVcf == false) { if (this->bedType == 3) { @@ -960,7 +958,6 @@ public: */ template <typename T> inline void reportBedRangeTab(const T &bed, CHRPOS start, CHRPOS end) { - // if it is azeroLength feature, we need to // correct the start and end coords to what they were // in the original file -- GitLab