From cee8949d76c05d64bad4561a09ad2befb4a80272 Mon Sep 17 00:00:00 2001
From: Aaron <aaronquinlan@gmail.com>
Date: Thu, 26 Jan 2012 13:10:46 -0500
Subject: [PATCH] refactored intersect; better -bed support for split BAM
 alignments

---
 src/intersectBed/Makefile                     |   4 +-
 src/intersectBed/intersectBed.cpp             | 164 +++++++-----------
 src/intersectBed/intersectBed.h               |   8 +-
 src/utils/BamTools-Ancillary/BamAncillary.cpp |  39 +++++
 src/utils/BamTools-Ancillary/BamAncillary.h   |   3 +
 .../BlockedIntervals/BlockedIntervals.cpp     |   2 +-
 src/utils/BlockedIntervals/BlockedIntervals.h |   2 +-
 src/utils/bedFile/bedFile.h                   |   3 -
 8 files changed, 117 insertions(+), 108 deletions(-)

diff --git a/src/intersectBed/Makefile b/src/intersectBed/Makefile
index c96a433e..1eef2c0a 100644
--- a/src/intersectBed/Makefile
+++ b/src/intersectBed/Makefile
@@ -12,6 +12,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
            -I$(UTILITIES_DIR)/fileType/ \
            -I$(UTILITIES_DIR)/BamTools/include \
            -I$(UTILITIES_DIR)/BlockedIntervals \
+           -I$(UTILITIES_DIR)/BamTools-Ancillary \
            -I$(UTILITIES_DIR)/chromsweep \
            -I$(UTILITIES_DIR)/version/
 
@@ -20,7 +21,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
 # ----------------------------------
 SOURCES= intersectMain.cpp intersectBed.cpp
 OBJECTS= $(SOURCES:.cpp=.o)
-_EXT_OBJECTS=bedFile.o lineFileUtilities.o BlockedIntervals.o gzstream.o fileType.o chromsweep.o
+_EXT_OBJECTS=bedFile.o lineFileUtilities.o BlockedIntervals.o BamAncillary.o gzstream.o fileType.o chromsweep.o
 EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
 BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
 PROGRAM= intersectBed
@@ -38,6 +39,7 @@ $(EXT_OBJECTS):
 	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/
 	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools/
 	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BlockedIntervals/
+	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools-Ancillary/
 	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/
 	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/
 	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/chromsweep/
diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp
index 42093c35..39ed3e31 100644
--- a/src/intersectBed/intersectBed.cpp
+++ b/src/intersectBed/intersectBed.cpp
@@ -16,7 +16,6 @@
 Helper functions
 ************************************/
 bool BedIntersect::processHits(const BED &a, const vector<BED> &hits) {
-
     bool hitsFound   = false;
     if (_printable == true) {
         vector<BED>::const_iterator h       = hits.begin();
@@ -96,17 +95,8 @@ bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) {
 }
 
 
-bool BedIntersect::FindBlockedOverlaps(const BED &a, vector<BED> &hits) {
-    
-    // find the hits that overlap with the full span of the blocked BED
-    _bedB->allHits(a.chrom, a.start, a.end, a.strand,
-                   hits, _sameStrand, _diffStrand,
-                   _overlapFraction, _reciprocal);
-    
-    // break a into discrete blocks, as we need to 
-    // measure overlap with the individual blocks, not the full span.
-    bedVector a_blocks; 
-    GetBedBlocks(a, a_blocks);
+bool BedIntersect::FindBlockedOverlaps(const BED &a, const vector<BED> &a_blocks, 
+                                       const vector<BED> &hits, bool a_is_bam) {
     int a_footprint = GetTotalBlockLength(a_blocks);
     // container to store the set of raw hits 
     // that actually overlap the A blocks
@@ -159,7 +149,11 @@ bool BedIntersect::FindBlockedOverlaps(const BED &a, vector<BED> &hits) {
             }
         }
     }
-    return processHits(a, valid_hits);
+    if (!a_is_bam) {
+        return processHits(a, valid_hits);
+    }
+    else
+        return !valid_hits.empty();
 }
 
 
@@ -242,8 +236,19 @@ void BedIntersect::IntersectBed() {
                 if (_obeySplits == false)
                     FindOverlaps(a, hits);
                 // split the BED12 into blocks and look for overlaps in each discrete block
-                else
-                    FindBlockedOverlaps(a, hits);
+                else {
+                    // find the hits that overlap with the full span of the blocked BED
+                    _bedB->allHits(a.chrom, a.start, a.end, a.strand,
+                                   hits, _sameStrand, _diffStrand,
+                                   _overlapFraction, _reciprocal);
+                    // break a into discrete blocks, as we need to 
+                    // measure overlap with the individual blocks, not the full span.
+                    bedVector a_blocks; 
+                    GetBedBlocks(a, a_blocks);
+                    // find the overlaps between the block in A and B 
+                    // last parm is false as a is not a BAM entry
+                    FindBlockedOverlaps(a, a_blocks, hits, false);
+                }
                 hits.clear();
             }
         }
@@ -261,8 +266,11 @@ void BedIntersect::IntersectBed() {
         while (sweep.Next(hit_set)) {
             if (_obeySplits == false)
                 processHits(hit_set.first, hit_set.second);
-            else
-                FindBlockedOverlaps(hit_set.first, hit_set.second);
+            else {
+                bedVector a_blocks; 
+                GetBedBlocks(hit_set.first, a_blocks);
+                FindBlockedOverlaps(hit_set.first, a_blocks, hit_set.second, false);
+            }
         }
     }
 }
@@ -279,19 +287,15 @@ void BedIntersect::IntersectBam(string bamFile) {
     // using BAM output.
     if (_bamOutput == false) {
         _bedA = new BedFile(_bedAFile);
-        _bedA->bedType = 6;
+        _bedA->bedType = 12;
     }
-
     // open the BAM file
     BamReader reader;
     BamWriter writer;
-    
     reader.Open(bamFile);
-
     // get header & reference information
     string bamHeader  = reader.GetHeaderText();
     RefVector refs    = reader.GetReferenceData();
-
     // open a BAM output to stdout if we are writing BAM
     if (_bamOutput == true) {
         // set compression mode
@@ -301,93 +305,55 @@ void BedIntersect::IntersectBam(string bamFile) {
         // open our BAM writer
         writer.Open("stdout", bamHeader, refs);
     }
-
     vector<BED> hits;
     // reserve some space
     hits.reserve(100);
-
-    
     BamAlignment bam;    
     // get each set of alignments for each pair.
     while (reader.GetNextAlignment(bam)) {
 
-        if (bam.IsMapped()) {
-            BED a;
-            a.chrom = refs.at(bam.RefID).RefName;
-            a.start = bam.Position;
-            a.end   = bam.GetEndPosition(false, false);
-
-            // build the name field from the BAM alignment.
-            a.name = bam.Name;
-            if (bam.IsFirstMate()) a.name += "/1";
-            if (bam.IsSecondMate()) a.name += "/2";
-
-            a.score  = ToString(bam.MapQuality);
-
-            a.strand = "+";
-            if (bam.IsReverseStrand()) a.strand = "-";
-
-            if (_bamOutput == true) {
-                bool overlapsFound = false;
-                // treat the BAM alignment as a single "block"
-                if (_obeySplits == false) {
-                    overlapsFound = _bedB->anyHits(a.chrom, a.start, a.end, 
-                                                   a.strand, _sameStrand, _diffStrand,
-                                                   _overlapFraction, _reciprocal);
-                }
-                // split the BAM alignment into discrete blocks and
-                // look for overlaps only within each block.
-                else {
-                    bool overlapFoundForBlock;
-                    bedVector bedBlocks;  // vec to store the discrete BED "blocks" from a
-                    // we don't want to split on "D" ops, hence the "false"
-                    GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks);
-
-                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
-                    vector<BED>::const_iterator bedEnd  = bedBlocks.end();
-                    for (; bedItr != bedEnd; ++bedItr) {
-                        overlapFoundForBlock = _bedB->anyHits(bedItr->chrom, bedItr->start, bedItr->end, 
-                                                              bedItr->strand, _sameStrand, _diffStrand,
-                                                              _overlapFraction, _reciprocal);
-                        if (overlapFoundForBlock == true)
-                            overlapsFound = true;
-                    }
-                }
-                if (overlapsFound == true) {
-                    if (_noHit == false)
-                        writer.SaveAlignment(bam);
-                }
-                else {
-                    if (_noHit == true) {
-                        writer.SaveAlignment(bam);
-                    }
-                }
-            }
-            else {
-                // treat the BAM alignment as a single BED "block"
-                if (_obeySplits == false) {
-                    FindOverlaps(a, hits);
-                    hits.clear();
-                }
-                // split the BAM alignment into discrete BED blocks and
-                // look for overlaps only within each block.
-                else {
-                    bedVector bedBlocks;  // vec to store the discrete BED "blocks" from a
-                    GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks);
-
-                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
-                    vector<BED>::const_iterator bedEnd  = bedBlocks.end();
-                    for (; bedItr != bedEnd; ++bedItr) {
-                        FindOverlaps(*bedItr, hits);
-                        hits.clear();
-                    }
-                }
-            }
-        }
         // BAM IsMapped() is false
-        else if (_noHit == true) {
+        if ((!bam.IsMapped()) && (_noHit == true))
             writer.SaveAlignment(bam);
+        else if (!bam.IsMapped())
+            continue;
+
+        // break alignment into discrete blocks,
+        bedVector bed_blocks;
+        string chrom = refs.at(bam.RefID).RefName;
+        GetBamBlocks(bam, chrom, bed_blocks);
+        // create a basic BED entry from the BAM alignment
+        BED bed;
+        MakeBedFromBam(bam, chrom, bed_blocks, bed);
+        bool overlapsFound = false;
+        if ((_bamOutput == true) && (_obeySplits == false)) 
+        {
+            overlapsFound = _bedB->anyHits(bed.chrom, bed.start, bed.end, 
+                                           bed.strand, _sameStrand, _diffStrand,
+                                           _overlapFraction, _reciprocal);
+        }
+        else if ( ((_bamOutput == true)  && (_obeySplits == true)) ||
+                  ((_bamOutput == false) && (_obeySplits == true)) )
+        {
+            // find the hits that overlap with the full span of the blocked BED
+            _bedB->allHits(bed.chrom, bed.start, bed.end, bed.strand,
+                           hits, _sameStrand, _diffStrand,
+                           _overlapFraction, _reciprocal);
+            // find the overlaps between the block in A and B
+            overlapsFound = FindBlockedOverlaps(bed, bed_blocks, hits, _bamOutput);
+        }
+        else if ((_bamOutput == false) && (_obeySplits == false))
+        {
+            FindOverlaps(bed, hits);
+        }
+        // save the BAM alignment if overlap reqs. were met
+        if (_bamOutput == true) {
+            if ((overlapsFound == true) && (_noHit == false))
+                writer.SaveAlignment(bam);
+            else if ((overlapsFound == false) && (_noHit == true))
+                writer.SaveAlignment(bam);
         }
+        hits.clear();
     }
 
     // close the relevant BAM files.
diff --git a/src/intersectBed/intersectBed.h b/src/intersectBed/intersectBed.h
index 221233cd..84c7ebb1 100644
--- a/src/intersectBed/intersectBed.h
+++ b/src/intersectBed/intersectBed.h
@@ -18,6 +18,7 @@
 #include "api/BamWriter.h"
 #include "api/BamAux.h"
 #include "BlockedIntervals.h"
+#include "BamAncillary.h"
 using namespace BamTools;
 
 
@@ -87,11 +88,12 @@ private:
     bool processHits(const BED &a, const vector<BED> &hits);
 
     bool FindOverlaps(const BED &a, vector<BED> &hits);
-    
-    bool FindBlockedOverlaps(const BED &a, vector<BED> &hits);
+
+    bool FindBlockedOverlaps(const BED &a, const vector<BED> &a_blocks, 
+        const vector<BED> &hits, bool a_is_bam);
 
     void ReportOverlapDetail(int overlapBases, const BED &a, const BED &b, CHRPOS s, CHRPOS e);
-    
+
     void ReportOverlapSummary(const BED &a, const int &numOverlapsFound);
 
 };
diff --git a/src/utils/BamTools-Ancillary/BamAncillary.cpp b/src/utils/BamTools-Ancillary/BamAncillary.cpp
index 09891e96..e59a6f2d 100644
--- a/src/utils/BamTools-Ancillary/BamAncillary.cpp
+++ b/src/utils/BamTools-Ancillary/BamAncillary.cpp
@@ -73,4 +73,43 @@ namespace BamTools {
             blocks.push_back( BED(chrom, bam.Position, currPosition, name, score, strand) );
         }
     }
+    
+    void MakeBedFromBam(const BamAlignment &bam, const string &chrom,
+                        const bedVector &blocks, BED &bed) 
+    {
+        bed.chrom = chrom;
+        bed.start = bam.Position;
+        bed.end   = bam.GetEndPosition(false, false);
+        // build the name field from the BAM alignment.
+        bed.name = bam.Name;
+        if (bam.IsFirstMate()) bed.name += "/1";
+        else if (bam.IsSecondMate()) bed.name += "/2";
+        bed.score  = ToString(bam.MapQuality);
+        bed.strand = "+";
+        if (bam.IsReverseStrand()) bed.strand = "-";
+        
+        bed.fields.push_back(bed.chrom);
+        bed.fields.push_back(ToString(bed.start));
+        bed.fields.push_back(ToString(bed.end));
+        bed.fields.push_back(bed.name);
+        bed.fields.push_back(bed.score);
+        bed.fields.push_back(bed.strand);
+        bed.fields.push_back(ToString(bam.Position));
+        bed.fields.push_back(ToString(bed.end));
+        bed.fields.push_back("0,0,0");
+        bed.fields.push_back(ToString(blocks.size()));
+        
+        ostringstream block_lengths, block_starts;
+        for (size_t i = 0; i < blocks.size(); ++i) {
+            block_lengths << blocks[i].end   - blocks[i].start << ",";
+            block_starts  << blocks[i].start - bam.Position << ",";
+        }
+        bed.fields.push_back(block_lengths.str());
+        bed.fields.push_back(block_starts.str());
+        
+        // identify which indices relate to the "other" 
+        // (i,e. non-BED6) fields
+        for (size_t i = 5; i <= 11; ++i)
+            bed.other_idxs.push_back(i);
+    }
 }
diff --git a/src/utils/BamTools-Ancillary/BamAncillary.h b/src/utils/BamTools-Ancillary/BamAncillary.h
index ed544e25..07cc0678 100644
--- a/src/utils/BamTools-Ancillary/BamAncillary.h
+++ b/src/utils/BamTools-Ancillary/BamAncillary.h
@@ -16,4 +16,7 @@
 namespace BamTools {
     void getBamBlocks(const BamAlignment &bam, const RefVector &refs,
                         vector<BED> &blocks, bool includeDeletions = true);
+
+    void MakeBedFromBam(const BamAlignment &bam, const string &chrom,
+        const bedVector &blocks, BED &bed);
 }
diff --git a/src/utils/BlockedIntervals/BlockedIntervals.cpp b/src/utils/BlockedIntervals/BlockedIntervals.cpp
index da4c5341..eead4ca7 100644
--- a/src/utils/BlockedIntervals/BlockedIntervals.cpp
+++ b/src/utils/BlockedIntervals/BlockedIntervals.cpp
@@ -14,7 +14,7 @@
 
 void GetBamBlocks(const BamAlignment &bam,
                   const string &chrom,
-                  vector<BED> &bedBlocks,
+                  bedVector &bedBlocks,
                   bool breakOnDeletionOps)
 {
     vector<int> starts;
diff --git a/src/utils/BlockedIntervals/BlockedIntervals.h b/src/utils/BlockedIntervals/BlockedIntervals.h
index 60556e6e..5a3a5c2c 100644
--- a/src/utils/BlockedIntervals/BlockedIntervals.h
+++ b/src/utils/BlockedIntervals/BlockedIntervals.h
@@ -22,7 +22,7 @@ using namespace BamTools;
 */
 void GetBamBlocks(const BamAlignment &bam,
                   const string &chrom,
-                  vector<BED> &bedBlocks,
+                  bedVector &bedBlocks,
                   bool breakOnDeletionOps = false);
 
 /* break a BED12 record into discrete BED6 blocks. */
diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h
index 2a6016b8..d2b323fd 100644
--- a/src/utils/bedFile/bedFile.h
+++ b/src/utils/bedFile/bedFile.h
@@ -798,7 +798,6 @@ public:
     */
     template <typename T>
     inline void reportBedTab(const T &bed) {
-        
         // if it is azeroLength feature, we need to
         // correct the start and end coords to what they were
         // in the original file
@@ -888,7 +887,6 @@ public:
                 start++;
             end--;
         }
-        
         //BED
         if (_isGff == false && _isVcf == false) {
             if (this->bedType == 3) {
@@ -960,7 +958,6 @@ public:
     */
     template <typename T>
     inline void reportBedRangeTab(const T &bed, CHRPOS start, CHRPOS end) {
-        
         // if it is azeroLength feature, we need to
         // correct the start and end coords to what they were
         // in the original file
-- 
GitLab