From 025efbd175e213e99bc85754a578474b89fa3225 Mon Sep 17 00:00:00 2001
From: Aaron <aaronquinlan@gmail.com>
Date: Mon, 7 Nov 2011 19:54:58 -0500
Subject: [PATCH] Added r. layer's cluster algorithm -cluster

---
 src/multiIntersectBed/intervalItem.h          |  17 +-
 src/multiIntersectBed/multiIntersectBed.cpp   | 191 ++++++++++--------
 src/multiIntersectBed/multiIntersectBed.h     |  16 +-
 .../multiIntersectBedMain.cpp                 |  11 +-
 4 files changed, 135 insertions(+), 100 deletions(-)

diff --git a/src/multiIntersectBed/intervalItem.h b/src/multiIntersectBed/intervalItem.h
index a63e0ce4..54b15d6d 100644
--- a/src/multiIntersectBed/intervalItem.h
+++ b/src/multiIntersectBed/intervalItem.h
@@ -39,7 +39,7 @@ public:
        coord_type(START),
        coord(0)
     {}
-    
+
     IntervalItem(int _index, COORDINATE_TYPE _type, CHRPOS _coord) :
         source_index(_index),
         coord_type(_type),
@@ -54,7 +54,20 @@ public:
 
     bool operator< ( const IntervalItem& other ) const
     {
-        return this->coord > other.coord;
+        if (this->coord > other.coord) {
+            return true;
+        }
+        else if (this->coord < other.coord) {
+            return false;
+        }
+        else {
+            if (this->coord_type == END) {
+                return false;
+            }
+            else {
+                return true;
+            }
+        }
     }
 };
 
diff --git a/src/multiIntersectBed/multiIntersectBed.cpp b/src/multiIntersectBed/multiIntersectBed.cpp
index e4eb5e10..8058af56 100644
--- a/src/multiIntersectBed/multiIntersectBed.cpp
+++ b/src/multiIntersectBed/multiIntersectBed.cpp
@@ -58,84 +58,6 @@ MultiIntersectBed::~MultiIntersectBed() {
 }
 
 
-void MultiIntersectBed::MultiIntersect() {
-    OpenFiles();
-
-    // Add the first interval from each file
-    for(size_t i = 0;i < input_files.size(); ++i)
-        LoadNextItem(i);
-
-    // Chromosome loop - once per chromosome
-    do {
-        // Find the first chromosome to use
-        current_chrom = DetermineNextChrom();
-
-        // Populate the queue with initial values from all files
-        // (if they belong to the correct chromosome)
-        for(size_t i = 0; i < input_files.size(); ++i)
-            AddInterval(i);
-
-        CHRPOS current_start = ConsumeNextCoordinate();
-
-        // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage
-        if (print_empty_regions && current_start > 0)
-            PrintEmptyCoverage(0,current_start);
-
-        // Intervals loop - until all intervals (of current chromosome) from all files are used.
-        do {
-            CHRPOS current_end = queue.top().coord;
-            PrintCoverage(current_start, current_end);
-            current_start = ConsumeNextCoordinate();
-        } while (!queue.empty());
-
-        // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome
-            // print a dummy empty coverage
-        if (print_empty_regions) {
-            CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom);
-            if (current_start < chrom_size)
-                PrintEmptyCoverage(current_start, chrom_size);
-        }
-
-    } while (!AllFilesDone());
-}
-
-
-CHRPOS MultiIntersectBed::ConsumeNextCoordinate() {
-    assert(!queue.empty());
-
-    CHRPOS new_position = queue.top().coord;
-    do {
-        IntervalItem item = queue.top();
-        UpdateInformation(item);
-        queue.pop();
-    } while (!queue.empty() && queue.top().coord == new_position);
-
-    return new_position;
-}
-
-
-void MultiIntersectBed::UpdateInformation(const IntervalItem &item) {
-    // Update the depth coverage for this file
-
-    // Which coordinate is it - start or end?
-    switch (item.coord_type)
-    {
-    case START:
-        current_depth[item.source_index] = 1;
-        current_non_zero_inputs++;
-        break;
-    case END:
-        //Read the next interval from this file
-        AddInterval(item.source_index);
-        current_depth[item.source_index] = 0;
-        current_non_zero_inputs--;
-        break;
-    default:
-        assert(0);
-    }
-}
-
-
 void MultiIntersectBed::AddInterval(int index) {
     assert(static_cast<unsigned int>(index) < input_files.size());
 
@@ -234,12 +156,6 @@ void MultiIntersectBed::LoadNextItem(int index) {
     BedFile *file = input_files[index];
     BED merged_bed;
     int lineNum = 0;
-    //
-    // TO DO: Do the mergeing on the fly.  How best to do this?
-    // 
-    // IDEA: Implement a Merge class with GetNextMerge element.
-    //
-
     while (file->GetNextMergedBed(merged_bed, lineNum))
     {
         current_item[index] = merged_bed;
@@ -291,3 +207,110 @@ void MultiIntersectBed::CloseFiles() {
     }
     input_files.clear();
 }
+
+
+void MultiIntersectBed::MultiIntersect() {
+    OpenFiles();
+
+    // Add the first interval from each file
+    for(size_t i = 0;i < input_files.size(); ++i)
+        LoadNextItem(i);
+
+    // Chromosome loop - once per chromosome
+    IntervalItem curr_point;
+    do {
+        // Find the first chromosome to use
+        current_chrom = DetermineNextChrom();
+
+        // Populate the queue with initial values from all files
+        // (if they belong to the correct chromosome)
+        for(size_t i = 0; i < input_files.size(); ++i)
+            AddInterval(i);
+
+        CHRPOS prev_start    = 0;
+        do {
+            // get the current point in the queue
+            curr_point    = queue.top();
+            // if we have moved, report the interval
+            if (curr_point.coord > prev_start) {
+                PrintCoverage(prev_start, curr_point.coord);
+            }
+            // update depending on the type of interval we encountered.
+            if (curr_point.coord_type == START) {
+                current_depth[curr_point.source_index] = 1;
+                current_non_zero_inputs++;
+            }
+            else {
+                //Read the next interval from this file
+                AddInterval(curr_point.source_index);
+                current_depth[curr_point.source_index] = 0;
+                current_non_zero_inputs--;
+            }
+            // reset for the next point
+            prev_start = curr_point.coord;
+            queue.pop();
+        } while (!queue.empty());
+
+        // want empty regions, and the last coordinate is not the last coordinate of the chromosome
+        if (print_empty_regions) {
+            CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom);
+            if (curr_point.coord < chrom_size)
+                PrintEmptyCoverage(curr_point.coord, chrom_size);
+        }
+    } while (!AllFilesDone());
+}
+
+
+void MultiIntersectBed::Cluster() {
+    OpenFiles();
+
+    // Add the first interval from each file
+    for(size_t i = 0;i < input_files.size(); ++i)
+        LoadNextItem(i);
+
+    // Chromosome loop - once per chromosome
+    IntervalItem curr_point;
+    short last_direction = 0;
+    do {
+        // Find the first chromosome to use
+        current_chrom = DetermineNextChrom();
+
+        // Populate the queue with initial values from all files
+        // (if they belong to the correct chromosome)
+        for(size_t i = 0; i < input_files.size(); ++i)
+            AddInterval(i);
+
+        CHRPOS prev_start = 0;
+        do {
+            // get the current point in the queue
+            curr_point = queue.top();
+            if (curr_point.coord_type == START) {
+                current_depth[curr_point.source_index] = 1;
+                current_non_zero_inputs++;
+                // reset for the next point
+                prev_start = curr_point.coord;
+                last_direction = 1;
+            }
+            else {
+                if (last_direction == 1) {
+                    PrintCoverage(prev_start, curr_point.coord);
+                }
+                //Read the next interval from this file
+                AddInterval(curr_point.source_index);
+                current_depth[curr_point.source_index] = 0;
+                current_non_zero_inputs--;
+                last_direction = -1;
+            }
+            queue.pop();
+        } while (!queue.empty());
+
+        // want empty regions, and the last coordinate is not the last coordinate of the chromosome
+        if (print_empty_regions) {
+            CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom);
+            if (curr_point.coord < chrom_size)
+                PrintEmptyCoverage(curr_point.coord, chrom_size);
+        }
+    } while (!AllFilesDone());
+}
+
+
diff --git a/src/multiIntersectBed/multiIntersectBed.h b/src/multiIntersectBed/multiIntersectBed.h
index 1e22954e..b8497f84 100644
--- a/src/multiIntersectBed/multiIntersectBed.h
+++ b/src/multiIntersectBed/multiIntersectBed.h
@@ -56,6 +56,9 @@ public:
 
     // Combines all interval files
     void MultiIntersect();
+    
+    // 
+    void Cluster();
 
     // Print the header line: chrom/start/end + name of each bedgraph file.
     void PrintHeader();
@@ -95,19 +98,6 @@ private:
     */
     bool        AllFilesDone();
 
-    /*
-       Extract the next coordinate from the queue, and updates the current coverage information.
-       If multiple interval share the same coordinate values, all of them are handled.
-       If an END coordinate is consumed, the next interval (from the corresponding file) is read.
-     */
-    CHRPOS ConsumeNextCoordinate();
-
-    /*
-       Updates the coverage information based on the given item.
-       Item can be a START coordinate or an END coordiante.
-     */
-    void UpdateInformation(const IntervalItem &item);
-
     /*
        prints chrom/start/end and the current depth coverage values of all the files.
      */
diff --git a/src/multiIntersectBed/multiIntersectBedMain.cpp b/src/multiIntersectBed/multiIntersectBedMain.cpp
index 5248315e..5b4b627b 100644
--- a/src/multiIntersectBed/multiIntersectBedMain.cpp
+++ b/src/multiIntersectBed/multiIntersectBedMain.cpp
@@ -49,6 +49,7 @@ int main(int argc, char* argv[])
     bool haveFiller        = true;
     bool printHeader       = false;
     bool printEmptyRegions = false;
+    bool cluster           = false;
     bool showHelp          = false;
     string genomeFile;
     string basePath;
@@ -127,6 +128,9 @@ int main(int argc, char* argv[])
         else if(PARAMETER_CHECK("-empty", 6, parameterLength)) {
             printEmptyRegions = true;
         }
+        else if(PARAMETER_CHECK("-cluster", 8, parameterLength)) {
+            cluster = true;
+        }
         else if(PARAMETER_CHECK("-examples", 9, parameterLength)) {
             ShowHelp();
             ShowExamples();
@@ -155,7 +159,10 @@ int main(int argc, char* argv[])
     MultiIntersectBed mbi(cout, inputFiles, inputTitles, printEmptyRegions, genomeFile, noCoverageValue);
     if (printHeader)
         mbi.PrintHeader();
-    mbi.MultiIntersect();
+    if (!cluster)
+        mbi.MultiIntersect();
+    else
+        mbi.Cluster();
 }
 
 void ShowHelp(void) {
@@ -173,6 +180,8 @@ void ShowHelp(void) {
 
     cerr << "Options: " << endl;
 
+    cerr << "\t-cluster\t\t"     << "Invoke Ryan's algorithm." << endl << endl;
+
     cerr << "\t-header\t\t"     << "Print a header line." << endl;
     cerr                        << "\t\t\t(chrom/start/end + names of each file)." << endl << endl;
 
-- 
GitLab