From 8ed770c9eddcad6f237e27a0d304e099d5d9f1f2 Mon Sep 17 00:00:00 2001
From: Aaron <aaronquinlan@gmail.com>
Date: Mon, 7 Feb 2011 10:53:06 -0500
Subject: [PATCH] Added -score option to mergeBed and allowed duplicate names
 for -nms option.

---
 src/mergeBed/mergeBed.cpp  | 197 ++++++++++++++++++++++++++++++-------
 src/mergeBed/mergeBed.h    |  23 +++--
 src/mergeBed/mergeMain.cpp |  53 +++++++---
 3 files changed, 214 insertions(+), 59 deletions(-)

diff --git a/src/mergeBed/mergeBed.cpp b/src/mergeBed/mergeBed.cpp
index d0cbb7cb..a4c2ca09 100644
--- a/src/mergeBed/mergeBed.cpp
+++ b/src/mergeBed/mergeBed.cpp
@@ -14,33 +14,119 @@
 
 
 
-void ReportMergedNames(const map<string, bool> &names) {
-    unsigned int n = 0;
+void BedMerge::ReportMergedNames(const vector<string> &names) {
     printf("\t");
-    map<string, bool>::const_iterator nameItr = names.begin();
-    map<string, bool>::const_iterator nameEnd = names.end();
+    vector<string>::const_iterator nameItr = names.begin();
+    vector<string>::const_iterator nameEnd = names.end();
     for (; nameItr != nameEnd; ++nameItr) {
-        if (n < (names.size() - 1)) {
-            cout << nameItr->first << ";";
+        if (nameItr < (nameEnd - 1))
+            cout << *nameItr << ";";
+        else
+            cout << *nameItr;
+    }
+}
+
+void BedMerge::ReportMergedScores(const vector<string> &scores) {
+    printf("\t");
+    
+    // convert the scores to floats
+    vector<float> data;
+    for (size_t i = 0 ; i < scores.size() ; i++) {
+        data.push_back(atof(scores[i].c_str()));
+    }    
+
+    if (_scoreOp == "sum") {
+        printf("%.3f", accumulate(data.begin(), data.end(), 0.0));
+    }
+    else if (_scoreOp == "min") {
+        printf("%.3f", *min_element( data.begin(), data.end() ));
+    }
+    else if (_scoreOp == "max") {
+        printf("%.3f", *max_element( data.begin(), data.end() ));
+    }
+    else if (_scoreOp == "mean") {
+        double total = accumulate(data.begin(), data.end(), 0.0);
+        double mean = total / data.size();
+        printf("%.3f", mean);
+    }
+    else if (_scoreOp == "median") {
+        double median = 0.0;
+        sort(data.begin(), data.end());
+        int totalLines = data.size();
+        if ((totalLines % 2) > 0) {
+            long mid;
+            mid = totalLines / 2;
+            median = data[mid];
         }
         else {
-            cout << nameItr->first;
+            long midLow, midHigh;
+            midLow = (totalLines / 2) - 1;
+            midHigh = (totalLines / 2);
+            median = (data[midLow] + data[midHigh]) / 2.0;
+        }
+        printf("%.3f", median);
+    }
+    else if ((_scoreOp == "mode") || (_scoreOp == "antimode")) {
+         // compute the frequency of each unique value
+         map<string, int> freqs;
+         vector<string>::const_iterator dIt  = scores.begin();
+         vector<string>::const_iterator dEnd = scores.end();
+         for (; dIt != dEnd; ++dIt) {
+             freqs[*dIt]++;
+         }
+
+         // grab the mode and the anti mode
+         string mode, antiMode;
+         int    count = 0;
+         int minCount = INT_MAX;
+         for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) {
+             if (iter->second > count) {
+                 mode = iter->first;
+                 count = iter->second;
+             }
+             if (iter->second < minCount) {
+                 antiMode = iter->first;
+                 minCount = iter->second;
+             }
+         }
+         // report
+         if (_scoreOp == "mode") {
+             printf("%s", mode.c_str());
+         }
+         else if (_scoreOp == "antimode") {
+             printf("%s", antiMode.c_str());
+         }
+     }
+     else if (_scoreOp == "collapse") {    
+        vector<string>::const_iterator scoreItr = scores.begin();
+        vector<string>::const_iterator scoreEnd = scores.end();
+        for (; scoreItr != scoreEnd; ++scoreItr) {
+            if (scoreItr < (scoreEnd - 1))
+                cout << *scoreItr << ";";
+            else
+                cout << *scoreItr;
         }
-        n++;
     }
 }
 
 // ===============
 // = Constructor =
 // ===============
-BedMerge::BedMerge(string &bedFile, bool &numEntries, int &maxDistance, bool &forceStrand, bool &reportNames) {
-
-    _bedFile = bedFile;
-    _numEntries = numEntries;
-    _maxDistance = maxDistance;
-    _forceStrand = forceStrand;
-    _reportNames = reportNames;
-
+BedMerge::BedMerge(string &bedFile, 
+                   bool numEntries, 
+                   int  maxDistance, 
+                   bool forceStrand, 
+                   bool reportNames, 
+                   bool reportScores,
+                   const string &scoreOp) :
+    _bedFile(bedFile),
+    _numEntries(numEntries),
+    _forceStrand(forceStrand),
+    _reportNames(reportNames),
+    _reportScores(reportScores),
+    _scoreOp(scoreOp),
+    _maxDistance(maxDistance)
+{
     _bed = new BedFile(bedFile);
 
     if (_forceStrand == false)
@@ -60,40 +146,73 @@ BedMerge::~BedMerge(void) {
 // ===============================================
 // Convenience method for reporting merged blocks
 // ================================================
-void BedMerge::Report(string chrom, int start, int end, const map<string, bool> &names, int mergeCount) {
+void BedMerge::Report(string chrom, int start, int end, 
+                      const vector<string> &names, const vector<string> &scores, int mergeCount) 
+{
     if (_bed->isZeroBased == false) {start++;}
     
     printf("%s\t%d\t%d", chrom.c_str(), start, end);
-    if (_numEntries == false && _reportNames == false) {
+    // just the merged intervals
+    if (_numEntries == false && _reportNames == false && _reportScores == false) {
         printf("\n");
     }
-    else if (_numEntries) {
+    // merged intervals and counts    
+    else if (_numEntries == true && _reportNames == false && _reportScores == false) {
         printf("\t%d\n", mergeCount);
     }
-    else if (_reportNames) {
+    // merged intervals and names        
+    else if (_numEntries == false && _reportNames == true && _reportScores == false) {
         ReportMergedNames(names);
         printf("\n");
     }
+    // merged intervals and scores        
+    else if (_numEntries == false && _reportNames == false && _reportScores == true) {
+        ReportMergedScores(scores);
+        printf("\n");
+    }
+    // merged intervals, names, and scores        
+    else if (_numEntries == false && _reportNames == true && _reportScores == true) {
+        ReportMergedNames(names);
+        ReportMergedScores(scores);
+        printf("\n");
+    }
 }
 
 
 // =========================================================
 // Convenience method for reporting merged blocks by strand
 // =========================================================
-void BedMerge::ReportStranded(string chrom, int start, int end, const map<string, bool> &names, int mergeCount, string strand) {
+void BedMerge::ReportStranded(string chrom, int start, int end, 
+                              const vector<string> &names, const vector<string> &scores,
+                              int mergeCount, string strand) 
+{
     if (_bed->isZeroBased == false) {start++;}
     
     printf("%s\t%d\t%d", chrom.c_str(), start, end);
-    if (_numEntries == false && _reportNames == false) {
+    // just the merged intervals
+    if (_numEntries == false && _reportNames == false && _reportScores == false) {
         printf("\t%s\n", strand.c_str());
     }
-    else if (_numEntries) {
+    // merged intervals and counts    
+    else if (_numEntries == true && _reportNames == false && _reportScores == false) {
         printf("\t%d\t%s\n", mergeCount, strand.c_str());
     }
-    else if (_reportNames) {
+    // merged intervals and names        
+    else if (_numEntries == false && _reportNames == true && _reportScores == false) {
         ReportMergedNames(names);
         printf("\t%s\n", strand.c_str());
     }
+    // merged intervals and scores        
+    else if (_numEntries == false && _reportNames == false && _reportScores == true) {
+        ReportMergedScores(scores);
+        printf("\t%s\n", strand.c_str());
+    }
+    // merged intervals, names, and scores        
+    else if (_numEntries == false && _reportNames == true && _reportScores == true) {
+        ReportMergedNames(names);
+        ReportMergedScores(scores);
+        printf("\t%s\n", strand.c_str());
+    }
 }
 
 
@@ -115,8 +234,9 @@ void BedMerge::MergeBed() {
         string chrom        = m->first;
         vector<BED> bedList = m->second;
         int mergeCount = 1;
-        map<string, bool> names;
-
+        vector<string> names;
+        vector<string> scores;
+        
         // merge overlapping features for this chromosome.
         int start = -1;
         int end   = -1;
@@ -126,24 +246,27 @@ void BedMerge::MergeBed() {
             // new block, no overlap
             if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) {
                 if (start >= 0) {
-                    Report(chrom, start, end, names, mergeCount);
+                    Report(chrom, start, end, names, scores, mergeCount);
                     // reset
                     mergeCount = 1;
                     names.clear();
+                    scores.clear();
                 }
                 start = bedItr->start;
                 end   = bedItr->end;
-                names[bedItr->name] = true;
+                names.push_back(bedItr->name);
+                scores.push_back(bedItr->score);
             }
             // same block, overlaps
             else {
                 if ((int) bedItr-> end > end) end = bedItr->end;
                 mergeCount++;
-                names[bedItr->name] = true;
+                names.push_back(bedItr->name);
+                scores.push_back(bedItr->score);
             }
         }
         if (start >= 0) {
-            Report(chrom, start, end, names, mergeCount);
+            Report(chrom, start, end, names, scores, mergeCount);
         }
     }
 }
@@ -177,7 +300,8 @@ void BedMerge::MergeBedStranded() {
 
             int mergeCount = 1;
             int numOnStrand = 0;
-            map<string, bool> names;
+            vector<string> names;
+            vector<string> scores;
 
             // merge overlapping features for this chromosome.
             int start = -1;
@@ -193,23 +317,26 @@ void BedMerge::MergeBedStranded() {
                 
                 if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) {
                     if (start >= 0) {
-                        ReportStranded(chrom, start, end, names, mergeCount, strands[s]);
+                        ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]);
                         // reset
                         mergeCount = 1;
                         names.clear();
+                        scores.clear();
                     }
                     start = bedItr->start;
                     end   = bedItr->end;
-                    names[bedItr->name] = true;
+                    names.push_back(bedItr->name);
+                    scores.push_back(bedItr->score);
                 }
                 else {
                     if ((int) bedItr-> end > end) end = bedItr->end;
                     mergeCount++;
-                    names[bedItr->name] = true;
+                    names.push_back(bedItr->name);
+                    scores.push_back(bedItr->score);
                 }
             }
             if (start >= 0) {
-                ReportStranded(chrom, start, end, names, mergeCount, strands[s]);
+                ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]);
             }
         }
     }
diff --git a/src/mergeBed/mergeBed.h b/src/mergeBed/mergeBed.h
index d0cb59f9..0d1ed46a 100644
--- a/src/mergeBed/mergeBed.h
+++ b/src/mergeBed/mergeBed.h
@@ -12,6 +12,7 @@
 #include "bedFile.h"
 #include <vector>
 #include <algorithm>
+#include <numeric>
 #include <iostream>
 #include <fstream>
 #include <limits.h>
@@ -19,7 +20,6 @@
 
 using namespace std;
 
-void ReportMergedNames(const map<string, bool> &names);
 
 //************************************************
 // Class methods and elements
@@ -29,7 +29,9 @@ class BedMerge {
 public:
 
   // constructor
-  BedMerge(string &bedFile, bool &numEntries, int &maxDistance, bool &forceStrand, bool &reportNames);
+  BedMerge(string &bedFile, bool numEntries, 
+           int maxDistance, bool forceStrand, 
+           bool reportNames, bool reportScores, const string &scoreOp);
 
   // destructor
   ~BedMerge(void);
@@ -40,13 +42,18 @@ public:
 private:
 
     string _bedFile;
-    bool _numEntries;
-    bool _forceStrand;
-    bool _reportNames;
-    int _maxDistance;
+    bool   _numEntries;
+    bool   _forceStrand;
+    bool   _reportNames;
+    bool   _reportScores;
+    string _scoreOp;
+    int    _maxDistance;
     // instance of a bed file class.
     BedFile *_bed;
 
-    void Report(string chrom, int start, int end, const map<string, bool> &names, int mergeCount);
-    void ReportStranded(string chrom, int start, int end, const map<string, bool> &names, int mergeCount, string strand);
+    void Report(string chrom, int start, int end, const vector<string> &names, const vector<string> &scores, int mergeCount);
+    void ReportStranded(string chrom, int start, int end, const vector<string> &names, const vector<string> &scores, int mergeCount, string strand);
+    void ReportMergedNames(const vector<string> &names);
+    void ReportMergedScores(const vector<string> &scores);
+    
 };
diff --git a/src/mergeBed/mergeMain.cpp b/src/mergeBed/mergeMain.cpp
index c756e38a..fc435843 100644
--- a/src/mergeBed/mergeMain.cpp
+++ b/src/mergeBed/mergeMain.cpp
@@ -32,6 +32,7 @@ int main(int argc, char* argv[]) {
     // input files
     string bedFile  = "stdin";
     int maxDistance = 0;
+    string scoreOp  = "";
 
     // input arguments
     bool haveBed         = true;
@@ -39,6 +40,7 @@ int main(int argc, char* argv[]) {
     bool haveMaxDistance = false;
     bool forceStrand     = false;
     bool reportNames     = false;
+    bool reportScores    = false;
 
     for(int i = 1; i < argc; i++) {
         int parameterLength = (int)strlen(argv[i]);
@@ -78,6 +80,13 @@ int main(int argc, char* argv[]) {
         else if (PARAMETER_CHECK("-nms", 4, parameterLength)) {
             reportNames = true;
         }
+        else if (PARAMETER_CHECK("-scores", 7, parameterLength)) {
+            reportScores = true;
+            if ((i+1) < argc) {
+                scoreOp      = argv[i + 1];
+                i++;
+            }
+        }
         else {
             cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
             showHelp = true;
@@ -93,9 +102,15 @@ int main(int argc, char* argv[]) {
         cerr << endl << "*****" << endl << "*****ERROR: Request either -n OR -nms, not both." << endl << "*****" << endl;
         showHelp = true;
     }
+    if ((reportScores == true) && (scoreOp != "sum")  && (scoreOp != "max")    && (scoreOp != "min") && (scoreOp != "mean") &&
+        (scoreOp != "mode") && (scoreOp != "median") && (scoreOp != "antimode") && (scoreOp != "collapse")) 
+    {
+        cerr << endl << "*****" << endl << "*****ERROR: Invalid scoreOp selection \"" << scoreOp << endl << "\"  *****" << endl;
+        showHelp = true;
+    }
 
     if (!showHelp) {
-        BedMerge *bm = new BedMerge(bedFile, numEntries, maxDistance, forceStrand, reportNames);
+        BedMerge *bm = new BedMerge(bedFile, numEntries, maxDistance, forceStrand, reportNames, reportScores, scoreOp);
         delete bm;
         return 0;
     }
@@ -115,21 +130,27 @@ void ShowHelp(void) {
     cerr << "Usage:   " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf>" << endl << endl;
 
     cerr << "Options: " << endl;
-    cerr << "\t-s\t"            << "Force strandedness.  That is, only merge features" << endl;
-    cerr                        << "\t\tthat are the same strand." << endl;
-    cerr                        << "\t\t- By default, merging is done without respect to strand." << endl << endl;
-
-    cerr << "\t-n\t"            << "Report the number of BED entries that were merged." << endl;
-    cerr                        << "\t\t- Note: \"1\" is reported if no merging occurred." << endl << endl;
-
-
-    cerr << "\t-d\t"            << "Maximum distance between features allowed for features" << endl;
-    cerr                        << "\t\tto be merged." << endl;
-    cerr                        << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl;
-    cerr                        << "\t\t- (INTEGER)" << endl << endl;
-
-    cerr << "\t-nms\t"          << "Report the names of the merged features separated by semicolons." << endl << endl;
-
+    cerr << "\t-s\t"                     << "Force strandedness.  That is, only merge features" << endl;
+    cerr                                 << "\t\tthat are the same strand." << endl;
+    cerr                                 << "\t\t- By default, merging is done without respect to strand." << endl << endl;
+
+    cerr << "\t-n\t"                     << "Report the number of BED entries that were merged." << endl;
+    cerr                                 << "\t\t- Note: \"1\" is reported if no merging occurred." << endl << endl;
+
+
+    cerr << "\t-d\t"                     << "Maximum distance between features allowed for features" << endl;
+    cerr                                 << "\t\tto be merged." << endl;
+    cerr                                 << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl;
+    cerr                                 << "\t\t- (INTEGER)" << endl << endl;
+
+    cerr << "\t-nms\t"                   << "Report the names of the merged features separated by semicolons." << endl << endl;
+    
+    cerr << "\t-scores [STRING]\t"       << "Report the scores of the merged features. Specify one of " << endl;
+    cerr                                 << "\t\tthe following options for reporting scores:" << endl;
+    cerr                                 << "\t\t\t    sum, min, max," << endl;
+    cerr                                 << "\t\t\t    mean, median, mode, antimode," << endl;
+    cerr                                 << "\t\t\t    collapse (i.e., print a semicolon-separated list)," << endl << endl;
+    
 
     // end the program here
     exit(1);
-- 
GitLab