From 65e377d29c35579a13f39f478aa44d9f3c3c8b9e Mon Sep 17 00:00:00 2001
From: Aaron <aaronquinlan@gmail.com>
Date: Tue, 29 Jan 2013 08:16:03 -0500
Subject: [PATCH] [EHN] add -delim option to bedtools merge

---
 docs/content/tools/genomecov.rst  |  2 +-
 docs/content/tools/merge.rst      | 39 ++++++++++++++++++++++++++-----
 src/mergeBed/mergeBed.cpp         | 14 +++++++----
 src/mergeBed/mergeBed.h           | 16 ++++++++++---
 src/mergeBed/mergeMain.cpp        | 36 ++++++++++++++++++++++++----
 src/utils/VectorOps/VectorOps.cpp |  4 ++--
 src/utils/VectorOps/VectorOps.h   |  4 ++--
 test/merge/test-merge.sh          | 28 ++++++++++++++++++----
 8 files changed, 114 insertions(+), 29 deletions(-)

diff --git a/docs/content/tools/genomecov.rst b/docs/content/tools/genomecov.rst
index c8e86566..e0c73a6e 100755
--- a/docs/content/tools/genomecov.rst
+++ b/docs/content/tools/genomecov.rst
@@ -35,7 +35,7 @@ Usage and option summary
 **(or)**:
 ::
   
-  genomeCovergaBed [OPTIONS] [-i|-ibam] -g (iff. -i)
+  genomeCoverageBed [OPTIONS] [-i|-ibam] -g (iff. -i)
 
 
 
diff --git a/docs/content/tools/merge.rst b/docs/content/tools/merge.rst
index 1b9eb767..c8c17a4a 100755
--- a/docs/content/tools/merge.rst
+++ b/docs/content/tools/merge.rst
@@ -45,12 +45,15 @@ Option                           Description
 **-s**				             Force strandedness. That is, only merge features that are the same strand. *By default, this is disabled*.
 **-n**					         Report the number of BED entries that were merged. *1 is reported if no merging occurred*.
 **-d**                           Maximum distance between features allowed for features to be merged. *Default is 0. That is, overlapping and/or book-ended features are merged*.
-**-nms**                         Report the names of the merged features separated by semicolons.
+**-nms**                         Report the names of the merged features separated by commas.  Change delimiter with ``-delim``
 **-scores**	                     | Report the scores of the merged features. Specify one of 
-		                         | the following options for reporting scores:
-		                         | sum, min, max,
-		                         | mean, median, mode, antimode,
-		                         | collapse (i.e., print a semicolon-separated list)
+                                 | the following options for reporting scores:
+                                 | sum, min, max,
+                                 | mean, median, mode, antimode,
+                                 | collapse (i.e., print a semicolon-separated list)
+**-delim**                       | Specify a custom delimiter for the -nms and -scores concat options
+                                 | Example: ``-delim "|"``
+                                 | ``Default: ","``
 ===========================      ===============================================================================================================================================================================================================
 
 
@@ -156,7 +159,7 @@ merged features.
   chr1  250  500  A3
  
   $ bedtools merge -i A.bed -nms
-  chr1  100  500  A1;A2;A3
+  chr1  100  500  A1,A2,A3
   
 
 ==========================================================================
@@ -183,5 +186,29 @@ how the scores from each overlapping interval should be reported.
   chr1  100  500  1,2,3
   
   
+==========================================================================
+``-delim`` Change the delimiter for ``-nms`` and ``-scores collapse``
+==========================================================================
+One can override the use of a comma as the delimiter for the ``-nms`` and
+``-scores collapse`` options via the use of the ``-delim`` option.
+
+.. code-block:: bash
+
+  $ cat A.bed
+  chr1  100  200  A1
+  chr1  150  300  A2
+  chr1  250  500  A3
+
+Compare:
+ 
+.. code-block:: bash
+
+  $ bedtools merge -i A.bed -nms
+  chr1  100  500  A1,A2,A3
   
+to:
+
+.. code-block:: bash
 
+  $ bedtools merge -i A.bed -nms -delim "|"
+  chr1  100  500  A1|A2|A3
diff --git a/src/mergeBed/mergeBed.cpp b/src/mergeBed/mergeBed.cpp
index 0c1bb27d..e556e44b 100644
--- a/src/mergeBed/mergeBed.cpp
+++ b/src/mergeBed/mergeBed.cpp
@@ -21,7 +21,7 @@ void BedMerge::ReportMergedNames(const vector<string> &names) {
         vector<string>::const_iterator nameEnd = names.end();
         for (; nameItr != nameEnd; ++nameItr) {
             if (nameItr < (nameEnd - 1))
-                cout << *nameItr << ";";
+                cout << *nameItr << _delimiter;
             else
                 cout << *nameItr;
         }
@@ -29,7 +29,9 @@ void BedMerge::ReportMergedNames(const vector<string> &names) {
     else {
         cerr << endl 
              << "*****" << endl 
-             << "*****ERROR: No names found to report for the -names option. Exiting." << endl 
+             << "*****ERROR: "
+             << "No names found to report for the -names option. Exiting." 
+             << endl 
              << "*****" << endl;
         exit(1);
     }
@@ -58,7 +60,7 @@ void BedMerge::ReportMergedScores(const vector<string> &scores) {
         else if (_scoreOp == "antimode")
             buffer << setprecision (PRECISION) << vo.GetAntiMode();
         else if (_scoreOp == "collapse")
-            buffer << setprecision (PRECISION) << vo.GetCollapse();
+            buffer << setprecision (PRECISION) << vo.GetCollapse(_delimiter);
         cout << "\t" << buffer.str();
     }
     else {        
@@ -79,14 +81,16 @@ BedMerge::BedMerge(string &bedFile,
                    bool forceStrand, 
                    bool reportNames, 
                    bool reportScores,
-                   const string &scoreOp) :
+                   const string &scoreOp,
+                   const string &delimiter) :
     _bedFile(bedFile),
     _numEntries(numEntries),
     _forceStrand(forceStrand),
     _reportNames(reportNames),
     _reportScores(reportScores),
     _scoreOp(scoreOp),
-    _maxDistance(maxDistance)
+    _maxDistance(maxDistance),
+    _delimiter(delimiter)
 {
     _bed = new BedFile(bedFile);
 
diff --git a/src/mergeBed/mergeBed.h b/src/mergeBed/mergeBed.h
index 52eb7cb1..d9b0c143 100644
--- a/src/mergeBed/mergeBed.h
+++ b/src/mergeBed/mergeBed.h
@@ -34,7 +34,8 @@ public:
   // constructor
   BedMerge(string &bedFile, bool numEntries, 
            int maxDistance, bool forceStrand, 
-           bool reportNames, bool reportScores, const string &scoreOp);
+           bool reportNames, bool reportScores, 
+           const string &scoreOp, const string &delimiter);
 
   // destructor
   ~BedMerge(void);
@@ -51,11 +52,20 @@ private:
     bool   _reportScores;
     string _scoreOp;
     int    _maxDistance;
+    string _delimiter;
     // instance of a bed file class.
     BedFile *_bed;
 
-    void Report(string chrom, int start, int end, const vector<string> &names, const vector<string> &scores, int mergeCount);
-    void ReportStranded(string chrom, int start, int end, const vector<string> &names, const vector<string> &scores, int mergeCount, string strand);
+    void Report(string chrom, int start, int end, 
+                const vector<string> &names, 
+                const vector<string> &scores, 
+                int mergeCount);
+
+    void ReportStranded(string chrom, int start, int end, 
+                        const vector<string> &names, 
+                        const vector<string> &scores, 
+                        int mergeCount, 
+                        string strand);
     void ReportMergedNames(const vector<string> &names);
     void ReportMergedScores(const vector<string> &scores);
     
diff --git a/src/mergeBed/mergeMain.cpp b/src/mergeBed/mergeMain.cpp
index 2f1a164c..28b869af 100644
--- a/src/mergeBed/mergeMain.cpp
+++ b/src/mergeBed/mergeMain.cpp
@@ -41,6 +41,7 @@ int merge_main(int argc, char* argv[]) {
     bool forceStrand     = false;
     bool reportNames     = false;
     bool reportScores    = false;
+    string delimiter     = ",";
 
     for(int i = 1; i < argc; i++) {
         int parameterLength = (int)strlen(argv[i]);
@@ -87,6 +88,12 @@ int merge_main(int argc, char* argv[]) {
                 i++;
             }
         }
+        else if (PARAMETER_CHECK("-delim", 6, parameterLength)) {
+            if ((i+1) < argc) {
+                delimiter      = argv[i + 1];
+                i++;
+            }
+        }
         else {
             cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
             showHelp = true;
@@ -98,15 +105,28 @@ int merge_main(int argc, char* argv[]) {
         cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl;
         showHelp = true;
     }
-    if ((reportScores == true) && (scoreOp != "sum")  && (scoreOp != "max")    && (scoreOp != "min") && (scoreOp != "mean") &&
-        (scoreOp != "mode") && (scoreOp != "median") && (scoreOp != "antimode") && (scoreOp != "collapse")) 
+    if ((reportScores == true) && (scoreOp != "sum")  
+         && (scoreOp != "max")  && (scoreOp != "min") 
+         && (scoreOp != "mean") && (scoreOp != "mode") 
+         && (scoreOp != "median") && (scoreOp != "antimode") 
+         && (scoreOp != "collapse")) 
     {
-        cerr << endl << "*****" << endl << "*****ERROR: Invalid scoreOp selection \"" << scoreOp << endl << "\"  *****" << endl;
+        cerr << endl 
+             << "*****" 
+             << endl 
+             << "*****ERROR: Invalid scoreOp selection \"" 
+             << scoreOp 
+             << endl 
+             << "\"  *****" 
+             << endl;
         showHelp = true;
     }
 
     if (!showHelp) {
-        BedMerge *bm = new BedMerge(bedFile, numEntries, maxDistance, forceStrand, reportNames, reportScores, scoreOp);
+        BedMerge *bm = new BedMerge(bedFile, numEntries, 
+                                    maxDistance, forceStrand, 
+                                    reportNames, reportScores, 
+                                    scoreOp, delimiter);
         delete bm;
     }
     else {
@@ -137,7 +157,8 @@ void merge_help(void) {
     cerr                                 << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl;
     cerr                                 << "\t\t- (INTEGER)" << endl << endl;
 
-    cerr << "\t-nms\t"                   << "Report the names of the merged features separated by semicolons." << endl << endl;
+    cerr << "\t-nms\t"                   << "Report the names of the merged features separated by commas." << endl;
+    cerr                                 << "\t\tChange delim. with -delim." << endl << endl;
     
     cerr << "\t-scores\t"                << "Report the scores of the merged features. Specify one of " << endl;
     cerr                                 << "\t\tthe following options for reporting scores:" << endl;
@@ -146,6 +167,11 @@ void merge_help(void) {
     cerr                                 << "\t\t  collapse (i.e., print a semicolon-separated list)," << endl;
     cerr                                 << "\t\t- (INTEGER)" << endl << endl;
     
+    cerr << "\t-delim\t"                 << "Specify a custom delimiter for the -nms and -scores concat options" << endl;
+    cerr                                 << "\t\t- Example: -delim \"|\"" << endl;
+    cerr                                 << "\t\t- Default: \",\"." << endl << endl;
+    
+    
     cerr << "Notes: " << endl;
     cerr << "\t(1) All output, regardless of input type (e.g., GFF or VCF)" << endl;
     cerr << "\t    will in BED format with zero-based starts" << endl << endl;
diff --git a/src/utils/VectorOps/VectorOps.cpp b/src/utils/VectorOps/VectorOps.cpp
index 53c2c374..97c5d629 100644
--- a/src/utils/VectorOps/VectorOps.cpp
+++ b/src/utils/VectorOps/VectorOps.cpp
@@ -208,12 +208,12 @@ uint32_t VectorOps::GetCountDistinct(void)
     return _vecs.size();
 }
 
-string VectorOps::GetCollapse(void)
+string VectorOps::GetCollapse(string delimiter)
 {
     ostringstream collapse;
     for( size_t i = 0; i < _vecs.size(); i++ ) {
         if (i>0)
-            collapse << ",";
+            collapse << delimiter;
         collapse << _vecs[i];
     }
     return collapse.str();
diff --git a/src/utils/VectorOps/VectorOps.h b/src/utils/VectorOps/VectorOps.h
index 3f4b53c9..5e979fa9 100644
--- a/src/utils/VectorOps/VectorOps.h
+++ b/src/utils/VectorOps/VectorOps.h
@@ -63,8 +63,8 @@ public:
     uint32_t GetCount(void);          
     // return a the count of _unique_ elements in the vector
     uint32_t GetCountDistinct(void);  
-    // return a comma-separated list of elements
-    string GetCollapse(void);         
+    // return a delimiter-separated list of elements
+    string GetCollapse(string delimiter = ",");
     // return a concatenation of all elements in the vector
     string GetConcat(void);           
     // return a comma-separated list of the _unique_ elements
diff --git a/test/merge/test-merge.sh b/test/merge/test-merge.sh
index b616be0a..ee7f6290 100644
--- a/test/merge/test-merge.sh
+++ b/test/merge/test-merge.sh
@@ -87,7 +87,7 @@ rm obs exp
 echo "    merge.t5...\c"
 echo \
 "chr1	10	20	a1
-chr1	30	100	a2;a3;a4" > exp
+chr1	30	100	a2,a3,a4" > exp
 $BT merge -i a.names.bed -nms > obs
 check obs exp
 rm obs exp
@@ -98,10 +98,10 @@ rm obs exp
 echo "    merge.t6...\c"
 echo \
 "chr1	10	20	a1	1
-chr1	30	100	a2;a3;a4	9
+chr1	30	100	a2,a3,a4	9
 chr2	10	20	a1	5
 chr2	30	40	a2	6
-chr2	42	100	a3;a4	15" > exp
+chr2	42	100	a3,a4	15" > exp
 $BT merge -i a.full.bed -nms -scores sum> obs
 check obs exp
 rm obs exp
@@ -126,10 +126,10 @@ rm obs exp
 echo "    merge.t8...\c"
 echo \
 "chr1	10	20	a1	1	1
-chr1	30	100	a2;a3;a4	9	3
+chr1	30	100	a2,a3,a4	9	3
 chr2	10	20	a1	5	1
 chr2	30	40	a2	6	1
-chr2	42	100	a3;a4	15	2" > exp
+chr2	42	100	a3,a4	15	2" > exp
 $BT merge -i a.full.bed -nms -n -scores sum> obs
 check obs exp
 rm obs exp
@@ -150,3 +150,21 @@ chr2	45	100	a4	8	-	1" > exp
 $BT merge -i a.full.bed -s -nms -n -scores sum> obs
 check obs exp
 rm obs exp
+
+###########################################################
+# Test #10
+#  Test the use of a custom delimiter for -nms
+#  
+# cat a.names.bed
+# chr1	10	20	a1
+# chr1	30	40	a2
+# chr1	40	50	a3
+# chr1	45	100	a4
+###########################################################
+echo "    merge.t10...\c"
+echo \
+"chr1	10	20	a1
+chr1	30	100	a2|a3|a4" > exp
+$BT merge -i a.names.bed -nms -delim "|" > obs
+check obs exp
+rm obs exp
-- 
GitLab