From 65e377d29c35579a13f39f478aa44d9f3c3c8b9e Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Tue, 29 Jan 2013 08:16:03 -0500 Subject: [PATCH] [EHN] add -delim option to bedtools merge --- docs/content/tools/genomecov.rst | 2 +- docs/content/tools/merge.rst | 39 ++++++++++++++++++++++++++----- src/mergeBed/mergeBed.cpp | 14 +++++++---- src/mergeBed/mergeBed.h | 16 ++++++++++--- src/mergeBed/mergeMain.cpp | 36 ++++++++++++++++++++++++---- src/utils/VectorOps/VectorOps.cpp | 4 ++-- src/utils/VectorOps/VectorOps.h | 4 ++-- test/merge/test-merge.sh | 28 ++++++++++++++++++---- 8 files changed, 114 insertions(+), 29 deletions(-) diff --git a/docs/content/tools/genomecov.rst b/docs/content/tools/genomecov.rst index c8e86566..e0c73a6e 100755 --- a/docs/content/tools/genomecov.rst +++ b/docs/content/tools/genomecov.rst @@ -35,7 +35,7 @@ Usage and option summary **(or)**: :: - genomeCovergaBed [OPTIONS] [-i|-ibam] -g (iff. -i) + genomeCoverageBed [OPTIONS] [-i|-ibam] -g (iff. -i) diff --git a/docs/content/tools/merge.rst b/docs/content/tools/merge.rst index 1b9eb767..c8c17a4a 100755 --- a/docs/content/tools/merge.rst +++ b/docs/content/tools/merge.rst @@ -45,12 +45,15 @@ Option Description **-s** Force strandedness. That is, only merge features that are the same strand. *By default, this is disabled*. **-n** Report the number of BED entries that were merged. *1 is reported if no merging occurred*. **-d** Maximum distance between features allowed for features to be merged. *Default is 0. That is, overlapping and/or book-ended features are merged*. -**-nms** Report the names of the merged features separated by semicolons. +**-nms** Report the names of the merged features separated by commas. Change delimiter with ``-delim`` **-scores** | Report the scores of the merged features. Specify one of - | the following options for reporting scores: - | sum, min, max, - | mean, median, mode, antimode, - | collapse (i.e., print a semicolon-separated list) + | the following options for reporting scores: + | sum, min, max, + | mean, median, mode, antimode, + | collapse (i.e., print a semicolon-separated list) +**-delim** | Specify a custom delimiter for the -nms and -scores concat options + | Example: ``-delim "|"`` + | ``Default: ","`` =========================== =============================================================================================================================================================================================================== @@ -156,7 +159,7 @@ merged features. chr1 250 500 A3 $ bedtools merge -i A.bed -nms - chr1 100 500 A1;A2;A3 + chr1 100 500 A1,A2,A3 ========================================================================== @@ -183,5 +186,29 @@ how the scores from each overlapping interval should be reported. chr1 100 500 1,2,3 +========================================================================== +``-delim`` Change the delimiter for ``-nms`` and ``-scores collapse`` +========================================================================== +One can override the use of a comma as the delimiter for the ``-nms`` and +``-scores collapse`` options via the use of the ``-delim`` option. + +.. code-block:: bash + + $ cat A.bed + chr1 100 200 A1 + chr1 150 300 A2 + chr1 250 500 A3 + +Compare: + +.. code-block:: bash + + $ bedtools merge -i A.bed -nms + chr1 100 500 A1,A2,A3 +to: + +.. code-block:: bash + $ bedtools merge -i A.bed -nms -delim "|" + chr1 100 500 A1|A2|A3 diff --git a/src/mergeBed/mergeBed.cpp b/src/mergeBed/mergeBed.cpp index 0c1bb27d..e556e44b 100644 --- a/src/mergeBed/mergeBed.cpp +++ b/src/mergeBed/mergeBed.cpp @@ -21,7 +21,7 @@ void BedMerge::ReportMergedNames(const vector<string> &names) { vector<string>::const_iterator nameEnd = names.end(); for (; nameItr != nameEnd; ++nameItr) { if (nameItr < (nameEnd - 1)) - cout << *nameItr << ";"; + cout << *nameItr << _delimiter; else cout << *nameItr; } @@ -29,7 +29,9 @@ void BedMerge::ReportMergedNames(const vector<string> &names) { else { cerr << endl << "*****" << endl - << "*****ERROR: No names found to report for the -names option. Exiting." << endl + << "*****ERROR: " + << "No names found to report for the -names option. Exiting." + << endl << "*****" << endl; exit(1); } @@ -58,7 +60,7 @@ void BedMerge::ReportMergedScores(const vector<string> &scores) { else if (_scoreOp == "antimode") buffer << setprecision (PRECISION) << vo.GetAntiMode(); else if (_scoreOp == "collapse") - buffer << setprecision (PRECISION) << vo.GetCollapse(); + buffer << setprecision (PRECISION) << vo.GetCollapse(_delimiter); cout << "\t" << buffer.str(); } else { @@ -79,14 +81,16 @@ BedMerge::BedMerge(string &bedFile, bool forceStrand, bool reportNames, bool reportScores, - const string &scoreOp) : + const string &scoreOp, + const string &delimiter) : _bedFile(bedFile), _numEntries(numEntries), _forceStrand(forceStrand), _reportNames(reportNames), _reportScores(reportScores), _scoreOp(scoreOp), - _maxDistance(maxDistance) + _maxDistance(maxDistance), + _delimiter(delimiter) { _bed = new BedFile(bedFile); diff --git a/src/mergeBed/mergeBed.h b/src/mergeBed/mergeBed.h index 52eb7cb1..d9b0c143 100644 --- a/src/mergeBed/mergeBed.h +++ b/src/mergeBed/mergeBed.h @@ -34,7 +34,8 @@ public: // constructor BedMerge(string &bedFile, bool numEntries, int maxDistance, bool forceStrand, - bool reportNames, bool reportScores, const string &scoreOp); + bool reportNames, bool reportScores, + const string &scoreOp, const string &delimiter); // destructor ~BedMerge(void); @@ -51,11 +52,20 @@ private: bool _reportScores; string _scoreOp; int _maxDistance; + string _delimiter; // instance of a bed file class. BedFile *_bed; - void Report(string chrom, int start, int end, const vector<string> &names, const vector<string> &scores, int mergeCount); - void ReportStranded(string chrom, int start, int end, const vector<string> &names, const vector<string> &scores, int mergeCount, string strand); + void Report(string chrom, int start, int end, + const vector<string> &names, + const vector<string> &scores, + int mergeCount); + + void ReportStranded(string chrom, int start, int end, + const vector<string> &names, + const vector<string> &scores, + int mergeCount, + string strand); void ReportMergedNames(const vector<string> &names); void ReportMergedScores(const vector<string> &scores); diff --git a/src/mergeBed/mergeMain.cpp b/src/mergeBed/mergeMain.cpp index 2f1a164c..28b869af 100644 --- a/src/mergeBed/mergeMain.cpp +++ b/src/mergeBed/mergeMain.cpp @@ -41,6 +41,7 @@ int merge_main(int argc, char* argv[]) { bool forceStrand = false; bool reportNames = false; bool reportScores = false; + string delimiter = ","; for(int i = 1; i < argc; i++) { int parameterLength = (int)strlen(argv[i]); @@ -87,6 +88,12 @@ int merge_main(int argc, char* argv[]) { i++; } } + else if (PARAMETER_CHECK("-delim", 6, parameterLength)) { + if ((i+1) < argc) { + delimiter = argv[i + 1]; + i++; + } + } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; @@ -98,15 +105,28 @@ int merge_main(int argc, char* argv[]) { cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl; showHelp = true; } - if ((reportScores == true) && (scoreOp != "sum") && (scoreOp != "max") && (scoreOp != "min") && (scoreOp != "mean") && - (scoreOp != "mode") && (scoreOp != "median") && (scoreOp != "antimode") && (scoreOp != "collapse")) + if ((reportScores == true) && (scoreOp != "sum") + && (scoreOp != "max") && (scoreOp != "min") + && (scoreOp != "mean") && (scoreOp != "mode") + && (scoreOp != "median") && (scoreOp != "antimode") + && (scoreOp != "collapse")) { - cerr << endl << "*****" << endl << "*****ERROR: Invalid scoreOp selection \"" << scoreOp << endl << "\" *****" << endl; + cerr << endl + << "*****" + << endl + << "*****ERROR: Invalid scoreOp selection \"" + << scoreOp + << endl + << "\" *****" + << endl; showHelp = true; } if (!showHelp) { - BedMerge *bm = new BedMerge(bedFile, numEntries, maxDistance, forceStrand, reportNames, reportScores, scoreOp); + BedMerge *bm = new BedMerge(bedFile, numEntries, + maxDistance, forceStrand, + reportNames, reportScores, + scoreOp, delimiter); delete bm; } else { @@ -137,7 +157,8 @@ void merge_help(void) { cerr << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl; cerr << "\t\t- (INTEGER)" << endl << endl; - cerr << "\t-nms\t" << "Report the names of the merged features separated by semicolons." << endl << endl; + cerr << "\t-nms\t" << "Report the names of the merged features separated by commas." << endl; + cerr << "\t\tChange delim. with -delim." << endl << endl; cerr << "\t-scores\t" << "Report the scores of the merged features. Specify one of " << endl; cerr << "\t\tthe following options for reporting scores:" << endl; @@ -146,6 +167,11 @@ void merge_help(void) { cerr << "\t\t collapse (i.e., print a semicolon-separated list)," << endl; cerr << "\t\t- (INTEGER)" << endl << endl; + cerr << "\t-delim\t" << "Specify a custom delimiter for the -nms and -scores concat options" << endl; + cerr << "\t\t- Example: -delim \"|\"" << endl; + cerr << "\t\t- Default: \",\"." << endl << endl; + + cerr << "Notes: " << endl; cerr << "\t(1) All output, regardless of input type (e.g., GFF or VCF)" << endl; cerr << "\t will in BED format with zero-based starts" << endl << endl; diff --git a/src/utils/VectorOps/VectorOps.cpp b/src/utils/VectorOps/VectorOps.cpp index 53c2c374..97c5d629 100644 --- a/src/utils/VectorOps/VectorOps.cpp +++ b/src/utils/VectorOps/VectorOps.cpp @@ -208,12 +208,12 @@ uint32_t VectorOps::GetCountDistinct(void) return _vecs.size(); } -string VectorOps::GetCollapse(void) +string VectorOps::GetCollapse(string delimiter) { ostringstream collapse; for( size_t i = 0; i < _vecs.size(); i++ ) { if (i>0) - collapse << ","; + collapse << delimiter; collapse << _vecs[i]; } return collapse.str(); diff --git a/src/utils/VectorOps/VectorOps.h b/src/utils/VectorOps/VectorOps.h index 3f4b53c9..5e979fa9 100644 --- a/src/utils/VectorOps/VectorOps.h +++ b/src/utils/VectorOps/VectorOps.h @@ -63,8 +63,8 @@ public: uint32_t GetCount(void); // return a the count of _unique_ elements in the vector uint32_t GetCountDistinct(void); - // return a comma-separated list of elements - string GetCollapse(void); + // return a delimiter-separated list of elements + string GetCollapse(string delimiter = ","); // return a concatenation of all elements in the vector string GetConcat(void); // return a comma-separated list of the _unique_ elements diff --git a/test/merge/test-merge.sh b/test/merge/test-merge.sh index b616be0a..ee7f6290 100644 --- a/test/merge/test-merge.sh +++ b/test/merge/test-merge.sh @@ -87,7 +87,7 @@ rm obs exp echo " merge.t5...\c" echo \ "chr1 10 20 a1 -chr1 30 100 a2;a3;a4" > exp +chr1 30 100 a2,a3,a4" > exp $BT merge -i a.names.bed -nms > obs check obs exp rm obs exp @@ -98,10 +98,10 @@ rm obs exp echo " merge.t6...\c" echo \ "chr1 10 20 a1 1 -chr1 30 100 a2;a3;a4 9 +chr1 30 100 a2,a3,a4 9 chr2 10 20 a1 5 chr2 30 40 a2 6 -chr2 42 100 a3;a4 15" > exp +chr2 42 100 a3,a4 15" > exp $BT merge -i a.full.bed -nms -scores sum> obs check obs exp rm obs exp @@ -126,10 +126,10 @@ rm obs exp echo " merge.t8...\c" echo \ "chr1 10 20 a1 1 1 -chr1 30 100 a2;a3;a4 9 3 +chr1 30 100 a2,a3,a4 9 3 chr2 10 20 a1 5 1 chr2 30 40 a2 6 1 -chr2 42 100 a3;a4 15 2" > exp +chr2 42 100 a3,a4 15 2" > exp $BT merge -i a.full.bed -nms -n -scores sum> obs check obs exp rm obs exp @@ -150,3 +150,21 @@ chr2 45 100 a4 8 - 1" > exp $BT merge -i a.full.bed -s -nms -n -scores sum> obs check obs exp rm obs exp + +########################################################### +# Test #10 +# Test the use of a custom delimiter for -nms +# +# cat a.names.bed +# chr1 10 20 a1 +# chr1 30 40 a2 +# chr1 40 50 a3 +# chr1 45 100 a4 +########################################################### +echo " merge.t10...\c" +echo \ +"chr1 10 20 a1 +chr1 30 100 a2|a3|a4" > exp +$BT merge -i a.names.bed -nms -delim "|" > obs +check obs exp +rm obs exp -- GitLab