From 8ed770c9eddcad6f237e27a0d304e099d5d9f1f2 Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Mon, 7 Feb 2011 10:53:06 -0500 Subject: [PATCH] Added -score option to mergeBed and allowed duplicate names for -nms option. --- src/mergeBed/mergeBed.cpp | 197 ++++++++++++++++++++++++++++++------- src/mergeBed/mergeBed.h | 23 +++-- src/mergeBed/mergeMain.cpp | 53 +++++++--- 3 files changed, 214 insertions(+), 59 deletions(-) diff --git a/src/mergeBed/mergeBed.cpp b/src/mergeBed/mergeBed.cpp index d0cbb7cb..a4c2ca09 100644 --- a/src/mergeBed/mergeBed.cpp +++ b/src/mergeBed/mergeBed.cpp @@ -14,33 +14,119 @@ -void ReportMergedNames(const map<string, bool> &names) { - unsigned int n = 0; +void BedMerge::ReportMergedNames(const vector<string> &names) { printf("\t"); - map<string, bool>::const_iterator nameItr = names.begin(); - map<string, bool>::const_iterator nameEnd = names.end(); + vector<string>::const_iterator nameItr = names.begin(); + vector<string>::const_iterator nameEnd = names.end(); for (; nameItr != nameEnd; ++nameItr) { - if (n < (names.size() - 1)) { - cout << nameItr->first << ";"; + if (nameItr < (nameEnd - 1)) + cout << *nameItr << ";"; + else + cout << *nameItr; + } +} + +void BedMerge::ReportMergedScores(const vector<string> &scores) { + printf("\t"); + + // convert the scores to floats + vector<float> data; + for (size_t i = 0 ; i < scores.size() ; i++) { + data.push_back(atof(scores[i].c_str())); + } + + if (_scoreOp == "sum") { + printf("%.3f", accumulate(data.begin(), data.end(), 0.0)); + } + else if (_scoreOp == "min") { + printf("%.3f", *min_element( data.begin(), data.end() )); + } + else if (_scoreOp == "max") { + printf("%.3f", *max_element( data.begin(), data.end() )); + } + else if (_scoreOp == "mean") { + double total = accumulate(data.begin(), data.end(), 0.0); + double mean = total / data.size(); + printf("%.3f", mean); + } + else if (_scoreOp == "median") { + double median = 0.0; + sort(data.begin(), data.end()); + int totalLines = data.size(); + if ((totalLines % 2) > 0) { + long mid; + mid = totalLines / 2; + median = data[mid]; } else { - cout << nameItr->first; + long midLow, midHigh; + midLow = (totalLines / 2) - 1; + midHigh = (totalLines / 2); + median = (data[midLow] + data[midHigh]) / 2.0; + } + printf("%.3f", median); + } + else if ((_scoreOp == "mode") || (_scoreOp == "antimode")) { + // compute the frequency of each unique value + map<string, int> freqs; + vector<string>::const_iterator dIt = scores.begin(); + vector<string>::const_iterator dEnd = scores.end(); + for (; dIt != dEnd; ++dIt) { + freqs[*dIt]++; + } + + // grab the mode and the anti mode + string mode, antiMode; + int count = 0; + int minCount = INT_MAX; + for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) { + if (iter->second > count) { + mode = iter->first; + count = iter->second; + } + if (iter->second < minCount) { + antiMode = iter->first; + minCount = iter->second; + } + } + // report + if (_scoreOp == "mode") { + printf("%s", mode.c_str()); + } + else if (_scoreOp == "antimode") { + printf("%s", antiMode.c_str()); + } + } + else if (_scoreOp == "collapse") { + vector<string>::const_iterator scoreItr = scores.begin(); + vector<string>::const_iterator scoreEnd = scores.end(); + for (; scoreItr != scoreEnd; ++scoreItr) { + if (scoreItr < (scoreEnd - 1)) + cout << *scoreItr << ";"; + else + cout << *scoreItr; } - n++; } } // =============== // = Constructor = // =============== -BedMerge::BedMerge(string &bedFile, bool &numEntries, int &maxDistance, bool &forceStrand, bool &reportNames) { - - _bedFile = bedFile; - _numEntries = numEntries; - _maxDistance = maxDistance; - _forceStrand = forceStrand; - _reportNames = reportNames; - +BedMerge::BedMerge(string &bedFile, + bool numEntries, + int maxDistance, + bool forceStrand, + bool reportNames, + bool reportScores, + const string &scoreOp) : + _bedFile(bedFile), + _numEntries(numEntries), + _forceStrand(forceStrand), + _reportNames(reportNames), + _reportScores(reportScores), + _scoreOp(scoreOp), + _maxDistance(maxDistance) +{ _bed = new BedFile(bedFile); if (_forceStrand == false) @@ -60,40 +146,73 @@ BedMerge::~BedMerge(void) { // =============================================== // Convenience method for reporting merged blocks // ================================================ -void BedMerge::Report(string chrom, int start, int end, const map<string, bool> &names, int mergeCount) { +void BedMerge::Report(string chrom, int start, int end, + const vector<string> &names, const vector<string> &scores, int mergeCount) +{ if (_bed->isZeroBased == false) {start++;} printf("%s\t%d\t%d", chrom.c_str(), start, end); - if (_numEntries == false && _reportNames == false) { + // just the merged intervals + if (_numEntries == false && _reportNames == false && _reportScores == false) { printf("\n"); } - else if (_numEntries) { + // merged intervals and counts + else if (_numEntries == true && _reportNames == false && _reportScores == false) { printf("\t%d\n", mergeCount); } - else if (_reportNames) { + // merged intervals and names + else if (_numEntries == false && _reportNames == true && _reportScores == false) { ReportMergedNames(names); printf("\n"); } + // merged intervals and scores + else if (_numEntries == false && _reportNames == false && _reportScores == true) { + ReportMergedScores(scores); + printf("\n"); + } + // merged intervals, names, and scores + else if (_numEntries == false && _reportNames == true && _reportScores == true) { + ReportMergedNames(names); + ReportMergedScores(scores); + printf("\n"); + } } // ========================================================= // Convenience method for reporting merged blocks by strand // ========================================================= -void BedMerge::ReportStranded(string chrom, int start, int end, const map<string, bool> &names, int mergeCount, string strand) { +void BedMerge::ReportStranded(string chrom, int start, int end, + const vector<string> &names, const vector<string> &scores, + int mergeCount, string strand) +{ if (_bed->isZeroBased == false) {start++;} printf("%s\t%d\t%d", chrom.c_str(), start, end); - if (_numEntries == false && _reportNames == false) { + // just the merged intervals + if (_numEntries == false && _reportNames == false && _reportScores == false) { printf("\t%s\n", strand.c_str()); } - else if (_numEntries) { + // merged intervals and counts + else if (_numEntries == true && _reportNames == false && _reportScores == false) { printf("\t%d\t%s\n", mergeCount, strand.c_str()); } - else if (_reportNames) { + // merged intervals and names + else if (_numEntries == false && _reportNames == true && _reportScores == false) { ReportMergedNames(names); printf("\t%s\n", strand.c_str()); } + // merged intervals and scores + else if (_numEntries == false && _reportNames == false && _reportScores == true) { + ReportMergedScores(scores); + printf("\t%s\n", strand.c_str()); + } + // merged intervals, names, and scores + else if (_numEntries == false && _reportNames == true && _reportScores == true) { + ReportMergedNames(names); + ReportMergedScores(scores); + printf("\t%s\n", strand.c_str()); + } } @@ -115,8 +234,9 @@ void BedMerge::MergeBed() { string chrom = m->first; vector<BED> bedList = m->second; int mergeCount = 1; - map<string, bool> names; - + vector<string> names; + vector<string> scores; + // merge overlapping features for this chromosome. int start = -1; int end = -1; @@ -126,24 +246,27 @@ void BedMerge::MergeBed() { // new block, no overlap if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) { if (start >= 0) { - Report(chrom, start, end, names, mergeCount); + Report(chrom, start, end, names, scores, mergeCount); // reset mergeCount = 1; names.clear(); + scores.clear(); } start = bedItr->start; end = bedItr->end; - names[bedItr->name] = true; + names.push_back(bedItr->name); + scores.push_back(bedItr->score); } // same block, overlaps else { if ((int) bedItr-> end > end) end = bedItr->end; mergeCount++; - names[bedItr->name] = true; + names.push_back(bedItr->name); + scores.push_back(bedItr->score); } } if (start >= 0) { - Report(chrom, start, end, names, mergeCount); + Report(chrom, start, end, names, scores, mergeCount); } } } @@ -177,7 +300,8 @@ void BedMerge::MergeBedStranded() { int mergeCount = 1; int numOnStrand = 0; - map<string, bool> names; + vector<string> names; + vector<string> scores; // merge overlapping features for this chromosome. int start = -1; @@ -193,23 +317,26 @@ void BedMerge::MergeBedStranded() { if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) { if (start >= 0) { - ReportStranded(chrom, start, end, names, mergeCount, strands[s]); + ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]); // reset mergeCount = 1; names.clear(); + scores.clear(); } start = bedItr->start; end = bedItr->end; - names[bedItr->name] = true; + names.push_back(bedItr->name); + scores.push_back(bedItr->score); } else { if ((int) bedItr-> end > end) end = bedItr->end; mergeCount++; - names[bedItr->name] = true; + names.push_back(bedItr->name); + scores.push_back(bedItr->score); } } if (start >= 0) { - ReportStranded(chrom, start, end, names, mergeCount, strands[s]); + ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]); } } } diff --git a/src/mergeBed/mergeBed.h b/src/mergeBed/mergeBed.h index d0cb59f9..0d1ed46a 100644 --- a/src/mergeBed/mergeBed.h +++ b/src/mergeBed/mergeBed.h @@ -12,6 +12,7 @@ #include "bedFile.h" #include <vector> #include <algorithm> +#include <numeric> #include <iostream> #include <fstream> #include <limits.h> @@ -19,7 +20,6 @@ using namespace std; -void ReportMergedNames(const map<string, bool> &names); //************************************************ // Class methods and elements @@ -29,7 +29,9 @@ class BedMerge { public: // constructor - BedMerge(string &bedFile, bool &numEntries, int &maxDistance, bool &forceStrand, bool &reportNames); + BedMerge(string &bedFile, bool numEntries, + int maxDistance, bool forceStrand, + bool reportNames, bool reportScores, const string &scoreOp); // destructor ~BedMerge(void); @@ -40,13 +42,18 @@ public: private: string _bedFile; - bool _numEntries; - bool _forceStrand; - bool _reportNames; - int _maxDistance; + bool _numEntries; + bool _forceStrand; + bool _reportNames; + bool _reportScores; + string _scoreOp; + int _maxDistance; // instance of a bed file class. BedFile *_bed; - void Report(string chrom, int start, int end, const map<string, bool> &names, int mergeCount); - void ReportStranded(string chrom, int start, int end, const map<string, bool> &names, int mergeCount, string strand); + void Report(string chrom, int start, int end, const vector<string> &names, const vector<string> &scores, int mergeCount); + void ReportStranded(string chrom, int start, int end, const vector<string> &names, const vector<string> &scores, int mergeCount, string strand); + void ReportMergedNames(const vector<string> &names); + void ReportMergedScores(const vector<string> &scores); + }; diff --git a/src/mergeBed/mergeMain.cpp b/src/mergeBed/mergeMain.cpp index c756e38a..fc435843 100644 --- a/src/mergeBed/mergeMain.cpp +++ b/src/mergeBed/mergeMain.cpp @@ -32,6 +32,7 @@ int main(int argc, char* argv[]) { // input files string bedFile = "stdin"; int maxDistance = 0; + string scoreOp = ""; // input arguments bool haveBed = true; @@ -39,6 +40,7 @@ int main(int argc, char* argv[]) { bool haveMaxDistance = false; bool forceStrand = false; bool reportNames = false; + bool reportScores = false; for(int i = 1; i < argc; i++) { int parameterLength = (int)strlen(argv[i]); @@ -78,6 +80,13 @@ int main(int argc, char* argv[]) { else if (PARAMETER_CHECK("-nms", 4, parameterLength)) { reportNames = true; } + else if (PARAMETER_CHECK("-scores", 7, parameterLength)) { + reportScores = true; + if ((i+1) < argc) { + scoreOp = argv[i + 1]; + i++; + } + } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; @@ -93,9 +102,15 @@ int main(int argc, char* argv[]) { cerr << endl << "*****" << endl << "*****ERROR: Request either -n OR -nms, not both." << endl << "*****" << endl; showHelp = true; } + if ((reportScores == true) && (scoreOp != "sum") && (scoreOp != "max") && (scoreOp != "min") && (scoreOp != "mean") && + (scoreOp != "mode") && (scoreOp != "median") && (scoreOp != "antimode") && (scoreOp != "collapse")) + { + cerr << endl << "*****" << endl << "*****ERROR: Invalid scoreOp selection \"" << scoreOp << endl << "\" *****" << endl; + showHelp = true; + } if (!showHelp) { - BedMerge *bm = new BedMerge(bedFile, numEntries, maxDistance, forceStrand, reportNames); + BedMerge *bm = new BedMerge(bedFile, numEntries, maxDistance, forceStrand, reportNames, reportScores, scoreOp); delete bm; return 0; } @@ -115,21 +130,27 @@ void ShowHelp(void) { cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf>" << endl << endl; cerr << "Options: " << endl; - cerr << "\t-s\t" << "Force strandedness. That is, only merge features" << endl; - cerr << "\t\tthat are the same strand." << endl; - cerr << "\t\t- By default, merging is done without respect to strand." << endl << endl; - - cerr << "\t-n\t" << "Report the number of BED entries that were merged." << endl; - cerr << "\t\t- Note: \"1\" is reported if no merging occurred." << endl << endl; - - - cerr << "\t-d\t" << "Maximum distance between features allowed for features" << endl; - cerr << "\t\tto be merged." << endl; - cerr << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl; - cerr << "\t\t- (INTEGER)" << endl << endl; - - cerr << "\t-nms\t" << "Report the names of the merged features separated by semicolons." << endl << endl; - + cerr << "\t-s\t" << "Force strandedness. That is, only merge features" << endl; + cerr << "\t\tthat are the same strand." << endl; + cerr << "\t\t- By default, merging is done without respect to strand." << endl << endl; + + cerr << "\t-n\t" << "Report the number of BED entries that were merged." << endl; + cerr << "\t\t- Note: \"1\" is reported if no merging occurred." << endl << endl; + + + cerr << "\t-d\t" << "Maximum distance between features allowed for features" << endl; + cerr << "\t\tto be merged." << endl; + cerr << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + cerr << "\t-nms\t" << "Report the names of the merged features separated by semicolons." << endl << endl; + + cerr << "\t-scores [STRING]\t" << "Report the scores of the merged features. Specify one of " << endl; + cerr << "\t\tthe following options for reporting scores:" << endl; + cerr << "\t\t\t sum, min, max," << endl; + cerr << "\t\t\t mean, median, mode, antimode," << endl; + cerr << "\t\t\t collapse (i.e., print a semicolon-separated list)," << endl << endl; + // end the program here exit(1); -- GitLab