diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 448e25d1510ed3f5c70a850a45178a07ee9c1d5a..dda672ecbbb52ae2fd57266746607d8594b1b6e8 100755 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -10,7 +10,7 @@ // Acknowledgments: Much of the code herein is taken from Jim Kent's // BED processing code. I am grateful for his elegant // genome binning algorithm and therefore use it extensively. - + #include "bedFile.h" @@ -26,20 +26,20 @@ static int binOffsets[] = {512+64+8+1, 64+8+1, 8+1, 1, 0}; void Tokenize(const string& str, vector<string>& tokens) { - // Skip delimiters at beginning. - string::size_type lastPos = str.find_first_not_of("\t", 0); - // Find first "non-delimiter". - string::size_type pos = str.find_first_of("\t", lastPos); - - while (string::npos != pos || string::npos != lastPos) - { - // Found a token, add it to the vector. - tokens.push_back(str.substr(lastPos, pos - lastPos)); - // Skip delimiters. Note the "not_of" - lastPos = str.find_first_not_of("\t", pos); - // Find next "non-delimiter" - pos = str.find_first_of("\t", lastPos); - } + // Skip delimiters at beginning. + string::size_type lastPos = str.find_first_not_of("\t", 0); + // Find first "non-delimiter". + string::size_type pos = str.find_first_of("\t", lastPos); + + while (string::npos != pos || string::npos != lastPos) + { + // Found a token, add it to the vector. + tokens.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of("\t", pos); + // Find next "non-delimiter" + pos = str.find_first_of("\t", lastPos); + } } int overlaps(const int aS, const int aE, const int bS, const int bE) { @@ -96,7 +96,10 @@ bool byChromThenStart(BED const & a, BED const & b){ //************************************************ static int getBin(int start, int end) -/* Given start,end in chromosome coordinates assign it +/* + NOTE: Taken ~verbatim from kent source. + + Given start,end in chromosome coordinates assign it * a bin. There's a bin for each 128k segment, for each * 1M segment, for each 8M segment, for each 64M segment, * and for each chromosome (which is assumed to be less than @@ -120,8 +123,12 @@ static int getBin(int start, int end) void BedFile::binKeeperFind(map<int, vector<BED>, std::less<int> > &bk, const int start, const int end, vector<BED> &hits) -/* Return a list of all items in binKeeper that intersect range. +/* + NOTE: Taken ~verbatim from kent source. + Return a list of all items in binKeeper that intersect range. + * Free this list with slFreeList. */ + { int startBin, endBin; int i,j; @@ -136,7 +143,6 @@ void BedFile::binKeeperFind(map<int, vector<BED>, std::less<int> > &bk, const in { for (vector<BED>::iterator el = bk[j].begin(); el != bk[j].end(); ++el) { { - //if (leftOf(end, el->start)) {break;} if (overlaps(el->start, el->end, start, end) > 0) { hits.push_back(*el); @@ -167,7 +173,6 @@ void BedFile::countHits(map<int, vector<BED>, std::less<int> > &bk, const int st { for (vector<BED>::iterator el = bk[j].begin(); el != bk[j].end(); ++el) { { - //if (leftOf(end, el->start)) {break;} if (overlaps(el->start, el->end, start, end) > 0) { el->count++; @@ -194,10 +199,10 @@ BedFile::~BedFile(void) { bool BedFile::parseBedLine (BED &bed, const vector<string> &lineVector, const int &lineNum) { - + if ((lineNum == 1) && (lineVector.size() >= 3)) { this->bedType = lineVector.size(); - + if (this->bedType == 3) { bed.chrom = lineVector[0]; bed.start = atoi(lineVector[1].c_str()); @@ -230,7 +235,7 @@ bool BedFile::parseBedLine (BED &bed, const vector<string> &lineVector, const in } } else if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { - + if (this->bedType == 3) { bed.chrom = lineVector[0]; bed.start = atoi(lineVector[1].c_str()); @@ -261,7 +266,7 @@ bool BedFile::parseBedLine (BED &bed, const vector<string> &lineVector, const in bed.strand = lineVector[5]; return true; } - + if (bed.start > bed.end) { cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than End. Ignoring it and moving on." << endl; return false; @@ -271,12 +276,16 @@ bool BedFile::parseBedLine (BED &bed, const vector<string> &lineVector, const in return false; } } + else if (lineVector.size() == 1) { + cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + exit(1); + } else if (lineVector.size() != this->bedType) { - cerr << "Differing number of BED fields encountered at line: " << lineNum << ". Exiting" << endl; + cerr << "Differing number of BED fields encountered at line: " << lineNum << ". Exiting..." << endl; exit(1); } else if (lineVector.size() < 3) { - cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required. Exiting" << endl; + cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required. Exiting..." << endl; exit(1); } } @@ -297,7 +306,7 @@ void BedFile::loadBedFileIntoMap() { //while (bed >> bedEntry.chrom >> bedEntry.start >> bedEntry.end) { while (getline(bed, bedLine)) { - + vector<string> bedFields; Tokenize(bedLine,bedFields); @@ -314,7 +323,7 @@ void BedFile::loadBedFileIntoMap() { void BedFile::loadBedFileIntoMapNoBin() { // Are we dealing with a BED file or a BED passed via stdin? - + // Case 1: Proper BED File. if ( (this->bedFile != "") && (this->bedFile != "stdin") ) { @@ -330,7 +339,7 @@ void BedFile::loadBedFileIntoMapNoBin() { int lineNum = 0; while (getline(bed, bedLine)) { - + vector<string> bedFields; Tokenize(bedLine,bedFields); @@ -347,7 +356,7 @@ void BedFile::loadBedFileIntoMapNoBin() { string bedLine; BED bedEntry; int lineNum = 0; - + while (getline(cin, bedLine)) { vector<string> bedFields; diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index bfc192f5cbc07fa2a678c1b472043f32587df165..285b4c47594cd2346c197cb6ffe0a61146058ce8 100755 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -17,7 +17,7 @@ using namespace std; //************************************************* struct BED { - + // UCSC BED fields string chrom; int start; @@ -25,7 +25,7 @@ struct BED { string name; unsigned short score; string strand; - + // Additional fields unsigned int count; // count of number of intervals // that overlap this feature @@ -83,29 +83,29 @@ public: // Methods bool parseBedLine (BED &, const vector<string> &, const int &); - void loadBedFileIntoMap(); + void loadBedFileIntoMap(); void loadBedFileIntoMapNoBin(); - + void binKeeperFind(map<int, vector<BED>, - std::less<int> > &, const int, - const int, vector<BED> &); - - void countHits(map<int, vector<BED>, std::less<int> > &, const int, const int); - + std::less<int> > &, const int, + const int, vector<BED> &); + + void countHits(map<int, vector<BED>, std::less<int> > &, const int, const int); + // a vector of the BED entries in the BED file. vector<BED> bedVector; masterBedMap bedMap; masterBedMapNoBin bedMapNoBin; - + map<string, int> minPosMap; map<string, int> maxPosMap; - + // the bedfile with which this instance is associated string bedFile; short bedType; - -private: +private: + // none }; #endif /* BEDFILE_H */