diff --git a/._Makefile.uZs1Vt b/._Makefile.uZs1Vt deleted file mode 100755 index 24eac48d99a877e496a2e799c98d61e4cd399501..0000000000000000000000000000000000000000 Binary files a/._Makefile.uZs1Vt and /dev/null differ diff --git a/._README.89gPvU b/._README.89gPvU deleted file mode 100755 index 92241a843c2ab0477beec047980afd7fc1a0d2ca..0000000000000000000000000000000000000000 Binary files a/._README.89gPvU and /dev/null differ diff --git a/src/coverageBed/coverageBed.cpp b/src/coverageBed/coverageBed.cpp index 178ff94ff2af7f1867e05ca0256fd42f5ee05b7f..77726692f4daea2c7b306847031c7f9cb1d0b829 100755 --- a/src/coverageBed/coverageBed.cpp +++ b/src/coverageBed/coverageBed.cpp @@ -53,7 +53,7 @@ void BedCoverage::CollectCoverageBed() { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps - _bedB->loadBedFileIntoMap(); + _bedB->loadBedCovFileIntoMap(); int lineNum = 0; // current input line number BED a, nullBed; @@ -61,14 +61,12 @@ void BedCoverage::CollectCoverageBed() { _bedA->Open(); // process each entry in A - bedStatus = _bedA->GetNextBed(a, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { // count a as a hit with all the relevant features in B _bedB->countHits(a, _forceStrand); a = nullBed; } - bedStatus = _bedA->GetNextBed(a, lineNum); } _bedA->Close(); @@ -81,7 +79,7 @@ void BedCoverage::CollectCoverageBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps - _bedB->loadBedFileIntoMap(); + _bedB->loadBedCovFileIntoMap(); // open the BAM file BamReader reader; @@ -103,7 +101,7 @@ void BedCoverage::CollectCoverageBam(string bamFile) { a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false); - a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; + a.strand = '+'; if (bam.IsReverseStrand()) a.strand = '-'; _bedB->countHits(a, _forceStrand); } } @@ -119,16 +117,16 @@ void BedCoverage::ReportCoverage() { map<unsigned int, unsigned int> allDepthHist; unsigned int totalLength = 0; - masterBedMap::const_iterator chromItr = _bedB->bedMap.begin(); - masterBedMap::const_iterator chromEnd = _bedB->bedMap.end(); + masterBedCovMap::const_iterator chromItr = _bedB->bedCovMap.begin(); + masterBedCovMap::const_iterator chromEnd = _bedB->bedCovMap.end(); for (; chromItr != chromEnd; ++chromItr) { - binsToBeds::const_iterator binItr = chromItr->second.begin(); - binsToBeds::const_iterator binEnd = chromItr->second.end(); + binsToBedCovs::const_iterator binItr = chromItr->second.begin(); + binsToBedCovs::const_iterator binEnd = chromItr->second.end(); for (; binItr != binEnd; ++binItr) { - vector<BED>::const_iterator bedItr = binItr->second.begin(); - vector<BED>::const_iterator bedEnd = binItr->second.end(); + vector<BEDCOV>::const_iterator bedItr = binItr->second.begin(); + vector<BEDCOV>::const_iterator bedEnd = binItr->second.end(); for (; bedItr != bedEnd; ++bedItr) { int zeroDepthCount = 0; @@ -140,7 +138,7 @@ void BedCoverage::ReportCoverage() { map<unsigned int, unsigned int> depthHist; map<unsigned int, DEPTH>::const_iterator depthItr; - for (int pos = start+1; pos <= bedItr->end; pos++) { + for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) { depthItr = bedItr->depthMap.find(pos); @@ -163,7 +161,7 @@ void BedCoverage::ReportCoverage() { } // Report the coverage for the current interval. - int length = bedItr->end - bedItr->start; + CHRPOS length = bedItr->end - bedItr->start; totalLength += length; int nonZeroBases = (length - zeroDepthCount); diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp index 1e51cb3fb5996ab41da84336809f9317fd6c7d8e..7c803089583daeb28680176a07641546faa5556e 100755 --- a/src/intersectBed/intersectBed.cpp +++ b/src/intersectBed/intersectBed.cpp @@ -70,7 +70,7 @@ bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { // grab _all_ of the features in B that overlap with a. _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _forceStrand); - + // how many overlaps are there b/w a and B? int numOverlaps = 0; @@ -80,7 +80,7 @@ bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { printable = false; // loop through the hits and report those that meet the user's criteria - vector<BED>::const_iterator h = hits.begin(); + vector<BED>::const_iterator h = hits.begin(); vector<BED>::const_iterator hitsEnd = hits.end(); for (; h != hitsEnd; ++h) { int s = max(a.start, h->start); @@ -119,7 +119,7 @@ bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { void BedIntersect::ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b, - const int &s, const int &e) { + const CHRPOS &s, const CHRPOS &e) { // simple intersection only if (_writeA == false && _writeB == false && _writeOverlap == false) { _bedA->reportBedRangeNewLine(a,s,e); @@ -198,17 +198,14 @@ void BedIntersect::IntersectBed() { // open the "A" file, process each BED entry and searh for overlaps. _bedA->Open(); - bedStatus = _bedA->GetNextBed(a, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { FindOverlaps(a, hits); hits.clear(); a = nullBed; } - bedStatus = _bedA->GetNextBed(a, lineNum); } _bedA->Close(); - } @@ -255,7 +252,7 @@ void BedIntersect::IntersectBam(string bamFile) { if (bam.IsSecondMate()) a.name += "/2"; a.score = ToString(bam.MapQuality); - a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; + a.strand = '+'; if (bam.IsReverseStrand()) a.strand = '-'; if (_bamOutput == true) { overlapsFound = FindOneOrMoreOverlap(a); diff --git a/src/intersectBed/intersectBed.h b/src/intersectBed/intersectBed.h index 9b4a05e41af32e27fc9595578c57d6fa11e04d9e..6d2779190f62b267ebe89d8a6f0a0b21d9343f98 100755 --- a/src/intersectBed/intersectBed.h +++ b/src/intersectBed/intersectBed.h @@ -80,7 +80,7 @@ private: bool FindOneOrMoreOverlap(const BED &a); void ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b, - const int &s, const int &e); + const CHRPOS &s, const CHRPOS &e); void ReportOverlapSummary(const BED &a, const int &numOverlapsFound); }; diff --git a/src/utils/bedFile/Makefile b/src/utils/bedFile/Makefile index 31bee672313953d2f656ccf18817072178d5e6f9..0ab6683fd4c172d7a3d84a711aded81faaa925ef 100755 --- a/src/utils/bedFile/Makefile +++ b/src/utils/bedFile/Makefile @@ -1,5 +1,5 @@ CXX = g++ -c -CXXFLAGS = -O2 -Wall +CXXFLAGS = -O2 -Wall LDFLAGS = OBJ_DIR = ../../../obj/ BIN_DIR = ../../../bin/ diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 38e2ce9def4a3321c9fe93858af219792b87687a..c99dc78f08c27974a4aa00864b1e3fb90f49ff0e 100755 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -28,8 +28,8 @@ bool sortByStart(const BED &a, const BED &b) { bool sortBySizeAsc(const BED &a, const BED &b) { - unsigned int aLen = a.end - a.start; - unsigned int bLen = b.end - b.start; + CHRPOS aLen = a.end - a.start; + CHRPOS bLen = b.end - b.start; if (aLen < bLen) return true; else return false; @@ -37,8 +37,8 @@ bool sortBySizeAsc(const BED &a, const BED &b) { bool sortBySizeDesc(const BED &a, const BED &b) { - unsigned int aLen = a.end - a.start; - unsigned int bLen = b.end - b.start; + CHRPOS aLen = a.end - a.start; + CHRPOS bLen = b.end - b.start; if (aLen > bLen) return true; else return false; @@ -69,32 +69,25 @@ bool byChromThenStart(BED const & a, BED const & b) { /* - NOTE: Taken ~verbatim from kent source. - - Given start,end in chromosome coordinates assign it - * a bin. There's a bin for each 128k segment, for each - * 1M segment, for each 8M segment, for each 64M segment, - * and for each chromosome (which is assumed to be less than - * 512M.) A range goes into the smallest bin it will fit in. + NOTE: Tweaked from kent source. */ -int getBin(int start, int end) { - int startBin = start; - int endBin = end-1; - startBin >>= _binFirstShift; - endBin >>= _binFirstShift; - - for (int i = 0; i < 6; ++i) { - if (startBin == endBin) { - return binOffsetsExtended[i] + startBin; - } - startBin >>= _binNextShift; - endBin >>= _binNextShift; - } - cerr << "start " << start << ", end " << end << " out of range in findBin (max is 512M)" << endl; - return 0; +int getBin(CHRPOS start, CHRPOS end) { + --end; + start >>= _binFirstShift; + end >>= _binFirstShift; + + for (int i = 0; i < _binLevels; ++i) { + if (start == end) return _binOffsetsExtended[i] + start; + start >>= _binNextShift; + end >>= _binNextShift; + } + cerr << "start " << start << ", end " << end << " out of range in findBin (max is 512M)" << endl; + return 0; } + + /******************************************* Class methods *******************************************/ @@ -160,7 +153,7 @@ void BedFile::Close(void) { } -BedLineStatus BedFile::GetNextBed (BED &bed, int &lineNum) { +BedLineStatus BedFile::GetNextBed(BED &bed, int &lineNum) { // make sure there are still lines to process. // if so, tokenize, validate and return the BED entry. @@ -185,35 +178,35 @@ BedLineStatus BedFile::GetNextBed (BED &bed, int &lineNum) { } -void BedFile::FindOverlapsPerBin(string chrom, int start, int end, string strand, vector<BED> &hits, bool forceStrand) { +void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, vector<BED> &hits, bool forceStrand) { int startBin, endBin; startBin = (start >> _binFirstShift); endBin = ((end-1) >> _binFirstShift); // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < 6; ++i) { + for (int i = 0; i < _binLevels; ++i) { // loop through each bin at this level of the hierarchy - int offset = binOffsetsExtended[i]; + int offset = _binOffsetsExtended[i]; for (int j = (startBin+offset); j <= (endBin+offset); ++j) { - // loop through each feature in this chrom/bin and see if it overlaps - // with the feature that was passed in. if so, add the feature to - // the list of hits. - vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); - vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); - - for (; bedItr != bedEnd; ++bedItr) { - // do we have sufficient overlap? - if (overlaps(bedItr->start, bedItr->end, start, end) > 0) { - // skip the hit if not on the same strand (and we care) - if (forceStrand == false) hits.push_back(*bedItr); - else if ( (forceStrand == true) && (strand == bedItr->strand)) { - hits.push_back(*bedItr); - } - } - } + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); + vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); + + for (; bedItr != bedEnd; ++bedItr) { + // do we have sufficient overlap? + if (overlaps(bedItr->start, bedItr->end, start, end) > 0) { + // skip the hit if not on the same strand (and we care) + if (forceStrand == false) hits.push_back(*bedItr); + else if ( (forceStrand == true) && (strand == bedItr->strand)) { + hits.push_back(*bedItr); + } + } + } } startBin >>= _binNextShift; endBin >>= _binNextShift; @@ -221,7 +214,7 @@ void BedFile::FindOverlapsPerBin(string chrom, int start, int end, string strand } -bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, int start, int end, string strand, +bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, bool forceStrand, float overlapFraction) { int startBin, endBin; @@ -231,10 +224,10 @@ bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, int start, int end, stri int aLength = (end - start); // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < 6; ++i) { + for (int i = 0; i < _binLevels; ++i) { // loop through each bin at this level of the hierarchy - int offset = binOffsetsExtended[i]; + int offset = _binOffsetsExtended[i]; for (int j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps @@ -265,7 +258,7 @@ bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, int start, int end, stri } -bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, int start, int end, string strand, +bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, bool forceStrand, float overlapFraction) { int startBin, endBin; @@ -275,10 +268,10 @@ bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, int start, int int aLength = (end - start); // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < 6; ++i) { + for (int i = 0; i < _binLevels; ++i) { // loop through each bin at this level of the hierarchy - int offset = binOffsetsExtended[i]; + int offset = _binOffsetsExtended[i]; for (int j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps @@ -314,43 +307,43 @@ bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, int start, int void BedFile::countHits(const BED &a, bool forceStrand) { - int startBin, endBin; - startBin = (a.start >> _binFirstShift); - endBin = ((a.end-1) >> _binFirstShift); - - // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < 6; ++i) { - - // loop through each bin at this level of the hierarchy - int offset = binOffsetsExtended[i]; - for (int j = (startBin+offset); j <= (endBin+offset); ++j) { - - // loop through each feature in this chrom/bin and see if it overlaps - // with the feature that was passed in. if so, add the feature to - // the list of hits. - vector<BED>::iterator bedItr = bedMap[a.chrom][j].begin(); - vector<BED>::iterator bedEnd = bedMap[a.chrom][j].end(); - for (; bedItr != bedEnd; ++bedItr) { - - // skip the hit if not on the same strand (and we care) - if (forceStrand && (a.strand != bedItr->strand)) { - continue; - } - else if (overlaps(bedItr->start, bedItr->end, a.start, a.end) > 0) { - - bedItr->count++; - bedItr->depthMap[a.start+1].starts++; - bedItr->depthMap[a.end].ends++; - - if (a.start < bedItr->minOverlapStart) { - bedItr->minOverlapStart = a.start; - } - } - } - } - startBin >>= _binNextShift; - endBin >>= _binNextShift; - } + int startBin, endBin; + startBin = (a.start >> _binFirstShift); + endBin = ((a.end-1) >> _binFirstShift); + + // loop through each bin "level" in the binning hierarchy + for (int i = 0; i < _binLevels; ++i) { + + // loop through each bin at this level of the hierarchy + int offset = _binOffsetsExtended[i]; + for (int j = (startBin+offset); j <= (endBin+offset); ++j) { + + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<BEDCOV>::iterator bedItr = bedCovMap[a.chrom][j].begin(); + vector<BEDCOV>::iterator bedEnd = bedCovMap[a.chrom][j].end(); + for (; bedItr != bedEnd; ++bedItr) { + + // skip the hit if not on the same strand (and we care) + if (forceStrand && (a.strand != bedItr->strand)) { + continue; + } + else if (overlaps(bedItr->start, bedItr->end, a.start, a.end) > 0) { + + bedItr->count++; + bedItr->depthMap[a.start+1].starts++; + bedItr->depthMap[a.end].ends++; + + if (a.start < bedItr->minOverlapStart) { + bedItr->minOverlapStart = a.start; + } + } + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } } @@ -360,323 +353,49 @@ void BedFile::setGff (bool gff) { } -BedLineStatus BedFile::parseLine (BED &bed, const vector<string> &lineVector, int &lineNum) { - - char *p2End, *p3End, *p4End, *p5End; - long l2, l3, l4, l5; - - // bail out if we have a blank line - if (lineVector.size() == 0) - return BED_BLANK; - - if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) { - - // we need at least 3 columns - if (lineVector.size() >= 3) { - - // test if columns 2 and 3 are integers. If so, assume BED. - l2 = strtol(lineVector[1].c_str(), &p2End, 10); - l3 = strtol(lineVector[2].c_str(), &p3End, 10); - - // strtol will set p2End or p3End to the start of the string if non-integral, base 10 - if (p2End != lineVector[1].c_str() && p3End != lineVector[2].c_str()) { - setGff(false); - if (parseBedLine(bed, lineVector, lineNum) == true) - return BED_VALID; - } - else if (lineVector.size() == 9) { - // otherwise test if columns 4 and 5 are integers. If so, assume GFF. - l4 = strtol(lineVector[3].c_str(), &p4End, 10); - l5 = strtol(lineVector[4].c_str(), &p5End, 10); - - // strtol will set p4End or p5End to the start of the string if non-integral, base 10 - if (p4End != lineVector[3].c_str() && p5End != lineVector[4].c_str()) { - setGff(true); - if (parseGffLine(bed, lineVector, lineNum) == true) - return BED_VALID; - } - } - else { - cerr << "Unexpected file format. Please use tab-delimited BED or GFF" << endl; - exit(1); - } - } - // gripe if it's not a blank line - else if (lineVector.size() != 0) { - cerr << "It looks as though you have less than 3 columns at line: " << lineNum << ". Are you sure your files are tab-delimited?" << endl; - exit(1); - } - } - else { - lineNum--; - return BED_HEADER; - } - - // default - return BED_INVALID; -} - - - -bool BedFile::parseBedLine (BED &bed, const vector<string> &lineVector, int lineNum) { - - if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { - - if (this->bedType == 3) { - bed.chrom = lineVector[0]; - bed.start = atoi(lineVector[1].c_str()); - bed.end = atoi(lineVector[2].c_str()); - bed.name = ""; - bed.score = ""; - bed.strand = ""; - } - else if (this->bedType == 4) { - bed.chrom = lineVector[0]; - bed.start = atoi(lineVector[1].c_str()); - bed.end = atoi(lineVector[2].c_str()); - bed.name = lineVector[3]; - bed.score = ""; - bed.strand = ""; - } - else if (this->bedType ==5) { - bed.chrom = lineVector[0]; - bed.start = atoi(lineVector[1].c_str()); - bed.end = atoi(lineVector[2].c_str()); - bed.name = lineVector[3]; - bed.score = lineVector[4]; - bed.strand = ""; - } - else if (this->bedType == 6) { - bed.chrom = lineVector[0]; - bed.start = atoi(lineVector[1].c_str()); - bed.end = atoi(lineVector[2].c_str()); - bed.name = lineVector[3]; - bed.score = lineVector[4]; - bed.strand = lineVector[5]; - } - else if (this->bedType > 6) { - bed.chrom = lineVector[0]; - bed.start = atoi(lineVector[1].c_str()); - bed.end = atoi(lineVector[2].c_str()); - bed.name = lineVector[3]; - bed.score = lineVector[4]; - bed.strand = lineVector[5]; - - for (unsigned int i = 6; i < lineVector.size(); ++i) { - bed.otherFields.push_back(lineVector[i]); - } - } - else { - cerr << "Error: unexpected number of fields at line: " << lineNum << ". Verify that your files are TAB-delimited and that your BED file has 3,4,5 or 6 fields. Exiting..." << endl; - exit(1); - } - - if (bed.start > bed.end) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; - exit(1); - } - else if ( (bed.start < 0) || (bed.end < 0) ) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; - exit(1); - } - else return true; - } - else if ((lineNum == 1) && (lineVector.size() >= 3)) { - this->bedType = lineVector.size(); - if (this->bedType == 3) { - bed.chrom = lineVector[0]; - bed.start = atoi(lineVector[1].c_str()); - bed.end = atoi(lineVector[2].c_str()); - bed.name = ""; - bed.score = ""; - bed.strand = ""; - } - else if (this->bedType == 4) { - bed.chrom = lineVector[0]; - bed.start = atoi(lineVector[1].c_str()); - bed.end = atoi(lineVector[2].c_str()); - bed.name = lineVector[3]; - bed.score = ""; - bed.strand = ""; - } - else if (this->bedType ==5) { - bed.chrom = lineVector[0]; - bed.start = atoi(lineVector[1].c_str()); - bed.end = atoi(lineVector[2].c_str()); - bed.name = lineVector[3]; - bed.score = lineVector[4]; - bed.strand = ""; - } - else if (this->bedType == 6) { - bed.chrom = lineVector[0]; - bed.start = atoi(lineVector[1].c_str()); - bed.end = atoi(lineVector[2].c_str()); - bed.name = lineVector[3]; - bed.score = lineVector[4]; - bed.strand = lineVector[5]; - } - else if (this->bedType > 6) { - bed.chrom = lineVector[0]; - bed.start = atoi(lineVector[1].c_str()); - bed.end = atoi(lineVector[2].c_str()); - bed.name = lineVector[3]; - bed.score = lineVector[4]; - bed.strand = lineVector[5]; - - for (unsigned int i = 6; i < lineVector.size(); ++i) { - bed.otherFields.push_back(lineVector[i]); - } - } - else { - cerr << "Error: unexpected number of fields at line: " << lineNum << ". Verify that your files are TAB-delimited and that your BED file has 3,4,5 or 6 fields. Exiting..." << endl; - exit(1); - } - - if (bed.start > bed.end) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; - exit(1); - } - else if ( (bed.start < 0) || (bed.end < 0) ) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; - exit(1); - } - else return true; - } - else if (lineVector.size() == 1) { - cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; - exit(1); - } - else if ((lineVector.size() != this->bedType) && (lineVector.size() != 0)) { - cerr << "Differing number of BED fields encountered at line: " << lineNum << ". Exiting..." << endl; - exit(1); - } - else if ((lineVector.size() < 3) && (lineVector.size() != 0)) { - cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required at line: "<< lineNum << ". Exiting..." << endl; - exit(1); - } - return false; -} - +void BedFile::loadBedFileIntoMap() { + BED bedEntry, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; -bool BedFile::parseGffLine (BED &bed, const vector<string> &lineVector, int lineNum) { - -/* -1. seqname - The name of the sequence. Must be a chromosome or scaffold. -2. source - The program that generated this feature. -3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", - "start_codon", "stop_codon", and "exon". -4. start - The starting position of the feature in the sequence. The first base is numbered 1. -5. end - The ending position of the feature (inclusive). -6. score - A score between 0 and 1000. If the track line useScore attribute is set to 1 - for this annotation data set, the score value will determine the level of gray - in which this feature is displayed (higher numbers = darker gray). - If there is no score value, enter ".". -7. strand - Valid entries include '+', '-', or '.' (for don't know/don't care). -8. frame - If the feature is a coding exon, frame should be a number between 0-2 that - represents the reading frame of the first base. If the feature is not a coding exon, - the value should be '.'. -9. group - All lines with the same group are linked together into a single item. -*/ - if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { - - if (this->bedType == 9 && _isGff) { - bed.chrom = lineVector[0]; - // substract 1 to force the start to be BED-style - bed.start = atoi(lineVector[3].c_str()) - 1; - bed.end = atoi(lineVector[4].c_str()); - bed.name = lineVector[2]; - bed.score = lineVector[5]; - bed.strand = lineVector[6]; - bed.otherFields.push_back(lineVector[1]); // add GFF "source". unused in BED - bed.otherFields.push_back(lineVector[7]); // add GFF "fname". unused in BED - bed.otherFields.push_back(lineVector[8]); // add GFF "group". unused in BED - } - else { - cerr << "Error: unexpected number of fields at line: " << lineNum << - ". Verify that your files are TAB-delimited and that your GFF file has 9 fields. Exiting..." << endl; - exit(1); - } - - if (bed.start > bed.end) { - cerr << "Error: malformed GFF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; - exit(1); - } - else if ( (bed.start < 0) || (bed.end < 0) ) { - cerr << "Error: malformed GFF entry at line " << lineNum << ". Coordinate detected that is < 1. Exiting." << endl; - exit(1); - } - else return true; - } - else if ((lineNum == 1) && (lineVector.size() == 9)) { - this->bedType = lineVector.size(); - - if (this->bedType == 9 && _isGff) { - bed.chrom = lineVector[0]; - // substract 1 to force the start to be BED-style - bed.start = atoi(lineVector[3].c_str()) - 1; - bed.end = atoi(lineVector[4].c_str()); - bed.name = lineVector[2]; - bed.score = lineVector[5]; - bed.strand = lineVector[6]; - - bed.otherFields.push_back(lineVector[1]); // add GFF "source". unused in BED - bed.otherFields.push_back(lineVector[7]); // add GFF "fname". unused in BED - bed.otherFields.push_back(lineVector[8]); // add GFF "group". unused in BED - - return true; - } - else { - cout << this->bedType << " " << _isGff << endl; - cerr << "Error: unexpected number of fields at line: " << lineNum << - ". Verify that your files are TAB-delimited and that your GFF file has 9 fields. Exiting..." << endl; - exit(1); - } - - if (bed.start > bed.end) { - cerr << "Error: malformed GFF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; - exit(1); - } - else if ( (bed.start < 0) || (bed.end < 0) ) { - cerr << "Error: malformed GFF entry at line " << lineNum << ". Coordinate detected that is < 1. Exiting." << endl; - exit(1); + Open(); + while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + int bin = getBin(bedEntry.start, bedEntry.end); + bedMap[bedEntry.chrom][bin].push_back(bedEntry); + bedEntry = nullBed; } - else return true; - } - else if (lineVector.size() == 1) { - cerr << "Only one GFF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; - exit(1); } - else if ((lineVector.size() != this->bedType) && (lineVector.size() != 0)) { - cerr << "Differing number of GFF fields encountered at line: " << lineNum << ". Exiting..." << endl; - exit(1); - } - else if ((lineVector.size() < 9) && (lineVector.size() != 0)) { - cerr << "TAB delimited GFF file with 9 fields is required at line: "<< lineNum << ". Exiting..." << endl; - exit(1); - } - return false; + Close(); } -void BedFile::loadBedFileIntoMap() { +void BedFile::loadBedCovFileIntoMap() { BED bedEntry, nullBed; int lineNum = 0; BedLineStatus bedStatus; Open(); - bedStatus = this->GetNextBed(bedEntry, lineNum); - while (bedStatus != BED_INVALID) { - // ignore headers and blank lines + while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { int bin = getBin(bedEntry.start, bedEntry.end); - bedEntry.count = 0; - bedEntry.minOverlapStart = INT_MAX; - bedMap[bedEntry.chrom][bin].push_back(bedEntry); + + BEDCOV bedCov; + bedCov.chrom = bedEntry.chrom; + bedCov.start = bedEntry.start; + bedCov.end = bedEntry.end; + bedCov.name = bedEntry.name; + bedCov.score = bedEntry.score; + bedCov.strand = bedEntry.strand; + bedCov.otherFields = bedEntry.otherFields; + bedCov.count = 0; + bedCov.minOverlapStart = INT_MAX; + + bedCovMap[bedEntry.chrom][bin].push_back(bedCov); bedEntry = nullBed; } - bedStatus = this->GetNextBed(bedEntry, lineNum); } Close(); } @@ -689,17 +408,11 @@ void BedFile::loadBedFileIntoMapNoBin() { BedLineStatus bedStatus; Open(); - bedStatus = this->GetNextBed(bedEntry, lineNum); - while (bedStatus != BED_INVALID) { - // ignore headers and blank lines + while ((bedStatus = this->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { - bedEntry.count = 0; - bedEntry.minOverlapStart = INT_MAX; bedMapNoBin[bedEntry.chrom].push_back(bedEntry); - bedEntry = nullBed; } - bedStatus = this->GetNextBed(bedEntry, lineNum); } Close(); @@ -709,254 +422,3 @@ void BedFile::loadBedFileIntoMapNoBin() { sort(m->second.begin(), m->second.end(), sortByStart); } } - - -/* - reportBedTab - - Writes the _original_ BED entry with a TAB - at the end of the line. - Works for BED3 - BED6. -*/ -void BedFile::reportBedTab(const BED &bed) { - - if (_isGff == false) { - if (this->bedType == 3) { - printf ("%s\t%d\t%d\t", bed.chrom.c_str(), bed.start, bed.end); - } - else if (this->bedType == 4) { - printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str()); - } - else if (this->bedType == 5) { - printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str()); - } - else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - } - else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - } - } - else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), bed.start+1, bed.end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); - - } -} - - - -/* - reportBedNewLine - - Writes the _original_ BED entry with a NEWLINE - at the end of the line. - Works for BED3 - BED6. -*/ -void BedFile::reportBedNewLine(const BED &bed) { - - if (_isGff == false) { - if (this->bedType == 3) { - printf ("%s\t%d\t%d\n", bed.chrom.c_str(), bed.start, bed.end); - } - else if (this->bedType == 4) { - printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str()); - } - else if (this->bedType == 5) { - printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str()); - } - else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - } - else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - printf("\n"); - } - } - else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), bed.start+1, bed.end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); - - } -} - - - -/* - reportBedRangeNewLine - - Writes a custom start->end for a BED entry - with a NEWLINE at the end of the line. - - Works for BED3 - BED6. -*/ -void BedFile::reportBedRangeTab(const BED &bed, int start, int end) { - - if (_isGff == false) { - if (this->bedType == 3) { - printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end); - } - else if (this->bedType == 4) { - printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str()); - } - else if (this->bedType == 5) { - printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str()); - } - else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - } - else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - } - } - else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); - - } -} - - - -/* - reportBedRangeTab - - Writes a custom start->end for a BED entry - with a TAB at the end of the line. - - Works for BED3 - BED6. -*/ -void BedFile::reportBedRangeNewLine(const BED &bed, int start, int end) { - - if (_isGff == false) { - if (this->bedType == 3) { - printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end); - } - else if (this->bedType == 4) { - printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str()); - } - else if (this->bedType == 5) { - printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str()); - } - else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - } - else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - printf("\n"); - } - } - else if (this->bedType == 9) { // add 1 to the start for GFF - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); - - } -} - - -/* - reportNullBedTab -*/ -void BedFile::reportNullBedTab() { - - if (_isGff == false) { - if (this->bedType == 3) { - printf (".\t-1\t-1\t"); - } - else if (this->bedType == 4) { - printf (".\t-1\t-1\t.\t"); - } - else if (this->bedType == 5) { - printf (".\t-1\t-1\t.\t-1\t"); - } - else if (this->bedType == 6) { - printf (".\t-1\t-1\t.\t-1\t.\t"); - } - else if (this->bedType > 6) { - printf (".\t-1\t-1\t.\t-1\t.\t"); - for (unsigned int i = 6; i < this->bedType; ++i) { - printf(".\t"); - } - } - } - else if (this->bedType == 9) { - printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t"); - } -} - - -/* - reportNullBedTab -*/ -void BedFile::reportNullBedNewLine() { - - if (_isGff == false) { - if (this->bedType == 3) { - printf (".\t-1\t-1\n"); - } - else if (this->bedType == 4) { - printf (".\t-1\t-1\t.\n"); - } - else if (this->bedType == 5) { - printf (".\t-1\t-1\t.\t-1\n"); - } - else if (this->bedType == 6) { - printf (".\t-1\t-1\t.\t-1\t.\n"); - } - else if (this->bedType > 6) { - printf (".\t-1\t-1\t.\t-1\t.\n"); - for (unsigned int i = 6; i < this->bedType; ++i) { - printf(".\t"); - } - printf("\n"); - } - } - else if (this->bedType == 9) { - printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n"); - } -} - diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 1efa4449fdd6a5b23f09b402f0657be82f5dca60..36e048c13701e7b7343c779389fc80ea3274cb41 100755 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -22,18 +22,36 @@ #include <cstring> #include <algorithm> #include <limits.h> +#include <stdint.h> #include <cstdio> - +//#include <tr1/unordered_map> // Experimental. using namespace std; + +//************************************************* +// Data type tydedef +//************************************************* +typedef uint32_t CHRPOS; + + //************************************************* // Genome binning constants //************************************************* -const int binOffsetsExtended[] = - {4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; +const int _numBins = 37450; +const int _binLevels = 7; + +// bins range in size from 16kb to 512Mb +// Bin 0 spans 512Mbp, # Level 1 +// Bins 1-8 span 64Mbp, # Level 2 +// Bins 9-72 span 8Mbp, # Level 3 +// Bins 73-584 span 1Mbp # Level 4 +// Bins 585-4680 span 128Kbp # Level 5 +// Bins 4681-37449 span 16Kbp # Level 6 +const int _binOffsetsExtended[] = + {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; -const int _binFirstShift = 17; /* How much to shift to get to finest bin. */ +const int _binFirstShift = 14; /* How much to shift to get to finest bin. */ const int _binNextShift = 3; /* How much to shift to get to next larger bin. */ @@ -46,64 +64,73 @@ struct DEPTH { unsigned int ends; }; + + /* - Structure for regular BED3-BED6 records + Structure for regular BED records */ struct BED { - // UCSC BED fields + // Regular BED fields + CHRPOS start; + CHRPOS end; + string chrom; - int start; - int end; string name; string score; - string strand; + char strand; + + // Add'l fields for BED12 and/or custom BED annotations + vector<string> otherFields; +}; + + +/* + Structure for regular BED COVERAGE records +*/ +struct BEDCOV { + + // Regular BED fields + CHRPOS start; + CHRPOS end; + string chrom; + string name; + string score; + char strand; + + // Add'l fields for BED12 and/or custom BED annotations vector<string> otherFields; - // Additional fields - unsigned int count; // count of number of intervals - // that overlap this feature - - map<unsigned int, DEPTH> depthMap; - int minOverlapStart; + // Additional fields specific to computing coverage + map<unsigned int, DEPTH> depthMap; + unsigned int count; + CHRPOS minOverlapStart; }; // enum to flag the state of a given line in a BED file. enum BedLineStatus -{ BED_INVALID = -1, - BED_HEADER = 0, - BED_BLANK = 1, - BED_VALID = 2 +{ + BED_INVALID = -1, + BED_HEADER = 0, + BED_BLANK = 1, + BED_VALID = 2 }; // return the genome "bin" for a feature with this start and end -int getBin(int start, int end); +inline +int getBin(CHRPOS start, CHRPOS end); + - // return the amount of overlap between two features. Negative if none and the the // number of negative bases is the distance between the two. inline -int overlaps(int aS, int aE, int bS, int bE) { +int overlaps(CHRPOS aS, CHRPOS aE, CHRPOS bS, CHRPOS bE) { return min(aE, bE) - max(aS, bS); } -// return the lesser of two values. -inline -int min(int a, int b) { - if (a <= b) return a; - else return b; -} - -// return the greater of two values. -inline -int max(int a, int b) { - if (a >= b) return a; - else return b; -} - // templated function to convert objects to strings template <typename T> @@ -115,6 +142,10 @@ std::string ToString(const T & value) } +// Ancillary functions + + + // BED Sorting Methods bool sortByChrom(const BED &a, const BED &b); bool sortByStart(const BED &a, const BED &b); @@ -127,19 +158,28 @@ bool byChromThenStart(BED const &a, BED const &b); //************************************************* -// Common typedefs +// Data structure typedefs //************************************************* -typedef vector<BED> bedVector; +typedef vector<BED> bedVector; +typedef vector<BEDCOV> bedCovVector; -typedef map<int, vector<BED>, std::less<int> > binsToBeds; -//typedef tr1::unordered_map<int, vector<BED> > binsToBeds; +typedef map<int, bedVector, std::less<int> > binsToBeds; +typedef map<int, bedCovVector, std::less<int> > binsToBedCovs; -typedef map<string, binsToBeds, std::less<string> > masterBedMap; -//typedef tr1::unordered_map<string, binsToBeds> masterBedMap; +typedef map<string, binsToBeds, std::less<string> > masterBedMap; +typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap; +typedef map<string, bedVector, std::less<string> > masterBedMapNoBin; + +// EXPERIMENTAL +//typedef tr1::unordered_map<int, vector<BED> > binsToBeds; +//typedef tr1::unordered_map<string, binsToBeds> masterBedMap; //typedef map<string, bedVector, std::less<string> > masterBedMap; +//typedef vector<BED> masterBedMapNew[_numBins]; +//typedef tr1::unordered_map<int, vector<BED> > binsToBedsNew; +//typedef tr1::unordered_map<string, binsToBeds> masterBedMapNew; + -typedef map<string, vector<BED>, std::less<string> > masterBedMapNoBin; //************************************************ @@ -167,6 +207,9 @@ public: // load a BED file into a map keyed by chrom, then bin. value is vector of BEDs void loadBedFileIntoMap(); + // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVs + void loadBedCovFileIntoMap(); + // load a BED file into a map keyed by chrom. value is vector of BEDs void loadBedFileIntoMapNoBin(); @@ -174,35 +217,29 @@ public: // search for all overlapping features in another BED file. // Searches through each relevant genome bin on the same chromosome // as the single feature. Note: Adapted from kent source "binKeeperFind" - void FindOverlapsPerBin(string chrom, int start, int end, string strand, vector<BED> &hits, bool forceStrand); + void FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, vector<BED> &hits, bool forceStrand); // return true if at least one overlap was found. otherwise, return false. - bool FindOneOrMoreOverlapsPerBin(string chrom, int start, int end, string strand, + bool FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, bool forceStrand, float overlapFraction = 0.0); // return true if at least one __reciprocal__ overlap was found. otherwise, return false. - bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, int start, int end, string strand, + bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, bool forceStrand, float overlapFraction = 0.0); - // Given a chrom, start, end and strand for a single feature, // increment a the number of hits for each feature in B file // that the feature overlaps void countHits(const BED &a, bool forceStrand); - // printing methods - void reportBedTab(const BED &); - void reportBedNewLine(const BED &); - void reportBedRangeTab(const BED &bed, int start, int end); - void reportBedRangeNewLine(const BED &bed, int start, int end); - void reportNullBedTab(void); - void reportNullBedNewLine(void); - // the bedfile with which this instance is associated string bedFile; unsigned int bedType; // 3-6, 12 for BED // 9 for GFF - masterBedMap bedMap; + + // Main data structires used by BEDTools + masterBedCovMap bedCovMap; + masterBedMap bedMap; masterBedMapNoBin bedMapNoBin; private: @@ -211,17 +248,535 @@ private: bool _isGff; istream *_bedStream; - // methods - - // parse an input line and determine how it should be handled - BedLineStatus parseLine (BED &bed, const vector<string> &lineVector, int &lineNum); - // process as line from a BED file - bool parseBedLine (BED &bed, const vector<string> &lineVector, int lineNum); - // process as line from a GFF file. convert to a BED feature. - bool parseGffLine (BED &bed, const vector<string> &lineVector, int lineNum); - void setGff (bool isGff); + void setGff (bool isGff); + + + /****************************************************** + Private definitions to circumvent linker issues with + templated member functions. + *******************************************************/ + + /* + parseLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) + */ + template <typename T> + inline BedLineStatus parseLine (T &bed, const vector<string> &lineVector, int &lineNum) { + + char *p2End, *p3End, *p4End, *p5End; + long l2, l3, l4, l5; + + // bail out if we have a blank line + if (lineVector.size() == 0) + return BED_BLANK; + + if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) { + + // we need at least 3 columns + if (lineVector.size() >= 3) { + + // test if columns 2 and 3 are integers. If so, assume BED. + l2 = strtol(lineVector[1].c_str(), &p2End, 10); + l3 = strtol(lineVector[2].c_str(), &p3End, 10); + + // strtol will set p2End or p3End to the start of the string if non-integral, base 10 + if (p2End != lineVector[1].c_str() && p3End != lineVector[2].c_str()) { + setGff(false); + if (parseBedLine(bed, lineVector, lineNum) == true) + return BED_VALID; + } + else if (lineVector.size() == 9) { + // otherwise test if columns 4 and 5 are integers. If so, assume GFF. + l4 = strtol(lineVector[3].c_str(), &p4End, 10); + l5 = strtol(lineVector[4].c_str(), &p5End, 10); + + // strtol will set p4End or p5End to the start of the string if non-integral, base 10 + if (p4End != lineVector[3].c_str() && p5End != lineVector[4].c_str()) { + setGff(true); + if (parseGffLine(bed, lineVector, lineNum) == true) + return BED_VALID; + } + } + else { + cerr << "Unexpected file format. Please use tab-delimited BED or GFF" << endl; + exit(1); + } + } + // gripe if it's not a blank line + else if (lineVector.size() != 0) { + cerr << "It looks as though you have less than 3 columns at line: " << lineNum << ". Are you sure your files are tab-delimited?" << endl; + exit(1); + } + } + else { + lineNum--; + return BED_HEADER; + } + + // default + return BED_INVALID; + } + + + /* + parseBedLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) + */ + template <typename T> + inline bool parseBedLine (T &bed, const vector<string> &lineVector, int lineNum) { + + if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { + + bed.chrom = lineVector[0]; + bed.start = atoi(lineVector[1].c_str()); + bed.end = atoi(lineVector[2].c_str()); + + if (this->bedType == 4) { + bed.name = lineVector[3]; + } + else if (this->bedType ==5) { + bed.name = lineVector[3]; + bed.score = lineVector[4]; + } + else if (this->bedType == 6) { + bed.name = lineVector[3]; + bed.score = lineVector[4]; + bed.strand = lineVector[5][0]; + } + else if (this->bedType > 6) { + bed.name = lineVector[3]; + bed.score = lineVector[4]; + bed.strand = lineVector[5][0]; + for (unsigned int i = 6; i < lineVector.size(); ++i) { + bed.otherFields.push_back(lineVector[i]); + } + } + else if (this->bedType != 3) { + cerr << "Error: unexpected number of fields at line: " << lineNum << ". Verify that your files are TAB-delimited and that your BED file has 3,4,5 or 6 fields. Exiting..." << endl; + exit(1); + } + + if ((bed.start <= bed.end) && (bed.start > 0) && (bed.end > 0)) { + return true; + } + else if (bed.start > bed.end) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + exit(1); + } + else if ( (bed.start < 0) || (bed.end < 0) ) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; + exit(1); + } + } + else if ((lineNum == 1) && (lineVector.size() >= 3)) { + this->bedType = lineVector.size(); + + bed.chrom = lineVector[0]; + bed.start = atoi(lineVector[1].c_str()); + bed.end = atoi(lineVector[2].c_str()); + + if (this->bedType == 4) { + bed.name = lineVector[3]; + } + else if (this->bedType ==5) { + bed.name = lineVector[3]; + bed.score = lineVector[4]; + } + else if (this->bedType == 6) { + bed.name = lineVector[3]; + bed.score = lineVector[4]; + bed.strand = lineVector[5][0]; + } + else if (this->bedType > 6) { + bed.name = lineVector[3]; + bed.score = lineVector[4]; + bed.strand = lineVector[5][0]; + for (unsigned int i = 6; i < lineVector.size(); ++i) { + bed.otherFields.push_back(lineVector[i]); + } + } + else if (this->bedType != 3) { + cerr << "Error: unexpected number of fields at line: " << lineNum << ". Verify that your files are TAB-delimited and that your BED file has 3,4,5 or 6 fields. Exiting..." << endl; + exit(1); + } + if ((bed.start <= bed.end) && (bed.start > 0) && (bed.end > 0)) { + return true; + } + else if (bed.start > bed.end) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + exit(1); + } + else if ( (bed.start < 0) || (bed.end < 0) ) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; + exit(1); + } + } + else if (lineVector.size() == 1) { + cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + exit(1); + } + else if ((lineVector.size() != this->bedType) && (lineVector.size() != 0)) { + cerr << "Differing number of BED fields encountered at line: " << lineNum << ". Exiting..." << endl; + exit(1); + } + else if ((lineVector.size() < 3) && (lineVector.size() != 0)) { + cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required at line: "<< lineNum << ". Exiting..." << endl; + exit(1); + } + return false; + } + + + /* + parseGffLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) + */ + template <typename T> + bool parseGffLine (T &bed, const vector<string> &lineVector, int lineNum) { + + /* + 1. seqname - The name of the sequence. Must be a chromosome or scaffold. + 2. source - The program that generated this feature. + 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", + "start_codon", "stop_codon", and "exon". + 4. start - The starting position of the feature in the sequence. The first base is numbered 1. + 5. end - The ending position of the feature (inclusive). + 6. score - A score between 0 and 1000. If the track line useScore attribute is set to 1 + for this annotation data set, the score value will determine the level of gray + in which this feature is displayed (higher numbers = darker gray). + If there is no score value, enter ".". + 7. strand - Valid entries include '+', '-', or '.' (for don't know/don't care). + 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that + represents the reading frame of the first base. If the feature is not a coding exon, + the value should be '.'. + 9. group - All lines with the same group are linked together into a single item. + */ + if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { + + if (this->bedType == 9 && _isGff) { + bed.chrom = lineVector[0]; + // substract 1 to force the start to be BED-style + bed.start = atoi(lineVector[3].c_str()) - 1; + bed.end = atoi(lineVector[4].c_str()); + bed.name = lineVector[2]; + bed.score = lineVector[5]; + bed.strand = atoi(lineVector[6].c_str()); + bed.otherFields.push_back(lineVector[1]); // add GFF "source". unused in BED + bed.otherFields.push_back(lineVector[7]); // add GFF "fname". unused in BED + bed.otherFields.push_back(lineVector[8]); // add GFF "group". unused in BED + } + else { + cerr << "Error: unexpected number of fields at line: " << lineNum << + ". Verify that your files are TAB-delimited and that your GFF file has 9 fields. Exiting..." << endl; + exit(1); + } + + if (bed.start > bed.end) { + cerr << "Error: malformed GFF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + exit(1); + } + else if ( (bed.start < 0) || (bed.end < 0) ) { + cerr << "Error: malformed GFF entry at line " << lineNum << ". Coordinate detected that is < 1. Exiting." << endl; + exit(1); + } + else return true; + } + else if ((lineNum == 1) && (lineVector.size() == 9)) { + this->bedType = lineVector.size(); + + if (this->bedType == 9 && _isGff) { + bed.chrom = lineVector[0]; + // substract 1 to force the start to be BED-style + bed.start = atoi(lineVector[3].c_str()) - 1; + bed.end = atoi(lineVector[4].c_str()); + bed.name = lineVector[2]; + bed.score = lineVector[5]; + bed.strand = atoi(lineVector[6].c_str()); + + bed.otherFields.push_back(lineVector[1]); // add GFF "source". unused in BED + bed.otherFields.push_back(lineVector[7]); // add GFF "fname". unused in BED + bed.otherFields.push_back(lineVector[8]); // add GFF "group". unused in BED + + return true; + } + else { + cout << this->bedType << " " << _isGff << endl; + cerr << "Error: unexpected number of fields at line: " << lineNum << + ". Verify that your files are TAB-delimited and that your GFF file has 9 fields. Exiting..." << endl; + exit(1); + } + + if (bed.start > bed.end) { + cerr << "Error: malformed GFF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + exit(1); + } + else if ( (bed.start < 0) || (bed.end < 0) ) { + cerr << "Error: malformed GFF entry at line " << lineNum << ". Coordinate detected that is < 1. Exiting." << endl; + exit(1); + } + else return true; + } + else if (lineVector.size() == 1) { + cerr << "Only one GFF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + exit(1); + } + else if ((lineVector.size() != this->bedType) && (lineVector.size() != 0)) { + cerr << "Differing number of GFF fields encountered at line: " << lineNum << ". Exiting..." << endl; + exit(1); + } + else if ((lineVector.size() < 9) && (lineVector.size() != 0)) { + cerr << "TAB delimited GFF file with 9 fields is required at line: "<< lineNum << ". Exiting..." << endl; + exit(1); + } + return false; + } + - +public: + + /* + reportBedTab + + Writes the _original_ BED entry with a TAB + at the end of the line. + Works for BED3 - BED6. + */ + template <typename T> + void reportBedTab(const T &bed) { + if (_isGff == false) { + if (this->bedType == 3) { + printf ("%s\t%d\t%d\t", bed.chrom.c_str(), bed.start, bed.end); + } + else if (this->bedType == 4) { + printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str()); + } + else if (this->bedType == 5) { + printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str()); + } + else if (this->bedType == 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand); + } + else if (this->bedType > 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + } + } + else if (this->bedType == 9) { + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%c\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), bed.start+1, bed.end, + bed.score.c_str(), bed.strand, + bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + } + } + + + + /* + reportBedNewLine + + Writes the _original_ BED entry with a NEWLINE + at the end of the line. + Works for BED3 - BED6. + */ + template <typename T> + void reportBedNewLine(const T &bed) { + if (_isGff == false) { + if (this->bedType == 3) { + printf ("%s\t%d\t%d\n", bed.chrom.c_str(), bed.start, bed.end); + } + else if (this->bedType == 4) { + printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str()); + } + else if (this->bedType == 5) { + printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str()); + } + else if (this->bedType == 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%c\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand); + } + else if (this->bedType > 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + printf("\n"); + } + } + else if (this->bedType == 9) { + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%c\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), bed.start+1, bed.end, + bed.score.c_str(), bed.strand, + bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + } + } + + + + /* + reportBedRangeNewLine + + Writes a custom start->end for a BED entry + with a NEWLINE at the end of the line. + + Works for BED3 - BED6. + */ + template <typename T> + void reportBedRangeTab(const T &bed, CHRPOS start, CHRPOS end) { + if (_isGff == false) { + if (this->bedType == 3) { + printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end); + } + else if (this->bedType == 4) { + printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str()); + } + else if (this->bedType == 5) { + printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str()); + } + else if (this->bedType == 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand); + } + else if (this->bedType > 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + } + } + else if (this->bedType == 9) { + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%c\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand, + bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + } + } + + + + /* + reportBedRangeTab + + Writes a custom start->end for a BED entry + with a TAB at the end of the line. + + Works for BED3 - BED6. + */ + template <typename T> + void reportBedRangeNewLine(const T &bed, CHRPOS start, CHRPOS end) { + if (_isGff == false) { + if (this->bedType == 3) { + printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end); + } + else if (this->bedType == 4) { + printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str()); + } + else if (this->bedType == 5) { + printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str()); + } + else if (this->bedType == 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%c\n", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand); + } + else if (this->bedType > 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + printf("\n"); + } + } + else if (this->bedType == 9) { // add 1 to the start for GFF + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%c\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand, + bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + } + } + + + /* + reportNullBedTab + */ + void reportNullBedTab() { + + if (_isGff == false) { + if (this->bedType == 3) { + printf (".\t-1\t-1\t"); + } + else if (this->bedType == 4) { + printf (".\t-1\t-1\t.\t"); + } + else if (this->bedType == 5) { + printf (".\t-1\t-1\t.\t-1\t"); + } + else if (this->bedType == 6) { + printf (".\t-1\t-1\t.\t-1\t.\t"); + } + else if (this->bedType > 6) { + printf (".\t-1\t-1\t.\t-1\t.\t"); + for (unsigned int i = 6; i < this->bedType; ++i) { + printf(".\t"); + } + } + } + else if (this->bedType == 9) { + printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t"); + } + } + + + /* + reportNullBedTab + */ + void reportNullBedNewLine() { + + if (_isGff == false) { + if (this->bedType == 3) { + printf (".\t-1\t-1\n"); + } + else if (this->bedType == 4) { + printf (".\t-1\t-1\t.\n"); + } + else if (this->bedType == 5) { + printf (".\t-1\t-1\t.\t-1\n"); + } + else if (this->bedType == 6) { + printf (".\t-1\t-1\t.\t-1\t.\n"); + } + else if (this->bedType > 6) { + printf (".\t-1\t-1\t.\t-1\t.\n"); + for (unsigned int i = 6; i < this->bedType; ++i) { + printf(".\t"); + } + printf("\n"); + } + } + else if (this->bedType == 9) { + printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n"); + } + } + + }; #endif /* BEDFILE_H */