diff --git a/src/pairToBed/pairToBed.cpp b/src/pairToBed/pairToBed.cpp index 4b586a7dcb142700011614a042df0ee1bbe53ade..b8b078ca0d1eba4c375f93f9ac342930d100433f 100755 --- a/src/pairToBed/pairToBed.cpp +++ b/src/pairToBed/pairToBed.cpp @@ -47,6 +47,21 @@ BedIntersectPE::BedIntersectPE(string bedAFilePE, string bedBFile, float overlap _bedA = new BedFilePE(bedAFilePE); _bedB = new BedFile(bedBFile); + + // dealing with a proper file + if (_bedA->bedFile != "stdin") { + if (_bamInput == false) + IntersectBedPE(); + else + IntersectBamPE(_bedA->bedFile); + } + // reading from stdin + else { + if (_bamInput == false) + IntersectBedPE(); + else + IntersectBamPE("stdin"); + } } @@ -324,50 +339,41 @@ bool BedIntersectPE::FindOneOrMoreSpanningOverlaps(const BEDPE &a, const string } -void BedIntersectPE::IntersectBedPE(istream &bedInput) { +void BedIntersectPE::IntersectBedPE() { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); - string bedLine; int lineNum = 0; // current input line number - vector<BED> hits, hits1, hits2; // vector of potential hits - vector<string> bedFields; // vector for a BED entry + vector<BED> hits, hits1, hits2; // vector of potential hits // reserve some space hits.reserve(100); hits1.reserve(100); hits2.reserve(100); - bedFields.reserve(12); - - // process each entry in A - while (getline(bedInput, bedLine)) { - - lineNum++; - Tokenize(bedLine,bedFields); - BEDPE a; + + BEDPE a, nullBedPE; + + _bedA->Open(); + while (_bedA->GetNextBedPE(a, lineNum) != BED_INVALID) { - // find the overlaps with B if it's a valid BED entry. - if (_bedA->parseBedPELine(a, bedFields, lineNum)) { - if ( (_searchType == "ispan") || (_searchType == "ospan") || - (_searchType == "notispan") || (_searchType == "notospan") ) { - if (a.chrom1 == a.chrom2) { - FindSpanningOverlaps(a, hits, _searchType); - hits.clear(); - } - } - else { - FindOverlaps(a, hits1, hits2, _searchType); - hits1.clear(); - hits2.clear(); + if ( (_searchType == "ispan") || (_searchType == "ospan") || + (_searchType == "notispan") || (_searchType == "notospan") ) { + if (a.chrom1 == a.chrom2) { + FindSpanningOverlaps(a, hits, _searchType); + hits.clear(); } } - // reset for the next input line - bedFields.clear(); + else { + FindOverlaps(a, hits1, hits2, _searchType); + hits1.clear(); + hits2.clear(); + } + a = nullBedPE; } + _bedA->Close(); } -// END IntersectPE void BedIntersectPE::IntersectBamPE(string bamFile) { @@ -518,29 +524,3 @@ void BedIntersectPE::ProcessBamBlock (const BamAlignment &bam1, const BamAlignme } -void BedIntersectPE::DetermineBedPEInput() { - - if (_bedA->bedFile != "stdin") { // process a file - if (_bamInput == false) { // BEDPE - ifstream beds(_bedA->bedFile.c_str(), ios::in); - if ( !beds ) { - cerr << "Error: The requested bed file (" << _bedA->bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - IntersectBedPE(beds); - } - else { // bam - IntersectBamPE(_bedA->bedFile); - } - } - else { // process stdin - if (_bamInput == false) { // BEDPE - IntersectBedPE(cin); - } - else { - IntersectBamPE("stdin"); - } - } -} - - diff --git a/src/pairToBed/pairToBed.h b/src/pairToBed/pairToBed.h index c464ee4fd98b8b4896831f0e76cccaa419ed0c88..239c6fc6d458e0cac2d1e849622de6f775e93e9c 100755 --- a/src/pairToBed/pairToBed.h +++ b/src/pairToBed/pairToBed.h @@ -55,7 +55,7 @@ public: void FindSpanningOverlaps(const BEDPE &a, vector<BED> &hits, const string &type); bool FindOneOrMoreSpanningOverlaps(const BEDPE &a, const string &type); - void IntersectBedPE(istream &bedInput); + void IntersectBedPE(); void IntersectBamPE(string bamFile); void DetermineBedPEInput(); diff --git a/src/pairToBed/pairToBedMain.cpp b/src/pairToBed/pairToBedMain.cpp index b263c36a43f44491e92f5bd9467360d7bc1023de..25eb44da1a94132e3843df7d15d0b377023bb7b3 100755 --- a/src/pairToBed/pairToBedMain.cpp +++ b/src/pairToBed/pairToBedMain.cpp @@ -147,7 +147,7 @@ int main(int argc, char* argv[]) { BedIntersectPE *bi = new BedIntersectPE(bedAFile, bedBFile, overlapFraction, searchType, forceStrand, inputIsBam, outputIsBam, useEditDistance); - bi->DetermineBedPEInput(); + delete bi; return 0; } else { diff --git a/src/pairToPair/pairToPair.cpp b/src/pairToPair/pairToPair.cpp index 12931886a45c0bb7a4eb9c9ebc0ca4ce6a927703..095b62f17d5f0dbdc8a3927dbf2dc1c9e3bfbeea 100755 --- a/src/pairToPair/pairToPair.cpp +++ b/src/pairToPair/pairToPair.cpp @@ -19,14 +19,16 @@ PairToPair::PairToPair(string &bedAFilePE, string &bedBFilePE, float &overlapFraction, string searchType, bool ignoreStrand) { - this->bedAFilePE = bedAFilePE; - this->bedBFilePE = bedBFilePE; - this->overlapFraction = overlapFraction; - this->searchType = searchType; - this->ignoreStrand = ignoreStrand; + _bedAFilePE = bedAFilePE; + _bedBFilePE = bedBFilePE; + _overlapFraction = overlapFraction; + _searchType = searchType; + _ignoreStrand = ignoreStrand; - this->bedA = new BedFilePE(bedAFilePE); - this->bedB = new BedFilePE(bedBFilePE); + _bedA = new BedFilePE(bedAFilePE); + _bedB = new BedFilePE(bedBFilePE); + + IntersectPairs(); } @@ -38,40 +40,33 @@ PairToPair::~PairToPair(void) { -void PairToPair::IntersectPairs(istream &bedInput) { +void PairToPair::IntersectPairs() { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps - bedB->loadBedPEFileIntoMap(); - - int lineNum = 0; - string bedLine; - vector<string> bedFields; // vector for a BED entry + _bedB->loadBedPEFileIntoMap(); + int lineNum = 0; vector<BED> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2; // reserve some space hitsA1B1.reserve(100); hitsA1B2.reserve(100); hitsA2B1.reserve(100); hitsA2B2.reserve(100); - bedFields.reserve(10); + BedLineStatus bedStatus; + BEDPE a, nullBedPE; - // process each entry in A - while (getline(bedInput, bedLine)) { - - BEDPE a; - Tokenize(bedLine,bedFields); - lineNum++; + _bedA->Open(); + bedStatus = _bedA->GetNextBedPE(a, lineNum); + while (bedStatus != BED_INVALID) { + if (bedStatus == BED_VALID) { + FindOverlaps(a, hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2, _searchType); - // find the overlaps with B if it's a valid BED entry. - if (bedA->parseBedPELine(a, bedFields, lineNum)) { - - FindOverlaps(a, hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2, this->searchType); - // reset space for next BEDPE - hitsA1B1.clear(); hitsA1B2.clear(); hitsA2B1.clear(); hitsA2B2.clear(); + hitsA1B1.clear(); hitsA1B2.clear(); hitsA2B1.clear(); hitsA2B2.clear(); + a = nullBedPE; } - // reset for the next input line - bedFields.clear(); + bedStatus = _bedA->GetNextBedPE(a, lineNum); } + _bedA->Close(); } // END IntersectPE @@ -96,10 +91,10 @@ void PairToPair::FindOverlaps(const BEDPE &a, vector<BED> &hitsA1B1, vector<BED> // Find the _potential_ hits between each end of A and B - bedB->FindOverlapsPerBin(1, a.chrom1, a.start1, a.end1, a.strand1, hitsA1B1, !(ignoreStrand)); // hits between A1 to B1 - bedB->FindOverlapsPerBin(1, a.chrom2, a.start2, a.end2, a.strand2, hitsA2B1, !(ignoreStrand)); // hits between A2 to B1 - bedB->FindOverlapsPerBin(2, a.chrom1, a.start1, a.end1, a.strand1, hitsA1B2, !(ignoreStrand)); // hits between A1 to B2 - bedB->FindOverlapsPerBin(2, a.chrom2, a.start2, a.end2, a.strand2, hitsA2B2, !(ignoreStrand)); // hits between A2 to B2 + _bedB->FindOverlapsPerBin(1, a.chrom1, a.start1, a.end1, a.strand1, hitsA1B1, !(_ignoreStrand)); // hits between A1 to B1 + _bedB->FindOverlapsPerBin(1, a.chrom2, a.start2, a.end2, a.strand2, hitsA2B1, !(_ignoreStrand)); // hits between A2 to B1 + _bedB->FindOverlapsPerBin(2, a.chrom1, a.start1, a.end1, a.strand1, hitsA1B2, !(_ignoreStrand)); // hits between A1 to B2 + _bedB->FindOverlapsPerBin(2, a.chrom2, a.start2, a.end2, a.strand2, hitsA2B2, !(_ignoreStrand)); // hits between A2 to B2 // Now, reduce to the set of hits on each end of A and B that meet the required overlap fraction and orientation. @@ -119,8 +114,8 @@ void PairToPair::FindOverlaps(const BEDPE &a, vector<BED> &hitsA1B1, vector<BED> } - if ((matchCount1 == 0) && (matchCount2 == 0) && (this->searchType == "neither")) { - bedA->reportBedPENewLine(a); + if ((matchCount1 == 0) && (matchCount2 == 0) && (_searchType == "neither")) { + _bedA->reportBedPENewLine(a); } } @@ -138,7 +133,7 @@ void PairToPair::FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vecto int e = min(a.end1, h->end); // is there enough overlap (default ~ 1bp) - if ( ((float)(e-s) / (float)(a.end1 - a.start1)) >= this->overlapFraction ) { + if ( ((float)(e-s) / (float)(a.end1 - a.start1)) >= _overlapFraction ) { numOverlaps++; qualityHits.push_back(*h); } @@ -153,7 +148,7 @@ void PairToPair::FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vecto int s = max(a.start2, h->start); int e = min(a.end2, h->end); // is there enough overlap (default ~ 1bp) - if ( ((float)(e-s) / (float)(a.end2 - a.start2)) >= this->overlapFraction ) { + if ( ((float)(e-s) / (float)(a.end2 - a.start2)) >= _overlapFraction ) { numOverlaps++; qualityHits.push_back(*h); } @@ -182,8 +177,8 @@ void PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<BED> &qualityHi BED b1 = m->second[0]; BED b2 = m->second[1]; - if (this->searchType == "both") { - bedA->reportBedPETab(a); + if (_searchType == "both") { + _bedA->reportBedPETab(a); printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", b1.chrom.c_str(), b1.start, b1.end, b2.chrom.c_str(), b2.start, b2.end, b1.name.c_str(), b1.score.c_str(), @@ -193,18 +188,3 @@ void PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<BED> &qualityHi } } - -void PairToPair::DetermineBedPEInput() { - - if (bedA->bedFile != "stdin") { // process a file - ifstream beds(bedA->bedFile.c_str(), ios::in); - if ( !beds ) { - cerr << "Error: The requested bed file (" << bedA->bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } - IntersectPairs(beds); - } - else { // process stdin - IntersectPairs(cin); - } -} diff --git a/src/pairToPair/pairToPair.h b/src/pairToPair/pairToPair.h index 2eb676a0b2dc27eba9af58e8fb91c4ecd708a2ac..3dd057d095010335f6517978c35fbff30b94259f 100755 --- a/src/pairToPair/pairToPair.h +++ b/src/pairToPair/pairToPair.h @@ -33,7 +33,7 @@ public: // destructor ~PairToPair(void); - void IntersectPairs(istream &bedInput); + void IntersectPairs(); void FindOverlaps(const BEDPE &a, vector<BED> &hitsA1B1, vector<BED> &hitsA1B2, vector<BED> &hitsA2B1, vector<BED> &hitsA2B2, string type); @@ -44,22 +44,21 @@ public: void FindHitsOnBothEnds(const BEDPE &a, const vector<BED> &qualityHitsEnd1, const vector<BED> &qualityHitsEnd2, int &matchCount); - void DetermineBedPEInput(); private: - string bedAFilePE; - string bedBFilePE; + string _bedAFilePE; + string _bedBFilePE; - float overlapFraction; - string searchType; - bool ignoreStrand; + float _overlapFraction; + string _searchType; + bool _ignoreStrand; // instance of a paired-end bed file class. - BedFilePE *bedA; + BedFilePE *_bedA; // instance of a bed file class. - BedFilePE *bedB; + BedFilePE *_bedB; }; #endif /* PAIRTOPAIR_H */ diff --git a/src/pairToPair/pairToPairMain.cpp b/src/pairToPair/pairToPairMain.cpp index aec4986dcf646b0dba7c0a646cfd4348ad8878d2..5dc7874d5b660ddff230483d59d2334eebdbe7c5 100755 --- a/src/pairToPair/pairToPairMain.cpp +++ b/src/pairToPair/pairToPairMain.cpp @@ -114,7 +114,7 @@ int main(int argc, char* argv[]) { if (!showHelp) { PairToPair *bi = new PairToPair(bedAFile, bedBFile, overlapFraction, searchType, ignoreStrand); - bi->DetermineBedPEInput(); + delete bi; return 0; } else { diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index cbfff474725b29e7eb1aa9f3fd6ff6e3c7ac16b1..0c24323312e2ded5fb393e5185e3faad03c3e65f 100755 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -156,7 +156,7 @@ void BedFile::Open(void) { // Close the BED file void BedFile::Close(void) { - delete _bedStream; + if (bedFile != "stdin") delete _bedStream; } diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 5ad89bcf148685a0336de2efb9bba9c13da499c1..873fa1613c2c3da61c48cbdd2e6080e839b2b0bb 100755 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -198,10 +198,6 @@ public: void reportNullBedTab(void); void reportNullBedNewLine(void); - // parse an input line and determine how it should be handled - BedLineStatus parseLine (BED &bed, const vector<string> &lineVector, int &lineNum); - - // the bedfile with which this instance is associated string bedFile; unsigned int bedType; // 3-6, 12 for BED @@ -217,6 +213,8 @@ private: // methods + // parse an input line and determine how it should be handled + BedLineStatus parseLine (BED &bed, const vector<string> &lineVector, int &lineNum); // process as line from a BED file bool parseBedLine (BED &bed, const vector<string> &lineVector, int lineNum); // process as line from a GFF file. convert to a BED feature. diff --git a/src/utils/bedFilePE/bedFilePE.cpp b/src/utils/bedFilePE/bedFilePE.cpp index 2a675f774a399661b05ab1d68dd13c39d441bd07..27c7a3cb4210413990cdaf6b8dfbb74cd1b42cc2 100755 --- a/src/utils/bedFilePE/bedFilePE.cpp +++ b/src/utils/bedFilePE/bedFilePE.cpp @@ -15,70 +15,87 @@ #include "bedFilePE.h" -/* - Adapted from kent source "binKeeperFind" -*/ -void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, int start, int end, string strand, vector<BED> &hits, bool forceStrand) { +// Constructor +BedFilePE::BedFilePE(string &bedFile) { + this->bedFile = bedFile; +} - int startBin, endBin; - startBin = (start >> _binFirstShift); - endBin = ((end-1) >> _binFirstShift); +// Destructor +BedFilePE::~BedFilePE(void) { +} - // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < 6; ++i) { - - // loop through each bin at this level of the hierarchy - int offset = binOffsetsExtended[i]; - for (int j = (startBin+offset); j <= (endBin+offset); ++j) { - - // loop through each feature in this chrom/bin and see if it overlaps - // with the feature that was passed in. if so, add the feature to - // the list of hits. - vector<BED>::const_iterator bedItr; - vector<BED>::const_iterator bedEnd; - if (bEnd == 1) { - bedItr = bedMapEnd1[chrom][j].begin(); - bedEnd = bedMapEnd1[chrom][j].end(); - } - else if (bEnd == 2) { - bedItr = bedMapEnd2[chrom][j].begin(); - bedEnd = bedMapEnd2[chrom][j].end(); +void BedFilePE::Open(void) { + if (bedFile == "stdin") { + _bedStream = &cin; + } + else { + size_t foundPos; + foundPos = bedFile.find_last_of(".gz"); + // is this a GZIPPED BED file? + if (foundPos == bedFile.size() - 1) { + igzstream beds(bedFile.c_str(), ios::in); + if ( !beds ) { + cerr << "Error: The requested bedpe file (" << bedFile << ") could not be opened. Exiting!" << endl; + exit (1); } else { - cerr << "Unexpected end of B requested" << endl; + // if so, close it (this was just a test) + beds.close(); + // now set a pointer to the stream so that we + // can read the file later on. + // Thank God for Josuttis, p. 631! + _bedStream = new igzstream(bedFile.c_str(), ios::in); } - for (; bedItr != bedEnd; ++bedItr) { - - // skip the hit if not on the same strand (and we care) - if (forceStrand && (strand != bedItr->strand)) { - continue; - } - else if (overlaps(bedItr->start, bedItr->end, start, end) > 0) { - hits.push_back(*bedItr); // it's a hit, add it. - } - + } + // not GZIPPED. + else { + + ifstream beds(bedFile.c_str(), ios::in); + // can we open the file? + if ( !beds ) { + cerr << "Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + else { + // if so, close it (this was just a test) + beds.close(); + // now set a pointer to the stream so that we + // can read the file later on. + // Thank God for Josuttis, p. 631! + _bedStream = new ifstream(bedFile.c_str(), ios::in); } } - startBin >>= _binNextShift; - endBin >>= _binNextShift; } } +// Close the BEDPE file +void BedFilePE::Close(void) { + if (bedFile != "stdin") delete _bedStream; +} +BedLineStatus BedFilePE::GetNextBedPE (BEDPE &bedpe, int &lineNum) { -//*********************************************** -// Common functions -//*********************************************** - -// Constructor -BedFilePE::BedFilePE(string &bedFile) { - this->bedFile = bedFile; -} + // make sure there are still lines to process. + // if so, tokenize, validate and return the BEDPE entry. + if (_bedStream->good()) { + string bedPELine; + vector<string> bedPEFields; + bedPEFields.reserve(10); + + // parse the bedStream pointer + getline(*_bedStream, bedPELine); + lineNum++; -// Destructor -BedFilePE::~BedFilePE(void) { + // split into a string vector. + Tokenize(bedPELine,bedPEFields); + + // load the BEDPE struct as long as it's a valid BEDPE entry. + return parseLine(bedpe, bedPEFields, lineNum); + } + // default if file is closed or EOF + return BED_INVALID; } @@ -167,10 +184,37 @@ void BedFilePE::reportBedPENewLine(const BEDPE &a) { } +BedLineStatus BedFilePE::parseLine (BEDPE &bedpe, const vector<string> &lineVector, int &lineNum) { + + // bail out if we have a blank line + if (lineVector.empty()) + return BED_BLANK; + + if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) { + // we need at least 6 columns + if (lineVector.size() >= 6) { + if (parseBedPELine(bedpe, lineVector, lineNum) == true) + return BED_VALID; + else return BED_INVALID; + } + else { + cerr << "It looks as though you have less than 6 columns. Are you sure your files are tab-delimited?" << endl; + exit(1); + } + } + else { + lineNum--; + return BED_HEADER; + } + + // default + return BED_INVALID; +} + bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, const int &lineNum) { - if ((lineNum == 1) && (lineVector.size() >= 3)) { + if ((lineNum == 1) && (lineVector.size() >= 6)) { this->bedType = lineVector.size(); @@ -253,15 +297,15 @@ bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, co } if (bed.start1 > bed.end1) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl; + cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl; return false; } else if (bed.start2 > bed.end2) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl; + cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl; return false; } else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl; + cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl; return false; } } @@ -374,76 +418,90 @@ bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, co } -void BedFilePE::loadBedPEFileIntoMap() { - - // open the BEDPE file for reading - ifstream bed(bedFile.c_str(), ios::in); - if ( !bed ) { - cerr << "Error: The requested bed file (" <<bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } +/* + Adapted from kent source "binKeeperFind" +*/ +void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, int start, int end, string strand, vector<BED> &hits, bool forceStrand) { - string bedLine; - BEDPE bedpeEntry; - BED bedEntry1, bedEntry2; - - unsigned int lineNum = 0; - int bin1, bin2; - vector<string> bedFields; // vector of strings for each column in BED file. - bedFields.reserve(10); // reserve space for worst case (BED 6) + int startBin, endBin; + startBin = (start >> _binFirstShift); + endBin = ((end-1) >> _binFirstShift); - while (getline(bed, bedLine)) { - lineNum++; + // loop through each bin "level" in the binning hierarchy + for (int i = 0; i < 6; ++i) { - if (lineNum > 1) { - - Tokenize(bedLine,bedFields); - - if (parseBedPELine(bedpeEntry, bedFields, lineNum)) { - - // separate the BEDPE entry into separate - // BED entries - splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2); - - // load end1 into a UCSC bin map - bin1 = getBin(bedEntry1.start, bedEntry1.end); - this->bedMapEnd1[bedEntry1.chrom][bin1].push_back(bedEntry1); - - // load end2 into a UCSC bin map - bin2 = getBin(bedEntry2.start, bedEntry2.end); - this->bedMapEnd2[bedEntry2.chrom][bin2].push_back(bedEntry2); + // loop through each bin at this level of the hierarchy + int offset = binOffsetsExtended[i]; + for (int j = (startBin+offset); j <= (endBin+offset); ++j) { + + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<BED>::const_iterator bedItr; + vector<BED>::const_iterator bedEnd; + if (bEnd == 1) { + bedItr = bedMapEnd1[chrom][j].begin(); + bedEnd = bedMapEnd1[chrom][j].end(); } - bedFields.clear(); - - } - else { - if ((bedLine.find("track") != string::npos) || (bedLine.find("browser") != string::npos)) { - continue; + else if (bEnd == 2) { + bedItr = bedMapEnd2[chrom][j].begin(); + bedEnd = bedMapEnd2[chrom][j].end(); } else { - Tokenize(bedLine,bedFields); + cerr << "Unexpected end of B requested" << endl; + } + for (; bedItr != bedEnd; ++bedItr) { + + // skip the hit if not on the same strand (and we care) + if (forceStrand && (strand != bedItr->strand)) { + continue; + } + else if (overlaps(bedItr->start, bedItr->end, start, end) > 0) { + hits.push_back(*bedItr); // it's a hit, add it. + } + + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } +} - if (parseBedPELine(bedpeEntry, bedFields, lineNum)) { - // separate the BEDPE entry into separate - // BED entries - splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2); - // load end1 into a UCSC bin map - bin1 = getBin(bedEntry1.start, bedEntry1.end); - this->bedMapEnd1[bedEntry1.chrom][bin1].push_back(bedEntry1); +void BedFilePE::loadBedPEFileIntoMap() { - // load end2 into a UCSC bin map - bin2 = getBin(bedEntry2.start, bedEntry2.end); - this->bedMapEnd2[bedEntry2.chrom][bin2].push_back(bedEntry2); - } - bedFields.clear(); - } + int lineNum = 0; + int bin1, bin2; + BedLineStatus bedStatus; + BEDPE bedpeEntry, nullBedPE; + + Open(); + bedStatus = this->GetNextBedPE(bedpeEntry, lineNum); + while (bedStatus != BED_INVALID) { + + if (bedStatus == BED_VALID) { + BED bedEntry1, bedEntry2; + // separate the BEDPE entry into separate + // BED entries + splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2); + + // load end1 into a UCSC bin map + bin1 = getBin(bedEntry1.start, bedEntry1.end); + this->bedMapEnd1[bedEntry1.chrom][bin1].push_back(bedEntry1); + + // load end2 into a UCSC bin map + bin2 = getBin(bedEntry2.start, bedEntry2.end); + this->bedMapEnd2[bedEntry2.chrom][bin2].push_back(bedEntry2); + + bedpeEntry = nullBedPE; } + bedStatus = this->GetNextBedPE(bedpeEntry, lineNum); } + Close(); } -void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, unsigned int lineNum, BED &bedEntry1, BED &bedEntry2) { +void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, BED &bedEntry1, BED &bedEntry2) { /* Split the BEDPE entry into separate BED entries @@ -457,22 +515,22 @@ void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, unsigned int lineNum read-pair. */ - bedEntry1.chrom = bedpeEntry.chrom1; - bedEntry1.start = bedpeEntry.start1; - bedEntry1.end = bedpeEntry.end1; - bedEntry1.name = bedpeEntry.name; - bedEntry1.score = bedpeEntry.score; - bedEntry1.strand = bedpeEntry.strand1; - bedEntry1.count = lineNum; + bedEntry1.chrom = bedpeEntry.chrom1; + bedEntry1.start = bedpeEntry.start1; + bedEntry1.end = bedpeEntry.end1; + bedEntry1.name = bedpeEntry.name; + bedEntry1.score = bedpeEntry.score; + bedEntry1.strand = bedpeEntry.strand1; + bedEntry1.count = lineNum; bedEntry1.minOverlapStart = INT_MAX; - bedEntry2.chrom = bedpeEntry.chrom2; - bedEntry2.start = bedpeEntry.start2; - bedEntry2.end = bedpeEntry.end2; - bedEntry2.name = bedpeEntry.name; - bedEntry2.score = bedpeEntry.score; - bedEntry2.strand = bedpeEntry.strand2; - bedEntry2.count = lineNum; + bedEntry2.chrom = bedpeEntry.chrom2; + bedEntry2.start = bedpeEntry.start2; + bedEntry2.end = bedpeEntry.end2; + bedEntry2.name = bedpeEntry.name; + bedEntry2.score = bedpeEntry.score; + bedEntry2.strand = bedpeEntry.strand2; + bedEntry2.count = lineNum; bedEntry2.minOverlapStart = INT_MAX; } diff --git a/src/utils/bedFilePE/bedFilePE.h b/src/utils/bedFilePE/bedFilePE.h index deaf08ea3134224a8de10e720787c46d7aea8021..82750752694f4f199d28a504f2ce9f28367d1e00 100755 --- a/src/utils/bedFilePE/bedFilePE.h +++ b/src/utils/bedFilePE/bedFilePE.h @@ -54,20 +54,27 @@ public: // Destructor ~BedFilePE(void); + // Open a BEDPE file for reading (creates an istream pointer) + void Open(void); + + // Close an opened BEDPE file. + void Close(void); + + // Get the next BED entry in an opened BED file. + BedLineStatus GetNextBedPE (BEDPE &bedpe, int &lineNum); + + // Methods - bool parseBedPELine (BEDPE &, const vector<string> &, const int &); - void reportBedPETab(const BEDPE &); - void reportBedPENewLine(const BEDPE &); + + void reportBedPETab(const BEDPE &a); + void reportBedPENewLine(const BEDPE &a); void loadBedPEFileIntoMap(); - void splitBedPEIntoBeds(const BEDPE &, unsigned int, BED &, BED &); + void splitBedPEIntoBeds(const BEDPE &a, const int &lineNum, BED &bedEntry1, BED &bedEntry2); void FindOverlapsPerBin(int bEnd, string chrom, int start, int end, string strand, vector<BED> &hits, bool forceStrand); - //void binKeeperFind(map<int, vector<BED>, - // std::less<int> > &, const int, - // const int, vector<BED> &); string bedFile; unsigned int bedType; @@ -76,7 +83,11 @@ public: masterBedMap bedMapEnd2; private: - // none + istream *_bedStream; + + // methods + BedLineStatus parseLine (BEDPE &bedpe, const vector<string> &lineVector, int &lineNum); + bool parseBedPELine (BEDPE &bed, const vector<string> &lineVector, const int &lineNum); }; #endif /* BEDFILEPE_H */