diff --git a/src/intersectBed/Makefile b/src/intersectBed/Makefile old mode 100644 new mode 100755 diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp index 7c803089583daeb28680176a07641546faa5556e..d15e0a3f434f91edab531f2f84a49aa5f630e3b8 100755 --- a/src/intersectBed/intersectBed.cpp +++ b/src/intersectBed/intersectBed.cpp @@ -12,6 +12,53 @@ #include "lineFileUtilities.h" #include "intersectBed.h" +/************************************ +Helper functions +************************************/ +bool BedIntersect::processHits(const BED &a, const vector<BED> &hits, bool printable) { + + // how many overlaps are there b/w the bed and the set of hits? + int numOverlaps = 0; + bool hitsFound = false; + + // loop through the hits and report those that meet the user's criteria + vector<BED>::const_iterator h = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; h != hitsEnd; ++h) { + int s = max(a.start, h->start); + int e = min(a.end, h->end); + int overlapBases = (e - s); // the number of overlapping bases b/w a and b + int aLength = (a.end - a.start); // the length of a in b.p. + + // is there enough overlap relative to the user's request? (default ~ 1bp) + if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { + + // Report the hit if the user doesn't care about reciprocal overlap between A and B. + if (_reciprocal == false) { + hitsFound = true; + numOverlaps++; + } + // we require there to be sufficient __reciprocal__ overlap + else { + int bLength = (h->end - h->start); + float bOverlap = ( (float) overlapBases / (float) bLength ); + if (bOverlap >= _overlapFraction) { + hitsFound = true; + numOverlaps++; + } + } + // report the overlap per the user's request. + if (printable == true) { + ReportOverlapDetail(overlapBases, a, *h, s, e); + } + } + } + // report the summary of the overlaps if requested. + ReportOverlapSummary(a, numOverlaps); + // were hits found for this BED feature? + return hitsFound; +} + /* Constructor @@ -19,7 +66,7 @@ BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, float overlapFraction, bool noHit, bool writeCount, bool forceStrand, - bool reciprocal, bool bamInput, bool bamOutput) { + bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput) { _bedAFile = bedAFile; _bedBFile = bedBFile; @@ -33,6 +80,7 @@ BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, _overlapFraction = overlapFraction; _forceStrand = forceStrand; _reciprocal = reciprocal; + _obeySplits = obeySplits; _bamInput = bamInput; _bamOutput = bamOutput; @@ -64,57 +112,38 @@ BedIntersect::~BedIntersect(void) { } -bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { +bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits, int lineNum) { bool hitsFound = false; - - // grab _all_ of the features in B that overlap with a. - _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _forceStrand); - - // how many overlaps are there b/w a and B? - int numOverlaps = 0; - + // should we print each overlap, or does the user want summary information? bool printable = true; if (_anyHit || _noHit || _writeCount) printable = false; - // loop through the hits and report those that meet the user's criteria - vector<BED>::const_iterator h = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); - for (; h != hitsEnd; ++h) { - int s = max(a.start, h->start); - int e = min(a.end, h->end); - int overlapBases = (e - s); // the number of overlapping bases b/w a and b - int aLength = (a.end - a.start); // the length of a in b.p. - - // is there enough overlap relative to the user's request? (default ~ 1bp) - if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { - - // Report the hit if the user doesn't care about reciprocal overlap between A and B. - if (_reciprocal == false) { - hitsFound = true; - numOverlaps++; - } - // we require there to be sufficient __reciprocal__ overlap - else { - int bLength = (h->end - h->start); - float bOverlap = ( (float) overlapBases / (float) bLength ); - if (bOverlap >= _overlapFraction) { - hitsFound = true; - numOverlaps++; - } - } - // report the overlap per the user's request. - if (printable == true) { - ReportOverlapDetail(overlapBases, a, *h, s, e); - } - } - } - // report the summary of the overlaps if requested. - ReportOverlapSummary(a, numOverlaps); - // were hits found for this BED feature? - return hitsFound; + if (_obeySplits == false) { + // grab _all_ of the features in B that overlap with a. + _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _forceStrand); + hitsFound = processHits(a, hits, printable); + } + else { + bool hitFoundForBlock = false; + bedVector bedBlocks; // vec to store the discrete BED "blocks" from a + splitBedIntoBlocks(a, lineNum, bedBlocks); + + vector<BED>::const_iterator bedItr = bedBlocks.begin(); + vector<BED>::const_iterator bedEnd = bedBlocks.end(); + for (; bedItr != bedEnd; ++bedItr) { + // collect and process the hits for this "block" + _bedB->FindOverlapsPerBin(bedItr->chrom, bedItr->start, bedItr->end, bedItr->strand, hits, _forceStrand); + hitFoundForBlock = processHits(*bedItr, hits, printable); + + if (hitFoundForBlock == true) + hitsFound = true; + hits.clear(); + } + } + return hitsFound; } @@ -200,7 +229,7 @@ void BedIntersect::IntersectBed() { _bedA->Open(); while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { - FindOverlaps(a, hits); + FindOverlaps(a, hits, lineNum); hits.clear(); a = nullBed; } @@ -266,7 +295,7 @@ void BedIntersect::IntersectBam(string bamFile) { } } else { - overlapsFound = FindOverlaps(a, hits); + overlapsFound = FindOverlaps(a, hits, 0); hits.clear(); } } diff --git a/src/intersectBed/intersectBed.h b/src/intersectBed/intersectBed.h index 6d2779190f62b267ebe89d8a6f0a0b21d9343f98..748d5f23615d26223389c337f7d89d0323d92dab 100755 --- a/src/intersectBed/intersectBed.h +++ b/src/intersectBed/intersectBed.h @@ -13,9 +13,9 @@ #define INTERSECTBED_H #include "bedFile.h" - #include "BamReader.h" #include "BamWriter.h" +#include "BamAncillary.h" #include "BamAux.h" using namespace BamTools; @@ -27,6 +27,7 @@ using namespace BamTools; using namespace std; + class BedIntersect { public: @@ -35,7 +36,7 @@ public: BedIntersect(string bedAFile, string bedBFile, bool anyHit, bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, float overlapFraction, bool noHit, bool writeCount, bool forceStrand, - bool reciprocal, bool bamInput, bool bamOutput); + bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput); // destructor ~BedIntersect(void); @@ -60,7 +61,7 @@ private: bool _anyHit; bool _noHit; bool _writeCount; // do we want a count of the number of overlaps in B? - + bool _obeySplits; bool _bamInput; bool _bamOutput; @@ -76,7 +77,9 @@ private: void IntersectBam(string bamFile); - bool FindOverlaps(const BED &a, vector<BED> &hits); + bool processHits(const BED &a, const vector<BED> &hits, bool printable); + + bool FindOverlaps(const BED &a, vector<BED> &hits, int lineNum); bool FindOneOrMoreOverlap(const BED &a); void ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b, diff --git a/src/intersectBed/intersectMain.cpp b/src/intersectBed/intersectMain.cpp index d4a2277a9e6161ab043439ada14f5246d1520100..c428a9689a11a2bf76acba73a3631e01683eaa5e 100755 --- a/src/intersectBed/intersectMain.cpp +++ b/src/intersectBed/intersectMain.cpp @@ -48,6 +48,7 @@ int main(int argc, char* argv[]) { bool haveFraction = false; bool reciprocalFraction = false; bool forceStrand = false; + bool obeySplits = false; bool inputIsBam = false; bool outputIsBam = true; @@ -131,6 +132,9 @@ int main(int argc, char* argv[]) { else if (PARAMETER_CHECK("-s", 2, parameterLength)) { forceStrand = true; } + else if (PARAMETER_CHECK("-split", 6, parameterLength)) { + obeySplits = true; + } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; @@ -193,7 +197,7 @@ int main(int argc, char* argv[]) { BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, writeOverlap, writeAllOverlap, overlapFraction, noHit, writeCount, forceStrand, - reciprocalFraction, inputIsBam, outputIsBam); + reciprocalFraction, obeySplits, inputIsBam, outputIsBam); delete bi; return 0; } @@ -257,6 +261,9 @@ void ShowHelp(void) { cerr << "\t-s\t" << "Force strandedness. That is, only report hits in B that" << endl; cerr << "\t\toverlap A on the same strand." << endl; cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; + + cerr << "\t-split\t" << "Report \"split\" BAM alignments as separate BED entries." << endl << endl; + // end the program here exit(1); diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index ec2fe6e1a106a96eb2d497c8d08d7784c90579a8..757522c246e66457e2b2a734fa16939f1ee2d527 100755 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -13,6 +13,51 @@ #include "bedFile.h" +/************************************************ +Helper functions +*************************************************/ +void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks) { + + if (bed.otherFields.size() < 6) { + cerr << "Input error: Cannot split into blocks. Found interval with fewer than 12 columns on line " << lineNum << "." << endl; + exit(1); + } + + int blockCount = atoi(bed.otherFields[3].c_str()); + if ( blockCount <= 0 ) { + cerr << "Input error: found interval having <= 0 blocks on line " << lineNum << "." << endl; + exit(1); + } + else if ( blockCount == 1 ) { + //take a short-cut for single blocks + bedBlocks.push_back(bed); + } + else { + // get the comma-delimited strings for the BED12 block starts and block ends. + string blockSizes(bed.otherFields[4]); + string blockStarts(bed.otherFields[5]); + + vector<int> sizes; + vector<int> starts; + Tokenize(blockSizes, sizes, ","); + Tokenize(blockStarts, starts, ","); + + if ( sizes.size() != (size_t) blockCount || starts.size() != (size_t) blockCount ) { + cerr << "Input error: found interval with block-counts not matching starts/sizes on line " << lineNum << "." << endl; + exit(1); + } + + // add each BED block to the bedBlocks vector + for (UINT i = 0; i < (UINT) blockCount; ++i) { + UINT blockStart = bed.start + starts[i]; + UINT blockEnd = bed.start + starts[i] + sizes[i]; + BED currBedBlock(bed.chrom, blockStart, blockEnd, bed.name, bed.score, bed.strand, bed.otherFields); + bedBlocks.push_back(currBedBlock); + } + } +} + + /*********************************************** Sorting comparison functions ************************************************/ diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 5c67005c8e92eecbdb3aefd8afea3649c00d5f08..6c81ce481a754953d97081c94c9949566cb28c75 100755 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -111,6 +111,18 @@ public: end(end), strand(strand) {} + + // BEDALL + BED(string chrom, CHRPOS start, CHRPOS end, string name, + string score, string strand, vector<string> otherFields) + : chrom(chrom), + start(start), + end(end), + name(name), + score(score), + strand(strand), + otherFields(otherFields) + {} }; // BED @@ -149,6 +161,29 @@ enum BedLineStatus }; +//************************************************* +// Data structure typedefs +//************************************************* +typedef vector<BED> bedVector; +typedef vector<BEDCOV> bedCovVector; + +typedef map<BIN, bedVector, std::less<BIN> > binsToBeds; +typedef map<BIN, bedCovVector, std::less<BIN> > binsToBedCovs; + +typedef map<string, binsToBeds, std::less<string> > masterBedMap; +typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap; +typedef map<string, bedVector, std::less<string> > masterBedMapNoBin; + + +// EXPERIMENTAL +//typedef tr1::unordered_map<int, vector<BED> > binsToBeds; +//typedef tr1::unordered_map<string, binsToBeds> masterBedMap; +//typedef map<string, bedVector, std::less<string> > masterBedMap; +//typedef vector<BED> masterBedMapNew[_numBins]; +//typedef tr1::unordered_map<int, vector<BED> > binsToBedsNew; +//typedef tr1::unordered_map<string, binsToBeds> masterBedMapNew; + + // return the genome "bin" for a feature with this start and end inline BIN getBin(CHRPOS start, CHRPOS end) { @@ -185,7 +220,7 @@ std::string ToString(const T & value) // Ancillary functions - +void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks); // BED Sorting Methods @@ -199,31 +234,6 @@ bool byChromThenStart(BED const &a, BED const &b); -//************************************************* -// Data structure typedefs -//************************************************* -typedef vector<BED> bedVector; -typedef vector<BEDCOV> bedCovVector; - -typedef map<BIN, bedVector, std::less<BIN> > binsToBeds; -typedef map<BIN, bedCovVector, std::less<BIN> > binsToBedCovs; - -typedef map<string, binsToBeds, std::less<string> > masterBedMap; -typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap; -typedef map<string, bedVector, std::less<string> > masterBedMapNoBin; - - -// EXPERIMENTAL -//typedef tr1::unordered_map<int, vector<BED> > binsToBeds; -//typedef tr1::unordered_map<string, binsToBeds> masterBedMap; -//typedef map<string, bedVector, std::less<string> > masterBedMap; -//typedef vector<BED> masterBedMapNew[_numBins]; -//typedef tr1::unordered_map<int, vector<BED> > binsToBedsNew; -//typedef tr1::unordered_map<string, binsToBeds> masterBedMapNew; - - - - //************************************************ // BedFile Class methods and elements //************************************************ diff --git a/src/utils/lineFileUtilities/lineFileUtilities.cpp b/src/utils/lineFileUtilities/lineFileUtilities.cpp index dfbdf77a19ba1691f8c0f1a87a7a6ee06250011a..9c0b7e9a827400d1b0108672b151bb1594820d78 100755 --- a/src/utils/lineFileUtilities/lineFileUtilities.cpp +++ b/src/utils/lineFileUtilities/lineFileUtilities.cpp @@ -14,7 +14,7 @@ // lineFileUtilities: // Common Functions //*********************************************** -void Tokenize(string str, vector<string> &tokens, const string &delimiter) { +void Tokenize(const string &str, vector<string> &tokens, const string &delimiter) { /* //method to tokenize on any whitespace @@ -30,8 +30,7 @@ void Tokenize(string str, vector<string> &tokens, const string &delimiter) { // Find first "non-delimiter". string::size_type pos = str.find_first_of(delimiter, lastPos); - while (string::npos != pos || string::npos != lastPos) - { + while (string::npos != pos || string::npos != lastPos) { // Found a token, add it to the vector. tokens.push_back(str.substr(lastPos, pos - lastPos)); // Skip delimiters. Note the "not_of" @@ -39,7 +38,22 @@ void Tokenize(string str, vector<string> &tokens, const string &delimiter) { // Find next "non-delimiter" pos = str.find_first_of(delimiter, lastPos); } - +} + +void Tokenize(const string &str, vector<int> &tokens, const string &delimiter) { + // Skip delimiters at beginning. + string::size_type lastPos = str.find_first_not_of(delimiter, 0); + // Find first "non-delimiter". + string::size_type pos = str.find_first_of(delimiter, lastPos); + + while (string::npos != pos || string::npos != lastPos) { + // Found a token, add it to the vector. + tokens.push_back(atoi(str.substr(lastPos, pos - lastPos).c_str())); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of(delimiter, pos); + // Find next "non-delimiter" + pos = str.find_first_of(delimiter, lastPos); + } } diff --git a/src/utils/lineFileUtilities/lineFileUtilities.h b/src/utils/lineFileUtilities/lineFileUtilities.h index c4ab8a48751e56fcad95749fc0d73ef45262db7c..fd3dfb0505e77c1c02688e07e037504ce769c60b 100755 --- a/src/utils/lineFileUtilities/lineFileUtilities.h +++ b/src/utils/lineFileUtilities/lineFileUtilities.h @@ -9,7 +9,8 @@ using namespace std; // split a line from a file into a vector of strings. token = "\t" -void Tokenize(string str, vector<string>& tokens, const string &delimiter = "\t"); +void Tokenize(const string &str, vector<string>& tokens, const string &delimiter = "\t"); +void Tokenize(const string &str, vector<int>& tokens, const string &delimiter = "\t"); #endif /* LINEFILEUTILITIES_H */