diff --git a/bin/bamToBed b/bin/bamToBed index b71a166409704697a9c09469a84bf9fbf3e8ac18..b15718ee7c9f74ee128666db8b08b1c072834c1e 100755 Binary files a/bin/bamToBed and b/bin/bamToBed differ diff --git a/obj/bamToBed.o b/obj/bamToBed.o index 0153502b8258a2c5446569892c0591170119443b..919884ea466ff0a4e16620536e4b3dd40e3402a5 100644 Binary files a/obj/bamToBed.o and b/obj/bamToBed.o differ diff --git a/src/bamToBed/bamToBed.cpp b/src/bamToBed/bamToBed.cpp index 41a73784a386e173311147f11692344741ffdccc..4de8238dc9a90806f1ac5eed783e3cdb563b7a8f 100755 --- a/src/bamToBed/bamToBed.cpp +++ b/src/bamToBed/bamToBed.cpp @@ -152,17 +152,20 @@ void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDista string strand = "+"; if (bam.IsReverseStrand()) strand = "-"; + string name = bam.Name; + if (bam.IsFirstMate()) name += "/1"; + if (bam.IsSecondMate()) name += "/2"; if (useEditDistance == false) { printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, - bam.Position + bam.Length, bam.Name.c_str(), + bam.Position + bam.Length, name.c_str(), bam.MapQuality, strand.c_str()); } else { uint8_t editDistance; if (bam.GetEditDistance(editDistance)) { printf("%s\t%d\t%d\t\%s\t%u\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, - bam.Position + bam.Length, bam.Name.c_str(), + bam.Position + bam.Length, name.c_str(), editDistance, strand.c_str()); } } @@ -175,7 +178,7 @@ void PrintBedPE(const BamAlignment &bam, const RefVector &refs, bool useEditDis string strand1 = "+"; string strand2 = "+"; if (bam.IsReverseStrand()) strand1 = "-"; - if (bam.IsMateReverseStrand()) strand2 = "-"; + if (bam.IsMateReverseStrand()) strand2 = "-"; printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, bam.Position + bam.Length, diff --git a/src/pairToBed/pairToBedMain.cpp b/src/pairToBed/pairToBedMain.cpp index 4d7948f25340f56cf27802534a374c9617d2630c..950edced4b4cbbd4af10925deb93e2b167847687 100755 --- a/src/pairToBed/pairToBedMain.cpp +++ b/src/pairToBed/pairToBedMain.cpp @@ -167,7 +167,7 @@ void ShowHelp(void) { cerr << "\t\tDefault is to ignore stand." << endl; cerr << "\t\tNot applicable with -type inspan or -type outspan." << endl << endl; - cerr << "\t-type \t" << "Report overlaps if between BEDPE A and BED B." << endl << endl; + cerr << "\t-type \t" << "Approach to reporting overlaps between BEDPE and BED." << endl << endl; cerr << "\t\teither\tReport overlaps if either end of A overlaps B." << endl; cerr << "\t\t\t- Default." << endl; cerr << "\t\tneither\tReport A if neither end of A overlaps B." << endl; @@ -182,7 +182,7 @@ void ShowHelp(void) { cerr << "\t\tnotospan\tReport A if ospan of A doesn't overlap B." << endl; cerr << "\t\t\t- Note: If chrom1 <> chrom2, entry is ignored." << endl << endl; - cerr << "See BEDTools manual for BEDPE file." << endl << endl; + cerr << "Refer to the BEDTools manual for BEDPE format." << endl << endl; exit(1); } diff --git a/src/pairToPair/a.bedpe b/src/pairToPair/a.bedpe new file mode 100644 index 0000000000000000000000000000000000000000..972367756cda858caf3bdca5d0423442370c627e --- /dev/null +++ b/src/pairToPair/a.bedpe @@ -0,0 +1 @@ +chr1 10 20 chr1 19 120 a1 1 + - diff --git a/src/pairToPair/b.bedpe b/src/pairToPair/b.bedpe new file mode 100644 index 0000000000000000000000000000000000000000..1a551009b26e42120bbd5de934073ffccaf2b0b6 --- /dev/null +++ b/src/pairToPair/b.bedpe @@ -0,0 +1 @@ +chr1 10 20 chr1 19 120 a1 1 - - diff --git a/src/pairToPair/pairToPair.cpp b/src/pairToPair/pairToPair.cpp index 1ef5a402a6330f1a0aca7076a15b166d2f070bd8..12931886a45c0bb7a4eb9c9ebc0ca4ce6a927703 100755 --- a/src/pairToPair/pairToPair.cpp +++ b/src/pairToPair/pairToPair.cpp @@ -1,11 +1,14 @@ -/* - BEDTools: pairToPair.cpp +/***************************************************************************** + pairToPair.cpp - Created by Aaron Quinlan Spring 2009. - Copyright 2009 Aaron Quinlan. All rights reserved. + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com - Summary: Looks for overlaps between paired-end reads / SVs (BEDPE format) and a BED file. -*/ + Licenced under the GNU General Public License 2.0+ license. +******************************************************************************/ #include "lineFileUtilities.h" #include "pairToPair.h" @@ -35,71 +38,47 @@ PairToPair::~PairToPair(void) { -void PairToPair::IntersectPairs() { +void PairToPair::IntersectPairs(istream &bedInput) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps bedB->loadBedPEFileIntoMap(); - - string bedLine; + int lineNum = 0; - - // are we dealing with a file? - if (bedA->bedFile != "stdin") { - - // open the BEDPE file for reading - ifstream bed(bedA->bedFile.c_str(), ios::in); - if ( !bed ) { - cerr << "Error: The requested bedpe file (" <<bedA->bedFile << ") could not be opened. Exiting!" << endl; - exit (1); - } + string bedLine; + vector<string> bedFields; // vector for a BED entry + + vector<BED> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2; + // reserve some space + hitsA1B1.reserve(100); hitsA1B2.reserve(100); hitsA2B1.reserve(100); hitsA2B2.reserve(100); + + bedFields.reserve(10); + + // process each entry in A + while (getline(bedInput, bedLine)) { - BEDPE a; - // process each entry in A - while (getline(bed, bedLine)) { - - // split the current line into ditinct fields - vector<string> bedFields; - Tokenize(bedLine,bedFields); - - lineNum++; - - // find the overlaps with B if it's a valid BED entry. - if (bedA->parseBedPELine(a, bedFields, lineNum)) { - vector<BED> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2; - - FindOverlaps(a, hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2, this->searchType); - } - } - } - // "A" is being passed via STDIN. - else { + BEDPE a; + Tokenize(bedLine,bedFields); + lineNum++; - BEDPE a; - // process each entry in A - while (getline(cin, bedLine)) { - - // split the current line into ditinct fields - vector<string> bedFields; - Tokenize(bedLine,bedFields); - - lineNum++; + // find the overlaps with B if it's a valid BED entry. + if (bedA->parseBedPELine(a, bedFields, lineNum)) { - // find the overlaps with B if it's a valid BED entry. - if (bedA->parseBedPELine(a, bedFields, lineNum)) { - vector<BED> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2; - - FindOverlaps(a, hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2, this->searchType); - } + FindOverlaps(a, hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2, this->searchType); + + // reset space for next BEDPE + hitsA1B1.clear(); hitsA1B2.clear(); hitsA2B1.clear(); hitsA2B2.clear(); } + // reset for the next input line + bedFields.clear(); } } // END IntersectPE -void PairToPair::FindOverlaps(BEDPE &a, vector<BED> &hitsA1B1, vector<BED> &hitsA1B2, - vector<BED> &hitsA2B1, vector<BED> &hitsA2B2, string &type) { +void PairToPair::FindOverlaps(const BEDPE &a, vector<BED> &hitsA1B1, vector<BED> &hitsA1B2, + vector<BED> &hitsA2B1, vector<BED> &hitsA2B2, string type) { // list of hits on each end of BEDPE // that exceed the requested overlap fraction @@ -117,10 +96,10 @@ void PairToPair::FindOverlaps(BEDPE &a, vector<BED> &hitsA1B1, vector<BED> &hits // Find the _potential_ hits between each end of A and B - bedB->binKeeperFind(a.chrom1, a.start1, a.end1, hitsA1B1); // hits between A1 to B1 - bedB->binKeeperFind(a.chrom2, a.start2, a.end2, hitsA2B1); // hits between A2 to B1 - bedB->binKeeperFind(a.chrom1, a.start1, a.end1, hitsA1B2); // hits between A1 to B2 - bedB->binKeeperFind(a.chrom2, a.start2, a.end2, hitsA2B2); // hits between A2 to B2 + bedB->FindOverlapsPerBin(1, a.chrom1, a.start1, a.end1, a.strand1, hitsA1B1, !(ignoreStrand)); // hits between A1 to B1 + bedB->FindOverlapsPerBin(1, a.chrom2, a.start2, a.end2, a.strand2, hitsA2B1, !(ignoreStrand)); // hits between A2 to B1 + bedB->FindOverlapsPerBin(2, a.chrom1, a.start1, a.end1, a.strand1, hitsA1B2, !(ignoreStrand)); // hits between A1 to B2 + bedB->FindOverlapsPerBin(2, a.chrom2, a.start2, a.end2, a.strand2, hitsA2B2, !(ignoreStrand)); // hits between A2 to B2 // Now, reduce to the set of hits on each end of A and B that meet the required overlap fraction and orientation. @@ -147,40 +126,36 @@ void PairToPair::FindOverlaps(BEDPE &a, vector<BED> &hitsA1B1, vector<BED> &hits -void PairToPair::FindQualityHitsBetweenEnds(BEDPE a, int end, vector<BED> &hits, +void PairToPair::FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vector<BED> &hits, vector<BED> &qualityHits, int &numOverlaps) { if (end == 1) { - for (vector<BED>::iterator h = hits.begin(); h != hits.end(); ++h) { - - if ((this->ignoreStrand == true) || (a.strand1 == h->strand)) { - - int s = max(a.start1, h->start); - int e = min(a.end1, h->end); - if (s < e) { - // is there enough overlap (default ~ 1bp) - if ( ((float)(e-s) / (float)(a.end1 - a.start1)) >= this->overlapFraction ) { - numOverlaps++; - qualityHits.push_back(*h); - } - } + + vector<BED>::const_iterator h = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; h != hitsEnd; ++h) { + int s = max(a.start1, h->start); + int e = min(a.end1, h->end); + + // is there enough overlap (default ~ 1bp) + if ( ((float)(e-s) / (float)(a.end1 - a.start1)) >= this->overlapFraction ) { + numOverlaps++; + qualityHits.push_back(*h); } } + } else if (end == 2) { - for (vector<BED>::iterator h = hits.begin(); h != hits.end(); ++h) { - - if ((this->ignoreStrand == true) || (a.strand2 == h->strand)) { - - int s = max(a.start2, h->start); - int e = min(a.end2, h->end); - if (s < e) { - // is there enough overlap (default ~ 1bp) - if ( ((float)(e-s) / (float)(a.end2 - a.start2)) >= this->overlapFraction ) { - numOverlaps++; - qualityHits.push_back(*h); - } - } + + vector<BED>::const_iterator h = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; h != hitsEnd; ++h) { + int s = max(a.start2, h->start); + int e = min(a.end2, h->end); + // is there enough overlap (default ~ 1bp) + if ( ((float)(e-s) / (float)(a.end2 - a.start2)) >= this->overlapFraction ) { + numOverlaps++; + qualityHits.push_back(*h); } } } @@ -200,6 +175,7 @@ void PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<BED> &qualityHi hitsMap[h->count].push_back(*h); matchCount++; } + for (map<unsigned int, vector<BED>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) { if (m->second.size() == 2) { @@ -216,3 +192,19 @@ void PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<BED> &qualityHi } } } + + +void PairToPair::DetermineBedPEInput() { + + if (bedA->bedFile != "stdin") { // process a file + ifstream beds(bedA->bedFile.c_str(), ios::in); + if ( !beds ) { + cerr << "Error: The requested bed file (" << bedA->bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + IntersectPairs(beds); + } + else { // process stdin + IntersectPairs(cin); + } +} diff --git a/src/pairToPair/pairToPair.h b/src/pairToPair/pairToPair.h index 3cb91e368ed19e95fa1cda6978b1ba1c61981486..2eb676a0b2dc27eba9af58e8fb91c4ecd708a2ac 100755 --- a/src/pairToPair/pairToPair.h +++ b/src/pairToPair/pairToPair.h @@ -1,3 +1,14 @@ +/***************************************************************************** + pairToPair.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0+ license. +******************************************************************************/ #ifndef PAIRTOPAIR_H #define PAIRTOPAIR_H @@ -22,14 +33,18 @@ public: // destructor ~PairToPair(void); - void IntersectPairs (); + void IntersectPairs(istream &bedInput); - void FindOverlaps(BEDPE &, vector<BED> &, vector<BED> &, vector<BED> &, vector<BED> &, string &); + void FindOverlaps(const BEDPE &a, vector<BED> &hitsA1B1, vector<BED> &hitsA1B2, + vector<BED> &hitsA2B1, vector<BED> &hitsA2B2, string type); - void FindQualityHitsBetweenEnds(BEDPE, int, vector<BED> &, vector<BED> &, int &); + void FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vector<BED> &hits, + vector<BED> &qualityHits, int &numOverlaps); - void FindHitsOnBothEnds(const BEDPE &, const vector<BED> &, const vector<BED> &, int &); + void FindHitsOnBothEnds(const BEDPE &a, const vector<BED> &qualityHitsEnd1, + const vector<BED> &qualityHitsEnd2, int &matchCount); + void DetermineBedPEInput(); private: diff --git a/src/pairToPair/pairToPairMain.cpp b/src/pairToPair/pairToPairMain.cpp index c405caea5553b8cad3fc6e5e31952e2fdad3bdea..400ec04c71b88d74d330b96f6fc7dee36c84897d 100755 --- a/src/pairToPair/pairToPairMain.cpp +++ b/src/pairToPair/pairToPairMain.cpp @@ -1,3 +1,14 @@ +/***************************************************************************** + pairToPairMain.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0+ license. +******************************************************************************/ #include "pairToPair.h" #include "version.h" @@ -102,7 +113,7 @@ int main(int argc, char* argv[]) { if (!showHelp) { PairToPair *bi = new PairToPair(bedAFile, bedBFile, overlapFraction, searchType, ignoreStrand); - bi->IntersectPairs(); + bi->DetermineBedPEInput(); return 0; } else { @@ -112,26 +123,29 @@ int main(int argc, char* argv[]) { void ShowHelp(void) { - cerr << "===============================================" << endl; - cerr << " " <<PROGRAM_NAME << " v" << VERSION << endl ; - cerr << " Aaron Quinlan, Ph.D. (aaronquinlan@gmail.com) " << endl ; - cerr << " Hall Laboratory, University of Virginia" << endl; - cerr << "===============================================" << endl << endl; - cerr << "Description: Report overlaps between two paired-end BED files (BEDPE)." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bedpe> -b <b.bedpe>" << endl << endl; - - cerr << "OPTIONS: " << endl; - cerr << "\t" << "-f\t\t" << "Minimum overlap req'd as fraction of a.bed (e.g. 0.05)." << endl << "\t\t\tDefault is 1E-9 (effectively 1bp)." << endl << endl; - cerr << "\t" << "-type \t\t" << "neither\t\t\tReport overlaps if _neither_ end of BEDPE (A) overlaps BEDPE B." << endl; - cerr << "\t\t\tboth\t\t\tReport overlaps if _both_ ends of BEDPE (A) overlap BEDPE B." << endl << endl; - cerr << "\t" << "-is\t\t" << "Ignore strands when searching for overlaps. By default, like strands are enforced." << endl; - + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - cerr << "NOTES: " << endl; - cerr << "\t" << "-i stdin\t" << "Allows BEDPE file A to be read from stdin. E.g.: cat a.bedpe | pairToPair -a stdin -b B.bed" << endl << endl; - cerr << "\t***Only 10 columns BEDPE formats allowed. See below).***"<< endl << endl; + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + cerr << "Summary: Report overlaps between two paired-end BED files (BEDPE)." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <BEDPE> -b <BEDPE>" << endl << endl; + + cerr << "Options: " << endl; + cerr << "\t-f\t" << "Minimum overlap required as fraction of A (e.g. 0.05)." << endl; + cerr << "\t\tDefault is 1E-9 (effectively 1bp)." << endl << endl; + + cerr << "\t-type \t" << "Approach to reporting overlaps between A and B." << endl; + cerr << "\t\tneither\t\tReport overlaps if neither end of A overlaps B." << endl << endl; + + cerr << "\t\tboth\t\tReport overlaps if both ends of A overlap B." << endl; + cerr << "\t\t\t\t- Default." << endl; + + cerr << "\t-is\t" << "Ignore strands when searching for overlaps." << endl; + cerr << "\t\t- By default, strands are enforced." << endl << endl; + + cerr << "Refer to the BEDTools manual for BEDPE format." << endl << endl; + // end the program here exit(1); diff --git a/src/utils/bedFilePE/bedFilePE.cpp b/src/utils/bedFilePE/bedFilePE.cpp index df8823b7acda7933f3aaa89b1d8d953a94be21b5..0de1771641e3c221e53e3f49326f12536ab6f2b4 100755 --- a/src/utils/bedFilePE/bedFilePE.cpp +++ b/src/utils/bedFilePE/bedFilePE.cpp @@ -15,7 +15,7 @@ #include "bedFilePE.h" -void BedFilePE::binKeeperFind(map<int, vector<BED>, std::less<int> > &bk, const int start, const int end, vector<BED> &hits) { +/*void BedFilePE::binKeeperFind(map<int, vector<BED>, std::less<int> > &bk, const int start, const int end, vector<BED> &hits) { int startBin, endBin; startBin = (start >>_binFirstShift); @@ -37,6 +37,60 @@ void BedFilePE::binKeeperFind(map<int, vector<BED>, std::less<int> > &bk, const endBin >>= _binNextShift; } } +*/ + + +/* + Adapted from kent source "binKeeperFind" +*/ +void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, int start, int end, string strand, vector<BED> &hits, bool forceStrand) { + + int startBin, endBin; + startBin = (start >> _binFirstShift); + endBin = ((end-1) >> _binFirstShift); + + // loop through each bin "level" in the binning hierarchy + for (int i = 0; i < 6; ++i) { + + // loop through each bin at this level of the hierarchy + int offset = binOffsetsExtended[i]; + for (int j = (startBin+offset); j <= (endBin+offset); ++j) { + + // loop through each feature in this chrom/bin and see if it overlaps + // with the feature that was passed in. if so, add the feature to + // the list of hits. + vector<BED>::const_iterator bedItr; + vector<BED>::const_iterator bedEnd; + if (bEnd == 1) { + bedItr = bedMapEnd1[chrom][j].begin(); + bedEnd = bedMapEnd1[chrom][j].end(); + } + else if (bEnd == 2) { + bedItr = bedMapEnd2[chrom][j].begin(); + bedEnd = bedMapEnd2[chrom][j].end(); + } + else { + cerr << "Unexpected end of B requested" << endl; + } + for (; bedItr != bedEnd; ++bedItr) { + + // skip the hit if not on the same strand (and we care) + if (forceStrand && (strand != bedItr->strand)) { + continue; + } + else if (overlaps(bedItr->start, bedItr->end, start, end) > 0) { + hits.push_back(*bedItr); // it's a hit, add it. + } + + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } +} + + + //*********************************************** diff --git a/src/utils/bedFilePE/bedFilePE.h b/src/utils/bedFilePE/bedFilePE.h index a48ce77d8b16e6d96f1699614f87565180b1417a..557cb264fa0131a66c3fc2db93e7e68b16992576 100755 --- a/src/utils/bedFilePE/bedFilePE.h +++ b/src/utils/bedFilePE/bedFilePE.h @@ -59,9 +59,13 @@ public: void loadBedPEFileIntoMap(); void splitBedPEIntoBeds(const BEDPE &, unsigned int, BED &, BED &); - void binKeeperFind(map<int, vector<BED>, - std::less<int> > &, const int, - const int, vector<BED> &); + + void FindOverlapsPerBin(int bEnd, string chrom, int start, int end, string strand, + vector<BED> &hits, bool forceStrand); + + //void binKeeperFind(map<int, vector<BED>, + // std::less<int> > &, const int, + // const int, vector<BED> &); string bedFile; unsigned int bedType;