From b7c0e34347b4b4b1c26f48772980ed66de1af898 Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Fri, 3 Dec 2010 09:17:36 -0500 Subject: [PATCH] Fixed pairToPair to allow full "B" entries to be reported in every mode. --- src/pairToPair/pairToPair.cpp | 100 +++++++++++++++++------------- src/pairToPair/pairToPair.h | 16 ++--- src/pairToPair/pairToPairMain.cpp | 5 +- src/utils/bedFile/bedFile.h | 14 +++++ src/utils/bedFilePE/bedFilePE.cpp | 54 ++++++++-------- src/utils/bedFilePE/bedFilePE.h | 8 +-- 6 files changed, 114 insertions(+), 83 deletions(-) diff --git a/src/pairToPair/pairToPair.cpp b/src/pairToPair/pairToPair.cpp index ca268d20..5f6b4c83 100644 --- a/src/pairToPair/pairToPair.cpp +++ b/src/pairToPair/pairToPair.cpp @@ -49,7 +49,7 @@ void PairToPair::IntersectPairs() { _bedB->loadBedPEFileIntoMap(); int lineNum = 0; - vector<BEDCOV> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2; + vector<MATE> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2; // reserve some space hitsA1B1.reserve(100); hitsA1B2.reserve(100); hitsA2B1.reserve(100); hitsA2B2.reserve(100); @@ -73,15 +73,18 @@ void PairToPair::IntersectPairs() { -void PairToPair::FindOverlaps(const BEDPE &a, vector<BEDCOV> &hitsA1B1, vector<BEDCOV> &hitsA1B2, - vector<BEDCOV> &hitsA2B1, vector<BEDCOV> &hitsA2B2) { +void PairToPair::FindOverlaps(const BEDPE &a, + vector<MATE> &hitsA1B1, + vector<MATE> &hitsA1B2, + vector<MATE> &hitsA2B1, + vector<MATE> &hitsA2B2) { // list of hits on each end of BEDPE // that exceed the requested overlap fraction - vector<BEDCOV> qualityHitsA1B1; - vector<BEDCOV> qualityHitsA1B2; - vector<BEDCOV> qualityHitsA2B1; - vector<BEDCOV> qualityHitsA2B2; + vector<MATE> qualityHitsA1B1; + vector<MATE> qualityHitsA1B2; + vector<MATE> qualityHitsA2B1; + vector<MATE> qualityHitsA2B2; // count of hits on each end of BEDPE // that exceed the requested overlap fraction @@ -151,14 +154,14 @@ void PairToPair::FindOverlaps(const BEDPE &a, vector<BEDCOV> &hitsA1B1, vector<B } -void PairToPair::FindQualityHitsBetweenEnds(CHRPOS start, CHRPOS end, const vector<BEDCOV> &hits, - vector<BEDCOV> &qualityHits, int &numOverlaps) { +void PairToPair::FindQualityHitsBetweenEnds(CHRPOS start, CHRPOS end, const vector<MATE> &hits, + vector<MATE> &qualityHits, int &numOverlaps) { - vector<BEDCOV>::const_iterator h = hits.begin(); - vector<BEDCOV>::const_iterator hitsEnd = hits.end(); + vector<MATE>::const_iterator h = hits.begin(); + vector<MATE>::const_iterator hitsEnd = hits.end(); for (; h != hitsEnd; ++h) { - int s = max(start, h->start); - int e = min(end, h->end); + int s = max(start, h->bed.start); + int e = min(end, h->bed.end); // is there enough overlap (default ~ 1bp) if ( ((float)(e-s) / (float)(end - start)) >= _overlapFraction ) { @@ -169,71 +172,82 @@ void PairToPair::FindQualityHitsBetweenEnds(CHRPOS start, CHRPOS end, const vect } -void PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<BEDCOV> &qualityHitsEnd1, - const vector<BEDCOV> &qualityHitsEnd2, int &matchCount) { +void PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, + const vector<MATE> &qualityHitsEnd2, int &matchCount) { - map<unsigned int, vector<BEDCOV>, less<int> > hitsMap; + map<unsigned int, vector<MATE>, less<int> > hitsMap; - for (vector<BEDCOV>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) { - hitsMap[h->count].push_back(*h); + for (vector<MATE>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) { + hitsMap[h->lineNum].push_back(*h); matchCount++; } - for (vector<BEDCOV>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) { - hitsMap[h->count].push_back(*h); + for (vector<MATE>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) { + hitsMap[h->lineNum].push_back(*h); matchCount++; } - for (map<unsigned int, vector<BEDCOV>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) { + for (map<unsigned int, vector<MATE>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) { if (m->second.size() == 2) { - BEDCOV b1 = m->second[0]; - BEDCOV b2 = m->second[1]; + MATE b1 = m->second[0]; + MATE b2 = m->second[1]; if (_searchType == "both") { _bedA->reportBedPETab(a); - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", b1.chrom.c_str(), b1.start, b1.end, - b2.chrom.c_str(), b2.start, b2.end, - b1.name.c_str(), b1.score.c_str(), - b1.strand.c_str(), b2.strand.c_str()); + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end, + b2.bed.chrom.c_str(), b2.bed.start, b2.bed.end, + b1.bed.name.c_str(), b1.bed.score.c_str(), + b1.bed.strand.c_str(), b2.bed.strand.c_str()); + for (size_t i = 0; i < b1.bed.otherFields.size(); ++i) + printf("\t%s", b1.bed.otherFields[i].c_str()); + printf("\n"); } } } } -void PairToPair::FindHitsOnEitherEnd(const BEDPE &a, const vector<BEDCOV> &qualityHitsEnd1, - const vector<BEDCOV> &qualityHitsEnd2, int &matchCount) { +void PairToPair::FindHitsOnEitherEnd(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, + const vector<MATE> &qualityHitsEnd2, int &matchCount) { - map<unsigned int, vector<BEDCOV>, less<int> > hitsMap; + map<unsigned int, vector<MATE>, less<int> > hitsMap; - for (vector<BEDCOV>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) { + for (vector<MATE>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) { hitsMap[h->lineNum].push_back(*h); matchCount++; } - for (vector<BEDCOV>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) { + for (vector<MATE>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) { hitsMap[h->lineNum].push_back(*h); matchCount++; } - for (map<unsigned int, vector<BEDCOV>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) { + for (map<unsigned int, vector<MATE>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) { if (m->second.size() >= 1) { if ((m->second.size()) == 2) { - BEDCOV b1 = m->second[0]; - BEDCOV b2 = m->second[1]; - + MATE b1 = m->second[0]; + MATE b2 = m->second[1]; + _bedA->reportBedPETab(a); - printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", b1.chrom.c_str(), b1.start, b1.end, - b2.chrom.c_str(), b2.start, b2.end, - b1.name.c_str(), b1.score.c_str(), - b1.strand.c_str(), b2.strand.c_str()); + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end, + b2.bed.chrom.c_str(), b2.bed.start, b2.bed.end, + b1.bed.name.c_str(), b1.bed.score.c_str(), + b1.bed.strand.c_str(), b2.bed.strand.c_str()); + for (size_t i = 0; i < b1.bed.otherFields.size(); ++i) + printf("\t%s", b1.bed.otherFields[i].c_str()); + printf("\n"); } else { - BEDCOV b1 = m->second[0]; + MATE b1 = m->second[0]; _bedA->reportBedPETab(a); - printf("%s\t%d\t%d\t.\t-1\t-1\t%s\t%s\t%s\t.\n", b1.chrom.c_str(), b1.start, b1.end, - b1.name.c_str(), b1.score.c_str(), b1.strand.c_str()); + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end, + b1.mate->bed.chrom.c_str(), b1.mate->bed.start, b1.mate->bed.end, + b1.bed.name.c_str(), b1.bed.score.c_str(), + b1.bed.strand.c_str(), b1.mate->bed.strand.c_str()); + for (size_t i = 0; i < b1.bed.otherFields.size(); ++i) + printf("\t%s", b1.bed.otherFields[i].c_str()); + printf("\n"); } } } diff --git a/src/pairToPair/pairToPair.h b/src/pairToPair/pairToPair.h index 08f9de4a..e30e4a28 100644 --- a/src/pairToPair/pairToPair.h +++ b/src/pairToPair/pairToPair.h @@ -20,6 +20,8 @@ using namespace std; + + //************************************************ // Class methods and elements //************************************************ @@ -55,17 +57,17 @@ private: BedFilePE *_bedB; // methods - void FindOverlaps(const BEDPE &a, vector<BEDCOV> &hitsA1B1, vector<BEDCOV> &hitsA1B2, - vector<BEDCOV> &hitsA2B1, vector<BEDCOV> &hitsA2B2); + void FindOverlaps(const BEDPE &a, vector<MATE> &hitsA1B1, vector<MATE> &hitsA1B2, + vector<MATE> &hitsA2B1, vector<MATE> &hitsA2B2); void FindQualityHitsBetweenEnds(CHRPOS start, CHRPOS end, - const vector<BEDCOV> &hits, vector<BEDCOV> &qualityHits, int &numOverlaps); + const vector<MATE> &hits, vector<MATE> &qualityHits, int &numOverlaps); - void FindHitsOnBothEnds(const BEDPE &a, const vector<BEDCOV> &qualityHitsEnd1, - const vector<BEDCOV> &qualityHitsEnd2, int &matchCount); + void FindHitsOnBothEnds(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, + const vector<MATE> &qualityHitsEnd2, int &matchCount); - void FindHitsOnEitherEnd(const BEDPE &a, const vector<BEDCOV> &qualityHitsEnd1, - const vector<BEDCOV> &qualityHitsEnd2, int &matchCount); + void FindHitsOnEitherEnd(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, + const vector<MATE> &qualityHitsEnd2, int &matchCount); }; diff --git a/src/pairToPair/pairToPairMain.cpp b/src/pairToPair/pairToPairMain.cpp index bcaef09c..1cf03f22 100644 --- a/src/pairToPair/pairToPairMain.cpp +++ b/src/pairToPair/pairToPairMain.cpp @@ -161,8 +161,9 @@ void ShowHelp(void) { cerr << "\t\tboth\tReport overlaps if both ends of A overlap B." << endl; cerr << "\t\t\t- Default = both." << endl << endl; - cerr << "\t-slop \t" << "The amount of slop (in b.p.). to be added to each footprint." << endl << endl; - + cerr << "\t-slop \t" << "The amount of slop (in b.p.). to be added to each footprint." << endl; + cerr << "\t\t*Note*: Slop is subtracted from start1 and start2 and added to end1 and end2." << endl << endl; + cerr << "\t-ss\t" << "Add slop based to each BEDPE footprint based on strand." << endl; cerr << "\t\t- If strand is \"+\", slop is only added to the end coordinates." << endl; cerr << "\t\t- If strand is \"-\", slop is only added to the start coordinates." << endl; diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 8916e105..2d5d8886 100644 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -154,6 +154,17 @@ public: }; // BED +/* + Structure for each end of a paired BED record + mate points to the other end. +*/ +struct MATE { + BED bed; + int lineNum; + MATE *mate; +}; + + /* Structure for regular BED COVERAGE records */ @@ -225,14 +236,17 @@ enum FileType //************************************************* typedef vector<BED> bedVector; typedef vector<BEDCOV> bedCovVector; +typedef vector<MATE> mateVector; typedef vector<BEDCOVLIST> bedCovListVector; typedef map<BIN, bedVector, std::less<BIN> > binsToBeds; typedef map<BIN, bedCovVector, std::less<BIN> > binsToBedCovs; +typedef map<BIN, mateVector, std::less<BIN> > binsToMates; typedef map<BIN, bedCovListVector, std::less<BIN> > binsToBedCovLists; typedef map<string, binsToBeds, std::less<string> > masterBedMap; typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap; +typedef map<string, binsToMates, std::less<string> > masterMateMap; typedef map<string, binsToBedCovLists, std::less<string> > masterBedCovListMap; typedef map<string, bedVector, std::less<string> > masterBedMapNoBin; diff --git a/src/utils/bedFilePE/bedFilePE.cpp b/src/utils/bedFilePE/bedFilePE.cpp index 11d64257..98d3a200 100644 --- a/src/utils/bedFilePE/bedFilePE.cpp +++ b/src/utils/bedFilePE/bedFilePE.cpp @@ -421,7 +421,7 @@ bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, co /* Adapted from kent source "binKeeperFind" */ -void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string strand, vector<BEDCOV> &hits, bool forceStrand) { +void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string strand, vector<MATE> &hits, bool forceStrand) { int startBin, endBin; startBin = (start >> _binFirstShift); @@ -437,8 +437,8 @@ void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS // loop through each feature in this chrom/bin and see if it overlaps // with the feature that was passed in. if so, add the feature to // the list of hits. - vector<BEDCOV>::const_iterator bedItr; - vector<BEDCOV>::const_iterator bedEnd; + vector<MATE>::const_iterator bedItr; + vector<MATE>::const_iterator bedEnd; if (bEnd == 1) { bedItr = bedMapEnd1[chrom][j].begin(); bedEnd = bedMapEnd1[chrom][j].end(); @@ -453,10 +453,10 @@ void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS for (; bedItr != bedEnd; ++bedItr) { // skip the hit if not on the same strand (and we care) - if (forceStrand && (strand != bedItr->strand)) { + if (forceStrand && (strand != bedItr->bed.strand)) { continue; } - else if (overlaps(bedItr->start, bedItr->end, start, end) > 0) { + else if (overlaps(bedItr->bed.start, bedItr->bed.end, start, end) > 0) { hits.push_back(*bedItr); // it's a hit, add it. } @@ -480,18 +480,19 @@ void BedFilePE::loadBedPEFileIntoMap() { while (bedStatus != BED_INVALID) { if (bedStatus == BED_VALID) { - BEDCOV bedEntry1, bedEntry2; + MATE *bedEntry1 = new MATE(); + MATE *bedEntry2 = new MATE(); // separate the BEDPE entry into separate // BED entries splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2); // load end1 into a UCSC bin map - bin1 = getBin(bedEntry1.start, bedEntry1.end); - this->bedMapEnd1[bedEntry1.chrom][bin1].push_back(bedEntry1); + bin1 = getBin(bedEntry1->bed.start, bedEntry1->bed.end); + this->bedMapEnd1[bedEntry1->bed.chrom][bin1].push_back(*bedEntry1); // load end2 into a UCSC bin map - bin2 = getBin(bedEntry2.start, bedEntry2.end); - this->bedMapEnd2[bedEntry2.chrom][bin2].push_back(bedEntry2); + bin2 = getBin(bedEntry2->bed.start, bedEntry2->bed.end); + this->bedMapEnd2[bedEntry2->bed.chrom][bin2].push_back(*bedEntry2); bedpeEntry = nullBedPE; } @@ -501,7 +502,7 @@ void BedFilePE::loadBedPEFileIntoMap() { } -void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, BEDCOV &bedEntry1, BEDCOV &bedEntry2) { +void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, MATE *bedEntry1, MATE *bedEntry2) { /* Split the BEDPE entry into separate BED entries @@ -515,23 +516,22 @@ void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, read-pair. */ - bedEntry1.chrom = bedpeEntry.chrom1; - bedEntry1.start = bedpeEntry.start1; - bedEntry1.end = bedpeEntry.end1; - bedEntry1.name = bedpeEntry.name; - bedEntry1.score = bedpeEntry.score; - bedEntry1.strand = bedpeEntry.strand1; - bedEntry1.count = lineNum; - bedEntry1.minOverlapStart = INT_MAX; + bedEntry1->bed.chrom = bedpeEntry.chrom1; + bedEntry1->bed.start = bedpeEntry.start1; + bedEntry1->bed.end = bedpeEntry.end1; + bedEntry1->bed.name = bedpeEntry.name; // only store the name in end1 to save memory + bedEntry1->bed.score = bedpeEntry.score; // only store the score in end1 to save memory + bedEntry1->bed.strand = bedpeEntry.strand1; + bedEntry1->bed.otherFields = bedpeEntry.otherFields; // only store the otherFields in end1 to save memory + bedEntry1->lineNum = lineNum; + bedEntry1->mate = bedEntry2; // keep a pointer to end2 - bedEntry2.chrom = bedpeEntry.chrom2; - bedEntry2.start = bedpeEntry.start2; - bedEntry2.end = bedpeEntry.end2; - bedEntry2.name = bedpeEntry.name; - bedEntry2.score = bedpeEntry.score; - bedEntry2.strand = bedpeEntry.strand2; - bedEntry2.count = lineNum; - bedEntry2.minOverlapStart = INT_MAX; + bedEntry2->bed.chrom = bedpeEntry.chrom2; + bedEntry2->bed.start = bedpeEntry.start2; + bedEntry2->bed.end = bedpeEntry.end2; + bedEntry2->bed.strand = bedpeEntry.strand2; + bedEntry2->lineNum = lineNum; + bedEntry2->mate = bedEntry1; // keep a pointer to end1 } diff --git a/src/utils/bedFilePE/bedFilePE.h b/src/utils/bedFilePE/bedFilePE.h index 0e6be51b..e6e43ed0 100644 --- a/src/utils/bedFilePE/bedFilePE.h +++ b/src/utils/bedFilePE/bedFilePE.h @@ -69,18 +69,18 @@ public: void reportBedPETab(const BEDPE &a); void reportBedPENewLine(const BEDPE &a); void loadBedPEFileIntoMap(); - void splitBedPEIntoBeds(const BEDPE &a, const int &lineNum, BEDCOV &bedEntry1, BEDCOV &bedEntry2); + void splitBedPEIntoBeds(const BEDPE &a, const int &lineNum, MATE *bedEntry1, MATE *bedEntry2); void FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string strand, - vector<BEDCOV> &hits, bool forceStrand); + vector<MATE> &hits, bool forceStrand); string bedFile; unsigned int bedType; - masterBedCovMap bedMapEnd1; - masterBedCovMap bedMapEnd2; + masterMateMap bedMapEnd1; + masterMateMap bedMapEnd2; private: istream *_bedStream; -- GitLab