From b7c0e34347b4b4b1c26f48772980ed66de1af898 Mon Sep 17 00:00:00 2001
From: Aaron <aaronquinlan@gmail.com>
Date: Fri, 3 Dec 2010 09:17:36 -0500
Subject: [PATCH] Fixed pairToPair to allow full "B" entries to be reported in
 every mode.

---
 src/pairToPair/pairToPair.cpp     | 100 +++++++++++++++++-------------
 src/pairToPair/pairToPair.h       |  16 ++---
 src/pairToPair/pairToPairMain.cpp |   5 +-
 src/utils/bedFile/bedFile.h       |  14 +++++
 src/utils/bedFilePE/bedFilePE.cpp |  54 ++++++++--------
 src/utils/bedFilePE/bedFilePE.h   |   8 +--
 6 files changed, 114 insertions(+), 83 deletions(-)

diff --git a/src/pairToPair/pairToPair.cpp b/src/pairToPair/pairToPair.cpp
index ca268d20..5f6b4c83 100644
--- a/src/pairToPair/pairToPair.cpp
+++ b/src/pairToPair/pairToPair.cpp
@@ -49,7 +49,7 @@ void PairToPair::IntersectPairs() {
 	_bedB->loadBedPEFileIntoMap();
 	
 	int lineNum = 0;	
-	vector<BEDCOV> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2;
+	vector<MATE> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2;
 	// reserve some space
 	hitsA1B1.reserve(100); hitsA1B2.reserve(100); hitsA2B1.reserve(100); hitsA2B2.reserve(100);
 	
@@ -73,15 +73,18 @@ void PairToPair::IntersectPairs() {
 
 
 
-void PairToPair::FindOverlaps(const BEDPE &a, vector<BEDCOV> &hitsA1B1, vector<BEDCOV> &hitsA1B2, 
-							  vector<BEDCOV> &hitsA2B1, vector<BEDCOV> &hitsA2B2) {
+void PairToPair::FindOverlaps(const BEDPE &a, 
+                              vector<MATE> &hitsA1B1, 
+                              vector<MATE> &hitsA1B2, 
+							  vector<MATE> &hitsA2B1, 
+							  vector<MATE> &hitsA2B2) {
 
 	// list of hits on each end of BEDPE
 	// that exceed the requested overlap fraction
-	vector<BEDCOV> qualityHitsA1B1;
-	vector<BEDCOV> qualityHitsA1B2;
-	vector<BEDCOV> qualityHitsA2B1;
-	vector<BEDCOV> qualityHitsA2B2;
+	vector<MATE> qualityHitsA1B1;
+	vector<MATE> qualityHitsA1B2;
+	vector<MATE> qualityHitsA2B1;
+	vector<MATE> qualityHitsA2B2;
 
 	// count of hits on each end of BEDPE
 	// that exceed the requested overlap fraction
@@ -151,14 +154,14 @@ void PairToPair::FindOverlaps(const BEDPE &a, vector<BEDCOV> &hitsA1B1, vector<B
 }
 
 
-void PairToPair::FindQualityHitsBetweenEnds(CHRPOS start, CHRPOS end, const vector<BEDCOV> &hits, 
-                                            vector<BEDCOV> &qualityHits, int &numOverlaps) {
+void PairToPair::FindQualityHitsBetweenEnds(CHRPOS start, CHRPOS end, const vector<MATE> &hits, 
+                                            vector<MATE> &qualityHits, int &numOverlaps) {
 
-	vector<BEDCOV>::const_iterator h       = hits.begin();
-	vector<BEDCOV>::const_iterator hitsEnd = hits.end();
+	vector<MATE>::const_iterator h       = hits.begin();
+	vector<MATE>::const_iterator hitsEnd = hits.end();
 	for (; h != hitsEnd; ++h) {				
-		int s = max(start, h->start);
-		int e = min(end, h->end);
+		int s = max(start, h->bed.start);
+		int e = min(end, h->bed.end);
 
 		// is there enough overlap (default ~ 1bp)
 		if ( ((float)(e-s) / (float)(end - start)) >= _overlapFraction ) {
@@ -169,71 +172,82 @@ void PairToPair::FindQualityHitsBetweenEnds(CHRPOS start, CHRPOS end, const vect
 }
 
 
-void PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<BEDCOV> &qualityHitsEnd1, 
-									const vector<BEDCOV> &qualityHitsEnd2, int &matchCount) {
+void PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, 
+									const vector<MATE> &qualityHitsEnd2, int &matchCount) {
 	
-	map<unsigned int, vector<BEDCOV>, less<int> > hitsMap;
+	map<unsigned int, vector<MATE>, less<int> > hitsMap;
 	
-	for (vector<BEDCOV>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) {
-		hitsMap[h->count].push_back(*h);
+	for (vector<MATE>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) {
+		hitsMap[h->lineNum].push_back(*h);
 		matchCount++;
 	}
-	for (vector<BEDCOV>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) {
-		hitsMap[h->count].push_back(*h);
+	for (vector<MATE>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) {
+		hitsMap[h->lineNum].push_back(*h);
 		matchCount++;
 	}
 
-	for (map<unsigned int, vector<BEDCOV>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) {
+	for (map<unsigned int, vector<MATE>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) {
 		if (m->second.size() == 2) {
 			
-			BEDCOV b1 = m->second[0];
-			BEDCOV b2 = m->second[1];
+			MATE b1 = m->second[0];
+			MATE b2 = m->second[1];
 			
 			if (_searchType == "both") {
 				_bedA->reportBedPETab(a);
-				printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", b1.chrom.c_str(), b1.start, b1.end,
-																   b2.chrom.c_str(), b2.start, b2.end,
-																   b1.name.c_str(), b1.score.c_str(), 
-																   b1.strand.c_str(), b2.strand.c_str());
+				printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end,
+																   b2.bed.chrom.c_str(), b2.bed.start, b2.bed.end,
+																   b1.bed.name.c_str(), b1.bed.score.c_str(), 
+																   b1.bed.strand.c_str(), b2.bed.strand.c_str());
+				for (size_t i = 0; i < b1.bed.otherFields.size(); ++i)
+                    printf("\t%s", b1.bed.otherFields[i].c_str());
+                printf("\n");
 			}
 		}
 	}
 }
 
 
-void PairToPair::FindHitsOnEitherEnd(const BEDPE &a, const vector<BEDCOV> &qualityHitsEnd1, 
-									const vector<BEDCOV> &qualityHitsEnd2, int &matchCount) {
+void PairToPair::FindHitsOnEitherEnd(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, 
+									const vector<MATE> &qualityHitsEnd2, int &matchCount) {
 	
-	map<unsigned int, vector<BEDCOV>, less<int> > hitsMap;
+	map<unsigned int, vector<MATE>, less<int> > hitsMap;
 	
-	for (vector<BEDCOV>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) {
+	for (vector<MATE>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) {
 		hitsMap[h->lineNum].push_back(*h);
 		matchCount++;
 	}
-	for (vector<BEDCOV>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) {
+	for (vector<MATE>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) {
 		hitsMap[h->lineNum].push_back(*h);
 		matchCount++;
 	}
 
-	for (map<unsigned int, vector<BEDCOV>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) {
+	for (map<unsigned int, vector<MATE>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) {
 		if (m->second.size() >= 1) {
 			
 			if ((m->second.size()) == 2) {
-    			BEDCOV b1 = m->second[0];
-    			BEDCOV b2 = m->second[1];
-			
+    			MATE b1 = m->second[0];
+    			MATE b2 = m->second[1];
+			    
     			_bedA->reportBedPETab(a);
-    			printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", b1.chrom.c_str(), b1.start, b1.end,
-    															   b2.chrom.c_str(), b2.start, b2.end,
-    															   b1.name.c_str(), b1.score.c_str(), 
-    															   b1.strand.c_str(), b2.strand.c_str());
+    			printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end,
+    															   b2.bed.chrom.c_str(), b2.bed.start, b2.bed.end,
+    															   b1.bed.name.c_str(), b1.bed.score.c_str(), 
+                                                                   b1.bed.strand.c_str(), b2.bed.strand.c_str());
+                for (size_t i = 0; i < b1.bed.otherFields.size(); ++i)
+    				printf("\t%s", b1.bed.otherFields[i].c_str());
+                printf("\n");
 			}
 			else {
-			    BEDCOV b1 = m->second[0];
+			    MATE b1 = m->second[0];
 			    
     			_bedA->reportBedPETab(a);
-    			printf("%s\t%d\t%d\t.\t-1\t-1\t%s\t%s\t%s\t.\n", b1.chrom.c_str(), b1.start, b1.end,
-    															 b1.name.c_str(), b1.score.c_str(), b1.strand.c_str());			
+    			printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end,
+    															   b1.mate->bed.chrom.c_str(), b1.mate->bed.start, b1.mate->bed.end,
+    															   b1.bed.name.c_str(), b1.bed.score.c_str(), 
+                                                                   b1.bed.strand.c_str(), b1.mate->bed.strand.c_str());
+                for (size_t i = 0; i < b1.bed.otherFields.size(); ++i)
+    				printf("\t%s", b1.bed.otherFields[i].c_str());
+                printf("\n");		
             }
 		}
 	}
diff --git a/src/pairToPair/pairToPair.h b/src/pairToPair/pairToPair.h
index 08f9de4a..e30e4a28 100644
--- a/src/pairToPair/pairToPair.h
+++ b/src/pairToPair/pairToPair.h
@@ -20,6 +20,8 @@
 
 using namespace std;
 
+
+
 //************************************************
 // Class methods and elements
 //************************************************
@@ -55,17 +57,17 @@ private:
 	BedFilePE *_bedB;
 	
 	// methods
-	void FindOverlaps(const BEDPE &a, vector<BEDCOV> &hitsA1B1, vector<BEDCOV> &hitsA1B2, 
-		vector<BEDCOV> &hitsA2B1, vector<BEDCOV> &hitsA2B2);
+	void FindOverlaps(const BEDPE &a, vector<MATE> &hitsA1B1, vector<MATE> &hitsA1B2, 
+		vector<MATE> &hitsA2B1, vector<MATE> &hitsA2B2);
 
     void FindQualityHitsBetweenEnds(CHRPOS start, CHRPOS end,
-        const vector<BEDCOV> &hits, vector<BEDCOV> &qualityHits, int &numOverlaps);
+        const vector<MATE> &hits, vector<MATE> &qualityHits, int &numOverlaps);
 	
-	void FindHitsOnBothEnds(const BEDPE &a, const vector<BEDCOV> &qualityHitsEnd1, 
-		const vector<BEDCOV> &qualityHitsEnd2, int &matchCount);
+	void FindHitsOnBothEnds(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, 
+		const vector<MATE> &qualityHitsEnd2, int &matchCount);
 		
-	void FindHitsOnEitherEnd(const BEDPE &a, const vector<BEDCOV> &qualityHitsEnd1, 
-        const vector<BEDCOV> &qualityHitsEnd2, int &matchCount);
+	void FindHitsOnEitherEnd(const BEDPE &a, const vector<MATE> &qualityHitsEnd1, 
+        const vector<MATE> &qualityHitsEnd2, int &matchCount);
         
 };
 
diff --git a/src/pairToPair/pairToPairMain.cpp b/src/pairToPair/pairToPairMain.cpp
index bcaef09c..1cf03f22 100644
--- a/src/pairToPair/pairToPairMain.cpp
+++ b/src/pairToPair/pairToPairMain.cpp
@@ -161,8 +161,9 @@ void ShowHelp(void) {
 	cerr 								<< "\t\tboth\tReport overlaps if both ends of A overlap B." << endl;
 	cerr								<< "\t\t\t- Default = both." << endl << endl;
 
-	cerr << "\t-slop \t"				<< "The amount of slop (in b.p.). to be added to each footprint." << endl << endl;
-
+    cerr << "\t-slop \t"				<< "The amount of slop (in b.p.). to be added to each footprint." << endl;
+    cerr                                << "\t\t*Note*: Slop is subtracted from start1 and start2 and added to end1 and end2." << endl << endl;
+    
 	cerr << "\t-ss\t"	    			<< "Add slop based to each BEDPE footprint based on strand." << endl;
 	cerr 								<< "\t\t- If strand is \"+\", slop is only added to the end coordinates." << endl;
 	cerr 								<< "\t\t- If strand is \"-\", slop is only added to the start coordinates." << endl;
diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h
index 8916e105..2d5d8886 100644
--- a/src/utils/bedFile/bedFile.h
+++ b/src/utils/bedFile/bedFile.h
@@ -154,6 +154,17 @@ public:
 }; // BED
 
 
+/*
+	Structure for each end of a paired BED record
+	mate points to the other end.
+*/
+struct MATE {
+    BED bed;
+    int lineNum;
+    MATE *mate;
+};
+
+
 /*
 	Structure for regular BED COVERAGE records
 */
@@ -225,14 +236,17 @@ enum FileType
 //*************************************************
 typedef vector<BED>    bedVector;
 typedef vector<BEDCOV> bedCovVector;
+typedef vector<MATE> mateVector;
 typedef vector<BEDCOVLIST> bedCovListVector;
 
 typedef map<BIN, bedVector,    std::less<BIN> > binsToBeds;
 typedef map<BIN, bedCovVector, std::less<BIN> > binsToBedCovs;
+typedef map<BIN, mateVector, std::less<BIN> > binsToMates;
 typedef map<BIN, bedCovListVector, std::less<BIN> > binsToBedCovLists;
 
 typedef map<string, binsToBeds, std::less<string> >    masterBedMap;
 typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap;
+typedef map<string, binsToMates, std::less<string> > masterMateMap;
 typedef map<string, binsToBedCovLists, std::less<string> > masterBedCovListMap;
 typedef map<string, bedVector, std::less<string> >     masterBedMapNoBin;
 
diff --git a/src/utils/bedFilePE/bedFilePE.cpp b/src/utils/bedFilePE/bedFilePE.cpp
index 11d64257..98d3a200 100644
--- a/src/utils/bedFilePE/bedFilePE.cpp
+++ b/src/utils/bedFilePE/bedFilePE.cpp
@@ -421,7 +421,7 @@ bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, co
 /*
 	Adapted from kent source "binKeeperFind"
 */
-void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string strand, vector<BEDCOV> &hits, bool forceStrand) {
+void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string strand, vector<MATE> &hits, bool forceStrand) {
 
 	int startBin, endBin;
 	startBin = (start >> _binFirstShift);
@@ -437,8 +437,8 @@ void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS
 			// loop through each feature in this chrom/bin and see if it overlaps
 			// with the feature that was passed in.  if so, add the feature to 
 			// the list of hits.
-			vector<BEDCOV>::const_iterator bedItr;
-			vector<BEDCOV>::const_iterator bedEnd;
+			vector<MATE>::const_iterator bedItr;
+			vector<MATE>::const_iterator bedEnd;
 			if (bEnd == 1) {
 				bedItr = bedMapEnd1[chrom][j].begin();
 				bedEnd = bedMapEnd1[chrom][j].end();
@@ -453,10 +453,10 @@ void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS
 			for (; bedItr != bedEnd; ++bedItr) {
 				
 				// skip the hit if not on the same strand (and we care)
-				if (forceStrand && (strand != bedItr->strand)) {
+				if (forceStrand && (strand != bedItr->bed.strand)) {
 					continue;
 				}
-				else if (overlaps(bedItr->start, bedItr->end, start, end) > 0) {
+				else if (overlaps(bedItr->bed.start, bedItr->bed.end, start, end) > 0) {
 					hits.push_back(*bedItr);	// it's a hit, add it.
 				}
 							
@@ -480,18 +480,19 @@ void BedFilePE::loadBedPEFileIntoMap() {
 	while (bedStatus != BED_INVALID) {
 		
 		if (bedStatus == BED_VALID) {
-			BEDCOV bedEntry1, bedEntry2;
+            MATE *bedEntry1 = new MATE();
+            MATE *bedEntry2 = new MATE();
 			// separate the BEDPE entry into separate
 			// BED entries
 			splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2);
 
 			// load end1 into a UCSC bin map
-			bin1 = getBin(bedEntry1.start, bedEntry1.end);
-			this->bedMapEnd1[bedEntry1.chrom][bin1].push_back(bedEntry1);	
+			bin1 = getBin(bedEntry1->bed.start, bedEntry1->bed.end);
+			this->bedMapEnd1[bedEntry1->bed.chrom][bin1].push_back(*bedEntry1);	
 		
 			// load end2 into a UCSC bin map
-			bin2 = getBin(bedEntry2.start, bedEntry2.end);
-			this->bedMapEnd2[bedEntry2.chrom][bin2].push_back(bedEntry2);
+			bin2 = getBin(bedEntry2->bed.start, bedEntry2->bed.end);
+			this->bedMapEnd2[bedEntry2->bed.chrom][bin2].push_back(*bedEntry2);
 
 			bedpeEntry = nullBedPE;
 		}
@@ -501,7 +502,7 @@ void BedFilePE::loadBedPEFileIntoMap() {
 }
 
 
-void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, BEDCOV &bedEntry1, BEDCOV &bedEntry2) {
+void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, MATE *bedEntry1, MATE *bedEntry2) {
 	
 	/* 
 	   Split the BEDPE entry into separate BED entries
@@ -515,23 +516,22 @@ void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum,
 	   read-pair.
 	*/
 	
-	bedEntry1.chrom           = bedpeEntry.chrom1;
-	bedEntry1.start           = bedpeEntry.start1;
-	bedEntry1.end             = bedpeEntry.end1;
-	bedEntry1.name            = bedpeEntry.name;
-	bedEntry1.score           = bedpeEntry.score;
-	bedEntry1.strand          = bedpeEntry.strand1;
-	bedEntry1.count           = lineNum;
-	bedEntry1.minOverlapStart = INT_MAX;
+	bedEntry1->bed.chrom           = bedpeEntry.chrom1;
+	bedEntry1->bed.start           = bedpeEntry.start1;
+	bedEntry1->bed.end             = bedpeEntry.end1;
+	bedEntry1->bed.name            = bedpeEntry.name;         // only store the name in end1 to save memory
+	bedEntry1->bed.score           = bedpeEntry.score;        // only store the score in end1 to save memory
+	bedEntry1->bed.strand          = bedpeEntry.strand1;
+	bedEntry1->bed.otherFields     = bedpeEntry.otherFields;  // only store the otherFields in end1 to save memory    
+	bedEntry1->lineNum             = lineNum;
+    bedEntry1->mate                = bedEntry2;               // keep a pointer to end2
 	
-	bedEntry2.chrom           = bedpeEntry.chrom2;
-	bedEntry2.start           = bedpeEntry.start2;
-	bedEntry2.end             = bedpeEntry.end2;
-	bedEntry2.name            = bedpeEntry.name;
-	bedEntry2.score           = bedpeEntry.score;
-	bedEntry2.strand          = bedpeEntry.strand2;		
-	bedEntry2.count           = lineNum;
-	bedEntry2.minOverlapStart = INT_MAX;
+	bedEntry2->bed.chrom           = bedpeEntry.chrom2;
+	bedEntry2->bed.start           = bedpeEntry.start2;
+	bedEntry2->bed.end             = bedpeEntry.end2;
+	bedEntry2->bed.strand          = bedpeEntry.strand2;
+	bedEntry2->lineNum             = lineNum;
+    bedEntry2->mate                = bedEntry1;               // keep a pointer to end1
 }
 
 
diff --git a/src/utils/bedFilePE/bedFilePE.h b/src/utils/bedFilePE/bedFilePE.h
index 0e6be51b..e6e43ed0 100644
--- a/src/utils/bedFilePE/bedFilePE.h
+++ b/src/utils/bedFilePE/bedFilePE.h
@@ -69,18 +69,18 @@ public:
 	void reportBedPETab(const BEDPE &a);
 	void reportBedPENewLine(const BEDPE &a);
 	void loadBedPEFileIntoMap();
-	void splitBedPEIntoBeds(const BEDPE &a, const int &lineNum, BEDCOV &bedEntry1, BEDCOV &bedEntry2);
+	void splitBedPEIntoBeds(const BEDPE &a, const int &lineNum, MATE *bedEntry1, MATE *bedEntry2);
 		 
 		
 	void FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string strand, 
-		vector<BEDCOV> &hits, bool forceStrand);
+		vector<MATE> &hits, bool forceStrand);
 		
 		
 	string bedFile;
 	unsigned int bedType;
 	
-	masterBedCovMap bedMapEnd1;
-	masterBedCovMap bedMapEnd2;
+	masterMateMap bedMapEnd1;
+	masterMateMap bedMapEnd2;
 	
 private:
 	istream *_bedStream;
-- 
GitLab