From 89cf3bcd68a6675903619dbe9b0de18bcc6b8ea7 Mon Sep 17 00:00:00 2001
From: arq5x <arq5x@virginia.edu>
Date: Thu, 18 Sep 2014 10:53:24 -0400
Subject: [PATCH] update fisher to use RecordKeyVector

---
 RELEASE_HISTORY                               | 86 ++++++++++++++++++-
 src/fisher/Fisher.cpp                         | 24 +++---
 src/fisher/Fisher.h                           |  2 +-
 src/intersectFile/intersectFile.h             |  2 +-
 .../FileRecordTools/Records/RecordKeyVector.h |  5 +-
 5 files changed, 100 insertions(+), 19 deletions(-)

diff --git a/RELEASE_HISTORY b/RELEASE_HISTORY
index 69154704..0e51ccc2 100644
--- a/RELEASE_HISTORY
+++ b/RELEASE_HISTORY
@@ -1,4 +1,88 @@
-Version 2.18.2 (16-Dec-2013)
+Version 2.19.1 (6-Mar-2014)
+Bug fix to intersect causing BAM footers to be erroneously written when -b is BAM
+
+Speedup for the map tool.
+http://bedtools.readthedocs.org/en/latest/_images/map-speed-comparo.png
+
+Map tool now allows multiple columns and operations in a single run.
+http://bedtools.readthedocs.org/en/latest/content/tools/map.html#multiple-operations-and-columns-at-the-same-time
+
+
+Version 2.19.0 (8-Feb-2014)
+Bug Fixes
+=========
+
+1. Fixed a long standing bug in which the number of base pairs of overlap was incorrectly calculated when using the -wo option with the -split option. Thanks to many for reporting this.
+
+2. Fixed a bug in which certain flavors of unmapped BAM alignments were incorrectly rejected in the latest 2.18.* series.  Thanks very much to 
+Gabriel Pratt.
+
+
+Enhancements
+============
+
+1. Substantially reduced memory usage, especially when dealing with unsorted data. Memory usage ballooned in the 2.18.* series owing to default buffer sizes we were using in a custom string class.  We have adjusted this and the memory usage has returned to 2.17.* levels while maintaining speed increases.  Thanks so much to Ian Sudberry rightfully complaining about this!
+
+
+New features
+============
+
+1. The latest version of the "map" function is ~3X faster than the one available in version 2.17 and 2.18
+
+# bedtools 2.17
+$ time bedtools map \
+                -a hg19.gerp.elements.bed.gz \
+                -b hg19.rmsk.bed.gz \
+                -c 4 \
+                -o collapse > /dev/null
+real	0m15.865s
+user	0m15.815s
+sys	0m0.040s
+
+
+# bedtools 2.19
+$ time bedtools map \
+                -a hg19.gerp.elements.bed.gz \
+                -b hg19.rmsk.bed.gz \
+                -c 4 \
+                -o collapse > /dev/null
+real	0m5.367s
+user	0m5.314s
+sys	0m0.050s
+
+2. The map function now supports the "-split" option, as well as "absmin" and "absmax" operations.
+
+3. In addition, it supports multiple chromosome sorting criterion by supplying a genome file that defines the expected chromosome order. Here is an example of how to run map with datasets having chromosomes sorted in "version" order, as opposed to the lexicographical chrom order that is the norm. 
+
+# version sort the BED files (e.g. chr1, chr2, etc., not chr1, chr10, chr11, etc.)
+$ zcat hg19.gerp.elements.bed.gz | sort -k1,1V -k2,2n > hg19.gerp.versionsorted.bed
+$ zcat hg19.rmsk.bed.gz | sort -k1,1V -k2,2n > hg19.rmsk.versionsorted.bed
+
+# make a toy genome file
+$ cut -f 1 hg19.rmsk.versionsorted.bed | uniq | awk '{print $1"\t"1}' > hg19.versionsorted.genome
+
+$ head hg19.versionsorted.genome
+chr1	1
+chr1_gl000191_random	1
+chr1_gl000192_random	1
+chr2	1
+chr3	1
+chr4	1
+chr4_ctg9_hap1	1
+chr4_gl000193_random	1
+chr4_gl000194_random	1
+chr5	1
+
+# tell map to expect a different chrom order.
+$ bedtools map \
+         -a hg19.gerp.versionsorted.bed \
+         -b hg19.rmsk.versionsorted.bed \
+         -c 4 \
+         -o collapse \
+         -g hg19.versionsorted.genome
+
+
+Version 2.18.2 (8-Jan-2014)
 
 bedtools. The changes to bedtools reflect fixes to compilation errors, performance enhancements for smaller files, and a bug fix for BAM files that lack a formal header. Our current focus for the 2.19.* release is is on addressing some standing bug/enhancements and also in updating some of the other more widely used tools (e.g., coverage, map, and substract) to use the new API. We will also continue to look into ways to improve performance while hopefully reducing memory usage for algorithms that work with unsorted data (thanks to Ian Sudberry for the ping!).
 
diff --git a/src/fisher/Fisher.cpp b/src/fisher/Fisher.cpp
index cb0cbcac..38eb23b4 100644
--- a/src/fisher/Fisher.cpp
+++ b/src/fisher/Fisher.cpp
@@ -1,4 +1,3 @@
-
 #include "Fisher.h"
 #include "BlockMgr.h"
 #include "NewChromsweep.h"
@@ -69,15 +68,15 @@ bool Fisher::getFisher() {
 	if (!sweep.init()) {
 		return false;
 	}
-	RecordKeyList hitSet;
+	RecordKeyVector hitSet;
 	while (sweep.next(hitSet)) {
 		if (_context->getObeySplits()) {
-			RecordKeyList keySet(hitSet.getKey());
-			RecordKeyList resultSet(hitSet.getKey());
+			RecordKeyVector keySet(hitSet.getKey());
+			RecordKeyVector resultSet(hitSet.getKey());
 			_blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet);
-			_intersectionVal += getTotalIntersection(&resultSet);
+			_intersectionVal += getTotalIntersection(resultSet);
 		} else {
-			_intersectionVal += getTotalIntersection(&hitSet);
+			_intersectionVal += getTotalIntersection(hitSet);
 		}
 	}
 
@@ -89,18 +88,17 @@ bool Fisher::getFisher() {
 	return true;
 }
 
-unsigned long Fisher::getTotalIntersection(RecordKeyList *recList)
+unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList)
 {
 	unsigned long intersection = 0;
-	const Record *key = recList->getKey();
+	const Record *key = recList.getKey();
 	int keyStart = key->getStartPos();
 	int keyEnd = key->getEndPos();
 
 	int hitIdx = 0;
-	for (RecordKeyList::const_iterator_type iter = recList->begin(); iter != recList->end(); iter = recList->next()) {
-		const Record *currRec = iter->value();
-		int maxStart = max(currRec->getStartPos(), keyStart);
-		int minEnd = min(currRec->getEndPos(), keyEnd);
+	for (RecordKeyVector::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) {
+		int maxStart = max((*iter)->getStartPos(), keyStart);
+		int minEnd = min((*iter)->getEndPos(), keyEnd);
 		if (_context->getObeySplits()) {
 			intersection += _blockMgr->getOverlapBases(hitIdx);
 			hitIdx++;
@@ -108,7 +106,7 @@ unsigned long Fisher::getTotalIntersection(RecordKeyList *recList)
 			intersection += (unsigned long)(minEnd - maxStart);
 		}
 	}
-	_numIntersections += (int)recList->size();
+	_numIntersections += (int)recList.size();
 	return intersection;
 }
 
diff --git a/src/fisher/Fisher.h b/src/fisher/Fisher.h
index c875273a..01f2157d 100644
--- a/src/fisher/Fisher.h
+++ b/src/fisher/Fisher.h
@@ -24,7 +24,7 @@ private:
     unsigned long _dbLen;
     bool getFisher();
 
-    unsigned long getTotalIntersection(RecordKeyList *hits);
+    unsigned long getTotalIntersection(RecordKeyVector &hits);
 };
 
 #endif /* FISHER_H */
diff --git a/src/intersectFile/intersectFile.h b/src/intersectFile/intersectFile.h
index 585de044..920ba1ab 100644
--- a/src/intersectFile/intersectFile.h
+++ b/src/intersectFile/intersectFile.h
@@ -12,7 +12,7 @@
 #ifndef INTERSECTFILE_H
 #define INTERSECTFILE_H
 
-#include "RecordKeyList.h"
+#include "RecordKeyVector.h"
 
 using namespace std;
 
diff --git a/src/utils/FileRecordTools/Records/RecordKeyVector.h b/src/utils/FileRecordTools/Records/RecordKeyVector.h
index a7389d52..2993e75d 100644
--- a/src/utils/FileRecordTools/Records/RecordKeyVector.h
+++ b/src/utils/FileRecordTools/Records/RecordKeyVector.h
@@ -9,12 +9,11 @@
 #define KEYVECTOR_H_
 
 
-
+#include "Record.h"
+#include <vector>
 
 using namespace std;
 
-#include "Record.h"
-#include <vector>
 
 class RecordKeyVector {
 public:
-- 
GitLab