diff --git a/RELEASE_HISTORY b/RELEASE_HISTORY index 69154704bd26fe4b227104637e9861b6bb2085e0..0e51ccc2f989f7acc230a473c0a2dea0f7018a27 100644 --- a/RELEASE_HISTORY +++ b/RELEASE_HISTORY @@ -1,4 +1,88 @@ -Version 2.18.2 (16-Dec-2013) +Version 2.19.1 (6-Mar-2014) +Bug fix to intersect causing BAM footers to be erroneously written when -b is BAM + +Speedup for the map tool. +http://bedtools.readthedocs.org/en/latest/_images/map-speed-comparo.png + +Map tool now allows multiple columns and operations in a single run. +http://bedtools.readthedocs.org/en/latest/content/tools/map.html#multiple-operations-and-columns-at-the-same-time + + +Version 2.19.0 (8-Feb-2014) +Bug Fixes +========= + +1. Fixed a long standing bug in which the number of base pairs of overlap was incorrectly calculated when using the -wo option with the -split option. Thanks to many for reporting this. + +2. Fixed a bug in which certain flavors of unmapped BAM alignments were incorrectly rejected in the latest 2.18.* series. Thanks very much to +Gabriel Pratt. + + +Enhancements +============ + +1. Substantially reduced memory usage, especially when dealing with unsorted data. Memory usage ballooned in the 2.18.* series owing to default buffer sizes we were using in a custom string class. We have adjusted this and the memory usage has returned to 2.17.* levels while maintaining speed increases. Thanks so much to Ian Sudberry rightfully complaining about this! + + +New features +============ + +1. The latest version of the "map" function is ~3X faster than the one available in version 2.17 and 2.18 + +# bedtools 2.17 +$ time bedtools map \ + -a hg19.gerp.elements.bed.gz \ + -b hg19.rmsk.bed.gz \ + -c 4 \ + -o collapse > /dev/null +real 0m15.865s +user 0m15.815s +sys 0m0.040s + + +# bedtools 2.19 +$ time bedtools map \ + -a hg19.gerp.elements.bed.gz \ + -b hg19.rmsk.bed.gz \ + -c 4 \ + -o collapse > /dev/null +real 0m5.367s +user 0m5.314s +sys 0m0.050s + +2. The map function now supports the "-split" option, as well as "absmin" and "absmax" operations. + +3. In addition, it supports multiple chromosome sorting criterion by supplying a genome file that defines the expected chromosome order. Here is an example of how to run map with datasets having chromosomes sorted in "version" order, as opposed to the lexicographical chrom order that is the norm. + +# version sort the BED files (e.g. chr1, chr2, etc., not chr1, chr10, chr11, etc.) +$ zcat hg19.gerp.elements.bed.gz | sort -k1,1V -k2,2n > hg19.gerp.versionsorted.bed +$ zcat hg19.rmsk.bed.gz | sort -k1,1V -k2,2n > hg19.rmsk.versionsorted.bed + +# make a toy genome file +$ cut -f 1 hg19.rmsk.versionsorted.bed | uniq | awk '{print $1"\t"1}' > hg19.versionsorted.genome + +$ head hg19.versionsorted.genome +chr1 1 +chr1_gl000191_random 1 +chr1_gl000192_random 1 +chr2 1 +chr3 1 +chr4 1 +chr4_ctg9_hap1 1 +chr4_gl000193_random 1 +chr4_gl000194_random 1 +chr5 1 + +# tell map to expect a different chrom order. +$ bedtools map \ + -a hg19.gerp.versionsorted.bed \ + -b hg19.rmsk.versionsorted.bed \ + -c 4 \ + -o collapse \ + -g hg19.versionsorted.genome + + +Version 2.18.2 (8-Jan-2014) bedtools. The changes to bedtools reflect fixes to compilation errors, performance enhancements for smaller files, and a bug fix for BAM files that lack a formal header. Our current focus for the 2.19.* release is is on addressing some standing bug/enhancements and also in updating some of the other more widely used tools (e.g., coverage, map, and substract) to use the new API. We will also continue to look into ways to improve performance while hopefully reducing memory usage for algorithms that work with unsorted data (thanks to Ian Sudberry for the ping!). diff --git a/src/fisher/Fisher.cpp b/src/fisher/Fisher.cpp index cb0cbcac67f0d3026bb2a7d426ece93716b9108b..38eb23b46fd4bde8c3f663dfc5bfff7df133cf92 100644 --- a/src/fisher/Fisher.cpp +++ b/src/fisher/Fisher.cpp @@ -1,4 +1,3 @@ - #include "Fisher.h" #include "BlockMgr.h" #include "NewChromsweep.h" @@ -69,15 +68,15 @@ bool Fisher::getFisher() { if (!sweep.init()) { return false; } - RecordKeyList hitSet; + RecordKeyVector hitSet; while (sweep.next(hitSet)) { if (_context->getObeySplits()) { - RecordKeyList keySet(hitSet.getKey()); - RecordKeyList resultSet(hitSet.getKey()); + RecordKeyVector keySet(hitSet.getKey()); + RecordKeyVector resultSet(hitSet.getKey()); _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet); - _intersectionVal += getTotalIntersection(&resultSet); + _intersectionVal += getTotalIntersection(resultSet); } else { - _intersectionVal += getTotalIntersection(&hitSet); + _intersectionVal += getTotalIntersection(hitSet); } } @@ -89,18 +88,17 @@ bool Fisher::getFisher() { return true; } -unsigned long Fisher::getTotalIntersection(RecordKeyList *recList) +unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList) { unsigned long intersection = 0; - const Record *key = recList->getKey(); + const Record *key = recList.getKey(); int keyStart = key->getStartPos(); int keyEnd = key->getEndPos(); int hitIdx = 0; - for (RecordKeyList::const_iterator_type iter = recList->begin(); iter != recList->end(); iter = recList->next()) { - const Record *currRec = iter->value(); - int maxStart = max(currRec->getStartPos(), keyStart); - int minEnd = min(currRec->getEndPos(), keyEnd); + for (RecordKeyVector::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { + int maxStart = max((*iter)->getStartPos(), keyStart); + int minEnd = min((*iter)->getEndPos(), keyEnd); if (_context->getObeySplits()) { intersection += _blockMgr->getOverlapBases(hitIdx); hitIdx++; @@ -108,7 +106,7 @@ unsigned long Fisher::getTotalIntersection(RecordKeyList *recList) intersection += (unsigned long)(minEnd - maxStart); } } - _numIntersections += (int)recList->size(); + _numIntersections += (int)recList.size(); return intersection; } diff --git a/src/fisher/Fisher.h b/src/fisher/Fisher.h index c875273ac16ba65bcb1f74e5f9298cc86aa4cb16..01f2157d32343f0ca08449f453d2380b526db69a 100644 --- a/src/fisher/Fisher.h +++ b/src/fisher/Fisher.h @@ -24,7 +24,7 @@ private: unsigned long _dbLen; bool getFisher(); - unsigned long getTotalIntersection(RecordKeyList *hits); + unsigned long getTotalIntersection(RecordKeyVector &hits); }; #endif /* FISHER_H */ diff --git a/src/intersectFile/intersectFile.h b/src/intersectFile/intersectFile.h index 585de04427b49cd75f3b620e445f0dbb54c7ac7b..920ba1ab65c61492371ea03262c6c09f834d3d1b 100644 --- a/src/intersectFile/intersectFile.h +++ b/src/intersectFile/intersectFile.h @@ -12,7 +12,7 @@ #ifndef INTERSECTFILE_H #define INTERSECTFILE_H -#include "RecordKeyList.h" +#include "RecordKeyVector.h" using namespace std; diff --git a/src/utils/FileRecordTools/Records/RecordKeyVector.h b/src/utils/FileRecordTools/Records/RecordKeyVector.h index a7389d52cf9c1503171cd80f898172117e2e2bbd..2993e75d0ce2ff062183d254e25052386a9d28ba 100644 --- a/src/utils/FileRecordTools/Records/RecordKeyVector.h +++ b/src/utils/FileRecordTools/Records/RecordKeyVector.h @@ -9,12 +9,11 @@ #define KEYVECTOR_H_ - +#include "Record.h" +#include <vector> using namespace std; -#include "Record.h" -#include <vector> class RecordKeyVector { public: