Commit e4955d31 authored by Aaron Quinlan's avatar Aaron Quinlan
Browse files

Merge pull request #243 from nkindlon/master

Addressed BinTree memory blow-up; -m in fisher (w/ unit tests)
parents e2b4e53b 3010bf53
......@@ -5,66 +5,20 @@
BinTree::BinTree(ContextIntersect *context)
: _context(context),
_binOffsetsExtended(NULL),
_showBinMetrics(false),
_maxBinNumFound(0)
{
_binOffsetsExtended = new uint32_t[NUM_BIN_LEVELS];
memset(_binOffsetsExtended, 0, NUM_BIN_LEVELS * sizeof(uint32_t));
_binOffsetsExtended = new binNumType[NUM_BIN_LEVELS];
memset(_binOffsetsExtended, 0, NUM_BIN_LEVELS * sizeof(binNumType));
//start at idx 1, because the memset above already initialized
//the first idx to zero, which is what we want.
for (uint32_t i= 1; i < NUM_BIN_LEVELS; i++) {
for (binNumType i= 1; i < NUM_BIN_LEVELS; i++) {
_binOffsetsExtended[i] = _binOffsetsExtended[i-1] + (1 << ((NUM_BIN_LEVELS - i -1) * 3));
}
}
BinTree::~BinTree() {
//TBD: pass all elements in tree back to FRM for proper cleanup/deletion
for (mainMapType::iterator mainIter = _mainMap.begin(); mainIter != _mainMap.end(); mainIter++) {
allBinsType bins = mainIter->second;
if (bins == NULL) {
fprintf(stderr, "ERROR: In BinTree destructor: found chromosome with NULL bin array.\n");
continue;
}
if (!_showBinMetrics) { //don't clean up bins when simply reporting metrics.
for (uint32_t i=0; i < NUM_BINS; i++) {
binType bin = bins[i];
if (bin == NULL) {
continue;
}
for (innerListIterType listIter = bin->begin(); listIter != bin->end(); listIter = bin->next()) {
const Record *record = listIter->value();
_context->getFile(record->getFileIdx())->deleteRecord(record);
}
delete bin;
bin = NULL;
}
}
delete [] bins;
bins = NULL;
}
delete [] _binOffsetsExtended;
if (_showBinMetrics) {
map<int, int> hitsHistogram;
FILE *fp = fopen("BinsHitFile.txt", "w");
fprintf(fp, "The largest bin was %u\n", _maxBinNumFound);
fprintf(fp, "There were %d different bins hit by database.\n", (int)_binsHit.size());
for (map<uint32_t, int>::iterator binIter = _binsHit.begin(); binIter != _binsHit.end(); binIter++) {
uint32_t binNum = binIter->first;
int numHits = binIter->second;
fprintf(fp, "%u\t%d\n", binNum, numHits);
hitsHistogram[numHits]++;
}
fclose(fp);
fp = fopen("BinHitsHistogram.txt", "w");
fprintf(fp, "NumHits\tNumBins\n");
for (map<int, int>::iterator histIter = hitsHistogram.begin(); histIter != hitsHistogram.end(); histIter++) {
fprintf(fp, "%d\t%d\n", histIter->first, histIter->second);
}
fclose(fp);
}
}
void BinTree::loadDB()
......@@ -93,9 +47,6 @@ void BinTree::loadDB()
void BinTree::getHits(Record *record, RecordKeyVector &hitSet)
{
if (_showBinMetrics) {
return; //don't care about query entries just yet.
}
if (record->isUnmapped()) {
return;
}
......@@ -106,11 +57,11 @@ void BinTree::getHits(Record *record, RecordKeyVector &hitSet)
return;
}
uint32_t startBin = 0;
uint32_t endBin = 0;
binNumType startBin = 0;
binNumType endBin = 0;
uint32_t startPos = record->getStartPos();
uint32_t endPos = record->getEndPos();
binNumType startPos = record->getStartPos();
binNumType endPos = record->getEndPos();
startBin = (startPos >> _binFirstShift);
endBin = ((endPos-1) >> _binFirstShift);
......@@ -124,25 +75,19 @@ void BinTree::getHits(Record *record, RecordKeyVector &hitSet)
hits if it meets all of the user's requests, which include:
(a) overlap fractio, (b) strandedness, (c) reciprocal overlap
*/
for (uint32_t i = 0; i < NUM_BIN_LEVELS; i++) {
uint32_t offset = _binOffsetsExtended[i];
for (uint32_t j = (startBin+offset); j <= (endBin+offset); j++) {
for (binNumType i = 0; i < NUM_BIN_LEVELS; i++) {
binNumType offset = _binOffsetsExtended[i];
for (binNumType j = (startBin+offset); j <= (endBin+offset); j++) {
// move to the next bin if this one is empty
const binType &bin = bins[j];
if (bin == NULL) {
//no list of records in this bin.
continue;
}
if (bin->empty()) {
//bin has list, but it's empty.
//Actually, this should never happen, so throw an error.
fprintf(stderr, "ERROR: found empty list for bin %u of chromosome %s\n",
j, chr.c_str());
allBinsType::const_iterator allBinsIter = bins.find(j);
if (allBinsIter == bins.end()) {
continue;
}
for (innerListIterType listIter = bin->begin(); listIter != bin->end(); listIter = bin->next()) {
const Record *dbRec = listIter->value();
const binType &bin = allBinsIter->second;
for (binType::const_iterator iter = bin.begin(); iter != bin.end(); iter++) {
const Record *dbRec = *iter;
if (record->intersects(dbRec, _context->getSameStrand(), _context->getDiffStrand(),
_context->getOverlapFraction(), _context->getReciprocal())) {
hitSet.push_back(dbRec);
......@@ -159,55 +104,31 @@ void BinTree::getHits(Record *record, RecordKeyVector &hitSet)
bool BinTree::addRecordToTree(const Record *record)
{
// Get chr, bin. allocate all bins and single bins as needed.
// Get chr, bin.
const QuickString &chr = record->getChrName();
uint32_t startPos = (uint32_t)(record->getStartPos());
uint32_t endPos = (uint32_t)(record->getEndPos());
//is this chr currently in the main map?
allBinsType bins = NULL;
mainMapType::iterator mainIter = _mainMap.find(chr);
if (mainIter == _mainMap.end()) {
//this is a new chr NOT currently in the map.
bins = new binType[NUM_BINS];
memset(bins, 0, NUM_BINS * sizeof(binType));
_mainMap[chr] = bins;
} else {
bins = mainIter->second;
}
uint32_t binNum = getBin(startPos, endPos);
if (_showBinMetrics) {
if (binNum > _maxBinNumFound) {
_maxBinNumFound = binNum;
}
_binsHit[binNum]++;
return true;
}
binNumType startPos = (binNumType)(record->getStartPos());
binNumType endPos = (binNumType)(record->getEndPos());
binNumType binNum = getBin(startPos, endPos);
if (binNum < 0 || binNum >= NUM_BINS) {
fprintf(stderr, "ERROR: Received illegal bin number %u from getBin call.\n", binNum);
return false;
}
binType &bin = bins[binNum];
if (bin == NULL) {
bin = new innerListType;
}
bin->push_back(record);
_mainMap[chr][binNum].push_back(record);
return true;
}
uint32_t BinTree::getBin(const Record *record) const {
return getBin((uint32_t)(record->getStartPos()), (uint32_t)(record->getEndPos()));
BinTree::binNumType BinTree::getBin(const Record *record) const {
return getBin((binNumType)(record->getStartPos()), (binNumType)(record->getEndPos()));
}
uint32_t BinTree::getBin(uint32_t start, uint32_t end) const {
BinTree::binNumType BinTree::getBin(binNumType start, binNumType end) const {
--end;
start >>= _binFirstShift;
end >>= _binFirstShift;
for (uint32_t i = 0; i < NUM_BIN_LEVELS; ++i) {
for (binNumType i = 0; i < NUM_BIN_LEVELS; ++i) {
if (start == end) {
return _binOffsetsExtended[i] + start;
}
......
......@@ -37,8 +37,9 @@ private:
//
// BIN HANDLING
//
static const uint32_t NUM_BINS = 37450;
static const uint32_t NUM_BIN_LEVELS = 7;
typedef uint32_t binNumType;
static const binNumType NUM_BINS = 37450;
static const binNumType NUM_BIN_LEVELS = 7;
// bins range in size from 16kb to 512Mb
// Bin 0 spans 512Mbp, # Level 1
......@@ -47,25 +48,30 @@ private:
// Bins 73-584 span 1Mbp # Level 4
// Bins 585-4680 span 128Kbp # Level 5
// Bins 4681-37449 span 16Kbp # Level 6
uint32_t *_binOffsetsExtended;
static const uint32_t _binFirstShift = 14; /* How much to shift to get to finest bin. */
static const uint32_t _binNextShift = 3; /* How much to shift to get to next larger bin. */
typedef RecordList innerListType;
typedef const RecordListNode * innerListIterType;
typedef innerListType * binType;
typedef binType * allBinsType;
typedef QuickString mainKeyType;
typedef map<mainKeyType, allBinsType> mainMapType;
binNumType *_binOffsetsExtended;
static const binNumType _binFirstShift = 14; /* How much to shift to get to finest bin. */
static const binNumType _binNextShift = 3; /* How much to shift to get to next larger bin. */
// typedef RecordList innerListType;
// typedef const RecordListNode * innerListIterType;
// typedef innerListType * binType;
// typedef binType * allBinsType;
// typedef QuickString mainKeyType;
// typedef map<mainKeyType, allBinsType> mainMapType;
// mainMapType _mainMap;
typedef vector<const Record *> binType;
typedef map<binNumType, binType> allBinsType; //for each bin number, have a RecordList
typedef map<QuickString, allBinsType> mainMapType; //for each chrom, a map of bin num to RecordLists.
mainMapType _mainMap;
bool _showBinMetrics;
uint32_t _maxBinNumFound;
map<uint32_t, int> _binsHit;
binNumType _maxBinNumFound;
map<binNumType, int> _binsHit;
bool addRecordToTree(const Record *);
uint32_t getBin(uint32_t start, uint32_t end) const;
uint32_t getBin(const Record *record) const;
binNumType getBin(binNumType start, binNumType end) const;
binNumType getBin(const Record *record) const;
};
......
......@@ -7,7 +7,8 @@
ContextFisher::ContextFisher() {
setSortedInput(true);
setUseMergedIntervals(false);
setUseMergedIntervals(false); //by default,
//intervals are merged in Jaccard, but unmerged in fisher.
}
ContextFisher::~ContextFisher() {
......@@ -21,45 +22,23 @@ bool ContextFisher::parseCmdArgs(int argc, char **argv, int skipFirstArgs)
else if (strcmp(_argv[_i], "-exclude") == 0) {
if (!handle_exclude()) return false;
}
else if (strcmp(_argv[_i], "-m") == 0) {
if (!handle_m()) return false;
}
}
return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs);
}
bool ContextFisher::isValidState()
{
if (!ContextIntersect::isValidState()) {
if (!ContextJaccard::isValidState()) {
return false;
}
// Tests for stranded merge
//
if (_desiredStrand != FileRecordMergeMgr::ANY_STRAND) { // requested stranded merge
for (int i=0; i < getNumInputFiles(); i++) {
// make sure file has strand.
if (!getFile(i)->recordsHaveStrand()) {
_errorMsg = "\n***** ERROR: stranded merge requested, but input file ";
_errorMsg += getInputFileName(i);
_errorMsg += " does not have strands. *****";
return false;
}
//make sure file is not VCF.
if (getFile(1)->getFileType() == FileRecordTypeChecker::VCF_FILE_TYPE) {
_errorMsg = "\n***** ERROR: stranded merge not supported for VCF file ";
_errorMsg += getInputFileName(i);
_errorMsg += ". *****";
return false;
}
}
}
if (_genomeFile == NULL){
_errorMsg = "\nERROR*****: specify -g genome file*****\n";
return false;
}
//column operations not allowed with BAM input
if (hasColumnOpsMethods() &&
getFile(0)->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) {
_errorMsg = "\n***** ERROR: stranded merge not supported for VCF files. *****";
return false;
}
return true;
}
......@@ -79,3 +58,11 @@ bool ContextFisher::handle_exclude()
return true;
}
bool ContextFisher::handle_m()
{
setUseMergedIntervals(true);
markUsed(_i - _skipFirstArgs);
_i++;
return true;
}
......@@ -24,6 +24,7 @@ public:
protected:
string _excludeFile;
bool handle_exclude();
bool handle_m();
};
......
......@@ -58,12 +58,6 @@ bool ContextJaccard::isValidState()
}
}
}
//column operations not allowed with BAM input
if (hasColumnOpsMethods() &&
getFile(0)->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) {
_errorMsg = "\n***** ERROR: stranded merge not supported for VCF files. *****";
return false;
}
return true;
}
......
chr1 10 20
chr1 12 19
chr1 30 40
chr1 51 52
......@@ -51,3 +51,45 @@ left right two-tail ratio
$BT fisher -a a.bed -b b.bed -g t.60.genome > obs
check obs exp
rm obs exp
echo " fisher.t3...\c"
echo \
"# Number of query intervals: 4
# Number of db intervals: 2
# Number of overlaps: 3
# Number of possible intervals (estimated): 4
# phyper(3 - 1, 4, 4 - 4, 2, lower.tail=F)
# Contingency Table Of Counts
#_________________________________________
# | in -b | not in -b |
# in -a | 3 | 1 |
# not in -a | 0 | 0 |
#_________________________________________
# p-values for fisher's exact test
left right two-tail ratio
1 1 1 -nan" > exp
$BT fisher -a a_merge.bed -b b.bed -g t.60.genome > obs
check obs exp
rm obs exp
echo " fisher.t4...\c"
echo \
"# Number of query intervals: 3
# Number of db intervals: 2
# Number of overlaps: 2
# Number of possible intervals (estimated): 4
# phyper(2 - 1, 3, 4 - 3, 2, lower.tail=F)
# Contingency Table Of Counts
#_________________________________________
# | in -b | not in -b |
# in -a | 2 | 1 |
# not in -a | 0 | 1 |
#_________________________________________
# p-values for fisher's exact test
left right two-tail ratio
1 0.5 1 inf" > exp
$BT fisher -a a_merge.bed -b b.bed -g t.60.genome -m > obs
check obs exp
rm obs exp
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment