BinTree.cpp 6.82 KB
Newer Older
nkindlon's avatar
nkindlon committed
1
2
3
4
#include "BinTree.h"
#include "FileRecordMgr.h"


5
BinTree::BinTree(int databaseFileIdx, ContextIntersect *context)
nkindlon's avatar
nkindlon committed
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
: _databaseFileIdx(databaseFileIdx),
  _context(context),
  _binOffsetsExtended(NULL),
  _dbFileMgr(NULL),
  _showBinMetrics(false),
  _maxBinNumFound(0)
 {
	_binOffsetsExtended = new uint32_t[NUM_BIN_LEVELS];
	memset(_binOffsetsExtended, 0, NUM_BIN_LEVELS * sizeof(uint32_t));

	//start at idx 1, because the memset above already initialized
	//the first idx to zero, which is what we want.
	for (uint32_t i= 1; i < NUM_BIN_LEVELS; i++) {
		_binOffsetsExtended[i] = _binOffsetsExtended[i-1] + (1 << ((NUM_BIN_LEVELS - i -1) * 3));
	}
}

BinTree::~BinTree() {
	//TBD: pass all elements in tree back to FRM for proper cleanup/deletion
	for (mainMapType::iterator mainIter = _mainMap.begin(); mainIter != _mainMap.end(); mainIter++) {
		allBinsType bins = mainIter->second;
		if (bins == NULL) {
			fprintf(stderr, "ERROR: In BinTree destructor: found chromosome with NULL bin array.\n");
			continue;
		}
		if (!_showBinMetrics) { //don't clean up bins when simply reporting metrics.

			for (uint32_t i=0; i < NUM_BINS; i++) {
				binType bin = bins[i];
				if (bin == NULL) {
					continue;
				}
				for (innerListIterType listIter = bin->begin(); listIter != bin->end(); listIter = bin->next()) {
					const Record *record = listIter->value();
					_dbFileMgr->deleteRecord(record);
				}
				delete bin;
				bin = NULL;
			}
		}
		delete [] bins;
		bins = NULL;
	}
	if (_dbFileMgr != NULL) {
		delete _dbFileMgr;
		_dbFileMgr = NULL;
	}
	delete [] _binOffsetsExtended;

	if (_showBinMetrics) {
		map<int, int> hitsHistogram;
		FILE *fp = fopen("BinsHitFile.txt", "w");
		fprintf(fp, "The largest bin was %u\n", _maxBinNumFound);
		fprintf(fp, "There were %d different bins hit by database.\n", (int)_binsHit.size());
		for (map<uint32_t, int>::iterator binIter = _binsHit.begin(); binIter != _binsHit.end(); binIter++) {
			uint32_t binNum = binIter->first;
			int numHits = binIter->second;
			fprintf(fp, "%u\t%d\n", binNum, numHits);
			hitsHistogram[numHits]++;
		}
		fclose(fp);
		fp = fopen("BinHitsHistogram.txt", "w");
		fprintf(fp, "NumHits\tNumBins\n");
		for (map<int, int>::iterator histIter = hitsHistogram.begin(); histIter != hitsHistogram.end(); histIter++) {
			fprintf(fp, "%d\t%d\n", histIter->first, histIter->second);
		}
		fclose(fp);
	}
}

bool BinTree::loadDB()
{
	_dbFileMgr = new FileRecordMgr(_databaseFileIdx, _context);
	if (!_dbFileMgr->open()) {
		fprintf(stderr, "ERROR: Can't open database file %s to build tree.\n", _context->getInputFileName(_databaseFileIdx).c_str());
		delete _dbFileMgr;
		_dbFileMgr = NULL;
		return false;
	}
	Record *record = NULL;
	while (!_dbFileMgr->eof()) {
		record = _dbFileMgr->allocateAndGetNextRecord();
88
89
		//In addition to NULL records, we also don't want to add unmapped reads.
		if (record == NULL || record->isUnmapped()) {
nkindlon's avatar
nkindlon committed
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
			continue;
		}

		if (!addRecordToTree(record)) {
			fprintf(stderr, "ERROR: Unable to add record to tree.\n");
			_dbFileMgr->close();
			return false;
		}
	}
	_dbFileMgr->close();

	//TBD: give ERROR and return false if tree is empty.
	if (_mainMap.empty()) {
		fprintf(stderr, "ERROR: Tree is empty, no records added.\n");
		return false;
	}
	return true;

}

void BinTree::getHits(Record *record, RecordKeyList &hitSet)
{
	if (_showBinMetrics) {
		return; //don't care about query entries just yet.
	}
115
116
117
	if (record->isUnmapped()) {
		return;
	}
nkindlon's avatar
nkindlon committed
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
    const QuickString &chr = record->getChrName();
	mainMapType::iterator mainIter = _mainMap.find(chr);
	if (mainIter == _mainMap.end()) {
		//given chrom not even in map.
		return;
	}

    uint32_t startBin = 0;
    uint32_t endBin = 0;

    uint32_t startPos = record->getStartPos();
    uint32_t endPos = record->getEndPos();

    startBin = (startPos >> _binFirstShift);
    endBin = ((endPos-1) >> _binFirstShift);


	const allBinsType bins = mainIter->second;

    /* SYNOPSIS:
         1. We loop through each UCSC BIN level for feature A's chrom.
         2. For each BIN, we loop through each B feature and add it to
            hits if it meets all of the user's requests, which include:
               (a) overlap fractio, (b) strandedness, (c) reciprocal overlap
    */
    for (uint32_t i = 0; i < NUM_BIN_LEVELS; i++) {
        uint32_t offset = _binOffsetsExtended[i];
        for (uint32_t j = (startBin+offset); j <= (endBin+offset); j++)  {

        	// move to the next bin if this one is empty
        	const binType &bin = bins[j];
        	if (bin == NULL) {
        		//no list of records in this bin.
        		continue;
        	}
        	if (bin->empty()) {
        		//bin has list, but it's empty.
        		//Actually, this should never happen, so throw an error.
        		fprintf(stderr, "ERROR: found empty list for bin %u of chromosome %s\n",
        				j, chr.c_str());
        		continue;
        	}
            for (innerListIterType listIter = bin->begin(); listIter != bin->end(); listIter = bin->next()) {
            	const Record *dbRec = listIter->value();
            	if (record->intersects(dbRec, _context->getSameStrand(), _context->getDiffStrand(),
            			_context->getOverlapFraction(), _context->getReciprocal())) {
            		hitSet.push_back(dbRec);
            	}
            }
        }
        startBin >>= _binNextShift;
        endBin >>= _binNextShift;
    }
}

bool BinTree::addRecordToTree(const Record *record)
{
175
	// Get chr, bin. allocate all bins and single bins as needed.
nkindlon's avatar
nkindlon committed
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
	const QuickString &chr = record->getChrName();
	uint32_t startPos = (uint32_t)(record->getStartPos());
	uint32_t endPos = (uint32_t)(record->getEndPos());

	//is this chr currently in the main map?
	allBinsType bins = NULL;
	mainMapType::iterator mainIter = _mainMap.find(chr);
	if (mainIter == _mainMap.end()) {
		//this is a new chr NOT currently in the map.
		bins = new binType[NUM_BINS];
		memset(bins, 0, NUM_BINS * sizeof(binType));
		_mainMap[chr] = bins;
	} else {
		bins = mainIter->second;
	}
	uint32_t binNum = getBin(startPos, endPos);

	if (_showBinMetrics) {
		if (binNum > _maxBinNumFound) {
			_maxBinNumFound = binNum;
		}
		_binsHit[binNum]++;
		return true;
	}

	if (binNum < 0 || binNum >= NUM_BINS) {
		fprintf(stderr, "ERROR: Received illegal bin number %u from getBin call.\n", binNum);
		return false;
	}
	binType &bin = bins[binNum];
	if (bin == NULL) {
		bin = new innerListType;
	}
	bin->push_back(record);

	return true;
}

uint32_t BinTree::getBin(const Record *record) const {
	return getBin((uint32_t)(record->getStartPos()), (uint32_t)(record->getEndPos()));
}

uint32_t BinTree::getBin(uint32_t start, uint32_t end) const {
    --end;
    start >>= _binFirstShift;
    end   >>= _binFirstShift;

    for (uint32_t i = 0; i < NUM_BIN_LEVELS; ++i) {
        if (start == end) {
        	return _binOffsetsExtended[i] + start;
        }
        start >>= _binNextShift;
        end   >>= _binNextShift;
    }
    //failure
    return -1;
}