BinTree.cpp 6.29 KB
Newer Older
nkindlon's avatar
nkindlon committed
1
2
3
4
#include "BinTree.h"
#include "FileRecordMgr.h"


5
6
BinTree::BinTree(ContextIntersect *context)
: _databaseFile(NULL),
nkindlon's avatar
nkindlon committed
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
  _context(context),
  _binOffsetsExtended(NULL),
  _showBinMetrics(false),
  _maxBinNumFound(0)
 {
	_binOffsetsExtended = new uint32_t[NUM_BIN_LEVELS];
	memset(_binOffsetsExtended, 0, NUM_BIN_LEVELS * sizeof(uint32_t));

	//start at idx 1, because the memset above already initialized
	//the first idx to zero, which is what we want.
	for (uint32_t i= 1; i < NUM_BIN_LEVELS; i++) {
		_binOffsetsExtended[i] = _binOffsetsExtended[i-1] + (1 << ((NUM_BIN_LEVELS - i -1) * 3));
	}
}

BinTree::~BinTree() {
	//TBD: pass all elements in tree back to FRM for proper cleanup/deletion
	for (mainMapType::iterator mainIter = _mainMap.begin(); mainIter != _mainMap.end(); mainIter++) {
		allBinsType bins = mainIter->second;
		if (bins == NULL) {
			fprintf(stderr, "ERROR: In BinTree destructor: found chromosome with NULL bin array.\n");
			continue;
		}
		if (!_showBinMetrics) { //don't clean up bins when simply reporting metrics.

			for (uint32_t i=0; i < NUM_BINS; i++) {
				binType bin = bins[i];
				if (bin == NULL) {
					continue;
				}
				for (innerListIterType listIter = bin->begin(); listIter != bin->end(); listIter = bin->next()) {
					const Record *record = listIter->value();
39
					_databaseFile->deleteRecord(record);
nkindlon's avatar
nkindlon committed
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
				}
				delete bin;
				bin = NULL;
			}
		}
		delete [] bins;
		bins = NULL;
	}
	delete [] _binOffsetsExtended;

	if (_showBinMetrics) {
		map<int, int> hitsHistogram;
		FILE *fp = fopen("BinsHitFile.txt", "w");
		fprintf(fp, "The largest bin was %u\n", _maxBinNumFound);
		fprintf(fp, "There were %d different bins hit by database.\n", (int)_binsHit.size());
		for (map<uint32_t, int>::iterator binIter = _binsHit.begin(); binIter != _binsHit.end(); binIter++) {
			uint32_t binNum = binIter->first;
			int numHits = binIter->second;
			fprintf(fp, "%u\t%d\n", binNum, numHits);
			hitsHistogram[numHits]++;
		}
		fclose(fp);
		fp = fopen("BinHitsHistogram.txt", "w");
		fprintf(fp, "NumHits\tNumBins\n");
		for (map<int, int>::iterator histIter = hitsHistogram.begin(); histIter != hitsHistogram.end(); histIter++) {
			fprintf(fp, "%d\t%d\n", histIter->first, histIter->second);
		}
		fclose(fp);
	}
}

71
void BinTree::loadDB()
nkindlon's avatar
nkindlon committed
72
{
73
74
	_databaseFile = _context->getFile(_context->getDatabaseFileIdx());

nkindlon's avatar
nkindlon committed
75
	Record *record = NULL;
76
	while (!_databaseFile->eof()) {
nkindlon's avatar
nkindlon committed
77
		record = _databaseFile->getNextRecord();
78
79
		//In addition to NULL records, we also don't want to add unmapped reads.
		if (record == NULL || record->isUnmapped()) {
nkindlon's avatar
nkindlon committed
80
81
82
83
84
			continue;
		}

		if (!addRecordToTree(record)) {
			fprintf(stderr, "ERROR: Unable to add record to tree.\n");
85
86
			_databaseFile->close();
			exit(1);
nkindlon's avatar
nkindlon committed
87
88
89
90
91
92
93
94
95
		}
	}
}

void BinTree::getHits(Record *record, RecordKeyList &hitSet)
{
	if (_showBinMetrics) {
		return; //don't care about query entries just yet.
	}
96
97
98
	if (record->isUnmapped()) {
		return;
	}
nkindlon's avatar
nkindlon committed
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
    const QuickString &chr = record->getChrName();
	mainMapType::iterator mainIter = _mainMap.find(chr);
	if (mainIter == _mainMap.end()) {
		//given chrom not even in map.
		return;
	}

    uint32_t startBin = 0;
    uint32_t endBin = 0;

    uint32_t startPos = record->getStartPos();
    uint32_t endPos = record->getEndPos();

    startBin = (startPos >> _binFirstShift);
    endBin = ((endPos-1) >> _binFirstShift);


	const allBinsType bins = mainIter->second;

    /* SYNOPSIS:
         1. We loop through each UCSC BIN level for feature A's chrom.
         2. For each BIN, we loop through each B feature and add it to
            hits if it meets all of the user's requests, which include:
               (a) overlap fractio, (b) strandedness, (c) reciprocal overlap
    */
    for (uint32_t i = 0; i < NUM_BIN_LEVELS; i++) {
        uint32_t offset = _binOffsetsExtended[i];
        for (uint32_t j = (startBin+offset); j <= (endBin+offset); j++)  {

        	// move to the next bin if this one is empty
        	const binType &bin = bins[j];
        	if (bin == NULL) {
        		//no list of records in this bin.
        		continue;
        	}
        	if (bin->empty()) {
        		//bin has list, but it's empty.
        		//Actually, this should never happen, so throw an error.
        		fprintf(stderr, "ERROR: found empty list for bin %u of chromosome %s\n",
        				j, chr.c_str());
        		continue;
        	}
            for (innerListIterType listIter = bin->begin(); listIter != bin->end(); listIter = bin->next()) {
            	const Record *dbRec = listIter->value();
            	if (record->intersects(dbRec, _context->getSameStrand(), _context->getDiffStrand(),
            			_context->getOverlapFraction(), _context->getReciprocal())) {
            		hitSet.push_back(dbRec);
            	}
            }
        }
        startBin >>= _binNextShift;
        endBin >>= _binNextShift;
    }
}

bool BinTree::addRecordToTree(const Record *record)
{
156
	// Get chr, bin. allocate all bins and single bins as needed.
nkindlon's avatar
nkindlon committed
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
	const QuickString &chr = record->getChrName();
	uint32_t startPos = (uint32_t)(record->getStartPos());
	uint32_t endPos = (uint32_t)(record->getEndPos());

	//is this chr currently in the main map?
	allBinsType bins = NULL;
	mainMapType::iterator mainIter = _mainMap.find(chr);
	if (mainIter == _mainMap.end()) {
		//this is a new chr NOT currently in the map.
		bins = new binType[NUM_BINS];
		memset(bins, 0, NUM_BINS * sizeof(binType));
		_mainMap[chr] = bins;
	} else {
		bins = mainIter->second;
	}
	uint32_t binNum = getBin(startPos, endPos);

	if (_showBinMetrics) {
		if (binNum > _maxBinNumFound) {
			_maxBinNumFound = binNum;
		}
		_binsHit[binNum]++;
		return true;
	}

	if (binNum < 0 || binNum >= NUM_BINS) {
		fprintf(stderr, "ERROR: Received illegal bin number %u from getBin call.\n", binNum);
		return false;
	}
	binType &bin = bins[binNum];
	if (bin == NULL) {
		bin = new innerListType;
	}
	bin->push_back(record);

	return true;
}

uint32_t BinTree::getBin(const Record *record) const {
	return getBin((uint32_t)(record->getStartPos()), (uint32_t)(record->getEndPos()));
}

uint32_t BinTree::getBin(uint32_t start, uint32_t end) const {
    --end;
    start >>= _binFirstShift;
    end   >>= _binFirstShift;

    for (uint32_t i = 0; i < NUM_BIN_LEVELS; ++i) {
        if (start == end) {
        	return _binOffsetsExtended[i] + start;
        }
        start >>= _binNextShift;
        end   >>= _binNextShift;
    }
    //failure
    return -1;
}