Commit 436e6f4e authored by Neil Kindlon's avatar Neil Kindlon
Browse files

Bug fixes for closest and bug 155

parent b70b6caf
......@@ -24,7 +24,7 @@ int closest_main(int argc, char* argv[]) {
}
closest_help();
delete context;
return 0;
return 1;
}
ClosestFile *closestFile = new ClosestFile(context);
......
......@@ -16,7 +16,7 @@ int fisher_main(int argc, char* argv[]) {
}
fisher_help();
delete context;
return 0;
return 1;
}
Fisher *fisher = new Fisher(context);
......
......@@ -30,7 +30,7 @@ int intersect_main(int argc, char* argv[]) {
}
intersect_help();
delete context;
return 0;
return 1;
}
FileIntersect *fileIntersect = new FileIntersect(context);
......
......@@ -27,7 +27,7 @@ int jaccard_main(int argc, char* argv[]) {
}
jaccard_help();
delete context;
return 0;
return 1;
}
Jaccard *jaccard = new Jaccard(context);
......
......@@ -29,7 +29,7 @@ int map_main(int argc, char* argv[]) {
}
map_help();
delete context;
return 0;
return 1;
}
FileMap *fileMap = new FileMap(context);
......
......@@ -33,7 +33,7 @@ int merge_main(int argc, char* argv[]) {
}
merge_help();
delete context;
return 0;
return 1;
}
MergeFile *mergeFile = new MergeFile(context);
......
......@@ -22,7 +22,7 @@ int sample_main(int argc, char **argv)
}
sample_help();
delete context;
return 0;
return 1;
}
SampleFile *sampleFile = new SampleFile(context);
......
......@@ -157,7 +157,7 @@ bool ContextClosest::handle_iu() {
}
bool ContextClosest::handle_id() {
_ignoreUpstream = true;
_ignoreDownstream = true;
markUsed(_i - _skipFirstArgs);
return true;
}
......
......@@ -190,9 +190,9 @@ int BlockMgr::findBlockedOverlaps(RecordKeyVector &keyList, RecordKeyVector &hit
}
}
if (hitHasOverlap) {
if ((float) totalHitOverlap / (float)keyBlocksSumLength > _overlapFraction) {
if ((float) totalHitOverlap / (float)keyBlocksSumLength >= _overlapFraction) {
if (_hasReciprocal &&
((float)totalHitOverlap / (float)hitBlockSumLength > _overlapFraction)) {
((float)totalHitOverlap / (float)hitBlockSumLength >= _overlapFraction)) {
_overlapBases.push_back(totalHitOverlap);
resultList.push_back(*hitListIter);
} else if (!_hasReciprocal) {
......
......@@ -19,6 +19,7 @@ CloseSweep::CloseSweep(ContextClosest *context)
_minUpstreamDist.resize(_numDBs, INT_MAX);
_minDownstreamDist.resize(_numDBs,INT_MAX);
_maxPrevLeftClosestEndPos.resize(_numDBs, 0);
_maxPrevLeftClosestEndPosReverse.resize(_numDBs, 0);
for (int i=0; i < _numDBs; i++) {
_minUpstreamRecs[i] = new distRecVecType();
......@@ -70,19 +71,21 @@ void CloseSweep::masterScan(RecordKeyVector &retList) {
// skip if we hit the end of the DB
// advance the db until we are ahead of the query. update hits and cache as necessary
bool stopScanning = false;
while (_currDbRecs[i] != NULL &&
_currQueryRec->sameChrom(_currDbRecs[i]) &&
!stopScanning) {
if (considerRecord(_currDbRecs[i], i, stopScanning) == DELETE) {
_dbFRMs[i]->deleteRecord(_currDbRecs[i]);
_currDbRecs[i] = NULL;
} else {
_caches[i].push_back(_currDbRecs[i]);
_currDbRecs[i] = NULL;
// if (_currDbRecs[i] != NULL || nextRecord(false, i)) {
bool stopScanning = false;
while (_currDbRecs[i] != NULL &&
_currQueryRec->sameChrom(_currDbRecs[i]) &&
!stopScanning) {
if (considerRecord(_currDbRecs[i], i, stopScanning) == DELETE) {
_dbFRMs[i]->deleteRecord(_currDbRecs[i]);
_currDbRecs[i] = NULL;
} else {
_caches[i].push_back(_currDbRecs[i]);
_currDbRecs[i] = NULL;
}
nextRecord(false, i);
}
nextRecord(false, i);
}
// }
}
finalizeSelections(i, retList);
}
......@@ -148,7 +151,7 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
// HIT INTERSECTS QUERY
if (!_context->ignoreOverlaps()) {
_overlapRecs[dbIdx]->push_back(cacheRec);
_maxPrevLeftClosestEndPos[dbIdx] = cacheRec->getEndPos();
setLeftClosestEndPos(dbIdx, cacheRec);
}
return IGNORE;
} else if (cacheRec->after(_currQueryRec)) {
......@@ -159,15 +162,15 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
currDist = (cacheRec->getStartPos() - _currQueryRec->getEndPos()) + 1;
if (_context->signDistance()) {
if ((_context->getStrandedDistMode() == ContextClosest::A_DIST && _currQueryRec->getStrandVal() == Record::REVERSE) ||
(_context->getStrandedDistMode() == ContextClosest::B_DIST && cacheRec->getStrandVal() == Record::REVERSE))
(_context->getStrandedDistMode() == ContextClosest::B_DIST && cacheRec->getStrandVal() == Record::FORWARD))
{
// hit is "upstream" of A
if (_context->ignoreUpstream()) {
return IGNORE;
}
else {
if (currDist <= abs(_minUpstreamDist[dbIdx])) {
if (currDist < abs(_minUpstreamDist[dbIdx])) {
if (currDist<= abs(_minUpstreamDist[dbIdx])) {
if (currDist< abs(_minUpstreamDist[dbIdx])) {
_minUpstreamDist[dbIdx] = currDist * -1;
_minUpstreamRecs[dbIdx]->clear();
}
......@@ -177,14 +180,15 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
_minUpstreamRecs[dbIdx]->push_back(cacheRec);
return IGNORE;
} else {
return DELETE;
stopScanning = true;
return IGNORE;
}
}
}
}
// HIT IS DOWNSTREAM.
// MUST FIRST DETERMINE WHETHER TO STOP SCANNING.
if (currDist > abs(_minDownstreamDist[dbIdx])) {
if (currDist> abs(_minDownstreamDist[dbIdx])) {
stopScanning = true;
return IGNORE;
}
......@@ -193,7 +197,7 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
}
//Still here? Valid hit.
if (currDist <= abs(_minDownstreamDist[dbIdx])) {
if (currDist < abs(_minDownstreamDist[dbIdx])) {
if (currDist< abs(_minDownstreamDist[dbIdx])) {
_minDownstreamDist[dbIdx] = currDist;
_minDownstreamRecs[dbIdx]->clear();
}
......@@ -204,13 +208,12 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
// HIT IS TO THE LEFT OF THE QUERY.
// First see if we can purge this record from the cache. If it's further left than the last record that was left, delete it.
if (cacheRec->getEndPos() < _maxPrevLeftClosestEndPos[dbIdx]) return DELETE;
if (beforeLeftClosestEndPos(dbIdx, cacheRec)) return DELETE;
currDist = (_currQueryRec->getStartPos() - cacheRec->getEndPos()) + 1;
if (_context->signDistance()) {
if ((_context->getStrandedDistMode() == ContextClosest::REF_DIST) ||
(_context->getStrandedDistMode() == ContextClosest::A_DIST && _currQueryRec->getStrandVal() != Record::REVERSE) ||
(_context->getStrandedDistMode() == ContextClosest::B_DIST && cacheRec->getStrandVal() != Record::REVERSE))
if ((_context->getStrandedDistMode() == ContextClosest::A_DIST && _currQueryRec->getStrandVal() == Record::REVERSE) ||
(_context->getStrandedDistMode() == ContextClosest::B_DIST && cacheRec->getStrandVal() == Record::FORWARD))
{
// HIT IS DOWNSTREAM.
// MUST FIRST DETERMINE WHETHER TO STOP SCANNING.
......@@ -223,31 +226,31 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
//Still here? Valid hit.
if (currDist <= abs(_minDownstreamDist[dbIdx])) {
if (currDist < abs(_minDownstreamDist[dbIdx])) {
_minDownstreamDist[dbIdx] = currDist * -1;
_minDownstreamDist[dbIdx] = currDist;
_minDownstreamRecs[dbIdx]->clear();
}
_minDownstreamRecs[dbIdx]->push_back(cacheRec);
_maxPrevLeftClosestEndPos[dbIdx] = cacheRec->getEndPos();
setLeftClosestEndPos(dbIdx, cacheRec);
return IGNORE;
}
}
}
// hit is "upstream" of A
// hit is "UPSTREAM" of A
if (_context->ignoreUpstream()) {
return IGNORE;
}
if (currDist <= abs(_minUpstreamDist[dbIdx])) {
if (currDist < abs(_minUpstreamDist[dbIdx])) {
_minUpstreamDist[dbIdx] = currDist;
_minUpstreamDist[dbIdx] = currDist * -1;
_minUpstreamRecs[dbIdx]->clear();
}
_minUpstreamRecs[dbIdx]->push_back(cacheRec);
_maxPrevLeftClosestEndPos[dbIdx] = cacheRec->getEndPos();
setLeftClosestEndPos(dbIdx, cacheRec);
return IGNORE;
} else if (currDist == abs(_minUpstreamDist[dbIdx])) {
_minUpstreamRecs[dbIdx]->push_back(cacheRec);
_maxPrevLeftClosestEndPos[dbIdx] = cacheRec->getEndPos();
setLeftClosestEndPos(dbIdx, cacheRec);
return IGNORE;
} else {
return DELETE;
......@@ -418,12 +421,14 @@ bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList, bool wantScan)
//then we have to clear the cache.
if (!_caches[dbIdx].empty() && queryChromAfterDbRec(_caches[dbIdx].begin()->value())) {
clearCache(dbIdx);
_maxPrevLeftClosestEndPos[dbIdx] = 0;
clearClosestEndPos(dbIdx);
}
return false;
}
if (haveDB && haveQuery && dbRecAfterQueryChrom(dbRec) && (!_caches[dbIdx].empty() && (_caches[dbIdx].begin()->value()->sameChrom(_currQueryRec)))) {
if (!haveQuery || !haveDB) return false;
if (!_caches[dbIdx].empty() && (_caches[dbIdx].begin()->value()->sameChrom(_currQueryRec))) {
//the newest DB record's chrom is ahead of the query, but the cache still
//has old records on that query's chrom
scanCache(dbIdx, retList);
......@@ -431,7 +436,6 @@ bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList, bool wantScan)
return true;
}
if (!haveQuery || !haveDB) return false;
// the query is ahead of the database. fast-forward the database to catch-up.
if (queryChromAfterDbRec(dbRec)) {
......@@ -440,9 +444,10 @@ bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList, bool wantScan)
queryChromAfterDbRec(dbRec)) {
_dbFRMs[dbIdx]->deleteRecord(dbRec);
nextRecord(false, dbIdx);
dbRec = _currDbRecs[dbIdx];
}
clearCache(dbIdx);
_maxPrevLeftClosestEndPos[dbIdx] = 0;
clearClosestEndPos(dbIdx);
return false;
}
// the database is ahead of the query.
......@@ -468,12 +473,66 @@ bool CloseSweep::dbRecAfterQueryChrom(const Record *dbRec)
//see if the db has both it's curr chrom and the query's curr chrom.
const _orderTrackType *track = _fileTracks[dbRec->getFileIdx()];
_orderTrackType::const_iterator iter = track->find(dbRec->getChrName());
if (iter == track->end()) return false; //db file does not contain the curr chrom
int dbOrder = iter->second;
iter = track->find(_currQueryRec->getChrName());
if (iter == track->end()) return false; //db file does not contain the query chrom.
if (iter == track->end()) return false; // query file does not contain the db chrom.
int qOrder = iter->second;
return (dbOrder > qOrder);
}
void CloseSweep::setLeftClosestEndPos(int dbIdx, const Record *rec)
{
int recEndPos = rec->getEndPos();
if (!_context->getSameStrand() && !_context->getDiffStrand()) {
_maxPrevLeftClosestEndPos[dbIdx] = recEndPos;
} else {
if (_context->getSameStrand()) {
if (rec->getStrandVal() == Record::FORWARD) {
_maxPrevLeftClosestEndPos[dbIdx] = recEndPos;
} else {
_maxPrevLeftClosestEndPosReverse[dbIdx] = recEndPos;
}
} else {
//want diff strand
if (rec->getStrandVal() == Record::FORWARD) {
_maxPrevLeftClosestEndPosReverse[dbIdx] = recEndPos;
} else {
_maxPrevLeftClosestEndPos[dbIdx] = recEndPos;
}
}
}
}
bool CloseSweep::beforeLeftClosestEndPos(int dbIdx, const Record *rec)
{
int recEndPos = rec->getEndPos();
int prevPos = _maxPrevLeftClosestEndPos[dbIdx];
int prevPosReverse = _maxPrevLeftClosestEndPosReverse[dbIdx];
if (!_context->getSameStrand() && !_context->getDiffStrand()) {
return recEndPos < prevPos;
} else {
if (_context->getSameStrand()) {
if (rec->getStrandVal() == Record::FORWARD) {
return recEndPos < prevPos;
} else {
return recEndPos < prevPosReverse;
}
} else {
//want diff strand
if (rec->getStrandVal() == Record::FORWARD) {
return recEndPos < prevPosReverse;
} else {
return recEndPos < prevPos;
}
}
}
}
void CloseSweep::clearClosestEndPos(int dbIdx)
{
_maxPrevLeftClosestEndPos[dbIdx] = 0;
_maxPrevLeftClosestEndPosReverse[dbIdx] = 0;
}
......@@ -29,6 +29,7 @@ private:
vector<int> _minDownstreamDist;
vector<distRecVecType *> _overlapRecs;
vector<int> _maxPrevLeftClosestEndPos;
vector<int> _maxPrevLeftClosestEndPosReverse;
vector<int> _finalDistances;
......@@ -48,6 +49,10 @@ private:
rateOvlpType considerRecord(const Record *cacheRec, int dbIdx, bool &stopScanning);
void finalizeSelections(int dbIdx, RecordKeyVector &retList);
void checkMultiDbs(RecordKeyVector &retList);
void setLeftClosestEndPos(int dbIdx, const Record *rec);
bool beforeLeftClosestEndPos(int dbIdx, const Record *rec);
void clearClosestEndPos(int dbIdx);
};
......
......@@ -25,7 +25,10 @@ NewChromSweep::NewChromSweep(ContextIntersect *context)
_databaseTotalRecords(0),
_wasInitialized(false),
_currQueryRec(NULL),
_runToQueryEnd(false)
_runToQueryEnd(false),
_lexicoDisproven(false),
_lexicoAssumed(false),
_lexicoAssumedFileIdx(-1)
{
}
......@@ -211,6 +214,7 @@ bool NewChromSweep::chromChange(int dbIdx, RecordKeyVector &retList, bool wantSc
bool NewChromSweep::next(RecordKeyVector &retList) {
retList.clearVector();
//make sure the first read of the query file is tested for chrom sort order.
bool needTestSortOrder = false;
if (_currQueryRec != NULL) {
......@@ -228,9 +232,6 @@ bool NewChromSweep::next(RecordKeyVector &retList) {
return false;
}
_currQueryChromName = _currQueryRec->getChrName();
// if (_currQueryChromName == "chrUn_gl000230" && _currQueryRec->getStartPos() == 40971) {
// printf("Break point here.\n");
// }
masterScan(retList);
......@@ -320,6 +321,17 @@ void NewChromSweep::testChromOrder(const Record *rec)
rec->print(stderr, true);
exit(1);
}
if (!_lexicoDisproven && chrom < prevChrom) {
if (_lexicoAssumed) {
// ERROR.
fprintf(stderr, "ERROR: Sort order was unspecified, and file %s is not sorted lexicographically.\n",
_context->getInputFileName(fileIdx).c_str());
fprintf(stderr, " Please re-reun with the -g option for a genome file.\n See documentation for details.\n");
exit(1);
}
_lexicoDisproven = true;
}
}
bool NewChromSweep::queryChromAfterDbRec(const Record *dbRec)
......@@ -330,12 +342,19 @@ bool NewChromSweep::queryChromAfterDbRec(const Record *dbRec)
return (_currQueryRec->getChromId() > dbRec->getChromId()) ;
}
//see if query has both
const QuickString &qChrom = _currQueryRec->getChrName();
const QuickString &dbChrom = dbRec->getChrName();
const _orderTrackType *track = _fileTracks[_currQueryRec->getFileIdx()];
_orderTrackType::const_iterator iter = track->find(_currQueryRec->getChrName());
if (iter == track->end()) return false; //query file does not contain the curr chrom
_orderTrackType::const_iterator iter = track->find(qChrom);
int qOrder = iter->second;
iter = track->find(dbRec->getChrName());
if (iter == track->end()) return false; //db file does not contain the dbChrom.
iter = track->find(dbChrom);
if (iter == track->end()) {
//query file does not contain the dbChrom.
//try a lexicographical comparison, if possible.
return testLexicoQueryAfterDb(_currQueryRec, dbRec);
}
int dbOrder = iter->second;
return (qOrder > dbOrder);
......@@ -382,7 +401,7 @@ bool NewChromSweep::verifyChromOrderMismatch(const QuickString & chrom, const Qu
void NewChromSweep::testThatAllDbChromsExistInQuery()
{
if (_context->hasGenomeFile()) return;
if (_context->hasGenomeFile() || !_lexicoDisproven) return;
int queryIdx = _context->getQueryFileIdx();
//get the query file track. Then check that every chrom in every db exists in it.
......@@ -402,3 +421,17 @@ void NewChromSweep::testThatAllDbChromsExistInQuery()
}
}
}
bool NewChromSweep::testLexicoQueryAfterDb(const Record *queryRec, const Record *dbRec)
{
if (_lexicoDisproven) return false;
bool queryGreater = queryRec->getChrName() > dbRec->getChrName();
if (!_lexicoAssumed && queryGreater) {
_lexicoAssumed = true;
_lexicoAssumedFileIdx = dbRec->getFileIdx();
_lexicoAssumedChromName = dbRec->getChrName();
}
return queryGreater;
}
......@@ -109,19 +109,25 @@ protected:
//
// members and methods for detectnig differently
// members and methods for detecting differently
// sorted files without a genome file.
//
typedef map<QuickString, int> _orderTrackType;
vector<_orderTrackType *> _fileTracks;
map<int, QuickString> _filePrevChrom;
bool _lexicoDisproven; //whether we've established that any file ISN'T in lexicographical order
bool _lexicoAssumed; //whether we've had to try to guess that any file might be in lexicographical order.
QuickString _lexicoAssumedChromName; //which chromosome we had to make that guess for. Used in error reporting.
int _lexicoAssumedFileIdx; //which file we had to make the guess for. Also for error reporting.
void testChromOrder(const Record *rec);
bool queryChromAfterDbRec(const Record *dbRec);
int findChromOrder(const Record *rec);
bool verifyChromOrderMismatch(const QuickString & chrom, const QuickString &prevChrom, int skipFile);
void testThatAllDbChromsExistInQuery();
bool testLexicoQueryAfterDb(const Record *queryRec, const Record *dbRec);
};
......
......@@ -156,7 +156,10 @@ void RecordOutputMgr::printClosest(RecordKeyVector &keyList, const vector<int> *
printKey(hitRec, hitRec->getStartPosStr(), hitRec->getEndPosStr());
if (dists != NULL) {
tab();
_outBuf.append((*dists)[distCount]);
int dist = (*dists)[distCount];
//if not using sign distance, use absolute value instead.
dist = context->signDistance() ? dist : abs(dist);
_outBuf.append(dist);
distCount++;
}
newline();
......
chr1 10 20
chr1 130 140
\ No newline at end of file
chr1 80 90
chr1 200 210
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment