Commit 897d1a85 authored by Neil Kindlon's avatar Neil Kindlon
Browse files

first check-in for new closest tool

parent 34f0dd71
...@@ -87,6 +87,11 @@ void closest_help(void) { ...@@ -87,6 +87,11 @@ void closest_help(void) {
cerr << "\t\t- \"first\" Report the first tie that occurred in the B file." << endl; cerr << "\t\t- \"first\" Report the first tie that occurred in the B file." << endl;
cerr << "\t\t- \"last\" Report the last tie that occurred in the B file." << endl << endl; cerr << "\t\t- \"last\" Report the last tie that occurred in the B file." << endl << endl;
cerr << "\t-mdb\t" << "How multiple databases are resolved." << endl;
cerr << "\t\t- \"each\" Report closest records for each database (default)." << endl;
cerr << "\t\t- \"all\" Report closest records among all databases." << endl << endl;
cerr << "\t-N\t" << "Require that the query and the closest hit have different names." << endl; cerr << "\t-N\t" << "Require that the query and the closest hit have different names." << endl;
cerr << "\t\tFor BED, the 4th column is compared." << endl << endl; cerr << "\t\tFor BED, the 4th column is compared." << endl << endl;
......
...@@ -17,7 +17,8 @@ ContextClosest::ContextClosest() ...@@ -17,7 +17,8 @@ ContextClosest::ContextClosest()
_haveStrandedDistMode(false), _haveStrandedDistMode(false),
_diffNames(false), _diffNames(false),
_tieMode(ALL_TIES), _tieMode(ALL_TIES),
_strandedDistMode(REF_DIST) _strandedDistMode(REF_DIST),
_multiDbMode(EACH_DB)
{ {
} }
...@@ -69,6 +70,10 @@ bool ContextClosest::parseCmdArgs(int argc, char **argv, int skipFirstArgs){ ...@@ -69,6 +70,10 @@ bool ContextClosest::parseCmdArgs(int argc, char **argv, int skipFirstArgs){
else if (strcmp(_argv[_i], "-t") == 0) { else if (strcmp(_argv[_i], "-t") == 0) {
if (!handle_t()) return false; if (!handle_t()) return false;
} }
else if (strcmp(_argv[_i], "-mdb") == 0) {
if (!handle_mdb()) return false;
}
} }
return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs); return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs);
} }
...@@ -188,3 +193,28 @@ bool ContextClosest::handle_t() ...@@ -188,3 +193,28 @@ bool ContextClosest::handle_t()
_errorMsg = "*****ERROR: Request \"all\", \"first\", \"last\" for Tie Mode (-t)"; _errorMsg = "*****ERROR: Request \"all\", \"first\", \"last\" for Tie Mode (-t)";
return false; return false;
} }
bool ContextClosest::handle_mdb()
{
bool mdbError = false;
if ((_i+1) < _argc) {
QuickString mdbStr(_argv[_i+1]);
if (mdbStr == "each") {
_multiDbMode = EACH_DB;
} else if (mdbStr == "all") {
_multiDbMode = ALL_DBS;
} else {
mdbError = true;
}
} else {
mdbError = true;
}
if (!mdbError) {
markUsed(_i - _skipFirstArgs);
_i++;
markUsed(_i - _skipFirstArgs);
return true;
}
_errorMsg = "*****ERROR: Request \"each\" or \"last\" for Multiple Database Mode (-mdb)";
return false;
}
...@@ -34,6 +34,8 @@ public: ...@@ -34,6 +34,8 @@ public:
typedef enum { REF_DIST, A_DIST, B_DIST} strandedDistanceModeType; typedef enum { REF_DIST, A_DIST, B_DIST} strandedDistanceModeType;
strandedDistanceModeType getStrandedDistMode() const { return _strandedDistMode; } strandedDistanceModeType getStrandedDistMode() const { return _strandedDistMode; }
typedef enum { EACH_DB, ALL_DBS } multiDbModeType;
multiDbModeType getMultiDbMode() const { return _multiDbMode; }
private: private:
bool _haveTieMode; bool _haveTieMode;
...@@ -46,6 +48,7 @@ private: ...@@ -46,6 +48,7 @@ private:
bool _diffNames; bool _diffNames;
tieModeType _tieMode; tieModeType _tieMode;
strandedDistanceModeType _strandedDistMode; strandedDistanceModeType _strandedDistMode;
multiDbModeType _multiDbMode;
bool handle_d(); bool handle_d();
bool handle_D(); bool handle_D();
...@@ -54,6 +57,8 @@ private: ...@@ -54,6 +57,8 @@ private:
bool handle_id(); bool handle_id();
bool handle_N(); bool handle_N();
bool handle_t(); bool handle_t();
bool handle_mdb();
}; };
......
...@@ -10,19 +10,22 @@ ...@@ -10,19 +10,22 @@
CloseSweep::CloseSweep(ContextClosest *context) CloseSweep::CloseSweep(ContextClosest *context)
: NewChromSweep(context), : NewChromSweep(context),
_context(context) { _context(context)
{
_minUpstreamRecs.resize(_numDBs, NULL); _minUpstreamRecs.resize(_numDBs, NULL);
_minDownstreamRecs.resize(_numDBs, NULL); _minDownstreamRecs.resize(_numDBs, NULL);
_overlapRecs.resize(_numDBs, NULL); _overlapRecs.resize(_numDBs, NULL);
_minUpstreamDist.resize(_numDBs, INT_MAX); _minUpstreamDist.resize(_numDBs, INT_MAX);
_minDownstreamDist.resize(_numDBs,INT_MAX); _minDownstreamDist.resize(_numDBs,INT_MAX);
_maxPrevLeftClosestEndPos.resize(_numDBs, 0 );
for (int i=0; i < _numDBs; i++) { for (int i=0; i < _numDBs; i++) {
_minUpstreamRecs[i] = new distRecVecType(); _minUpstreamRecs[i] = new distRecVecType();
_minDownstreamRecs[i] = new distRecVecType(); _minDownstreamRecs[i] = new distRecVecType();
_overlapRecs[i] = new vector<const Record *>(); _overlapRecs[i] = new vector<const Record *>();
} }
} }
CloseSweep::~CloseSweep(void) { CloseSweep::~CloseSweep(void) {
...@@ -40,7 +43,7 @@ void CloseSweep::masterScan(RecordKeyVector &retList) { ...@@ -40,7 +43,7 @@ void CloseSweep::masterScan(RecordKeyVector &retList) {
} }
//initialize distances //initialize distances
for (size_t i=0; i < _numDBs; i++) { for (int i=0; i < _numDBs; i++) {
_minUpstreamDist[i] = INT_MAX; _minUpstreamDist[i] = INT_MAX;
_minDownstreamDist[i] = INT_MAX; _minDownstreamDist[i] = INT_MAX;
} }
...@@ -57,7 +60,8 @@ void CloseSweep::masterScan(RecordKeyVector &retList) { ...@@ -57,7 +60,8 @@ void CloseSweep::masterScan(RecordKeyVector &retList) {
// scan the database cache for hits // scan the database cache for hits
scanCache(i, retList); scanCache(i, retList);
//skip if we hit the end of the DB
// skip if we hit the end of the DB
// advance the db until we are ahead of the query. update hits and cache as necessary // advance the db until we are ahead of the query. update hits and cache as necessary
bool stopScanning = false; bool stopScanning = false;
while (_currDbRecs[i] != NULL && while (_currDbRecs[i] != NULL &&
...@@ -75,6 +79,7 @@ void CloseSweep::masterScan(RecordKeyVector &retList) { ...@@ -75,6 +79,7 @@ void CloseSweep::masterScan(RecordKeyVector &retList) {
} }
finalizeSelections(i, retList); finalizeSelections(i, retList);
} }
checkMultiDbs(retList);
} }
void CloseSweep::scanCache(int dbIdx, RecordKeyVector &retList) { void CloseSweep::scanCache(int dbIdx, RecordKeyVector &retList) {
...@@ -123,6 +128,7 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int ...@@ -123,6 +128,7 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
// HIT INTERSECTS QUERY // HIT INTERSECTS QUERY
if (!_context->ignoreOverlaps()) { if (!_context->ignoreOverlaps()) {
_overlapRecs[dbIdx]->push_back(cacheRec); _overlapRecs[dbIdx]->push_back(cacheRec);
_maxPrevLeftClosestEndPos[dbIdx] = cacheRec->getEndPos();
} }
return IGNORE; return IGNORE;
} else if (cacheRec->after(_currQueryRec)) { } else if (cacheRec->after(_currQueryRec)) {
...@@ -177,6 +183,9 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int ...@@ -177,6 +183,9 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
} else if (_currQueryRec->after(cacheRec)){ } else if (_currQueryRec->after(cacheRec)){
// HIT IS TO THE LEFT OF THE QUERY. // HIT IS TO THE LEFT OF THE QUERY.
// First see if we can purge this record from the cache. If it's further left than the last record that was left, delete it.
if (cacheRec->getEndPos() < _maxPrevLeftClosestEndPos[dbIdx]) return DELETE;
currDist = (_currQueryRec->getStartPos() - cacheRec->getEndPos()) + 1; currDist = (_currQueryRec->getStartPos() - cacheRec->getEndPos()) + 1;
if (_context->signDistance()) { if (_context->signDistance()) {
if ((_context->getStrandedDistMode() != ContextClosest::REF_DIST) && if ((_context->getStrandedDistMode() != ContextClosest::REF_DIST) &&
...@@ -198,6 +207,7 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int ...@@ -198,6 +207,7 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
_minDownstreamRecs[dbIdx]->clear(); _minDownstreamRecs[dbIdx]->clear();
} }
_minDownstreamRecs[dbIdx]->push_back(cacheRec); _minDownstreamRecs[dbIdx]->push_back(cacheRec);
_maxPrevLeftClosestEndPos[dbIdx] = cacheRec->getEndPos();
return IGNORE; return IGNORE;
} }
} }
...@@ -212,9 +222,12 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int ...@@ -212,9 +222,12 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
_minUpstreamRecs[dbIdx]->clear(); _minUpstreamRecs[dbIdx]->clear();
} }
_minUpstreamRecs[dbIdx]->push_back(cacheRec); _minUpstreamRecs[dbIdx]->push_back(cacheRec);
_maxPrevLeftClosestEndPos[dbIdx] = cacheRec->getEndPos();
return IGNORE; return IGNORE;
} else if (currDist == _minUpstreamDist[dbIdx]) { } else if (currDist == _minUpstreamDist[dbIdx]) {
_minUpstreamRecs[dbIdx]->push_back(cacheRec); _minUpstreamRecs[dbIdx]->push_back(cacheRec);
_maxPrevLeftClosestEndPos[dbIdx] = cacheRec->getEndPos();
return IGNORE; return IGNORE;
} else { } else {
return DELETE; return DELETE;
...@@ -304,18 +317,76 @@ void CloseSweep::finalizeSelections(int dbIdx, RecordKeyVector &retList) { ...@@ -304,18 +317,76 @@ void CloseSweep::finalizeSelections(int dbIdx, RecordKeyVector &retList) {
} }
return; return;
} }
}
void CloseSweep::checkMultiDbs(RecordKeyVector &retList) {
ContextClosest::tieModeType tieMode = _context->getTieMode();
if (_context->getMultiDbMode() == ContextClosest::ALL_DBS && _numDBs > 1) {
_copyDists.clear();
_copyRetList.clearAll();
_copyRetList.setKey(retList.getKey());
//loop through retList, find min dist
int minDist = INT_MAX;
int i = 0;
for (; i < (int)_finalDistances.size(); i++) {
if (_finalDistances[i] < minDist) {
minDist = _finalDistances[i];
}
}
i=0;
for (RecordKeyVector::const_iterator_type iter = retList.begin(); iter != retList.end(); iter++) {
int dist = _finalDistances[i];
if (dist == minDist || dist * -1 == minDist) {
_copyDists.push_back(dist);
_copyRetList.push_back(*iter);
}
i++;
}
retList.clearVector();
_finalDistances.clear();
if (_copyRetList.empty()) return;
if (tieMode == ContextClosest::FIRST_TIE) {
retList.push_back(*(_copyRetList.begin()));
_finalDistances.push_back(_copyDists[0]);
} else if (tieMode == ContextClosest::LAST_TIE) {
retList.push_back(*(_copyRetList.begin() + _copyRetList.size() -1));
_finalDistances.push_back(_copyDists[_copyDists.size()-1]);
} else {
retList = _copyRetList;
_finalDistances = _copyDists;
}
}
} }
bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList) bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList)
{ {
// if (_currQueryRec->getChrName() == "chr1_gl000191_random" || (_currDbRecs[dbIdx] != NULL && _currDbRecs[dbIdx]->getChrName() == "chr1_gl000191_random")) {
// printf("Break point here.\n");
// }
// the files are on the same chrom // the files are on the same chrom
if (_currDbRecs[dbIdx] == NULL || _currQueryRec->sameChrom(_currDbRecs[dbIdx])) { if (_currDbRecs[dbIdx] == NULL || _currQueryRec->sameChrom(_currDbRecs[dbIdx])) {
//if this is the first time the query's chrom is ahead of the chrom that was in this cache,
//then we have to clear the cache.
if (!_caches[dbIdx].empty() && _currQueryRec->chromAfter(_caches[dbIdx].begin()->value())) {
clearCache(dbIdx);
_maxPrevLeftClosestEndPos[dbIdx] = 0;
}
return false; return false;
} }
if (_currDbRecs[dbIdx]->chromAfter(_currQueryRec) && (!_caches[dbIdx].empty() && (_caches[dbIdx].begin()->value()->sameChrom(_currQueryRec)))) { if (_currDbRecs[dbIdx]->chromAfter(_currQueryRec) && (!_caches[dbIdx].empty() && (_caches[dbIdx].begin()->value()->sameChrom(_currQueryRec)))) {
//the newest DB record's chrom is ahead of the query, but the cache still //the newest DB record's chrom is ahead of the query, but the cache still
//has records on that chrom //has old records on that query's chrom
return false; scanCache(dbIdx, retList);
finalizeSelections(dbIdx, retList);
return true;
} }
// the query is ahead of the database. fast-forward the database to catch-up. // the query is ahead of the database. fast-forward the database to catch-up.
if (_currQueryRec->chromAfter(_currDbRecs[dbIdx])) { if (_currQueryRec->chromAfter(_currDbRecs[dbIdx])) {
...@@ -326,6 +397,7 @@ bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList) ...@@ -326,6 +397,7 @@ bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList)
nextRecord(false, dbIdx); nextRecord(false, dbIdx);
} }
clearCache(dbIdx); clearCache(dbIdx);
_maxPrevLeftClosestEndPos[dbIdx] = 0;
return false; return false;
} }
// the database is ahead of the query. // the database is ahead of the query.
...@@ -339,4 +411,3 @@ bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList) ...@@ -339,4 +411,3 @@ bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList)
//control can't reach here, but compiler still wants a return statement. //control can't reach here, but compiler still wants a return statement.
return true; return true;
} }
...@@ -28,9 +28,15 @@ private: ...@@ -28,9 +28,15 @@ private:
vector<distRecVecType *> _minDownstreamRecs; vector<distRecVecType *> _minDownstreamRecs;
vector<int> _minDownstreamDist; vector<int> _minDownstreamDist;
vector<distRecVecType *> _overlapRecs; vector<distRecVecType *> _overlapRecs;
vector<int> _maxPrevLeftClosestEndPos;
vector<int> _finalDistances; vector<int> _finalDistances;
//structs to help with finding closest among all of multiple dbs.
RecordKeyVector _copyRetList;
vector<int> _copyDists;
//override these two methods from chromsweep //override these two methods from chromsweep
void masterScan(RecordKeyVector &retList); void masterScan(RecordKeyVector &retList);
void scanCache(int dbIdx, RecordKeyVector &retList); void scanCache(int dbIdx, RecordKeyVector &retList);
...@@ -40,8 +46,7 @@ private: ...@@ -40,8 +46,7 @@ private:
typedef enum { IGNORE, DELETE } rateOvlpType; typedef enum { IGNORE, DELETE } rateOvlpType;
rateOvlpType considerRecord(const Record *cacheRec, int dbIdx, bool &stopScanning); rateOvlpType considerRecord(const Record *cacheRec, int dbIdx, bool &stopScanning);
void finalizeSelections(int dbIdx, RecordKeyVector &retList); void finalizeSelections(int dbIdx, RecordKeyVector &retList);
void checkMultiDbs(RecordKeyVector &retList);
}; };
......
chr1 5 15
chr1 20 60
chr1 200 220
chr1 15 35
chr1 120 170
chr1 210 230
...@@ -152,18 +152,66 @@ $BT closest -a close-a.bed -b close-b.bed -t last > obs ...@@ -152,18 +152,66 @@ $BT closest -a close-a.bed -b close-b.bed -t last > obs
check obs exp check obs exp
rm obs exp rm obs exp
###########################################################
#
# TEST MULTIPLE DATABASES
#
########################################################### ###########################################################
# test reproting of no overlapping feature
########################################################### ###########################################################
echo " closest.t12...\c" # test 3 dbs, -each mode, which is the default
###########################################################
echo " closest.t13...\c"
echo \ echo \
"chr1 100 101 chr1 150 201 "chr1 80 100 chr1 20 60
chr1 200 201 chr1 100 101 chr1 80 100 chr1 120 170
chr1 300 301 chr1 150 201 chr1 80 100 chr1 70 90" > exp
chr1 100000 100010 chr1 175 375 $BT closest -a mq1.bed -b mdb1.bed mdb2.bed mdb3.bed > obs
chr1 100020 100040 chr1 175 375 check obs exp
chr2 1 10 . -1 -1 rm obs exp
chr2 20 30 . -1 -1" > exp
$BT closest -a close-a.bed -b close-b.bed -io > obs ###########################################################
# test 3 dbs, -all mode
###########################################################
echo " closest.t13...\c"
echo \
"chr1 80 100 chr1 70 90" > exp
$BT closest -a mq1.bed -b mdb1.bed mdb2.bed mdb3.bed -mdb all > obs
check obs exp
rm obs exp
###########################################################
# test 2 dbs, tie mode = all
###########################################################
echo " closest.t14...\c"
echo \
"chr1 80 100 chr1 20 60
chr1 80 100 chr1 120 170" > exp
$BT closest -a mq1.bed -b mdb1.bed mdb2.bed -t all > obs
check obs exp
rm obs exp
###########################################################
# test 2 dbs, tie mode = first
###########################################################
echo " closest.t15...\c"
echo \
"chr1 80 100 chr1 20 60" > exp
$BT closest -a mq1.bed -b mdb1.bed mdb2.bed -mdb all -t first > obs
check obs exp check obs exp
rm obs exp rm obs exp
###########################################################
# test 2 dbs, tie mode = last
###########################################################
echo " closest.t16...\c"
echo \
"chr1 80 100 chr1 120 170" > exp
$BT closest -a mq1.bed -b mdb1.bed mdb2.bed -mdb all -t last > obs
check obs exp
rm obs exp
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment