Commit 897d1a85 authored by Neil Kindlon's avatar Neil Kindlon
Browse files

first check-in for new closest tool

parent 34f0dd71
......@@ -87,6 +87,11 @@ void closest_help(void) {
cerr << "\t\t- \"first\" Report the first tie that occurred in the B file." << endl;
cerr << "\t\t- \"last\" Report the last tie that occurred in the B file." << endl << endl;
cerr << "\t-mdb\t" << "How multiple databases are resolved." << endl;
cerr << "\t\t- \"each\" Report closest records for each database (default)." << endl;
cerr << "\t\t- \"all\" Report closest records among all databases." << endl << endl;
cerr << "\t-N\t" << "Require that the query and the closest hit have different names." << endl;
cerr << "\t\tFor BED, the 4th column is compared." << endl << endl;
......
......@@ -17,7 +17,8 @@ ContextClosest::ContextClosest()
_haveStrandedDistMode(false),
_diffNames(false),
_tieMode(ALL_TIES),
_strandedDistMode(REF_DIST)
_strandedDistMode(REF_DIST),
_multiDbMode(EACH_DB)
{
}
......@@ -69,6 +70,10 @@ bool ContextClosest::parseCmdArgs(int argc, char **argv, int skipFirstArgs){
else if (strcmp(_argv[_i], "-t") == 0) {
if (!handle_t()) return false;
}
else if (strcmp(_argv[_i], "-mdb") == 0) {
if (!handle_mdb()) return false;
}
}
return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs);
}
......@@ -188,3 +193,28 @@ bool ContextClosest::handle_t()
_errorMsg = "*****ERROR: Request \"all\", \"first\", \"last\" for Tie Mode (-t)";
return false;
}
bool ContextClosest::handle_mdb()
{
bool mdbError = false;
if ((_i+1) < _argc) {
QuickString mdbStr(_argv[_i+1]);
if (mdbStr == "each") {
_multiDbMode = EACH_DB;
} else if (mdbStr == "all") {
_multiDbMode = ALL_DBS;
} else {
mdbError = true;
}
} else {
mdbError = true;
}
if (!mdbError) {
markUsed(_i - _skipFirstArgs);
_i++;
markUsed(_i - _skipFirstArgs);
return true;
}
_errorMsg = "*****ERROR: Request \"each\" or \"last\" for Multiple Database Mode (-mdb)";
return false;
}
......@@ -34,6 +34,8 @@ public:
typedef enum { REF_DIST, A_DIST, B_DIST} strandedDistanceModeType;
strandedDistanceModeType getStrandedDistMode() const { return _strandedDistMode; }
typedef enum { EACH_DB, ALL_DBS } multiDbModeType;
multiDbModeType getMultiDbMode() const { return _multiDbMode; }
private:
bool _haveTieMode;
......@@ -46,6 +48,7 @@ private:
bool _diffNames;
tieModeType _tieMode;
strandedDistanceModeType _strandedDistMode;
multiDbModeType _multiDbMode;
bool handle_d();
bool handle_D();
......@@ -54,6 +57,8 @@ private:
bool handle_id();
bool handle_N();
bool handle_t();
bool handle_mdb();
};
......
......@@ -10,19 +10,22 @@
CloseSweep::CloseSweep(ContextClosest *context)
: NewChromSweep(context),
_context(context) {
_context(context)
{
_minUpstreamRecs.resize(_numDBs, NULL);
_minDownstreamRecs.resize(_numDBs, NULL);
_overlapRecs.resize(_numDBs, NULL);
_minUpstreamDist.resize(_numDBs, INT_MAX);
_minDownstreamDist.resize(_numDBs,INT_MAX);
_maxPrevLeftClosestEndPos.resize(_numDBs, 0 );
for (int i=0; i < _numDBs; i++) {
_minUpstreamRecs[i] = new distRecVecType();
_minDownstreamRecs[i] = new distRecVecType();
_overlapRecs[i] = new vector<const Record *>();
}
}
CloseSweep::~CloseSweep(void) {
......@@ -40,7 +43,7 @@ void CloseSweep::masterScan(RecordKeyVector &retList) {
}
//initialize distances
for (size_t i=0; i < _numDBs; i++) {
for (int i=0; i < _numDBs; i++) {
_minUpstreamDist[i] = INT_MAX;
_minDownstreamDist[i] = INT_MAX;
}
......@@ -57,7 +60,8 @@ void CloseSweep::masterScan(RecordKeyVector &retList) {
// scan the database cache for hits
scanCache(i, retList);
//skip if we hit the end of the DB
// skip if we hit the end of the DB
// advance the db until we are ahead of the query. update hits and cache as necessary
bool stopScanning = false;
while (_currDbRecs[i] != NULL &&
......@@ -75,6 +79,7 @@ void CloseSweep::masterScan(RecordKeyVector &retList) {
}
finalizeSelections(i, retList);
}
checkMultiDbs(retList);
}
void CloseSweep::scanCache(int dbIdx, RecordKeyVector &retList) {
......@@ -123,6 +128,7 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
// HIT INTERSECTS QUERY
if (!_context->ignoreOverlaps()) {
_overlapRecs[dbIdx]->push_back(cacheRec);
_maxPrevLeftClosestEndPos[dbIdx] = cacheRec->getEndPos();
}
return IGNORE;
} else if (cacheRec->after(_currQueryRec)) {
......@@ -177,6 +183,9 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
} else if (_currQueryRec->after(cacheRec)){
// HIT IS TO THE LEFT OF THE QUERY.
// First see if we can purge this record from the cache. If it's further left than the last record that was left, delete it.
if (cacheRec->getEndPos() < _maxPrevLeftClosestEndPos[dbIdx]) return DELETE;
currDist = (_currQueryRec->getStartPos() - cacheRec->getEndPos()) + 1;
if (_context->signDistance()) {
if ((_context->getStrandedDistMode() != ContextClosest::REF_DIST) &&
......@@ -198,6 +207,7 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
_minDownstreamRecs[dbIdx]->clear();
}
_minDownstreamRecs[dbIdx]->push_back(cacheRec);
_maxPrevLeftClosestEndPos[dbIdx] = cacheRec->getEndPos();
return IGNORE;
}
}
......@@ -212,9 +222,12 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
_minUpstreamRecs[dbIdx]->clear();
}
_minUpstreamRecs[dbIdx]->push_back(cacheRec);
_maxPrevLeftClosestEndPos[dbIdx] = cacheRec->getEndPos();
return IGNORE;
} else if (currDist == _minUpstreamDist[dbIdx]) {
_minUpstreamRecs[dbIdx]->push_back(cacheRec);
_maxPrevLeftClosestEndPos[dbIdx] = cacheRec->getEndPos();
return IGNORE;
} else {
return DELETE;
......@@ -304,18 +317,76 @@ void CloseSweep::finalizeSelections(int dbIdx, RecordKeyVector &retList) {
}
return;
}
}
void CloseSweep::checkMultiDbs(RecordKeyVector &retList) {
ContextClosest::tieModeType tieMode = _context->getTieMode();
if (_context->getMultiDbMode() == ContextClosest::ALL_DBS && _numDBs > 1) {
_copyDists.clear();
_copyRetList.clearAll();
_copyRetList.setKey(retList.getKey());
//loop through retList, find min dist
int minDist = INT_MAX;
int i = 0;
for (; i < (int)_finalDistances.size(); i++) {
if (_finalDistances[i] < minDist) {
minDist = _finalDistances[i];
}
}
i=0;
for (RecordKeyVector::const_iterator_type iter = retList.begin(); iter != retList.end(); iter++) {
int dist = _finalDistances[i];
if (dist == minDist || dist * -1 == minDist) {
_copyDists.push_back(dist);
_copyRetList.push_back(*iter);
}
i++;
}
retList.clearVector();
_finalDistances.clear();
if (_copyRetList.empty()) return;
if (tieMode == ContextClosest::FIRST_TIE) {
retList.push_back(*(_copyRetList.begin()));
_finalDistances.push_back(_copyDists[0]);
} else if (tieMode == ContextClosest::LAST_TIE) {
retList.push_back(*(_copyRetList.begin() + _copyRetList.size() -1));
_finalDistances.push_back(_copyDists[_copyDists.size()-1]);
} else {
retList = _copyRetList;
_finalDistances = _copyDists;
}
}
}
bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList)
{
// if (_currQueryRec->getChrName() == "chr1_gl000191_random" || (_currDbRecs[dbIdx] != NULL && _currDbRecs[dbIdx]->getChrName() == "chr1_gl000191_random")) {
// printf("Break point here.\n");
// }
// the files are on the same chrom
if (_currDbRecs[dbIdx] == NULL || _currQueryRec->sameChrom(_currDbRecs[dbIdx])) {
//if this is the first time the query's chrom is ahead of the chrom that was in this cache,
//then we have to clear the cache.
if (!_caches[dbIdx].empty() && _currQueryRec->chromAfter(_caches[dbIdx].begin()->value())) {
clearCache(dbIdx);
_maxPrevLeftClosestEndPos[dbIdx] = 0;
}
return false;
}
if (_currDbRecs[dbIdx]->chromAfter(_currQueryRec) && (!_caches[dbIdx].empty() && (_caches[dbIdx].begin()->value()->sameChrom(_currQueryRec)))) {
//the newest DB record's chrom is ahead of the query, but the cache still
//has records on that chrom
return false;
//has old records on that query's chrom
scanCache(dbIdx, retList);
finalizeSelections(dbIdx, retList);
return true;
}
// the query is ahead of the database. fast-forward the database to catch-up.
if (_currQueryRec->chromAfter(_currDbRecs[dbIdx])) {
......@@ -326,6 +397,7 @@ bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList)
nextRecord(false, dbIdx);
}
clearCache(dbIdx);
_maxPrevLeftClosestEndPos[dbIdx] = 0;
return false;
}
// the database is ahead of the query.
......@@ -339,4 +411,3 @@ bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList)
//control can't reach here, but compiler still wants a return statement.
return true;
}
......@@ -28,9 +28,15 @@ private:
vector<distRecVecType *> _minDownstreamRecs;
vector<int> _minDownstreamDist;
vector<distRecVecType *> _overlapRecs;
vector<int> _maxPrevLeftClosestEndPos;
vector<int> _finalDistances;
//structs to help with finding closest among all of multiple dbs.
RecordKeyVector _copyRetList;
vector<int> _copyDists;
//override these two methods from chromsweep
void masterScan(RecordKeyVector &retList);
void scanCache(int dbIdx, RecordKeyVector &retList);
......@@ -40,8 +46,7 @@ private:
typedef enum { IGNORE, DELETE } rateOvlpType;
rateOvlpType considerRecord(const Record *cacheRec, int dbIdx, bool &stopScanning);
void finalizeSelections(int dbIdx, RecordKeyVector &retList);
void checkMultiDbs(RecordKeyVector &retList);
};
......
chr1 5 15
chr1 20 60
chr1 200 220
chr1 15 35
chr1 120 170
chr1 210 230
......@@ -152,18 +152,66 @@ $BT closest -a close-a.bed -b close-b.bed -t last > obs
check obs exp
rm obs exp
###########################################################
#
# TEST MULTIPLE DATABASES
#
###########################################################
# test reproting of no overlapping feature
###########################################################
echo " closest.t12...\c"
# test 3 dbs, -each mode, which is the default
###########################################################
echo " closest.t13...\c"
echo \
"chr1 100 101 chr1 150 201
chr1 200 201 chr1 100 101
chr1 300 301 chr1 150 201
chr1 100000 100010 chr1 175 375
chr1 100020 100040 chr1 175 375
chr2 1 10 . -1 -1
chr2 20 30 . -1 -1" > exp
$BT closest -a close-a.bed -b close-b.bed -io > obs
"chr1 80 100 chr1 20 60
chr1 80 100 chr1 120 170
chr1 80 100 chr1 70 90" > exp
$BT closest -a mq1.bed -b mdb1.bed mdb2.bed mdb3.bed > obs
check obs exp
rm obs exp
###########################################################
# test 3 dbs, -all mode
###########################################################
echo " closest.t13...\c"
echo \
"chr1 80 100 chr1 70 90" > exp
$BT closest -a mq1.bed -b mdb1.bed mdb2.bed mdb3.bed -mdb all > obs
check obs exp
rm obs exp
###########################################################
# test 2 dbs, tie mode = all
###########################################################
echo " closest.t14...\c"
echo \
"chr1 80 100 chr1 20 60
chr1 80 100 chr1 120 170" > exp
$BT closest -a mq1.bed -b mdb1.bed mdb2.bed -t all > obs
check obs exp
rm obs exp
###########################################################
# test 2 dbs, tie mode = first
###########################################################
echo " closest.t15...\c"
echo \
"chr1 80 100 chr1 20 60" > exp
$BT closest -a mq1.bed -b mdb1.bed mdb2.bed -mdb all -t first > obs
check obs exp
rm obs exp
###########################################################
# test 2 dbs, tie mode = last
###########################################################
echo " closest.t16...\c"
echo \
"chr1 80 100 chr1 120 170" > exp
$BT closest -a mq1.bed -b mdb1.bed mdb2.bed -mdb all -t last > obs
check obs exp
rm obs exp
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment