Commit 24e97ccb authored by Neil Kindlon's avatar Neil Kindlon
Browse files

K closest finally working; chrom name conventions test change

parent f9fbaac7
......@@ -82,6 +82,9 @@ void map_help(void) {
cerr << "\t-header\t" << "Print the header from the A file prior to results." << endl << endl;
cerr << "\t-prec\t" << "Sets the decimal precision for output (Default: 5)" << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) Both input files must be sorted by chrom, then start." << endl << endl;
......
......@@ -69,6 +69,8 @@ void merge_help(void) {
cerr << "\t-c\t" << "Specify columns from the input file to operate upon (see -o option, below)." << endl;
cerr << "\t\tMultiple columns can be specified in a comma-delimited list." << endl << endl;
cerr << "\t-prec\t" << "Sets the decimal precision for output (Default: 5)" << endl << endl;
KeyListOpsHelp();
cerr << "Notes: " << endl;
......
......@@ -9,6 +9,7 @@
#include <sstream>
#include <iomanip>
//#include "FormatGuess.h"
#include <map>
#include "PushBackStreamBuf.h"
#include "InflateStreamBuf.h"
......@@ -29,14 +30,29 @@ int nek_sandbox1_main2(int argc,char** argv);
int nek_sandbox1_main(int argc,char** argv)
{
multimap<int, char> myMap;
myMap.insert(pair<int, char>(1, 'a'));
myMap.insert(pair<int, char>(2, 'b'));
myMap.insert(pair<int, char>(3, 'c'));
myMap.insert(pair<int, char>(3, 'x'));
myMap.insert(pair<int, char>(3, 'y'));
myMap.insert(pair<int, char>(3, 'z'));
myMap.insert(pair<int, char>(4, 'd'));
cout << "Multimap contains:" << endl;
for (multimap<int, char>::iterator iter = myMap.begin(); iter != myMap.end(); iter++) {
cout << iter-> first << ":" << iter->second << endl;
}
cout << "Size is " << myMap.size() << "." << endl;
// for (int i=0; i < 4000; i++) {
// cout << "# This is line " << i << " of a file with a large header." << endl;
// }
// return 0;
if (argc < 2) {
cerr << "Error: Need one input file. Use \"-\" for stdin." << endl;
}
// if (argc < 2) {
// cerr << "Error: Need one input file. Use \"-\" for stdin." << endl;
// }
// ifstream inFileStream(argv[1]);
// static const int BUF_SIZE = 8192;
// BamTools::Internal::BgzfStream bgStream;
......
......@@ -52,6 +52,7 @@ ContextBase::ContextBase()
_seed(0),
_forwardOnly(false),
_reverseOnly(false),
_nameCheckDisabled(false),
_hasColumnOpsMethods(false),
_keyListOps(NULL),
_desiredStrand(FileRecordMergeMgr::ANY_STRAND),
......@@ -197,6 +198,9 @@ bool ContextBase::parseCmdArgs(int argc, char **argv, int skipFirstArgs) {
else if (strcmp(_argv[_i], "-sortout") == 0) {
if (!handle_sortout()) return false;
}
else if (strcmp(_argv[_i], "-nonamecheck") == 0) {
if (!handle_sortout()) return false;
}
}
return true;
......@@ -534,6 +538,13 @@ bool ContextBase::handle_sortout()
return true;
}
bool ContextBase::handle_nonamecheck()
{
setNameCheckDisabled(true);
markUsed(_i - _skipFirstArgs);
return true;
}
void ContextBase::setColumnOpsMethods(bool val)
{
if (val && !_hasColumnOpsMethods) {
......@@ -603,6 +614,8 @@ bool ContextBase::parseIoBufSize(QuickString bufStr)
}
void ContextBase::testNameConventions(const Record *record) {
if (getNameCheckDisabled()) return;
int fileIdx = record->getFileIdx();
//
......@@ -635,7 +648,7 @@ void ContextBase::testNameConventions(const Record *record) {
//
bool zeroVal = record->hasLeadingZeroInChromName();
bool zeroVal = record->hasLeadingZeroInChromName(hasChr);
testChrVal = fileHasLeadingZeroInChromNames(fileIdx);
if (testChrVal == UNTESTED) {
_fileHasLeadingZeroInChromNames[fileIdx] = zeroVal ? YES : NO;
......
......@@ -116,6 +116,8 @@ public:
virtual bool getUseFullBamTags() const { return _useFullBamTags; }
virtual void setUseFullBamTags(bool val) { _useFullBamTags = val; }
virtual bool getNameCheckDisabled() const { return _nameCheckDisabled; }
virtual void setNameCheckDisabled(bool val) { _nameCheckDisabled = val; }
// METHODS FOR PROGRAMS WITH USER_SPECIFIED NUMBER
// OF OUTPUT RECORDS.
......@@ -207,6 +209,7 @@ protected:
int _seed;
bool _forwardOnly;
bool _reverseOnly;
bool _nameCheckDisabled;
//Members for column operations
bool _hasColumnOpsMethods;
......@@ -265,6 +268,7 @@ protected:
virtual bool handle_null();
virtual bool handle_delim();
virtual bool handle_sortout();
virtual bool handle_nonamecheck();
bool handle_prec();
bool parseIoBufSize(QuickString bufStr);
......
......@@ -18,7 +18,8 @@ ContextClosest::ContextClosest()
_diffNames(false),
_tieMode(ALL_TIES),
_strandedDistMode(REF_DIST),
_multiDbMode(EACH_DB)
_multiDbMode(EACH_DB),
_numClosestHitsWanted(1)
{
// closest requires sorted input
setSortedInput(true);
......@@ -75,6 +76,9 @@ bool ContextClosest::parseCmdArgs(int argc, char **argv, int skipFirstArgs){
else if (strcmp(_argv[_i], "-mdb") == 0) {
if (!handle_mdb()) return false;
}
else if (strcmp(_argv[_i], "-k") == 0) {
if (!handle_k()) return false;
}
}
return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs);
......@@ -220,3 +224,21 @@ bool ContextClosest::handle_mdb()
_errorMsg = "*****ERROR: Request \"each\" or \"last\" for Multiple Database Mode (-mdb)";
return false;
}
bool ContextClosest::handle_k()
{
if ((_i+1) < _argc) {
if (isNumeric(_argv[_i+1])) {
_numClosestHitsWanted = atoi(_argv[_i+1]);
if (_numClosestHitsWanted > 0) {
markUsed(_i - _skipFirstArgs);
_i++;
markUsed(_i - _skipFirstArgs);
return true;
}
}
}
_errorMsg = "\n***** ERROR: -k option must be followed by a postive integer value *****";
return false;
}
......@@ -27,6 +27,7 @@ public:
bool signDistance() const { return _signDistance; }
bool hasStrandedDistMode() const { return _haveStrandedDistMode; }
bool diffNames() const { return _diffNames; }
int getNumClosestHitsWanted() const { return _numClosestHitsWanted; }
typedef enum { FIRST_TIE, LAST_TIE, ALL_TIES} tieModeType;
tieModeType getTieMode() const { return _tieMode; }
......@@ -49,6 +50,7 @@ private:
tieModeType _tieMode;
strandedDistanceModeType _strandedDistMode;
multiDbModeType _multiDbMode;
int _numClosestHitsWanted;
bool handle_d();
bool handle_D();
......@@ -58,7 +60,7 @@ private:
bool handle_N();
bool handle_t();
bool handle_mdb();
bool handle_k();
};
......
......@@ -258,17 +258,11 @@ bool Record::hasChrInChromName() const {
(str[2] == 'r' || str[2] == 'R'));
}
bool Record::hasLeadingZeroInChromName() const {
bool Record::hasLeadingZeroInChromName(bool chrKnown) const {
const char *str = _chrName.c_str();
size_t i=0;
while (i < _chrName.size() && !isdigit(str[i])) {
if (str[i] == '_') return false; //ignore
//leading zero after underscore, as some of the
//random and hap chroms are names that way.
i++;
}
return (i < _chrName.size() && str[i] == '0');
// only check to see if the digit zero follows the occurance of
// "chr", case insensitive.
return (_chrName.size() >= 4 && str[3] == '0' && (chrKnown || hasChrInChromName()));
}
void Record::print(FILE *fp, bool newline) const {
......
......@@ -137,7 +137,7 @@ public:
// virtual static bool isNumericField(int fieldNum) const = 0;
bool hasChrInChromName() const;
bool hasLeadingZeroInChromName() const;
bool hasLeadingZeroInChromName(bool chrKnown = false) const;
protected:
......
This diff is collapsed.
......@@ -9,9 +9,65 @@
#define CLOSESWEEP_H_
#include "NewChromsweep.h"
#include <list>
#include <set>
class ContextClosest;
class distanceTuple {
public:
distanceTuple() : _dist(0), _rec(NULL), _isNeg(false) {}
distanceTuple(int dist, const Record *rec, bool isNeg = false) : _dist(dist), _rec(rec), _isNeg(isNeg) {}
// bool operator < (const distanceTuple & other) const { return (_dist < other._dist); }
int _dist;
const Record *_rec;
bool _isNeg;
};
class DistanceTupleSortAscFunctor {
public:
bool operator()(const distanceTuple & d1, const distanceTuple & d2) const {
return ((d1._dist < d2._dist) ? true : (d1._dist == d2._dist ? (d1._isNeg && !d2._isNeg) : false)) ; }
};
class RecDistList {
public:
typedef enum { LEFT, OVERLAP, RIGHT } chromDirType;
RecDistList(int maxSize) : _maxUniqueAllowed(maxSize) {}
bool empty() const { return _recs.empty(); }
void clear();
int uniqueSize() const { return _recs.size(); }
size_t totalSize() const { return _totalRecs; }
bool addRec(int dist, const Record *, chromDirType chromDir);
bool exists(int dist) const { return (_recs.find(dist) != _recs.end()); }
int furtherestDistance() const { return _recs.rbegin()->first; }
typedef vector<pair<chromDirType, const Record *> >elemsType;
typedef map<int, elemsType *> distRecsType;
typedef distRecsType::iterator iterType;
typedef distRecsType::reverse_iterator revIterType;
typedef distRecsType::const_iterator constIterType;
typedef distRecsType::const_reverse_iterator constRevIterType;
int getMaxDist() const { return _recs.empty() ? 0 : _recs.rbegin()->first; }
constIterType begin() const { return _recs.begin(); }
constIterType end() const { return _recs.end(); }
int currDist(constIterType iter) const { return iter->first; }
size_t currNumElems(constIterType iter) const { return iter->second->size(); }
const Record *firstElem(constIterType iter) const { return (iter->second->at(0)).second; }
const Record *lastElem(constIterType iter) const { return (iter->second->at(iter->second->size()-1)).second; }
const elemsType *allElems(constIterType iter) const { return iter->second; }
int getMaxLeftEndPos() const;
private:
void insert(int dist, const Record *, chromDirType chromDir);
distRecsType _recs;
int _maxUniqueAllowed;
int _totalRecs;
};
class CloseSweep : public NewChromSweep {
public:
CloseSweep(ContextClosest *context);
......@@ -21,13 +77,10 @@ public:
private:
ContextClosest *_context;
typedef vector<const Record * > distRecVecType;
vector<distRecVecType *> _minUpstreamRecs;
vector<int> _minUpstreamDist;
vector<distRecVecType *> _minDownstreamRecs;
vector<int> _minDownstreamDist;
vector<distRecVecType *> _overlapRecs;
int _kClosest; // how many closest hits we want to each query.
vector<RecDistList *> _minUpstreamRecs;
vector<RecDistList *> _minDownstreamRecs;
vector<RecDistList *> _overlapRecs;
vector<int> _maxPrevLeftClosestEndPos;
vector<int> _maxPrevLeftClosestEndPosReverse;
......@@ -48,9 +101,18 @@ private:
void finalizeSelections(int dbIdx, RecordKeyVector &retList);
void checkMultiDbs(RecordKeyVector &retList);
void setLeftClosestEndPos(int dbIdx, const Record *rec);
typedef enum { LEFT, OVERLAP, RIGHT } chromDirType;
typedef enum { UPSTREAM, INTERSECT, DOWNSTREAM } streamDirType;
void setLeftClosestEndPos(int dbIdx);
bool beforeLeftClosestEndPos(int dbIdx, const Record *rec);
void clearClosestEndPos(int dbIdx);
bool canStopScan(const Record *cacheRec, bool ignored, streamDirType streamDir);
int addRecsToRetList(const RecDistList::elemsType *recs, int currDist, RecordKeyVector &retList);
void addSingleRec(const Record *rec, int currDist, int &hitsUsed, RecordKeyVector &retList);
rateOvlpType tryToAddRecord(const Record *cacheRec, int dist, int dbIdx, bool &stopScanning, chromDirType chromDir, streamDirType streamDir);
bool purgePointException(int dbIdx);
};
......
......@@ -129,7 +129,7 @@ void NewChromSweep::scanCache(int dbIdx, RecordKeyVector &retList) {
if (_currQueryRec->sameChrom(cacheRec) && !_currQueryRec->after(cacheRec)) {
if (intersects(_currQueryRec, cacheRec)) {
retList.push_back(cacheRec);
} else break; // cacheRec is after the query rec, stop scanning.
} else if (cacheRec->after(_currQueryRec)) break; // cacheRec is after the query rec, stop scanning.
cacheIter = _caches[dbIdx].next();
}
else {
......@@ -150,6 +150,7 @@ void NewChromSweep::clearCache(int dbIdx)
}
void NewChromSweep::masterScan(RecordKeyVector &retList) {
for (int i=0; i < _numDBs; i++) {
if (dbFinished(i) || chromChange(i, retList, true)) {
continue;
......
chr1 10 20 a1 1 -
chr1 8 9 b1 1 +
chr1 21 22 b2 1 -
chr1 40 60
chr1 120 140
chr1 10 15
chr1 25 28
chr1 30 40
chr1 35 40
chr1 38 40
chr1 50 60
chr1 70 80
chr1 95 105 2.1 20 +
chr1 98 108 2.2 20 +
chr1 105 115 2.3 20 +
chr1 120 130 2.4 20 +
chr1 140 160 2.5 20 +
chr1 140 155 2.6 20 +
chr1 140 150 2.7 20 +
chr1 170 180 2.8 20 +
chr1 190 200 2.9 20 +
chr1 10 20 3.1 20 -
chr1 30 40 3.2 20 +
chr1 45 60 3.3 20 -
chr1 50 60 3.4 20 -
chr1 55 60 3.5 20 -
chr1 70 80 3.6 20 +
chr1 75 80 3.65 20 +
chr1 90 105 3.7 20 +
chr1 95 110 3.8 20 -
chr1 110 115 3.9 20 +
chr1 120 130 3.10 20 -
chr1 130 140 3.11 20 +
chr1 135 140 3.115 20 +
chr1 150 165 3.12 20 -
chr1 150 160 3.13 20 -
chr1 150 155 3.14 20 -
chr1 170 180 3.15 20 +
chr1 190 200 3.16 20 -
chr1 100 110 q2.1 20 +
chr1 105 110 q2.2 20 -
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment