Commit 59b7ab67 authored by Aaron Quinlan's avatar Aaron Quinlan
Browse files

Merge pull request #179 from nkindlon/master

k closest completed
parents 26892d44 ba3bb104
......@@ -74,10 +74,19 @@ void closest_help(void) {
cerr << "\t-iu\t" << "Ignore features in B that are upstream of features in A." << endl;
cerr << "\t\tThis option requires -D and follows its orientation" << endl;
cerr << "\t\trules for determining what is \"upstream\"." << endl;
cerr << "\t\trules for determining what is \"upstream\"." << endl << endl;
cerr << "\t-id\t" << "Ignore features in B that are downstream of features in A." << endl;
cerr << "\t\tThis option requires -D and follows its orientation" << endl;
cerr << "\t\trules for determining what is \"downstream\"." << endl;
cerr << "\t\trules for determining what is \"downstream\"." << endl << endl;
cerr << "\t-fu\t" << "Choose first from features in B that are upstream of features in A." << endl;
cerr << "\t\tThis option requires -D and follows its orientation" << endl;
cerr << "\t\trules for determining what is \"upstream\"." << endl << endl;
cerr << "\t-fd\t" << "Choose first from features in B that are downstream of features in A." << endl;
cerr << "\t\tThis option requires -D and follows its orientation" << endl;
cerr << "\t\trules for determining what is \"downstream\"." << endl << endl;
cerr << "\t-t\t" << "How ties for closest feature are handled. This occurs when two" << endl;
cerr << "\t\tfeatures in B have exactly the same \"closeness\" with A." << endl;
......@@ -91,11 +100,14 @@ void closest_help(void) {
cerr << "\t\t- \"each\" Report closest records for each database (default)." << endl;
cerr << "\t\t- \"all\" Report closest records among all databases." << endl << endl;
cerr << "\t-names\t" << "When using multiple databases (-b), provide an alias for each that" << endl;
cerr <<"\t\twill appear instead of a fileId when also printing the DB record." << endl << endl;
cerr << "\t-names\t" << "When using multiple databases (-b), provide an alias for each that" << endl;
cerr <<"\t\twill appear instead of a fileId when also printing the DB record." << endl << endl;
cerr << "\t-filenames" << "\tWhen using multiple databases (-b), show each complete filename" << endl;
cerr <<"\t\t\tinstead of a fileId when also printing the DB record." << endl << endl;
cerr << "\t-filenames" << "\tWhen using multiple databases (-b), show each complete filename" << endl;
cerr <<"\t\t\tinstead of a fileId when also printing the DB record." << endl << endl;
cerr << "\t-k\t" << "Report the k closest hits. Default is 1. If tieMode = \"all\", " << endl;
cerr << "\t\t- all ties will still be reported." << endl << endl;
cerr << "\t-N\t" << "Require that the query and the closest hit have different names." << endl;
cerr << "\t\tFor BED, the 4th column is compared." << endl << endl;
......
......@@ -125,10 +125,10 @@ void intersect_help(void) {
cerr <<"\t\tother software tools and scripts that need to process one" << endl;
cerr <<"\t\tline of bedtools output at a time." << endl << endl;
cerr << "\t-names\t" << "When using multiple databases (-b), provide an alias for each that" << endl;
cerr << "\t-names\t" << "When using multiple databases, provide an alias for each that" << endl;
cerr <<"\t\twill appear instead of a fileId when also printing the DB record." << endl << endl;
cerr << "\t-filenames" << "\tWhen using multiple databases (-b), show each complete filename" << endl;
cerr << "\t-filenames" << "\tWhen using multiple databases, show each complete filename" << endl;
cerr <<"\t\t\tinstead of a fileId when also printing the DB record." << endl << endl;
cerr << "\t-sortout\t" << "When using multiple databases, sort the output DB hits" << endl;
......
......@@ -82,6 +82,9 @@ void map_help(void) {
cerr << "\t-header\t" << "Print the header from the A file prior to results." << endl << endl;
cerr << "\t-prec\t" << "Sets the decimal precision for output (Default: 5)" << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) Both input files must be sorted by chrom, then start." << endl << endl;
......
......@@ -69,6 +69,8 @@ void merge_help(void) {
cerr << "\t-c\t" << "Specify columns from the input file to operate upon (see -o option, below)." << endl;
cerr << "\t\tMultiple columns can be specified in a comma-delimited list." << endl << endl;
cerr << "\t-prec\t" << "Sets the decimal precision for output (Default: 5)" << endl << endl;
KeyListOpsHelp();
cerr << "Notes: " << endl;
......
......@@ -9,6 +9,7 @@
#include <sstream>
#include <iomanip>
//#include "FormatGuess.h"
#include <map>
#include "PushBackStreamBuf.h"
#include "InflateStreamBuf.h"
......@@ -29,14 +30,29 @@ int nek_sandbox1_main2(int argc,char** argv);
int nek_sandbox1_main(int argc,char** argv)
{
multimap<int, char> myMap;
myMap.insert(pair<int, char>(1, 'a'));
myMap.insert(pair<int, char>(2, 'b'));
myMap.insert(pair<int, char>(3, 'c'));
myMap.insert(pair<int, char>(3, 'x'));
myMap.insert(pair<int, char>(3, 'y'));
myMap.insert(pair<int, char>(3, 'z'));
myMap.insert(pair<int, char>(4, 'd'));
cout << "Multimap contains:" << endl;
for (multimap<int, char>::iterator iter = myMap.begin(); iter != myMap.end(); iter++) {
cout << iter-> first << ":" << iter->second << endl;
}
cout << "Size is " << myMap.size() << "." << endl;
// for (int i=0; i < 4000; i++) {
// cout << "# This is line " << i << " of a file with a large header." << endl;
// }
// return 0;
if (argc < 2) {
cerr << "Error: Need one input file. Use \"-\" for stdin." << endl;
}
// if (argc < 2) {
// cerr << "Error: Need one input file. Use \"-\" for stdin." << endl;
// }
// ifstream inFileStream(argv[1]);
// static const int BUF_SIZE = 8192;
// BamTools::Internal::BgzfStream bgStream;
......
......@@ -52,6 +52,7 @@ ContextBase::ContextBase()
_seed(0),
_forwardOnly(false),
_reverseOnly(false),
_nameCheckDisabled(false),
_hasColumnOpsMethods(false),
_keyListOps(NULL),
_desiredStrand(FileRecordMergeMgr::ANY_STRAND),
......@@ -198,6 +199,9 @@ bool ContextBase::parseCmdArgs(int argc, char **argv, int skipFirstArgs) {
else if (strcmp(_argv[_i], "-sortout") == 0) {
if (!handle_sortout()) return false;
}
else if (strcmp(_argv[_i], "-nonamecheck") == 0) {
if (!handle_sortout()) return false;
}
}
return true;
......@@ -535,6 +539,13 @@ bool ContextBase::handle_sortout()
return true;
}
bool ContextBase::handle_nonamecheck()
{
setNameCheckDisabled(true);
markUsed(_i - _skipFirstArgs);
return true;
}
void ContextBase::setColumnOpsMethods(bool val)
{
if (val && !_hasColumnOpsMethods) {
......@@ -604,6 +615,8 @@ bool ContextBase::parseIoBufSize(QuickString bufStr)
}
void ContextBase::testNameConventions(const Record *record) {
if (getNameCheckDisabled()) return;
int fileIdx = record->getFileIdx();
//
......@@ -636,7 +649,7 @@ void ContextBase::testNameConventions(const Record *record) {
//
bool zeroVal = record->hasLeadingZeroInChromName();
bool zeroVal = record->hasLeadingZeroInChromName(hasChr);
testChrVal = fileHasLeadingZeroInChromNames(fileIdx);
if (testChrVal == UNTESTED) {
_fileHasLeadingZeroInChromNames[fileIdx] = zeroVal ? YES : NO;
......
......@@ -116,6 +116,8 @@ public:
virtual bool getUseFullBamTags() const { return _useFullBamTags; }
virtual void setUseFullBamTags(bool val) { _useFullBamTags = val; }
virtual bool getNameCheckDisabled() const { return _nameCheckDisabled; }
virtual void setNameCheckDisabled(bool val) { _nameCheckDisabled = val; }
// METHODS FOR PROGRAMS WITH USER_SPECIFIED NUMBER
// OF OUTPUT RECORDS.
......@@ -207,6 +209,7 @@ protected:
int _seed;
bool _forwardOnly;
bool _reverseOnly;
bool _nameCheckDisabled;
//Members for column operations
bool _hasColumnOpsMethods;
......@@ -265,6 +268,7 @@ protected:
virtual bool handle_null();
virtual bool handle_delim();
virtual bool handle_sortout();
virtual bool handle_nonamecheck();
bool handle_prec();
bool parseIoBufSize(QuickString bufStr);
......
......@@ -18,7 +18,8 @@ ContextClosest::ContextClosest()
_diffNames(false),
_tieMode(ALL_TIES),
_strandedDistMode(REF_DIST),
_multiDbMode(EACH_DB)
_multiDbMode(EACH_DB),
_numClosestHitsWanted(1)
{
// closest requires sorted input
setSortedInput(true);
......@@ -66,7 +67,13 @@ bool ContextClosest::parseCmdArgs(int argc, char **argv, int skipFirstArgs){
else if (strcmp(_argv[_i], "-id") == 0) {
if (!handle_id()) return false;
}
else if (strcmp(_argv[_i], "-N") == 0) {
else if (strcmp(_argv[_i], "-fu") == 0) {
if (!handle_fu()) return false;
}
else if (strcmp(_argv[_i], "-fd") == 0) {
if (!handle_fd()) return false;
}
else if (strcmp(_argv[_i], "-N") == 0) {
if (!handle_N()) return false;
}
else if (strcmp(_argv[_i], "-t") == 0) {
......@@ -75,6 +82,9 @@ bool ContextClosest::parseCmdArgs(int argc, char **argv, int skipFirstArgs){
else if (strcmp(_argv[_i], "-mdb") == 0) {
if (!handle_mdb()) return false;
}
else if (strcmp(_argv[_i], "-k") == 0) {
if (!handle_k()) return false;
}
}
return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs);
......@@ -106,6 +116,25 @@ bool ContextClosest::isValidState(){
return false;
}
if ((_forceUpstream || _forceDownstream) && ! _haveStrandedDistMode) {
_errorMsg = "\n*****\n*****ERROR: When requesting -fu or -fd, you also need to specify -D.\n*****\n";
return false;
}
if (_ignoreUpstream && _forceUpstream) {
_errorMsg = "\n*****\n*****ERROR: Can't both ignore upstream and force upstream.\n*****\n";
return false;
}
if (_ignoreDownstream && _forceDownstream) {
_errorMsg = "\n*****\n*****ERROR: Can't both ignore downstream and force downstream.\n*****\n";
return false;
}
if (_sortOutput && _reportDistance) {
_errorMsg = "\n*****\n*****ERROR: -sortout (sorted output) is not valid with distance reporting.\n*****\n";
return false;
}
return true;
}
......@@ -162,6 +191,18 @@ bool ContextClosest::handle_id() {
return true;
}
bool ContextClosest::handle_fu() {
_forceUpstream = true;
markUsed(_i - _skipFirstArgs);
return true;
}
bool ContextClosest::handle_fd() {
_forceDownstream = true;
markUsed(_i - _skipFirstArgs);
return true;
}
bool ContextClosest::handle_N() {
_diffNames = true;
markUsed(_i - _skipFirstArgs);
......@@ -220,3 +261,21 @@ bool ContextClosest::handle_mdb()
_errorMsg = "*****ERROR: Request \"each\" or \"last\" for Multiple Database Mode (-mdb)";
return false;
}
bool ContextClosest::handle_k()
{
if ((_i+1) < _argc) {
if (isNumeric(_argv[_i+1])) {
_numClosestHitsWanted = atoi(_argv[_i+1]);
if (_numClosestHitsWanted > 0) {
markUsed(_i - _skipFirstArgs);
_i++;
markUsed(_i - _skipFirstArgs);
return true;
}
}
}
_errorMsg = "\n***** ERROR: -k option must be followed by a postive integer value *****";
return false;
}
......@@ -23,10 +23,13 @@ public:
bool ignoreOverlaps() const { return _ignoreOverlaps; }
bool ignoreUpstream() const { return _ignoreUpstream; }
bool ignoreDownstream() const { return _ignoreDownstream; }
bool forceUpstream() const { return _forceUpstream; }
bool forceDownstream() const { return _forceDownstream; }
bool reportDistance() const { return _reportDistance; }
bool signDistance() const { return _signDistance; }
bool hasStrandedDistMode() const { return _haveStrandedDistMode; }
bool diffNames() const { return _diffNames; }
int getNumClosestHitsWanted() const { return _numClosestHitsWanted; }
typedef enum { FIRST_TIE, LAST_TIE, ALL_TIES} tieModeType;
tieModeType getTieMode() const { return _tieMode; }
......@@ -42,6 +45,8 @@ private:
bool _ignoreOverlaps;
bool _ignoreUpstream;
bool _ignoreDownstream;
bool _forceUpstream;
bool _forceDownstream;
bool _reportDistance;
bool _signDistance;
bool _haveStrandedDistMode;
......@@ -49,16 +54,19 @@ private:
tieModeType _tieMode;
strandedDistanceModeType _strandedDistMode;
multiDbModeType _multiDbMode;
int _numClosestHitsWanted;
bool handle_d();
bool handle_D();
bool handle_io();
bool handle_iu();
bool handle_id();
bool handle_fu();
bool handle_fd();
bool handle_N();
bool handle_t();
bool handle_mdb();
bool handle_k();
};
......
......@@ -258,17 +258,11 @@ bool Record::hasChrInChromName() const {
(str[2] == 'r' || str[2] == 'R'));
}
bool Record::hasLeadingZeroInChromName() const {
bool Record::hasLeadingZeroInChromName(bool chrKnown) const {
const char *str = _chrName.c_str();
size_t i=0;
while (i < _chrName.size() && !isdigit(str[i])) {
if (str[i] == '_') return false; //ignore
//leading zero after underscore, as some of the
//random and hap chroms are names that way.
i++;
}
return (i < _chrName.size() && str[i] == '0');
// only check to see if the digit zero follows the occurance of
// "chr", case insensitive.
return (_chrName.size() >= 4 && str[3] == '0' && (chrKnown || hasChrInChromName()));
}
void Record::print(FILE *fp, bool newline) const {
......
......@@ -137,7 +137,7 @@ public:
// virtual static bool isNumericField(int fieldNum) const = 0;
bool hasChrInChromName() const;
bool hasLeadingZeroInChromName() const;
bool hasLeadingZeroInChromName(bool chrKnown = false) const;
protected:
......
This diff is collapsed.
......@@ -9,9 +9,74 @@
#define CLOSESWEEP_H_
#include "NewChromsweep.h"
#include <list>
#include <set>
class ContextClosest;
class distanceTuple {
public:
distanceTuple() : _dist(0), _rec(NULL), _isNeg(false) {}
distanceTuple(int dist, const Record *rec, bool isNeg = false) : _dist(dist), _rec(rec), _isNeg(isNeg) {}
// bool operator < (const distanceTuple & other) const { return (_dist < other._dist); }
int _dist;
const Record *_rec;
bool _isNeg;
};
class DistanceTupleSortAscFunctor {
public:
bool operator()(const distanceTuple & d1, const distanceTuple & d2) const {
return ((d1._dist < d2._dist) ? true : (d1._dist == d2._dist ? (d1._isNeg && !d2._isNeg) : false)) ; }
};
class RecDistList {
public:
typedef enum { LEFT, OVERLAP, RIGHT } chromDirType;
RecDistList(int maxSize);
~RecDistList();
bool empty() const { return _empty; }
void clear();
int uniqueSize() const { return _currNumIdxs; }
size_t totalSize() const { return _totalRecs; }
bool addRec(int dist, const Record *, chromDirType chromDir);
bool exists(int dist) const {
int dummyVal = 0;
return find(dist, dummyVal);
}
typedef pair<chromDirType, const Record *> elemPairType;
typedef vector<elemPairType>elemsType;
typedef pair<int, int> indexType;
int getMaxDist() const { return _empty ? 0 : _distIndex[_currNumIdxs-1].first; }
typedef int constIterType; //used to be a map iter, trying not to change interface too much.
constIterType begin() const { return 0; }
constIterType end() const { return _currNumIdxs; }
int currDist(constIterType iter) const { return _distIndex[iter].first; }
size_t currNumElems(constIterType iter) const { return allElems(iter)->size(); }
const elemsType *allElems(constIterType iter) const { return _allRecs[_distIndex[iter].second]; }
int getMaxLeftEndPos() const;
private:
void insert(int dist, const Record *, chromDirType chromDir);
//if true, pos will be the idx the distance is at.
//if false, pos will be the idx to insert at.
bool find(int dist, int &pos) const;
int _kVal; //max unique allowed
bool _empty;
int _currNumIdxs;
int _totalRecs;
vector<elemsType *> _allRecs;
indexType * _distIndex;
};
class CloseSweep : public NewChromSweep {
public:
CloseSweep(ContextClosest *context);
......@@ -21,13 +86,10 @@ public:
private:
ContextClosest *_context;
typedef vector<const Record * > distRecVecType;
vector<distRecVecType *> _minUpstreamRecs;
vector<int> _minUpstreamDist;
vector<distRecVecType *> _minDownstreamRecs;
vector<int> _minDownstreamDist;
vector<distRecVecType *> _overlapRecs;
int _kClosest; // how many closest hits we want to each query.
vector<RecDistList *> _minUpstreamRecs;
vector<RecDistList *> _minDownstreamRecs;
vector<RecDistList *> _overlapRecs;
vector<int> _maxPrevLeftClosestEndPos;
vector<int> _maxPrevLeftClosestEndPosReverse;
......@@ -48,9 +110,18 @@ private:
void finalizeSelections(int dbIdx, RecordKeyVector &retList);
void checkMultiDbs(RecordKeyVector &retList);
void setLeftClosestEndPos(int dbIdx, const Record *rec);
typedef enum { LEFT, OVERLAP, RIGHT } chromDirType;
typedef enum { UPSTREAM, INTERSECT, DOWNSTREAM } streamDirType;
void setLeftClosestEndPos(int dbIdx);
bool beforeLeftClosestEndPos(int dbIdx, const Record *rec);
void clearClosestEndPos(int dbIdx);
bool canStopScan(const Record *cacheRec, bool ignored, streamDirType streamDir);
int addRecsToRetList(const RecDistList::elemsType *recs, int currDist, RecordKeyVector &retList);
void addSingleRec(const Record *rec, int currDist, int &hitsUsed, RecordKeyVector &retList);
rateOvlpType tryToAddRecord(const Record *cacheRec, int dist, int dbIdx, bool &stopScanning, chromDirType chromDir, streamDirType streamDir);
bool purgePointException(int dbIdx);
};
......
chr1 10 20 a1 1 -
chr1 8 9 b1 1 +
chr1 21 22 b2 1 -
chr1 40 60
chr1 120 140
chr1 10 15
chr1 25 28
chr1 30 40
chr1 35 40
chr1 38 40
chr1 50 60
chr1 70 80
chr1 95 105 2.1 20 +
chr1 98 108 2.2 20 +
chr1 105 115 2.3 20 +
chr1 120 130 2.4 20 +
chr1 140 160 2.5 20 +
chr1 140 155 2.6 20 +
chr1 140 150 2.7 20 +
chr1 170 180 2.8 20 +
chr1 190 200 2.9 20 +
chr1 10 20 3.1 20 -
chr1 30 40 3.2 20 +
chr1 45 60 3.3 20 -
chr1 50 60 3.4 20 -
chr1 55 60 3.5 20 -
chr1 70 80 3.6 20 +
chr1 75 80 3.65 20 +
chr1 90 105 3.7 20 +
chr1 95 110 3.8 20 -
chr1 110 115 3.9 20 +
chr1 120 130 3.10 20 -
chr1 130 140 3.11 20 +
chr1 135 140 3.115 20 +
chr1 150 165 3.12 20 -
chr1 150 160 3.13 20 -
chr1 150 155 3.14 20 -
chr1 170 180 3.15 20 +
chr1 190 200 3.16 20 -
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment