Commit 29fe57fc authored by Neil Kindlon's avatar Neil Kindlon
Browse files

Added closest -fu and -fd, updated help

parent f9fbaac7
......@@ -74,10 +74,19 @@ void closest_help(void) {
cerr << "\t-iu\t" << "Ignore features in B that are upstream of features in A." << endl;
cerr << "\t\tThis option requires -D and follows its orientation" << endl;
cerr << "\t\trules for determining what is \"upstream\"." << endl;
cerr << "\t\trules for determining what is \"upstream\"." << endl << endl;
cerr << "\t-id\t" << "Ignore features in B that are downstream of features in A." << endl;
cerr << "\t\tThis option requires -D and follows its orientation" << endl;
cerr << "\t\trules for determining what is \"downstream\"." << endl;
cerr << "\t\trules for determining what is \"downstream\"." << endl << endl;
cerr << "\t-fu\t" << "Choose first from features in B that are upstream of features in A." << endl;
cerr << "\t\tThis option requires -D and follows its orientation" << endl;
cerr << "\t\trules for determining what is \"upstream\"." << endl << endl;
cerr << "\t-fd\t" << "Choose first from features in B that are downstream of features in A." << endl;
cerr << "\t\tThis option requires -D and follows its orientation" << endl;
cerr << "\t\trules for determining what is \"downstream\"." << endl << endl;
cerr << "\t-t\t" << "How ties for closest feature are handled. This occurs when two" << endl;
cerr << "\t\tfeatures in B have exactly the same \"closeness\" with A." << endl;
......@@ -92,6 +101,9 @@ void closest_help(void) {
cerr << "\t\t- \"all\" Report closest records among all databases." << endl << endl;
cerr << "\t-k\t" << "Report the k closest hits. Default is 1. If tieMode = \"all\", " << endl;
cerr << "\t\t- all ties will still be reported." << endl << endl;
cerr << "\t-N\t" << "Require that the query and the closest hit have different names." << endl;
cerr << "\t\tFor BED, the 4th column is compared." << endl << endl;
......
......@@ -82,6 +82,9 @@ void map_help(void) {
cerr << "\t-header\t" << "Print the header from the A file prior to results." << endl << endl;
cerr << "\t-prec\t" << "Sets the decimal precision for output (Default: 5)" << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) Both input files must be sorted by chrom, then start." << endl << endl;
......
......@@ -69,6 +69,8 @@ void merge_help(void) {
cerr << "\t-c\t" << "Specify columns from the input file to operate upon (see -o option, below)." << endl;
cerr << "\t\tMultiple columns can be specified in a comma-delimited list." << endl << endl;
cerr << "\t-prec\t" << "Sets the decimal precision for output (Default: 5)" << endl << endl;
KeyListOpsHelp();
cerr << "Notes: " << endl;
......
......@@ -9,6 +9,7 @@
#include <sstream>
#include <iomanip>
//#include "FormatGuess.h"
#include <map>
#include "PushBackStreamBuf.h"
#include "InflateStreamBuf.h"
......@@ -29,14 +30,29 @@ int nek_sandbox1_main2(int argc,char** argv);
int nek_sandbox1_main(int argc,char** argv)
{
multimap<int, char> myMap;
myMap.insert(pair<int, char>(1, 'a'));
myMap.insert(pair<int, char>(2, 'b'));
myMap.insert(pair<int, char>(3, 'c'));
myMap.insert(pair<int, char>(3, 'x'));
myMap.insert(pair<int, char>(3, 'y'));
myMap.insert(pair<int, char>(3, 'z'));
myMap.insert(pair<int, char>(4, 'd'));
cout << "Multimap contains:" << endl;
for (multimap<int, char>::iterator iter = myMap.begin(); iter != myMap.end(); iter++) {
cout << iter-> first << ":" << iter->second << endl;
}
cout << "Size is " << myMap.size() << "." << endl;
// for (int i=0; i < 4000; i++) {
// cout << "# This is line " << i << " of a file with a large header." << endl;
// }
// return 0;
if (argc < 2) {
cerr << "Error: Need one input file. Use \"-\" for stdin." << endl;
}
// if (argc < 2) {
// cerr << "Error: Need one input file. Use \"-\" for stdin." << endl;
// }
// ifstream inFileStream(argv[1]);
// static const int BUF_SIZE = 8192;
// BamTools::Internal::BgzfStream bgStream;
......
......@@ -52,6 +52,7 @@ ContextBase::ContextBase()
_seed(0),
_forwardOnly(false),
_reverseOnly(false),
_nameCheckDisabled(false),
_hasColumnOpsMethods(false),
_keyListOps(NULL),
_desiredStrand(FileRecordMergeMgr::ANY_STRAND),
......@@ -197,6 +198,9 @@ bool ContextBase::parseCmdArgs(int argc, char **argv, int skipFirstArgs) {
else if (strcmp(_argv[_i], "-sortout") == 0) {
if (!handle_sortout()) return false;
}
else if (strcmp(_argv[_i], "-nonamecheck") == 0) {
if (!handle_sortout()) return false;
}
}
return true;
......@@ -534,6 +538,13 @@ bool ContextBase::handle_sortout()
return true;
}
bool ContextBase::handle_nonamecheck()
{
setNameCheckDisabled(true);
markUsed(_i - _skipFirstArgs);
return true;
}
void ContextBase::setColumnOpsMethods(bool val)
{
if (val && !_hasColumnOpsMethods) {
......@@ -603,6 +614,8 @@ bool ContextBase::parseIoBufSize(QuickString bufStr)
}
void ContextBase::testNameConventions(const Record *record) {
if (getNameCheckDisabled()) return;
int fileIdx = record->getFileIdx();
//
......@@ -635,7 +648,7 @@ void ContextBase::testNameConventions(const Record *record) {
//
bool zeroVal = record->hasLeadingZeroInChromName();
bool zeroVal = record->hasLeadingZeroInChromName(hasChr);
testChrVal = fileHasLeadingZeroInChromNames(fileIdx);
if (testChrVal == UNTESTED) {
_fileHasLeadingZeroInChromNames[fileIdx] = zeroVal ? YES : NO;
......
......@@ -116,6 +116,8 @@ public:
virtual bool getUseFullBamTags() const { return _useFullBamTags; }
virtual void setUseFullBamTags(bool val) { _useFullBamTags = val; }
virtual bool getNameCheckDisabled() const { return _nameCheckDisabled; }
virtual void setNameCheckDisabled(bool val) { _nameCheckDisabled = val; }
// METHODS FOR PROGRAMS WITH USER_SPECIFIED NUMBER
// OF OUTPUT RECORDS.
......@@ -207,6 +209,7 @@ protected:
int _seed;
bool _forwardOnly;
bool _reverseOnly;
bool _nameCheckDisabled;
//Members for column operations
bool _hasColumnOpsMethods;
......@@ -265,6 +268,7 @@ protected:
virtual bool handle_null();
virtual bool handle_delim();
virtual bool handle_sortout();
virtual bool handle_nonamecheck();
bool handle_prec();
bool parseIoBufSize(QuickString bufStr);
......
......@@ -18,7 +18,8 @@ ContextClosest::ContextClosest()
_diffNames(false),
_tieMode(ALL_TIES),
_strandedDistMode(REF_DIST),
_multiDbMode(EACH_DB)
_multiDbMode(EACH_DB),
_numClosestHitsWanted(1)
{
// closest requires sorted input
setSortedInput(true);
......@@ -66,7 +67,13 @@ bool ContextClosest::parseCmdArgs(int argc, char **argv, int skipFirstArgs){
else if (strcmp(_argv[_i], "-id") == 0) {
if (!handle_id()) return false;
}
else if (strcmp(_argv[_i], "-N") == 0) {
else if (strcmp(_argv[_i], "-fu") == 0) {
if (!handle_fu()) return false;
}
else if (strcmp(_argv[_i], "-fd") == 0) {
if (!handle_fd()) return false;
}
else if (strcmp(_argv[_i], "-N") == 0) {
if (!handle_N()) return false;
}
else if (strcmp(_argv[_i], "-t") == 0) {
......@@ -75,6 +82,9 @@ bool ContextClosest::parseCmdArgs(int argc, char **argv, int skipFirstArgs){
else if (strcmp(_argv[_i], "-mdb") == 0) {
if (!handle_mdb()) return false;
}
else if (strcmp(_argv[_i], "-k") == 0) {
if (!handle_k()) return false;
}
}
return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs);
......@@ -106,6 +116,25 @@ bool ContextClosest::isValidState(){
return false;
}
if ((_forceUpstream || _forceDownstream) && ! _haveStrandedDistMode) {
_errorMsg = "\n*****\n*****ERROR: When requesting -fu or -fd, you also need to specify -D.\n*****\n";
return false;
}
if (_ignoreUpstream && _forceUpstream) {
_errorMsg = "\n*****\n*****ERROR: Can't both ignore upstream and force upstream.\n*****\n";
return false;
}
if (_ignoreDownstream && _forceDownstream) {
_errorMsg = "\n*****\n*****ERROR: Can't both ignore downstream and force downstream.\n*****\n";
return false;
}
if (_sortOutput && _reportDistance) {
_errorMsg = "\n*****\n*****ERROR: -sortout (sorted output) is not valid with distance reporting.\n*****\n";
return false;
}
return true;
}
......@@ -162,6 +191,18 @@ bool ContextClosest::handle_id() {
return true;
}
bool ContextClosest::handle_fu() {
_forceUpstream = true;
markUsed(_i - _skipFirstArgs);
return true;
}
bool ContextClosest::handle_fd() {
_forceDownstream = true;
markUsed(_i - _skipFirstArgs);
return true;
}
bool ContextClosest::handle_N() {
_diffNames = true;
markUsed(_i - _skipFirstArgs);
......@@ -220,3 +261,21 @@ bool ContextClosest::handle_mdb()
_errorMsg = "*****ERROR: Request \"each\" or \"last\" for Multiple Database Mode (-mdb)";
return false;
}
bool ContextClosest::handle_k()
{
if ((_i+1) < _argc) {
if (isNumeric(_argv[_i+1])) {
_numClosestHitsWanted = atoi(_argv[_i+1]);
if (_numClosestHitsWanted > 0) {
markUsed(_i - _skipFirstArgs);
_i++;
markUsed(_i - _skipFirstArgs);
return true;
}
}
}
_errorMsg = "\n***** ERROR: -k option must be followed by a postive integer value *****";
return false;
}
......@@ -23,10 +23,13 @@ public:
bool ignoreOverlaps() const { return _ignoreOverlaps; }
bool ignoreUpstream() const { return _ignoreUpstream; }
bool ignoreDownstream() const { return _ignoreDownstream; }
bool forceUpstream() const { return _forceUpstream; }
bool forceDownstream() const { return _forceDownstream; }
bool reportDistance() const { return _reportDistance; }
bool signDistance() const { return _signDistance; }
bool hasStrandedDistMode() const { return _haveStrandedDistMode; }
bool diffNames() const { return _diffNames; }
int getNumClosestHitsWanted() const { return _numClosestHitsWanted; }
typedef enum { FIRST_TIE, LAST_TIE, ALL_TIES} tieModeType;
tieModeType getTieMode() const { return _tieMode; }
......@@ -42,6 +45,8 @@ private:
bool _ignoreOverlaps;
bool _ignoreUpstream;
bool _ignoreDownstream;
bool _forceUpstream;
bool _forceDownstream;
bool _reportDistance;
bool _signDistance;
bool _haveStrandedDistMode;
......@@ -49,16 +54,19 @@ private:
tieModeType _tieMode;
strandedDistanceModeType _strandedDistMode;
multiDbModeType _multiDbMode;
int _numClosestHitsWanted;
bool handle_d();
bool handle_D();
bool handle_io();
bool handle_iu();
bool handle_id();
bool handle_fu();
bool handle_fd();
bool handle_N();
bool handle_t();
bool handle_mdb();
bool handle_k();
};
......
......@@ -258,17 +258,11 @@ bool Record::hasChrInChromName() const {
(str[2] == 'r' || str[2] == 'R'));
}
bool Record::hasLeadingZeroInChromName() const {
bool Record::hasLeadingZeroInChromName(bool chrKnown) const {
const char *str = _chrName.c_str();
size_t i=0;
while (i < _chrName.size() && !isdigit(str[i])) {
if (str[i] == '_') return false; //ignore
//leading zero after underscore, as some of the
//random and hap chroms are names that way.
i++;
}
return (i < _chrName.size() && str[i] == '0');
// only check to see if the digit zero follows the occurance of
// "chr", case insensitive.
return (_chrName.size() >= 4 && str[3] == '0' && (chrKnown || hasChrInChromName()));
}
void Record::print(FILE *fp, bool newline) const {
......
......@@ -137,7 +137,7 @@ public:
// virtual static bool isNumericField(int fieldNum) const = 0;
bool hasChrInChromName() const;
bool hasLeadingZeroInChromName() const;
bool hasLeadingZeroInChromName(bool chrKnown = false) const;
protected:
......
This diff is collapsed.
......@@ -9,9 +9,66 @@
#define CLOSESWEEP_H_
#include "NewChromsweep.h"
#include <list>
#include <set>
class ContextClosest;
class distanceTuple {
public:
distanceTuple() : _dist(0), _rec(NULL), _isNeg(false) {}
distanceTuple(int dist, const Record *rec, bool isNeg = false) : _dist(dist), _rec(rec), _isNeg(isNeg) {}
// bool operator < (const distanceTuple & other) const { return (_dist < other._dist); }
int _dist;
const Record *_rec;
bool _isNeg;
};
class DistanceTupleSortAscFunctor {
public:
bool operator()(const distanceTuple & d1, const distanceTuple & d2) const {
return ((d1._dist < d2._dist) ? true : (d1._dist == d2._dist ? (d1._isNeg && !d2._isNeg) : false)) ; }
};
class RecDistList {
public:
typedef enum { LEFT, OVERLAP, RIGHT } chromDirType;
RecDistList(int maxSize) : _maxUniqueAllowed(maxSize) {}
~RecDistList() { clear(); }
bool empty() const { return _recs.empty(); }
void clear();
int uniqueSize() const { return _recs.size(); }
size_t totalSize() const { return _totalRecs; }
bool addRec(int dist, const Record *, chromDirType chromDir);
bool exists(int dist) const { return (_recs.find(dist) != _recs.end()); }
int furtherestDistance() const { return _recs.rbegin()->first; }
typedef vector<pair<chromDirType, const Record *> >elemsType;
typedef map<int, elemsType *> distRecsType;
typedef distRecsType::iterator iterType;
typedef distRecsType::reverse_iterator revIterType;
typedef distRecsType::const_iterator constIterType;
typedef distRecsType::const_reverse_iterator constRevIterType;
int getMaxDist() const { return _recs.empty() ? 0 : _recs.rbegin()->first; }
constIterType begin() const { return _recs.begin(); }
constIterType end() const { return _recs.end(); }
int currDist(constIterType iter) const { return iter->first; }
size_t currNumElems(constIterType iter) const { return iter->second->size(); }
const Record *firstElem(constIterType iter) const { return (iter->second->at(0)).second; }
const Record *lastElem(constIterType iter) const { return (iter->second->at(iter->second->size()-1)).second; }
const elemsType *allElems(constIterType iter) const { return iter->second; }
int getMaxLeftEndPos() const;
private:
void insert(int dist, const Record *, chromDirType chromDir);
distRecsType _recs;
int _maxUniqueAllowed;
int _totalRecs;
};
class CloseSweep : public NewChromSweep {
public:
CloseSweep(ContextClosest *context);
......@@ -21,13 +78,10 @@ public:
private:
ContextClosest *_context;
typedef vector<const Record * > distRecVecType;
vector<distRecVecType *> _minUpstreamRecs;
vector<int> _minUpstreamDist;
vector<distRecVecType *> _minDownstreamRecs;
vector<int> _minDownstreamDist;
vector<distRecVecType *> _overlapRecs;
int _kClosest; // how many closest hits we want to each query.
vector<RecDistList *> _minUpstreamRecs;
vector<RecDistList *> _minDownstreamRecs;
vector<RecDistList *> _overlapRecs;
vector<int> _maxPrevLeftClosestEndPos;
vector<int> _maxPrevLeftClosestEndPosReverse;
......@@ -48,9 +102,18 @@ private:
void finalizeSelections(int dbIdx, RecordKeyVector &retList);
void checkMultiDbs(RecordKeyVector &retList);
void setLeftClosestEndPos(int dbIdx, const Record *rec);
typedef enum { LEFT, OVERLAP, RIGHT } chromDirType;
typedef enum { UPSTREAM, INTERSECT, DOWNSTREAM } streamDirType;
void setLeftClosestEndPos(int dbIdx);
bool beforeLeftClosestEndPos(int dbIdx, const Record *rec);
void clearClosestEndPos(int dbIdx);
bool canStopScan(const Record *cacheRec, bool ignored, streamDirType streamDir);
int addRecsToRetList(const RecDistList::elemsType *recs, int currDist, RecordKeyVector &retList);
void addSingleRec(const Record *rec, int currDist, int &hitsUsed, RecordKeyVector &retList);
rateOvlpType tryToAddRecord(const Record *cacheRec, int dist, int dbIdx, bool &stopScanning, chromDirType chromDir, streamDirType streamDir);
bool purgePointException(int dbIdx);
};
......
......@@ -129,7 +129,7 @@ void NewChromSweep::scanCache(int dbIdx, RecordKeyVector &retList) {
if (_currQueryRec->sameChrom(cacheRec) && !_currQueryRec->after(cacheRec)) {
if (intersects(_currQueryRec, cacheRec)) {
retList.push_back(cacheRec);
} else break; // cacheRec is after the query rec, stop scanning.
} else if (cacheRec->after(_currQueryRec)) break; // cacheRec is after the query rec, stop scanning.
cacheIter = _caches[dbIdx].next();
}
else {
......@@ -150,6 +150,7 @@ void NewChromSweep::clearCache(int dbIdx)
}
void NewChromSweep::masterScan(RecordKeyVector &retList) {
for (int i=0; i < _numDBs; i++) {
if (dbFinished(i) || chromChange(i, retList, true)) {
continue;
......
......@@ -666,3 +666,8 @@ rm exp obs
cd sortAndNaming
bash test-sort-and-naming.sh
cd ..
cd kclosest
bash test-kclosest.sh
cd ..
......@@ -502,7 +502,25 @@ $BT intersect -a a.bed -b b.bed -f 1.00001 2>&1 > /dev/null | cat - | head -2 |
check exp obs
rm exp obs
##################################################################bug167_strandSweep.bed
# Test that -s works with chromsweep
##################################################################
echo " intersect.t41...\c"
echo \
"22" > exp
$BT intersect -a bug167_strandSweep.bed -b bug167_strandSweep.bed -sorted -s -wa -wb | wc -l > obs
check exp obs
rm exp obs
##################################################################bug167_strandSweep.bed
# Test that -S works with chromsweep
##################################################################
echo " intersect.t42...\c"
echo \
"20" > exp
$BT intersect -a bug167_strandSweep.bed -b bug167_strandSweep.bed -sorted -S -wa -wb | wc -l > obs
check exp obs
rm exp obs
rm one_block.bam two_blocks.bam three_blocks.bam
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment