diff --git a/src/intersectFile/Makefile b/src/intersectFile/Makefile index e265b3348d1b1883c8f46e15b2902e3f2e5f8933..8c81049e431a1fbb94ec92be94593c9c1ca3470d 100644 --- a/src/intersectFile/Makefile +++ b/src/intersectFile/Makefile @@ -17,6 +17,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/RecordOutputMgr/ \ -I$(UTILITIES_DIR)/NewChromsweep \ -I$(UTILITIES_DIR)/BinTree \ diff --git a/src/mapFile/mapFile.cpp b/src/mapFile/mapFile.cpp index 7cff53109b6fff4d5184deb117d8a879ad39cf82..8dbf24ad54a2e93715fd453092e778efcdf24b0c 100644 --- a/src/mapFile/mapFile.cpp +++ b/src/mapFile/mapFile.cpp @@ -21,14 +21,11 @@ const int PRECISION = 21; FileMap::FileMap(ContextMap *context) : _context(context), _blockMgr(NULL), - _recordOutputMgr(NULL), - _colOps(_context->getColOps()) + _recordOutputMgr(NULL) { _blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal()); _recordOutputMgr = new RecordOutputMgr(); _recordOutputMgr->init(_context); - _keyListOps.setNullValue(_context->getNullValue()); - _keyListOps.setDelimStr(_context->getDelim()); } FileMap::~FileMap(void) { @@ -46,174 +43,15 @@ bool FileMap::mapFiles() } RecordKeyList hitSet; while (sweep.next(hitSet)) { - _outputValues.clear(); if (_context->getObeySplits()) { RecordKeyList keySet(hitSet.getKey()); RecordKeyList resultSet(hitSet.getKey()); _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet); - calculateOutput(resultSet); - _recordOutputMgr->printRecord(resultSet.getKey(), _outputValues); + _recordOutputMgr->printRecord(resultSet.getKey(), _context->getColumnOpsVal(resultSet)); } else { - calculateOutput(hitSet); - _recordOutputMgr->printRecord(hitSet.getKey(), _outputValues); + _recordOutputMgr->printRecord(hitSet.getKey(), _context->getColumnOpsVal(hitSet)); } } return true; } -void FileMap::calculateOutput(RecordKeyList &hits) -{ - //loop through all requested columns, and for each one, call the method needed - //for the operation specified. - _keyListOps.setKeyList(&hits); - - double val = 0.0; - for (int i=0; i < (int)_colOps.size(); i++) { - int col = _colOps[i].first; - KeyListOps::OP_TYPES opCode = _colOps[i].second; - - _keyListOps.setColumn(col); - switch (opCode) { - case KeyListOps::SUM: - val = _keyListOps.getSum(); - if (isnan(val)) { - _outputValues.append(_context->getNullValue()); - } else { - _outputValues.append(val); - } - break; - - case KeyListOps::MEAN: - val = _keyListOps.getMean(); - if (isnan(val)) { - _outputValues.append(_context->getNullValue()); - } else { - _outputValues.append(val); - } - break; - - case KeyListOps::STDDEV: - val = _keyListOps.getStddev(); - if (isnan(val)) { - _outputValues.append(_context->getNullValue()); - } else { - _outputValues.append(val); - } - break; - - case KeyListOps::SAMPLE_STDDEV: - val = _keyListOps.getSampleStddev(); - if (isnan(val)) { - _outputValues.append(_context->getNullValue()); - } else { - _outputValues.append(val); - } - break; - - case KeyListOps::MEDIAN: - val = _keyListOps.getMedian(); - if (isnan(val)) { - _outputValues.append(_context->getNullValue()); - } else { - _outputValues.append(val); - } - break; - - case KeyListOps::MODE: - _outputValues.append(_keyListOps.getMode()); - break; - - case KeyListOps::ANTIMODE: - _outputValues.append(_keyListOps.getAntiMode()); - break; - - case KeyListOps::MIN: - val = _keyListOps.getMin(); - if (isnan(val)) { - _outputValues.append(_context->getNullValue()); - } else { - _outputValues.append(val); - } - break; - - case KeyListOps::MAX: - val = _keyListOps.getMax(); - if (isnan(val)) { - _outputValues.append(_context->getNullValue()); - } else { - _outputValues.append(val); - } - break; - - case KeyListOps::ABSMIN: - val = _keyListOps.getAbsMin(); - if (isnan(val)) { - _outputValues.append(_context->getNullValue()); - } else { - _outputValues.append(val); - } - break; - - case KeyListOps::ABSMAX: - val = _keyListOps.getAbsMax(); - if (isnan(val)) { - _outputValues.append(_context->getNullValue()); - } else { - _outputValues.append(val); - } - break; - - case KeyListOps::COUNT: - _outputValues.append(_keyListOps.getCount()); - break; - - case KeyListOps::DISTINCT: - _outputValues.append(_keyListOps.getDistinct()); - break; - - case KeyListOps::COUNT_DISTINCT: - _outputValues.append(_keyListOps.getCountDistinct()); - break; - - case KeyListOps::DISTINCT_ONLY: - _outputValues.append(_keyListOps.getDistinctOnly()); - break; - - case KeyListOps::COLLAPSE: - _outputValues.append(_keyListOps.getCollapse()); - break; - - case KeyListOps::CONCAT: - _outputValues.append(_keyListOps.getConcat()); - break; - - case KeyListOps::FREQ_ASC: - _outputValues.append(_keyListOps.getFreqAsc()); - break; - - case KeyListOps::FREQ_DESC: - _outputValues.append(_keyListOps.getFreqDesc()); - break; - - case KeyListOps::FIRST: - _outputValues.append(_keyListOps.getFirst()); - break; - - case KeyListOps::LAST: - _outputValues.append(_keyListOps.getLast()); - break; - - case KeyListOps::INVALID: - default: - // Any unrecognized operation should have been handled already in the context validation. - // It's thus unnecessary to handle it here, but throw an error to help us know if future - // refactoring or code changes accidentally bypass the validation phase. - cerr << "ERROR: Invalid operation given for column " << col << ". Exiting..." << endl; - break; - } - //if this isn't the last column, add a tab. - if (i < (int)_colOps.size() -1) { - _outputValues.append('\t'); - } - } -} diff --git a/src/mapFile/mapFile.h b/src/mapFile/mapFile.h index e2143ef0fb110c2bb3fbde3500b3a5ee25403826..fbb431ac930bf68d7475f03ac8850438dd6d8302 100644 --- a/src/mapFile/mapFile.h +++ b/src/mapFile/mapFile.h @@ -38,11 +38,6 @@ private: ContextMap *_context; BlockMgr *_blockMgr; RecordOutputMgr *_recordOutputMgr; - KeyListOps _keyListOps; - const ContextMap::colOpsType & _colOps; - QuickString _outputValues; // placeholder for the results of mapping B to each a in A. - - void calculateOutput(RecordKeyList &hits); }; #endif /* MAPFILE_H */ diff --git a/src/nekSandbox1/Makefile b/src/nekSandbox1/Makefile index fbe6d8616676f86a02102b981f1d4e5c897ebf71..df8aba7262492b13bb9b9636577d01afa77d55ba 100644 --- a/src/nekSandbox1/Makefile +++ b/src/nekSandbox1/Makefile @@ -10,6 +10,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders \ -I$(UTILITIES_DIR)/FileRecordTools/Records \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/general \ -I$(UTILITIES_DIR)/NewChromsweep \ -I$(UTILITIES_DIR)/GenomeFile/ \ diff --git a/src/regressTest/Makefile b/src/regressTest/Makefile index e9ceebf3a43bc67ff0f1672139b174d66fb1f037..8ffeeab244a81930b3cf6cbe6bfbe62ece72bcaa 100644 --- a/src/regressTest/Makefile +++ b/src/regressTest/Makefile @@ -18,6 +18,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders \ -I$(UTILITIES_DIR)/FileRecordTools/Records \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/general # ---------------------------------- diff --git a/src/sampleFile/Makefile b/src/sampleFile/Makefile index 2042291e68735b4316062d64a04e7f35d9847b2a..9ccbe5a9ef96b7ef4ed2d424fbc396794f2efc6c 100644 --- a/src/sampleFile/Makefile +++ b/src/sampleFile/Makefile @@ -17,6 +17,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/RecordOutputMgr/ \ -I$(UTILITIES_DIR)/version/ diff --git a/src/utils/BinTree/Makefile b/src/utils/BinTree/Makefile index de04c816782689fc36ab33275d89bfa8f8d2a61e..c29b5ebadf2788b012c46e67330504e2edb52416 100644 --- a/src/utils/BinTree/Makefile +++ b/src/utils/BinTree/Makefile @@ -11,6 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/BamTools/include \ -I$(UTILITIES_DIR)/BamTools/src/ \ -I$(UTILITIES_DIR)/version/ diff --git a/src/utils/Contexts/ContextBase.cpp b/src/utils/Contexts/ContextBase.cpp index 1f0c7a172e9d7af67377176dc400766bba43e276..adbc47afb4105c611661fa95c11a72aa990c25ed 100644 --- a/src/utils/Contexts/ContextBase.cpp +++ b/src/utils/Contexts/ContextBase.cpp @@ -52,11 +52,16 @@ ContextBase::ContextBase() _hasConstantSeed(false), _seed(0), _forwardOnly(false), - _reverseOnly(false) + _reverseOnly(false), + _hasColumnOpsMethods(false) { _programNames["intersect"] = INTERSECT; _programNames["sample"] = SAMPLE; _programNames["map"] = MAP; + + if (hasColumnOpsMethods()) { + _keyListOps = new KeyListOps(); + } } ContextBase::~ContextBase() @@ -70,6 +75,11 @@ ContextBase::~ContextBase() delete _files[i]; _files[i] = NULL; } + if (hasColumnOpsMethods()) { + delete _keyListOps; + _keyListOps = NULL; + } + } bool ContextBase::determineOutputType() { @@ -167,6 +177,19 @@ bool ContextBase::parseCmdArgs(int argc, char **argv, int skipFirstArgs) { else if (strcmp(_argv[_i], "-seed") == 0) { if (!handle_seed()) return false; } + else if (strcmp(_argv[_i], "-o") == 0) { + if (!handle_o()) return false; + } + else if (strcmp(_argv[_i], "-c") == 0) { + if (!handle_c()) return false; + } + else if (strcmp(_argv[_i], "-null") == 0) { + if (!handle_null()) return false; + } + else if (strcmp(_argv[_i], "-delim") == 0) { + if (!handle_delim()) return false; + } + } return true; } @@ -182,6 +205,12 @@ bool ContextBase::isValidState() if (!determineOutputType()) { return false; } + if (hasColumnOpsMethods()) { + FileRecordMgr *dbFile = getFile(hasIntersectMethods() ? _databaseFileIdx : 0); + if (!_keyListOps->isValidColumnOps(dbFile)) { + return false; + } + } return true; } @@ -354,3 +383,85 @@ bool ContextBase::handle_ubam() markUsed(_i - _skipFirstArgs); return true; } + + +// Methods specific to column operations. +// for col ops, -c is the string of columns upon which to operate +bool ContextBase::handle_c() +{ + if (!hasColumnOpsMethods()) { + return false; + } + if ((_i+1) < _argc) { + _keyListOps->setColumns(_argv[_i + 1]); + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + } + return true; +} + + +// for col ops, -o is the string of operations to apply to the columns (-c) +bool ContextBase::handle_o() +{ + if (!hasColumnOpsMethods()) { + return false; + } + if ((_i+1) < _argc) { + _keyListOps->setOperations(_argv[_i + 1]); + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + } + return true; +} + + +// for col ops, -null is a NULL vakue assigned +// when no overlaps are detected. +bool ContextBase::handle_null() +{ + if (!hasColumnOpsMethods()) { + return false; + } + if ((_i+1) < _argc) { + _keyListOps->setNullValue(_argv[_i + 1]); + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + } + return true; +} + +//for col ops, delimStr will appear between each item in +//a collapsed but delimited list. +bool ContextBase::handle_delim() +{ + if (!hasColumnOpsMethods()) { + return false; + } + if ((_i+1) < _argc) { + _keyListOps->setDelimStr(_argv[_i + 1]); + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + } + return true; +} + +void ContextBase::setColumnOpsMethods(bool val) +{ + _hasColumnOpsMethods = val; + if (val) { + _keyListOps = new KeyListOps(); + } +} + +const QuickString &ContextBase::getColumnOpsVal(RecordKeyList &keyList) const { + if (!hasColumnOpsMethods()) { + return _nullStr; + } + return _keyListOps->getOpVals(keyList); +} + diff --git a/src/utils/Contexts/ContextBase.h b/src/utils/Contexts/ContextBase.h index 7846f6218f02e184248ceeb228f6dddb5206e48a..b4bf12279f71db6a934a4e42a8970771a4a528e8 100644 --- a/src/utils/Contexts/ContextBase.h +++ b/src/utils/Contexts/ContextBase.h @@ -24,6 +24,7 @@ #include "NewGenomeFile.h" #include "api/BamReader.h" #include "api/BamAux.h" +#include "KeyListOps.h" class ContextBase { @@ -144,6 +145,13 @@ public: //methods. virtual bool hasIntersectMethods() const { return false; } + // determine whether column operations like those used in map + // are available. + void setColumnOpsMethods(bool val); + virtual bool hasColumnOpsMethods() const { return _hasColumnOpsMethods; } + const QuickString &getColumnOpsVal(RecordKeyList &keyList) const; + //methods applicable only to column operations. + protected: PROGRAM_TYPE _program; @@ -204,6 +212,10 @@ protected: bool _forwardOnly; bool _reverseOnly; + bool _hasColumnOpsMethods; + KeyListOps *_keyListOps; + QuickString _nullStr; //placeholder return value when col ops aren't valid. + void markUsed(int i) { _argsProcessed[i] = true; } bool isUsed(int i) const { return _argsProcessed[i]; } bool cmdArgsValid(); @@ -227,6 +239,11 @@ protected: virtual bool handle_split(); virtual bool handle_sorted(); virtual bool handle_ubam(); + + virtual bool handle_c(); + virtual bool handle_o(); + virtual bool handle_null(); + virtual bool handle_delim(); }; #endif /* CONTEXTBASE_H_ */ diff --git a/src/utils/Contexts/ContextMap.cpp b/src/utils/Contexts/ContextMap.cpp index 8b2027241934986a3abfe083cfa41b37e35c2b79..e3f82417cda79b1ab81e89d72847613b6c151c53 100644 --- a/src/utils/Contexts/ContextMap.cpp +++ b/src/utils/Contexts/ContextMap.cpp @@ -8,18 +8,11 @@ #include "ContextMap.h" ContextMap::ContextMap() -: _delimStr(",") { // map requires sorted input setSortedInput(true); setLeftJoin(true); - - // default to BED score column - setColumns("5"); - // default to "sum" - setOperations("sum"); - // default to "." as a NULL value - setNullValue('.'); + setColumnOpsMethods(true); } ContextMap::~ContextMap() @@ -45,131 +38,22 @@ bool ContextMap::parseCmdArgs(int argc, char **argv, int skipFirstArgs) { if (isUsed(_i - _skipFirstArgs)) { continue; } - else if (strcmp(_argv[_i], "-o") == 0) { - if (!handle_o()) return false; - } - else if (strcmp(_argv[_i], "-c") == 0) { - if (!handle_c()) return false; - } - else if (strcmp(_argv[_i], "-null") == 0) { - if (!handle_null()) return false; - } - else if (strcmp(_argv[_i], "-delim") == 0) { - if (!handle_delim()) return false; - } + if (strcmp(_argv[_i], "-c") == 0) { + //bypass intersect's use of the -c option, because -c + //means writeCount for intersect, but means columns for map. + if (!ContextBase::handle_c()) return false; + } } return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs); } - - -bool ContextMap::isValidState() -{ - if (!ContextIntersect::isValidState()) { - return false; - } - - if (getDatabaseFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { - //throw Error - cerr << endl << "*****" << endl - << "***** ERROR: BAM database file not currently supported for the map tool." - << endl; - exit(1); - } - - - //get the strings from context containing the comma-delimited lists of columns - //and operations. Split both of these into vectors. Get the operation code - //for each operation string. Finally, make a vector of pairs, where the first - //member of each pair is a column number, and the second member is the code for the - //operation to perform on that column. - - vector<QuickString> columnsVec; - vector<QuickString> opsVec; - int numCols = Tokenize(_columns, columnsVec, ','); - int numOps = Tokenize(_operations, opsVec, ','); - - if (numOps < 1 || numCols < 1) { - cerr << endl << "*****" << endl - << "***** ERROR: There must be at least one column and at least one operation named." << endl; - return false; - } - if (numOps > 1 && numCols != numOps) { - cerr << endl << "*****" << endl - << "***** ERROR: There are " << numCols <<" columns given, but there are " << numOps << " operations. " << endl; - cerr << "\tPlease provide either a single operation that will be applied to all listed columns, " << endl; - cerr << "\tor an operation for each column." << endl; - return false; - } - KeyListOps keyListOps; - for (int i=0; i < (int)columnsVec.size(); i++) { - int col = str2chrPos(columnsVec[i]); - - //check that the column number is valid - if (col < 1 || col > getDatabaseFile()->getNumFields()) { - cerr << endl << "*****" << endl << "***** ERROR: Requested column " << col << ", but database file " - << getDatabaseFileName() << " only has fields 1 - " << getDatabaseFile()->getNumFields() << "." << endl; - return false; - } - const QuickString &operation = opsVec.size() > 1 ? opsVec[i] : opsVec[0]; - KeyListOps::OP_TYPES opCode = keyListOps.getOpCode(operation); - if (opCode == KeyListOps::INVALID) { - cerr << endl << "*****" << endl - << "***** ERROR: " << operation << " is not a valid operation. " << endl; - return false; - } - _colOps.push_back(pair<int, KeyListOps::OP_TYPES>(col, opCode)); - } - return true; -} - - -// for map, -c is the string of columns upon which to operate -bool ContextMap::handle_c() -{ - if ((_i+1) < _argc) { - setColumns(_argv[_i + 1]); - markUsed(_i - _skipFirstArgs); - _i++; - markUsed(_i - _skipFirstArgs); - } - return true; -} - - -// for map, -o is the string of operations to apply to the columns (-c) -bool ContextMap::handle_o() -{ - if ((_i+1) < _argc) { - setOperations(_argv[_i + 1]); - markUsed(_i - _skipFirstArgs); - _i++; - markUsed(_i - _skipFirstArgs); - } - return true; -} - - -// for map, -null is a NULL vakue assigned -// when no overlaps are detected. -bool ContextMap::handle_null() -{ - if ((_i+1) < _argc) { - setNullValue(_argv[_i + 1]); - markUsed(_i - _skipFirstArgs); - _i++; - markUsed(_i - _skipFirstArgs); - } - return true; -} - -bool ContextMap::handle_delim() -{ - if ((_i+1) < _argc) { - _delimStr = _argv[_i + 1]; - markUsed(_i - _skipFirstArgs); - _i++; - markUsed(_i - _skipFirstArgs); - } - return true; -} +// +// +//bool ContextMap::isValidState() +//{ +// if (!ContextIntersect::isValidState()) { +// return false; +// } +//} +// +// diff --git a/src/utils/Contexts/ContextMap.h b/src/utils/Contexts/ContextMap.h index 460f93b29100cf4ad2d6fbc2bd5e9e1805956938..9b7280e5425991a3f47588c7c3c78cf9b9fd8745 100644 --- a/src/utils/Contexts/ContextMap.h +++ b/src/utils/Contexts/ContextMap.h @@ -15,37 +15,14 @@ class ContextMap : public ContextIntersect { public: ContextMap(); virtual ~ContextMap(); - virtual bool isValidState(); - +// virtual bool isValidState(); +// virtual bool parseCmdArgs(int argc, char **argv, int skipFirstArgs); - - const QuickString &getColumns() const { return _columns; } - void setColumns(const QuickString &columns) { _columns = columns; } - - const QuickString & getOperations() const { return _operations; } - void setOperations(const QuickString & operation) { _operations = operation; } - - const QuickString & getNullValue() const { return _nullValue; } - void setNullValue(const QuickString & nullValue) { _nullValue = nullValue; } - - const QuickString &getDelim() const { return _delimStr; } +// virtual bool hasIntersectMethods() const { return true; } - - typedef vector<pair<int, KeyListOps::OP_TYPES> > colOpsType; - const colOpsType &getColOps() const { return _colOps; } +// private: - QuickString _operations; - QuickString _columns; - QuickString _nullValue; - KeyListOps _keyListOps; - colOpsType _colOps; - QuickString _delimStr; - - virtual bool handle_c(); - virtual bool handle_o(); - virtual bool handle_null(); - virtual bool handle_delim(); }; diff --git a/src/utils/FileRecordTools/Records/BamRecord.cpp b/src/utils/FileRecordTools/Records/BamRecord.cpp index 4c5cd8dc2fb4b3716ed1fc3de598428be747eb4e..f939fefbe6d117e88f224f8e5b14c2562f4be4c7 100644 --- a/src/utils/FileRecordTools/Records/BamRecord.cpp +++ b/src/utils/FileRecordTools/Records/BamRecord.cpp @@ -172,5 +172,10 @@ const QuickString &BamRecord::getField(int fieldNum) const return Bed6Interval::getField(fieldNum); } +bool BamRecord::isNumericField(int fieldNum) { + + //TBD: As with getField, this isn't defined for BAM. + return (fieldNum > 6 ? false : Bed6Interval::isNumericField(fieldNum)); +} diff --git a/src/utils/FileRecordTools/Records/BamRecord.h b/src/utils/FileRecordTools/Records/BamRecord.h index b74dbc2c92b05e83ba64e7eca1e1ddf6e59dd525..022ecb4d94da5f96133c487e774de7ff4377be00 100644 --- a/src/utils/FileRecordTools/Records/BamRecord.h +++ b/src/utils/FileRecordTools/Records/BamRecord.h @@ -40,6 +40,7 @@ public: virtual const QuickString &getField(int fieldNum) const; virtual int getNumFields() const { return 12; } + static bool isNumericField(int fieldNum); protected: BamTools::BamAlignment _bamAlignment; diff --git a/src/utils/FileRecordTools/Records/Bed12Interval.cpp b/src/utils/FileRecordTools/Records/Bed12Interval.cpp index 867a69ec4c23e50d2f1c3588f9622a78ce992176..0a5a092feef3ba1d02d599f7d50243c949aeb4f6 100644 --- a/src/utils/FileRecordTools/Records/Bed12Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed12Interval.cpp @@ -146,3 +146,29 @@ const QuickString &Bed12Interval::getField(int fieldNum) const } } +bool Bed12Interval::isNumericField(int fieldNum) { + switch (fieldNum) { + case 7: + return true; + break; + case 8: + return true; + break; + case 9: + return false; + break; + case 10: + return true; + break; + case 11: + return false; + break; + case 12: + return false; + break; + default: + return Bed6Interval::isNumericField(fieldNum); + break; + } +} + diff --git a/src/utils/FileRecordTools/Records/Bed12Interval.h b/src/utils/FileRecordTools/Records/Bed12Interval.h index 711800c3d7560d0834f5b823d48ffa47f3ec3535..ffa89f9044a0d8df0d25a3f4fadb90e2b27afaa0 100644 --- a/src/utils/FileRecordTools/Records/Bed12Interval.h +++ b/src/utils/FileRecordTools/Records/Bed12Interval.h @@ -54,6 +54,7 @@ public: virtual const QuickString &getField(int fieldNum) const; virtual int getNumFields() const { return 12; } + static bool isNumericField(int fieldNum); protected: diff --git a/src/utils/FileRecordTools/Records/Bed3Interval.cpp b/src/utils/FileRecordTools/Records/Bed3Interval.cpp index 3f896be5a80614cccc8b058eb04b746f2ec19b84..e31e43eafa5e5bb2a1b7eeadc9940a5332fe17b5 100644 --- a/src/utils/FileRecordTools/Records/Bed3Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed3Interval.cpp @@ -79,3 +79,23 @@ const QuickString &Bed3Interval::getField(int fieldNum) const break; } } + +bool Bed3Interval::isNumericField(int fieldNum) { + switch (fieldNum) { + case 1: + return false; //chrom + break; + case 2: + return true; //startPos + break; + case 3: + return true; //endPos + break; + default: + cerr << endl << "*****" << endl + << "*****ERROR: requested invalid column " << fieldNum << ". Exiting." << endl + << endl << "*****" << endl; + exit(1); + break; + } +} diff --git a/src/utils/FileRecordTools/Records/Bed3Interval.h b/src/utils/FileRecordTools/Records/Bed3Interval.h index 9f1ff1183a975fa7defda698abee05831bd5b771..93377a0a171df9cc8bffae94b51f53d667b313c7 100644 --- a/src/utils/FileRecordTools/Records/Bed3Interval.h +++ b/src/utils/FileRecordTools/Records/Bed3Interval.h @@ -32,6 +32,8 @@ public: virtual const QuickString &getField(int fieldNum) const; virtual int getNumFields() const { return 3; } + static bool isNumericField(int fieldNum); + protected: virtual ~Bed3Interval(); diff --git a/src/utils/FileRecordTools/Records/Bed4Interval.cpp b/src/utils/FileRecordTools/Records/Bed4Interval.cpp index c1ef81a3c51d91fd563c351d7ae751e80892cfe6..27ca9f7f845a81c9686368db9f68a873ade7551d 100644 --- a/src/utils/FileRecordTools/Records/Bed4Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed4Interval.cpp @@ -60,3 +60,8 @@ const QuickString &Bed4Interval::getField(int fieldNum) const } } +bool Bed4Interval::isNumericField(int fieldNum) { + return (fieldNum == 4 ? false : Bed3Interval::isNumericField(fieldNum)); +} + + diff --git a/src/utils/FileRecordTools/Records/Bed4Interval.h b/src/utils/FileRecordTools/Records/Bed4Interval.h index f42817c495ed4780dd57aa82d3c4a93718840e5b..b03844645165650742db79a6337814a826ecd05d 100644 --- a/src/utils/FileRecordTools/Records/Bed4Interval.h +++ b/src/utils/FileRecordTools/Records/Bed4Interval.h @@ -28,6 +28,7 @@ public: virtual const QuickString &getField(int fieldNum) const; virtual int getNumFields() const { return 4; } + static bool isNumericField(int fieldNum); protected: diff --git a/src/utils/FileRecordTools/Records/Bed5Interval.cpp b/src/utils/FileRecordTools/Records/Bed5Interval.cpp index 7307fb667ee5bbbf40fe0576fd9654bec0c9ad0f..130a788d945c523a5bdbb011c349c28cbe1f5edb 100644 --- a/src/utils/FileRecordTools/Records/Bed5Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed5Interval.cpp @@ -70,3 +70,16 @@ const QuickString &Bed5Interval::getField(int fieldNum) const break; } } + +bool Bed5Interval::isNumericField(int fieldNum) { + switch (fieldNum) { + case 4: + return false; + break; + case 5: + return true; + break; + default: + return Bed3Interval::isNumericField(fieldNum); + } +} diff --git a/src/utils/FileRecordTools/Records/Bed5Interval.h b/src/utils/FileRecordTools/Records/Bed5Interval.h index bc913d1df6f4b22c355d0ace56d5e0143feced12..2064d3546da9674bda43adfb844e212ce5d32876 100644 --- a/src/utils/FileRecordTools/Records/Bed5Interval.h +++ b/src/utils/FileRecordTools/Records/Bed5Interval.h @@ -27,6 +27,7 @@ public: virtual const QuickString &getField(int fieldNum) const; virtual int getNumFields() const { return 5; } + static bool isNumericField(int fieldNum); protected: diff --git a/src/utils/FileRecordTools/Records/Bed6Interval.cpp b/src/utils/FileRecordTools/Records/Bed6Interval.cpp index 8371553a435941c0b713d64a55d198e4760a75ac..5bc783c748e09fc0d52fe54d3a52b628e06b9744 100644 --- a/src/utils/FileRecordTools/Records/Bed6Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed6Interval.cpp @@ -81,3 +81,20 @@ const QuickString &Bed6Interval::getField(int fieldNum) const break; } } + +bool Bed6Interval::isNumericField(int fieldNum) { + switch (fieldNum) { + case 4: + return false; + break; + case 5: + return true; + break; + case 6: + return false; + break; + default: + return Bed3Interval::isNumericField(fieldNum); + break; + } +} diff --git a/src/utils/FileRecordTools/Records/Bed6Interval.h b/src/utils/FileRecordTools/Records/Bed6Interval.h index 9ad9f80b523ae5f5aef43aa3fd61e29cdc09e27f..023683fe6b48191bddcb9cd68dd5d102cd79df2f 100644 --- a/src/utils/FileRecordTools/Records/Bed6Interval.h +++ b/src/utils/FileRecordTools/Records/Bed6Interval.h @@ -27,6 +27,7 @@ public: virtual const QuickString &getField(int fieldNum) const; virtual int getNumFields() const { return 6; } + static bool isNumericField(int fieldNum); protected: diff --git a/src/utils/FileRecordTools/Records/BedGraphInterval.cpp b/src/utils/FileRecordTools/Records/BedGraphInterval.cpp index e0808573c0a6cb35de787421a4a60e260d08ffa7..9cfda4803d4e5d3e876ea5bc26227d7ede8a1faa 100644 --- a/src/utils/FileRecordTools/Records/BedGraphInterval.cpp +++ b/src/utils/FileRecordTools/Records/BedGraphInterval.cpp @@ -60,3 +60,14 @@ const QuickString &BedGraphInterval::getField(int fieldNum) const } } +bool BedGraphInterval::isNumericField(int fieldNum) { + switch (fieldNum) { + case 4: + return true; + break; + default: + return Bed3Interval::isNumericField(fieldNum); + break; + } +} + diff --git a/src/utils/FileRecordTools/Records/BedGraphInterval.h b/src/utils/FileRecordTools/Records/BedGraphInterval.h index 1bdf619a807885347ace0a55fbcaf302c4ff95a2..5db6feafa09666b8a981b4dc755ed8998b24d113 100644 --- a/src/utils/FileRecordTools/Records/BedGraphInterval.h +++ b/src/utils/FileRecordTools/Records/BedGraphInterval.h @@ -28,6 +28,7 @@ public: virtual const QuickString &getField(int fieldNum) const; virtual int getNumFields() const { return 4; } + static bool isNumericField(int fieldNum); protected: virtual ~BedGraphInterval(); diff --git a/src/utils/FileRecordTools/Records/BedPlusInterval.cpp b/src/utils/FileRecordTools/Records/BedPlusInterval.cpp index fc8be368e09d12ed8ae31446c83313296fb8d2a5..5819b863c1da839ab31d6fb29f61752a6d73eb02 100644 --- a/src/utils/FileRecordTools/Records/BedPlusInterval.cpp +++ b/src/utils/FileRecordTools/Records/BedPlusInterval.cpp @@ -117,3 +117,18 @@ const QuickString &BedPlusInterval::getField(int fieldNum) const } return Bed6Interval::getField(fieldNum); } + +bool BedPlusInterval::isNumericField(int fieldNum) { + + // + // TBD: There is no currently no good way to guarantee / enforce whether + // fields after the 6th are numeric, so for now we'll give the user the + // benefit of the doubt on those. + // + if (fieldNum > startOtherIdx) { + return true; + } else { + return Bed6Interval::isNumericField(fieldNum); + } +} + diff --git a/src/utils/FileRecordTools/Records/BedPlusInterval.h b/src/utils/FileRecordTools/Records/BedPlusInterval.h index 4b98b4f3d0a745ea80709f68bfd0bfa7a7e23d09..077ed9367ac5b99a707d04f0387fb7ea0a09ccb7 100644 --- a/src/utils/FileRecordTools/Records/BedPlusInterval.h +++ b/src/utils/FileRecordTools/Records/BedPlusInterval.h @@ -38,6 +38,8 @@ public: virtual void setField(int fieldNum, const char *str) { (*(_otherIdxs[fieldNum])) = str; } virtual void setNumPrintFields(int num) { _numPrintFields = num; } virtual int getNumPrintFields() const { return _numPrintFields; } + static bool isNumericField(int fieldNum); + protected: virtual ~BedPlusInterval(); diff --git a/src/utils/FileRecordTools/Records/GffRecord.cpp b/src/utils/FileRecordTools/Records/GffRecord.cpp index a91ce159eb819309959d0391f70d69e8217b6808..21cea1dab87ea07e05e5b6331ecca4f9abe56cb5 100644 --- a/src/utils/FileRecordTools/Records/GffRecord.cpp +++ b/src/utils/FileRecordTools/Records/GffRecord.cpp @@ -156,4 +156,40 @@ const QuickString &GffRecord::getField(int fieldNum) const } } +bool GffRecord::isNumericField(int fieldNum) { + switch (fieldNum) { + case 1: + return false; + break; + case 2: + return false; + break; + case 3: + return false; + break; + case 4: + return true; + break; + case 5: + return true; + break; + case 6: + return true; + break; + case 7: + return false; + break; + case 8: + return false; + break; + case 9: + return false; + break; + default: + return Bed6Interval::isNumericField(fieldNum); + break; + } + +} + diff --git a/src/utils/FileRecordTools/Records/GffRecord.h b/src/utils/FileRecordTools/Records/GffRecord.h index b84d96a71384750e850de0ffe4d03bb3e8272dfd..e675542f37e6939897e0e5adb1f537ef642761d4 100644 --- a/src/utils/FileRecordTools/Records/GffRecord.h +++ b/src/utils/FileRecordTools/Records/GffRecord.h @@ -34,6 +34,7 @@ public: //Note: using the assignment operator in a GffRecord can potentially be a performance hit, //if the number of fields frequently differ between this object and the one being copied. const GffRecord &operator=(const GffRecord &other); + static bool isNumericField(int fieldNum); protected: virtual ~GffRecord(); diff --git a/src/utils/FileRecordTools/Records/Record.cpp b/src/utils/FileRecordTools/Records/Record.cpp index 2beb4dca13cad05913be21e5870c66657a2dfdc7..89544ed21f23e91d7db73ab1e6e2158edc695ae9 100644 --- a/src/utils/FileRecordTools/Records/Record.cpp +++ b/src/utils/FileRecordTools/Records/Record.cpp @@ -187,9 +187,9 @@ void Record::undoZeroLength() ostream &operator << (ostream &out, const Record &record) { - QuickString errBuf; - record.print(errBuf); - out << errBuf; + QuickString outBuf; + record.print(outBuf); + out << outBuf; return out; } diff --git a/src/utils/FileRecordTools/Records/Record.h b/src/utils/FileRecordTools/Records/Record.h index 2c303d90c02272ea1c7a3956d29e8fac1abd0398..d8071c1ed223d7172bd7faa33e6ff6f514975616 100644 --- a/src/utils/FileRecordTools/Records/Record.h +++ b/src/utils/FileRecordTools/Records/Record.h @@ -129,6 +129,8 @@ public: virtual bool sameChromIntersects(const Record *otherRecord, bool sameStrand, bool diffStrand, float overlapFraction, bool reciprocal) const; +// virtual static bool isNumericField(int fieldNum) const = 0; + protected: virtual ~Record(); //by making the destructor protected, only the friend class(es) can actually delete Record objects, or objects derived from Record. diff --git a/src/utils/GenomeFile/Makefile b/src/utils/GenomeFile/Makefile index afaeccd4af5e138a17fa8cd0fd437141840935d2..fd17d29941e5dc51eadd02f66f6a142fd9160266 100644 --- a/src/utils/GenomeFile/Makefile +++ b/src/utils/GenomeFile/Makefile @@ -6,6 +6,7 @@ UTILITIES_DIR = ../ # ------------------- INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/BamTools/include/ # ---------------------------------- diff --git a/src/utils/KeyListOps/KeyListOps.cpp b/src/utils/KeyListOps/KeyListOps.cpp index 05a60402e867a641c616bb2b7d86cf599089d759..657635000e5313c200567d25ea36a29623bd4056 100644 --- a/src/utils/KeyListOps/KeyListOps.cpp +++ b/src/utils/KeyListOps/KeyListOps.cpp @@ -1,37 +1,14 @@ /* * KeyListOps.cpp * - * Created on: Feb 6, 2014 + * Created on: Feb 24, 2014 * Author: nek3d */ - #include "KeyListOps.h" -#include <cfloat> -#include <cmath> -#include <algorithm> - -KeyListOps::KeyListOps() -: _keyList(&_nullKeyList), - _column(1), - _nullVal("."), - _delimStr(","), - _iter(_nullKeyList.begin()) -{ - init(); +#include "FileRecordMgr.h" +#include <cmath> //for isnan -} - -KeyListOps::KeyListOps(RecordKeyList *keyList, int column) -: _keyList(keyList), - _column(column), - _nullVal("."), - _delimStr(","), - _iter(keyList->begin()) -{ - init(); -} - -void KeyListOps::init() { +KeyListOps::KeyListOps() { _opCodes["sum"] = SUM; _opCodes["mean"] = MEAN; _opCodes["stddev"] = STDDEV; @@ -53,11 +30,44 @@ void KeyListOps::init() { _opCodes["freq_desc"] = FREQ_DESC; _opCodes["first"] = FIRST; _opCodes["last"] = LAST; -} + _isNumericOp[SUM] = true; + _isNumericOp[MEAN] = true; + _isNumericOp[STDDEV] = true; + _isNumericOp[MEDIAN] = true; + _isNumericOp[MODE] = false; + _isNumericOp[ANTIMODE] = false; + _isNumericOp[MIN] = true; + _isNumericOp[MAX] = true; + _isNumericOp[ABSMIN] = true; + _isNumericOp[COUNT] = false; + _isNumericOp[DISTINCT] = false; + _isNumericOp[COUNT_DISTINCT] = false; + _isNumericOp[DISTINCT_ONLY] = false; + _isNumericOp[COLLAPSE] = false; + _isNumericOp[CONCAT] = false; + _isNumericOp[FREQ_ASC] = false; + _isNumericOp[FREQ_DESC] = false; + _isNumericOp[FIRST] = false; + _isNumericOp[LAST] = false; + + _methods.setDelimStr(","); + _methods.setNullValue("."); + + // default to BED score column + _columns = "5"; + // default to "sum" + _operations = "sum"; + +} -KeyListOps::~KeyListOps() { +bool KeyListOps::isNumericOp(OP_TYPES op) const { + map<OP_TYPES, bool>::const_iterator iter = _isNumericOp.find(op); + return (iter == _isNumericOp.end() ? false : iter->second); +} +bool KeyListOps::isNumericOp(const QuickString &op) const { + return isNumericOp(getOpCode(op)); } KeyListOps::OP_TYPES KeyListOps::getOpCode(const QuickString &operation) const { @@ -70,336 +80,285 @@ KeyListOps::OP_TYPES KeyListOps::getOpCode(const QuickString &operation) const { return iter->second; } -// return the total of the values in the vector -double KeyListOps::getSum() { - if (empty()) return NAN; - double theSum = 0.0; - for (begin(); !end(); next()) { - theSum += getColValNum(); - } - return theSum; -} +bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) { -// return the average value in the vector -double KeyListOps::getMean() { - if (empty()) return NAN; + if (dbFile->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { + //throw Error + cerr << endl << "*****" << endl + << "***** ERROR: BAM database file not currently supported for column operations." + << endl; + exit(1); + } - return getSum() / (float)getCount(); -} + //get the strings from context containing the comma-delimited lists of columns + //and operations. Split both of these into vectors. Get the operation code + //for each operation string. Finally, make a vector of pairs, where the first + //member of each pair is a column number, and the second member is the code for the + //operation to perform on that column. - // return the standard deviation -double KeyListOps::getStddev() { - if (empty()) return NAN; + vector<QuickString> columnsVec; + vector<QuickString> opsVec; + int numCols = Tokenize(_columns, columnsVec, ','); + int numOps = Tokenize(_operations, opsVec, ','); - double avg = getMean(); - double squareDiffSum = 0.0; - for (begin(); !end(); next()) { - double val = getColValNum(); - double diff = val - avg; - squareDiffSum += diff * diff; + if (numOps < 1 || numCols < 1) { + cerr << endl << "*****" << endl + << "***** ERROR: There must be at least one column and at least one operation named." << endl; + return false; } - return squareDiffSum / (float)getCount(); -} -// return the standard deviation -double KeyListOps::getSampleStddev() { - if (empty()) return NAN; - - double avg = getMean(); - double squareDiffSum = 0.0; - for (begin(); !end(); next()) { - double val = getColValNum(); - double diff = val - avg; - squareDiffSum += diff * diff; - } - return squareDiffSum / ((float)getCount() - 1.0); -} - -// return the median value in the vector -double KeyListOps::getMedian() { - if (empty()) return NAN; - - //get sorted vector. if even number of elems, return middle val. - //if odd, average of two. - toArray(true, ASC); - size_t count = getCount(); - if (count % 2) { - //odd number of elements. Take middle one. - return _numArray[count/2]; - } else { - //even numnber of elements. Take average of middle 2. - double sum = _numArray[count/2 -1] + _numArray[count/2]; - return sum / 2.0; + if (numOps > 1 && numCols != numOps) { + cerr << endl << "*****" << endl + << "***** ERROR: There are " << numCols <<" columns given, but there are " << numOps << " operations." << endl; + cerr << "\tPlease provide either a single operation that will be applied to all listed columns, " << endl; + cerr << "\tor an operation for each column." << endl; + return false; } -} - -// return the most common value in the vector -const QuickString &KeyListOps::getMode() { - if (empty()) return _nullVal; - - makeFreqMap(); - - //now pass through the freq map and keep track of which key has the highest occurance. - freqMapType::iterator maxIter = _freqMap.begin(); - int maxVal = 0; - for (; _freqIter != _freqMap.end(); _freqIter++) { - if (_freqIter->second > maxVal) { - maxIter = _freqIter; - maxVal = _freqIter->second; + for (int i=0; i < (int)columnsVec.size(); i++) { + int col = str2chrPos(columnsVec[i]); + + //check that the column number is valid + if (col < 1 || col > dbFile->getNumFields()) { + cerr << endl << "*****" << endl << "***** ERROR: Requested column " << col << ", but database file " + << dbFile->getFileName() << " only has fields 1 - " << dbFile->getNumFields() << "." << endl; + return false; } - } - _retStr = maxIter->first; - return _retStr; -} -// return the least common value in the vector -const QuickString &KeyListOps::getAntiMode() { - if (empty()) return _nullVal; - - makeFreqMap(); - - //now pass through the freq map and keep track of which key has the highest occurance. - freqMapType::iterator minIter = _freqMap.begin(); - int minVal = INT_MAX; - for (; _freqIter != _freqMap.end(); _freqIter++) { - if (_freqIter->second < minVal) { - minIter = _freqIter; - minVal = _freqIter->second; + const QuickString &operation = opsVec.size() > 1 ? opsVec[i] : opsVec[0]; + OP_TYPES opCode = getOpCode(operation); + if (opCode == INVALID) { + cerr << endl << "*****" << endl + << "***** ERROR: " << operation << " is not a valid operation. " << endl; + return false; } + _colOps.push_back(pair<int, OP_TYPES>(col, opCode)); } - _retStr = minIter->first; - return _retStr; -} -// return the minimum element of the vector -double KeyListOps::getMin() { - if (empty()) return NAN; - - double minVal = DBL_MAX; - for (begin(); !end(); next()) { - double currVal = getColValNum(); - minVal = (currVal < minVal) ? currVal : minVal; - } - return minVal; -} - -// return the maximum element of the vector -double KeyListOps::getMax() { - if (empty()) return NAN; - - double maxVal = DBL_MIN; - for (begin(); !end(); next()) { - double currVal = getColValNum(); - maxVal = (currVal > maxVal) ? currVal : maxVal; - } - return maxVal; -} - -// return the minimum absolute value of the vector -double KeyListOps::getAbsMin() { - if (empty()) return NAN; - - double minVal = DBL_MAX; - for (begin(); !end(); next()) { - double currVal = abs(getColValNum()); - minVal = (currVal < minVal) ? currVal : minVal; - } - return minVal; -} -// return the maximum absolute value of the vector -double KeyListOps::getAbsMax() { - if (empty()) return NAN; - - double maxVal = DBL_MIN; - for (begin(); !end(); next()) { - double currVal = abs(getColValNum()); - maxVal = (currVal > maxVal) ? currVal : maxVal; - } - return maxVal; -} -// return the count of element in the vector -uint32_t KeyListOps::getCount() { - return _keyList->size(); -} -// return a delimited list of the unique elements -const QuickString &KeyListOps::getDistinct() { - if (empty()) return _nullVal; - // separated list of unique values. If something repeats, only report once. - makeFreqMap(); - _retStr.clear(); - for (; _freqIter != _freqMap.end(); _freqIter++) { - if (_freqIter != _freqMap.begin()) _retStr += _delimStr; - _retStr.append(_freqIter->first); - } - return _retStr; -} - -const QuickString &KeyListOps::getDistinctOnly() { - if (empty()) return _nullVal; - - //separated list of only unique values. If item repeats, discard. - makeFreqMap(); - _retStr.clear(); - for (; _freqIter != _freqMap.end(); _freqIter++) { - if (_freqIter->second != 1) continue; - if (_freqIter != _freqMap.begin()) _retStr += _delimStr; - _retStr.append(_freqIter->first); - } - return _retStr; -} - -// return a the count of _unique_ elements in the vector -uint32_t KeyListOps::getCountDistinct() { - if (empty()) return 0; - - makeFreqMap(); - return _freqMap.size(); -} -// return a delimiter-separated list of elements -const QuickString &KeyListOps::getCollapse(const QuickString &delimiter) { - if (empty()) return _nullVal; - - //just put all items in one big separated list. - _retStr.clear(); - int i=0; - for (begin(); !end(); next()) { - if (i > 0) _retStr += _delimStr; - _retStr.append(getColVal()); - i++; - } - return _retStr; - -} -// return a concatenation of all elements in the vector -const QuickString &KeyListOps::getConcat() { - if (empty()) return _nullVal; - - //like collapse but w/o commas. Just a true concat of all vals. - //just swap out the delimChar with '' and call collapse, then - //restore the delimChar. - QuickString oldDelimStr(_delimStr); - _delimStr = ""; - getCollapse(); //this will store it's results in the _retStr method. - _delimStr = oldDelimStr; - return _retStr; -} - -// return a histogram of values and their freqs. in desc. order of frequency -const QuickString &KeyListOps::getFreqDesc() { - if (empty()) return _nullVal; - - //for each uniq val, report # occurances, in desc order. - makeFreqMap(); - //put freq map into multimap where key is the freq and val is the item. In other words, basically a reverse freq map. - histDescType hist; - for (; _freqIter != _freqMap.end(); _freqIter++) { - hist.insert(pair<int, QuickString>(_freqIter->second, _freqIter->first)); - } - //now iterate through the reverse map we just made and output it's pairs in val:key format. - _retStr.clear(); - for (histDescType::iterator histIter = hist.begin(); histIter != hist.end(); histIter++) { - if (histIter != hist.begin()) _retStr += _delimStr; - _retStr.append(histIter->second); - _retStr += ":"; - _retStr.append(histIter->first); - } - return _retStr; -} -// return a histogram of values and their freqs. in asc. order of frequency -const QuickString &KeyListOps::getFreqAsc() { - if (empty()) return _nullVal; - - //for each uniq val, report # occurances, in asc order. - makeFreqMap(); - //put freq map into multimap where key is the freq and val is the item. In other words, basically a reverse freq map. - histAscType hist; - for (; _freqIter != _freqMap.end(); _freqIter++) { - hist.insert(pair<int, QuickString>(_freqIter->second, _freqIter->first)); -// hist[*(_freqIter->second)] = _freqIter->first; - } - //now iterate through the reverse map we just made and output it's pairs in val:key format. - _retStr.clear(); - for (histAscType::iterator histIter = hist.begin(); histIter != hist.end(); histIter++) { - if (histIter != hist.begin()) _retStr += _delimStr; - _retStr.append(histIter->second); - _retStr += ":"; - _retStr.append(histIter->first); - } - return _retStr; -} -// return the first value in the list -const QuickString &KeyListOps::getFirst() { - if (empty()) return _nullVal; - - //just the first item. - begin(); - return getColVal(); -} -// return the last value in the list -const QuickString &KeyListOps::getLast() { - if (empty()) return _nullVal; - - //just the last item. - begin(); - for (size_t i = 0; i < getCount() -1; i++) { - next(); - } - return getColVal(); -} -const QuickString &KeyListOps::getColVal() { - return _iter->value()->getField(_column); -} - -double KeyListOps::getColValNum() { - return atof(_iter->value()->getField(_column).c_str()); -} -void KeyListOps::toArray(bool useNum, SORT_TYPE sortVal) { - - //TBD: optimize performance with better memory management. - if (useNum) { - _numArray.resize(_keyList->size()); - int i=0; - for (begin(); !end(); next()) { - _numArray[i] = getColValNum(); - i++; - } - } else { - _qsArray.resize(_keyList->size()); - int i=0; - for (begin(); !end(); next()) { - _qsArray[i] = getColVal(); - i++; + //The final step we need to do is check that for each column/operation pair, + //if the operation is numeric, see if the database's record type supports + //numeric operations for that column. For instance, we can allow the mean + //of column 4 for a BedGraph file, because that's numeric, but not for Bed4, + //because that isn't. + + for (int i = 0; i < (int)_colOps.size(); i++) { + int col = _colOps[i].first; + OP_TYPES opCode = _colOps[i].second; + FileRecordTypeChecker::RECORD_TYPE recordType = dbFile->getRecordType(); + + if (isNumericOp(opCode)) { + bool isValidNumOp = false; + switch(recordType) { + case FileRecordTypeChecker::BED3_RECORD_TYPE: + isValidNumOp = Bed3Interval::isNumericField(col); + break; + + case FileRecordTypeChecker::BED4_RECORD_TYPE: + isValidNumOp = Bed4Interval::isNumericField(col); + break; + + case FileRecordTypeChecker::BED5_RECORD_TYPE: + isValidNumOp = Bed5Interval::isNumericField(col); + break; + + case FileRecordTypeChecker::BEDGRAPH_RECORD_TYPE: + isValidNumOp = BedGraphInterval::isNumericField(col); + break; + + case FileRecordTypeChecker::BED6_RECORD_TYPE: + isValidNumOp = Bed6Interval::isNumericField(col); + break; + + case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE: + isValidNumOp = BedPlusInterval::isNumericField(col); + break; + + case FileRecordTypeChecker::BED12_RECORD_TYPE: + isValidNumOp = Bed12Interval::isNumericField(col); + break; + + case FileRecordTypeChecker::BAM_RECORD_TYPE: + isValidNumOp = BamRecord::isNumericField(col); + break; + + case FileRecordTypeChecker::VCF_RECORD_TYPE: + isValidNumOp = VcfRecord::isNumericField(col); + break; + + case FileRecordTypeChecker::GFF_RECORD_TYPE: + isValidNumOp = GffRecord::isNumericField(col); + break; + + default: + break; + } + if (!isValidNumOp) { + cerr << endl << "*****" << endl << "***** ERROR: Column " << col << " is not a numeric field for database file " + << dbFile->getFileName() << "." << endl; + return false; + } } } - if (sortVal != UNSORTED) { - sortArray(useNum, sortVal == ASC); - } + + return true; } -void KeyListOps::sortArray(bool useNum, bool ascOrder) +const QuickString & KeyListOps::getOpVals(RecordKeyList &hits) { - if (useNum) { - if (ascOrder) { - sort(_numArray.begin(), _numArray.end(), less<double>()); - } else { - sort(_numArray.begin(), _numArray.end(), greater<double>()); + //loop through all requested columns, and for each one, call the method needed + //for the operation specified. + _methods.setKeyList(&hits); + _outVals.clear(); + double val = 0.0; + for (int i=0; i < (int)_colOps.size(); i++) { + int col = _colOps[i].first; + OP_TYPES opCode = _colOps[i].second; + + _methods.setColumn(col); + switch (opCode) { + case SUM: + val = _methods.getSum(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case MEAN: + val = _methods.getMean(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case STDDEV: + val = _methods.getStddev(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case SAMPLE_STDDEV: + val = _methods.getSampleStddev(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case MEDIAN: + val = _methods.getMedian(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case MODE: + _outVals.append(_methods.getMode()); + break; + + case ANTIMODE: + _outVals.append(_methods.getAntiMode()); + break; + + case MIN: + val = _methods.getMin(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case MAX: + val = _methods.getMax(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case ABSMIN: + val = _methods.getAbsMin(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case ABSMAX: + val = _methods.getAbsMax(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case COUNT: + _outVals.append(_methods.getCount()); + break; + + case DISTINCT: + _outVals.append(_methods.getDistinct()); + break; + + case COUNT_DISTINCT: + _outVals.append(_methods.getCountDistinct()); + break; + + case DISTINCT_ONLY: + _outVals.append(_methods.getDistinctOnly()); + break; + + case COLLAPSE: + _outVals.append(_methods.getCollapse()); + break; + + case CONCAT: + _outVals.append(_methods.getConcat()); + break; + + case FREQ_ASC: + _outVals.append(_methods.getFreqAsc()); + break; + + case FREQ_DESC: + _outVals.append(_methods.getFreqDesc()); + break; + + case FIRST: + _outVals.append(_methods.getFirst()); + break; + + case LAST: + _outVals.append(_methods.getLast()); + break; + + case INVALID: + default: + // Any unrecognized operation should have been handled already in the context validation. + // It's thus unnecessary to handle it here, but throw an error to help us know if future + // refactoring or code changes accidentally bypass the validation phase. + cerr << "ERROR: Invalid operation given for column " << col << ". Exiting..." << endl; + break; } - } else { - if (ascOrder) { - sort(_qsArray.begin(), _qsArray.end(), less<QuickString>()); - } else { - sort(_qsArray.begin(), _qsArray.end(), greater<QuickString>()); + //if this isn't the last column, add a tab. + if (i < (int)_colOps.size() -1) { + _outVals.append('\t'); } } + return _outVals; } -void KeyListOps::makeFreqMap() { - _freqMap.clear(); - //make a map of values to their number of times occuring. - for (begin(); !end(); next()) { - _freqMap[getColVal()]++; - } - _freqIter = _freqMap.begin(); -} diff --git a/src/utils/KeyListOps/KeyListOps.h b/src/utils/KeyListOps/KeyListOps.h index e294f535f54355b26b6d2ffcac7e7f0f4f78c38a..3c26d2c20b5ac49d4c608fc6ae7dd98653dfb932 100644 --- a/src/utils/KeyListOps/KeyListOps.h +++ b/src/utils/KeyListOps/KeyListOps.h @@ -1,117 +1,54 @@ /* * KeyListOps.h * - * Created on: Feb 6, 2014 + * Created on: Feb 24, 2014 * Author: nek3d */ #ifndef KEYLISTOPS_H_ #define KEYLISTOPS_H_ -using namespace std; +#include "KeyListOpsMethods.h" -#include <map> -#include <utility> //for pair -#include "QuickString.h" -#include <stdint.h> -#include "RecordKeyList.h" +class FileRecordMgr; class KeyListOps { public: - KeyListOps(); - KeyListOps(RecordKeyList *keyList, int column = 1); - ~KeyListOps(); + KeyListOps(); - void setKeyList(RecordKeyList *keyList) { _keyList = keyList; } - void setColumn(int col) { _column = col; } - void setNullValue(const QuickString & nullVal) { _nullVal = nullVal; } - void setDelimStr(const QuickString &delimStr) { _delimStr = delimStr; } + void setColumns(const QuickString &columns) { _columns = columns; } + void setOperations(const QuickString & operation) { _operations = operation; } + void setNullValue(const QuickString & nullValue) { _methods.setNullValue(nullValue); } + void setDelimStr(const QuickString & delimStr) { _methods.setDelimStr(delimStr); } + void setKeyList(RecordKeyList *keyList) { _methods.setKeyList(keyList); } - typedef enum { SUM, MEAN, STDDEV, SAMPLE_STDDEV, MEDIAN, MODE, ANTIMODE, MIN, MAX, ABSMIN, ABSMAX, COUNT, DISTINCT, COUNT_DISTINCT, + typedef enum { SUM, MEAN, STDDEV, SAMPLE_STDDEV, MEDIAN, MODE, ANTIMODE, MIN, MAX, ABSMIN, ABSMAX, COUNT, DISTINCT, COUNT_DISTINCT, DISTINCT_ONLY, COLLAPSE, CONCAT, FREQ_ASC, FREQ_DESC, FIRST, LAST, INVALID } OP_TYPES; - OP_TYPES getOpCode(const QuickString &operation) const; - // return the total of the values in the vector - double getSum(); - // return the average value in the vector - double getMean(); - // return the standard deviation - double getStddev(); - // return the sample standard deviation - double getSampleStddev(); - // return the median value in the vector - double getMedian(); - // return the most common value in the vector - const QuickString &getMode(); - // return the least common value in the vector - const QuickString &getAntiMode(); - // return the minimum element of the vector - double getMin(); - // return the maximum element of the vector - double getMax(); - // return the minimum absolute value of the vector - double getAbsMin(); - // return the maximum absolute value of the vector - double getAbsMax(); - // return the count of element in the vector - uint32_t getCount(); - // return a the count of _unique_ elements in the vector - uint32_t getCountDistinct(); - // return only those elements that occur once - const QuickString &getDistinctOnly(); - // return a delimiter-separated list of elements - const QuickString & getCollapse(const QuickString & delimiter = ","); - // return a concatenation of all elements in the vector - const QuickString & getConcat(); - // return a comma-separated list of the _unique_ elements - const QuickString & getDistinct(); - // return a histogram of values and their freqs. in desc. order of frequency - const QuickString & getFreqDesc(); - // return a histogram of values and their freqs. in asc. order of frequency - const QuickString & getFreqAsc(); - // return the first value in the list - const QuickString & getFirst(); - // return the last value in the list - const QuickString & getLast(); - -private: - RecordKeyList *_keyList; - int _column; - QuickString _nullVal; - QuickString _delimStr; - QuickString _retStr; + bool isValidColumnOps(FileRecordMgr *dbFile); - map<QuickString, OP_TYPES> _opCodes; - RecordKeyList _nullKeyList; //this has to exist just so we can initialize _iter, below. - RecordKeyList::const_iterator_type _iter; + const QuickString &getOpVals(RecordKeyList &hits); - // Some methods need to put values into a vector, mostly for sorting. - vector<double> _numArray; - vector<QuickString> _qsArray; +private: + void init(); - typedef map<QuickString, int> freqMapType; - freqMapType _freqMap; - freqMapType::iterator _freqIter; + QuickString _operations; + QuickString _columns; - typedef enum { UNSORTED, ASC, DESC} SORT_TYPE; + KeyListOpsMethods _methods; + map<QuickString, OP_TYPES> _opCodes; + map<OP_TYPES, bool> _isNumericOp; - typedef multimap<int, QuickString, less<int> > histAscType; - typedef multimap<int, QuickString, greater<int> > histDescType; - void init(); - const QuickString &getColVal(); - double getColValNum(); - bool empty() { return _keyList->empty(); } - void begin() { _iter = _keyList->begin(); } - bool end() { return _iter == _keyList->end(); } - void next() { _iter = _keyList->next(); } - void toArray(bool useNum, SORT_TYPE sortVal = UNSORTED); - void sortArray(bool useNum, bool ascOrder); - void makeFreqMap(); + typedef vector<pair<int, OP_TYPES> > colOpsType; + colOpsType _colOps; + QuickString _outVals; + OP_TYPES getOpCode(const QuickString &operation) const; + bool isNumericOp(OP_TYPES op) const; + bool isNumericOp(const QuickString &op) const; }; - #endif /* KEYLISTOPS_H_ */ diff --git a/src/utils/KeyListOps/KeyListOpsMethods.cpp b/src/utils/KeyListOps/KeyListOpsMethods.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0b0013582ee059f3f564a2788a455b4527ddef46 --- /dev/null +++ b/src/utils/KeyListOps/KeyListOpsMethods.cpp @@ -0,0 +1,368 @@ +/* + * KeyListOpsMethods.cpp + * + * Created on: Feb 6, 2014 + * Author: nek3d + */ + +#include "KeyListOpsMethods.h" +#include <cfloat> +#include <cmath> +#include <algorithm> + +KeyListOpsMethods::KeyListOpsMethods() +: _keyList(&_nullKeyList), + _column(1), + _nullVal("."), + _delimStr(","), + _iter(_nullKeyList.begin()) +{ +} + +KeyListOpsMethods::KeyListOpsMethods(RecordKeyList *keyList, int column) +: _keyList(keyList), + _column(column), + _nullVal("."), + _delimStr(","), + _iter(keyList->begin()) +{ +} + + +KeyListOpsMethods::~KeyListOpsMethods() { + +} + +// return the total of the values in the vector +double KeyListOpsMethods::getSum() { + if (empty()) return NAN; + + double theSum = 0.0; + for (begin(); !end(); next()) { + theSum += getColValNum(); + } + return theSum; +} + +// return the average value in the vector +double KeyListOpsMethods::getMean() { + if (empty()) return NAN; + + return getSum() / (float)getCount(); +} + + + // return the standard deviation +double KeyListOpsMethods::getStddev() { + if (empty()) return NAN; + + double avg = getMean(); + double squareDiffSum = 0.0; + for (begin(); !end(); next()) { + double val = getColValNum(); + double diff = val - avg; + squareDiffSum += diff * diff; + } + return squareDiffSum / (float)getCount(); +} +// return the standard deviation +double KeyListOpsMethods::getSampleStddev() { + if (empty()) return NAN; + + double avg = getMean(); + double squareDiffSum = 0.0; + for (begin(); !end(); next()) { + double val = getColValNum(); + double diff = val - avg; + squareDiffSum += diff * diff; + } + return squareDiffSum / ((float)getCount() - 1.0); +} + +// return the median value in the vector +double KeyListOpsMethods::getMedian() { + if (empty()) return NAN; + + //get sorted vector. if even number of elems, return middle val. + //if odd, average of two. + toArray(true, ASC); + size_t count = getCount(); + if (count % 2) { + //odd number of elements. Take middle one. + return _numArray[count/2]; + } else { + //even numnber of elements. Take average of middle 2. + double sum = _numArray[count/2 -1] + _numArray[count/2]; + return sum / 2.0; + } +} + +// return the most common value in the vector +const QuickString &KeyListOpsMethods::getMode() { + if (empty()) return _nullVal; + + makeFreqMap(); + + //now pass through the freq map and keep track of which key has the highest occurance. + freqMapType::iterator maxIter = _freqMap.begin(); + int maxVal = 0; + for (; _freqIter != _freqMap.end(); _freqIter++) { + if (_freqIter->second > maxVal) { + maxIter = _freqIter; + maxVal = _freqIter->second; + } + } + _retStr = maxIter->first; + return _retStr; +} +// return the least common value in the vector +const QuickString &KeyListOpsMethods::getAntiMode() { + if (empty()) return _nullVal; + + makeFreqMap(); + + //now pass through the freq map and keep track of which key has the highest occurance. + freqMapType::iterator minIter = _freqMap.begin(); + int minVal = INT_MAX; + for (; _freqIter != _freqMap.end(); _freqIter++) { + if (_freqIter->second < minVal) { + minIter = _freqIter; + minVal = _freqIter->second; + } + } + _retStr = minIter->first; + return _retStr; +} +// return the minimum element of the vector +double KeyListOpsMethods::getMin() { + if (empty()) return NAN; + + double minVal = DBL_MAX; + for (begin(); !end(); next()) { + double currVal = getColValNum(); + minVal = (currVal < minVal) ? currVal : minVal; + } + return minVal; +} + +// return the maximum element of the vector +double KeyListOpsMethods::getMax() { + if (empty()) return NAN; + + double maxVal = DBL_MIN; + for (begin(); !end(); next()) { + double currVal = getColValNum(); + maxVal = (currVal > maxVal) ? currVal : maxVal; + } + return maxVal; +} + +// return the minimum absolute value of the vector +double KeyListOpsMethods::getAbsMin() { + if (empty()) return NAN; + + double minVal = DBL_MAX; + for (begin(); !end(); next()) { + double currVal = abs(getColValNum()); + minVal = (currVal < minVal) ? currVal : minVal; + } + return minVal; +} +// return the maximum absolute value of the vector +double KeyListOpsMethods::getAbsMax() { + if (empty()) return NAN; + + double maxVal = DBL_MIN; + for (begin(); !end(); next()) { + double currVal = abs(getColValNum()); + maxVal = (currVal > maxVal) ? currVal : maxVal; + } + return maxVal; +} +// return the count of element in the vector +uint32_t KeyListOpsMethods::getCount() { + return _keyList->size(); +} +// return a delimited list of the unique elements +const QuickString &KeyListOpsMethods::getDistinct() { + if (empty()) return _nullVal; + // separated list of unique values. If something repeats, only report once. + makeFreqMap(); + _retStr.clear(); + for (; _freqIter != _freqMap.end(); _freqIter++) { + if (_freqIter != _freqMap.begin()) _retStr += _delimStr; + _retStr.append(_freqIter->first); + } + return _retStr; +} + +const QuickString &KeyListOpsMethods::getDistinctOnly() { + if (empty()) return _nullVal; + + //separated list of only unique values. If item repeats, discard. + makeFreqMap(); + _retStr.clear(); + for (; _freqIter != _freqMap.end(); _freqIter++) { + if (_freqIter->second != 1) continue; + if (_freqIter != _freqMap.begin()) _retStr += _delimStr; + _retStr.append(_freqIter->first); + } + return _retStr; +} + +// return a the count of _unique_ elements in the vector +uint32_t KeyListOpsMethods::getCountDistinct() { + if (empty()) return 0; + + makeFreqMap(); + return _freqMap.size(); +} +// return a delimiter-separated list of elements +const QuickString &KeyListOpsMethods::getCollapse(const QuickString &delimiter) { + if (empty()) return _nullVal; + + //just put all items in one big separated list. + _retStr.clear(); + int i=0; + for (begin(); !end(); next()) { + if (i > 0) _retStr += _delimStr; + _retStr.append(getColVal()); + i++; + } + return _retStr; + +} +// return a concatenation of all elements in the vector +const QuickString &KeyListOpsMethods::getConcat() { + if (empty()) return _nullVal; + + //like collapse but w/o commas. Just a true concat of all vals. + //just swap out the delimChar with '' and call collapse, then + //restore the delimChar. + QuickString oldDelimStr(_delimStr); + _delimStr = ""; + getCollapse(); //this will store it's results in the _retStr method. + _delimStr = oldDelimStr; + return _retStr; +} + +// return a histogram of values and their freqs. in desc. order of frequency +const QuickString &KeyListOpsMethods::getFreqDesc() { + if (empty()) return _nullVal; + + //for each uniq val, report # occurances, in desc order. + makeFreqMap(); + //put freq map into multimap where key is the freq and val is the item. In other words, basically a reverse freq map. + histDescType hist; + for (; _freqIter != _freqMap.end(); _freqIter++) { + hist.insert(pair<int, QuickString>(_freqIter->second, _freqIter->first)); + } + //now iterate through the reverse map we just made and output it's pairs in val:key format. + _retStr.clear(); + for (histDescType::iterator histIter = hist.begin(); histIter != hist.end(); histIter++) { + if (histIter != hist.begin()) _retStr += _delimStr; + _retStr.append(histIter->second); + _retStr += ":"; + _retStr.append(histIter->first); + } + return _retStr; +} +// return a histogram of values and their freqs. in asc. order of frequency +const QuickString &KeyListOpsMethods::getFreqAsc() { + if (empty()) return _nullVal; + + //for each uniq val, report # occurances, in asc order. + makeFreqMap(); + //put freq map into multimap where key is the freq and val is the item. In other words, basically a reverse freq map. + histAscType hist; + for (; _freqIter != _freqMap.end(); _freqIter++) { + hist.insert(pair<int, QuickString>(_freqIter->second, _freqIter->first)); +// hist[*(_freqIter->second)] = _freqIter->first; + } + //now iterate through the reverse map we just made and output it's pairs in val:key format. + _retStr.clear(); + for (histAscType::iterator histIter = hist.begin(); histIter != hist.end(); histIter++) { + if (histIter != hist.begin()) _retStr += _delimStr; + _retStr.append(histIter->second); + _retStr += ":"; + _retStr.append(histIter->first); + } + return _retStr; +} +// return the first value in the list +const QuickString &KeyListOpsMethods::getFirst() { + if (empty()) return _nullVal; + + //just the first item. + begin(); + return getColVal(); +} +// return the last value in the list +const QuickString &KeyListOpsMethods::getLast() { + if (empty()) return _nullVal; + + //just the last item. + begin(); + for (size_t i = 0; i < getCount() -1; i++) { + next(); + } + return getColVal(); +} + +const QuickString &KeyListOpsMethods::getColVal() { + return _iter->value()->getField(_column); +} + +double KeyListOpsMethods::getColValNum() { + return atof(_iter->value()->getField(_column).c_str()); +} + +void KeyListOpsMethods::toArray(bool useNum, SORT_TYPE sortVal) { + + //TBD: optimize performance with better memory management. + if (useNum) { + _numArray.resize(_keyList->size()); + int i=0; + for (begin(); !end(); next()) { + _numArray[i] = getColValNum(); + i++; + } + } else { + _qsArray.resize(_keyList->size()); + int i=0; + for (begin(); !end(); next()) { + _qsArray[i] = getColVal(); + i++; + } + } + if (sortVal != UNSORTED) { + sortArray(useNum, sortVal == ASC); + } +} + +void KeyListOpsMethods::sortArray(bool useNum, bool ascOrder) +{ + if (useNum) { + if (ascOrder) { + sort(_numArray.begin(), _numArray.end(), less<double>()); + } else { + sort(_numArray.begin(), _numArray.end(), greater<double>()); + } + } else { + if (ascOrder) { + sort(_qsArray.begin(), _qsArray.end(), less<QuickString>()); + } else { + sort(_qsArray.begin(), _qsArray.end(), greater<QuickString>()); + } + } +} + +void KeyListOpsMethods::makeFreqMap() { + _freqMap.clear(); + + //make a map of values to their number of times occuring. + for (begin(); !end(); next()) { + _freqMap[getColVal()]++; + } + _freqIter = _freqMap.begin(); +} diff --git a/src/utils/KeyListOps/KeyListOpsMethods.h b/src/utils/KeyListOps/KeyListOpsMethods.h new file mode 100644 index 0000000000000000000000000000000000000000..0cac9c87a196cd3107dd953e7710e637205d2125 --- /dev/null +++ b/src/utils/KeyListOps/KeyListOpsMethods.h @@ -0,0 +1,113 @@ +/* + * KeyListOpsMethods.h + * + * Created on: Feb 6, 2014 + * Author: nek3d + */ + +#ifndef KEYLISTOPSMETHODS_H_ +#define KEYLISTOPSMETHODS_H_ + +using namespace std; + +#include <map> +#include <utility> //for pair +#include "QuickString.h" +#include <stdint.h> +#include "RecordKeyList.h" + +class KeyListOpsMethods { +public: + KeyListOpsMethods(); + KeyListOpsMethods(RecordKeyList *keyList, int column = 1); + ~KeyListOpsMethods(); + + + void setKeyList(RecordKeyList *keyList) { _keyList = keyList; } + void setColumn(int col) { _column = col; } + void setNullValue(const QuickString & nullVal) { _nullVal = nullVal; } + const QuickString &getNullValue() const { return _nullVal; } + void setDelimStr(const QuickString &delimStr) { _delimStr = delimStr; } + const QuickString &getDelimStr() const { return _delimStr; } + + // return the total of the values in the vector + double getSum(); + // return the average value in the vector + double getMean(); + // return the standard deviation + double getStddev(); + // return the sample standard deviation + double getSampleStddev(); + // return the median value in the vector + double getMedian(); + // return the most common value in the vector + const QuickString &getMode(); + // return the least common value in the vector + const QuickString &getAntiMode(); + // return the minimum element of the vector + double getMin(); + // return the maximum element of the vector + double getMax(); + // return the minimum absolute value of the vector + double getAbsMin(); + // return the maximum absolute value of the vector + double getAbsMax(); + // return the count of element in the vector + uint32_t getCount(); + // return a the count of _unique_ elements in the vector + uint32_t getCountDistinct(); + // return only those elements that occur once + const QuickString &getDistinctOnly(); + // return a delimiter-separated list of elements + const QuickString & getCollapse(const QuickString & delimiter = ","); + // return a concatenation of all elements in the vector + const QuickString & getConcat(); + // return a comma-separated list of the _unique_ elements + const QuickString & getDistinct(); + // return a histogram of values and their freqs. in desc. order of frequency + const QuickString & getFreqDesc(); + // return a histogram of values and their freqs. in asc. order of frequency + const QuickString & getFreqAsc(); + // return the first value in the list + const QuickString & getFirst(); + // return the last value in the list + const QuickString & getLast(); + +private: + RecordKeyList *_keyList; + int _column; + QuickString _nullVal; + QuickString _delimStr; + QuickString _retStr; + + RecordKeyList _nullKeyList; //this has to exist just so we can initialize _iter, below. + RecordKeyList::const_iterator_type _iter; + + // Some methods need to put values into a vector, mostly for sorting. + vector<double> _numArray; + vector<QuickString> _qsArray; + + typedef map<QuickString, int> freqMapType; + freqMapType _freqMap; + freqMapType::iterator _freqIter; + + typedef enum { UNSORTED, ASC, DESC} SORT_TYPE; + + typedef multimap<int, QuickString, less<int> > histAscType; + typedef multimap<int, QuickString, greater<int> > histDescType; + void init(); + const QuickString &getColVal(); + double getColValNum(); + bool empty() { return _keyList->empty(); } + void begin() { _iter = _keyList->begin(); } + bool end() { return _iter == _keyList->end(); } + void next() { _iter = _keyList->next(); } + void toArray(bool useNum, SORT_TYPE sortVal = UNSORTED); + void sortArray(bool useNum, bool ascOrder); + void makeFreqMap(); + + +}; + + +#endif /* KEYLISTOPSMETHODS_H_ */ diff --git a/src/utils/KeyListOps/Makefile b/src/utils/KeyListOps/Makefile index 1797c83985ea50a7bec3979819b27cbd11d0b742..0b0ac991d175b7bcf24999bed1131a6a2aed2b4a 100644 --- a/src/utils/KeyListOps/Makefile +++ b/src/utils/KeyListOps/Makefile @@ -6,7 +6,6 @@ UTILITIES_DIR = ../../utils/ # ------------------- INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/fileType/ \ - -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/GenomeFile/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ @@ -19,21 +18,26 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= KeyListOps.cpp KeyListOps.h -OBJECTS= KeyListOps.o +SOURCES= KeyListOps.cpp KeyListOps.h KeyListOpsMethods.cpp KeyListOpsMethods.h +OBJECTS= KeyListOps.o KeyListOpsMethods.o _EXT_OBJECTS= EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +all: $(BUILT_OBJECTS) + +.PHONY: all + $(BUILT_OBJECTS): $(SOURCES) @echo " * compiling" $(*F).cpp @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(INCLUDES) clean: @echo "Cleaning up." - @rm -f $(OBJ_DIR)/KeyListOps.o + @rm -f $(OBJ_DIR)/KeyListOps.o $(OBJ_DIR)/KeyListOpsMethods.o .PHONY: clean \ No newline at end of file diff --git a/src/utils/NewChromsweep/Makefile b/src/utils/NewChromsweep/Makefile index 8f4d93105b30488d40a10599c2d1bd0b083576cf..34fc5d1234a527b470e7b606faa87c0adaf9f0e8 100644 --- a/src/utils/NewChromsweep/Makefile +++ b/src/utils/NewChromsweep/Makefile @@ -11,6 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/BamTools/include \ -I$(UTILITIES_DIR)/BamTools/src/ \ -I$(UTILITIES_DIR)/version/ diff --git a/src/utils/RecordOutputMgr/Makefile b/src/utils/RecordOutputMgr/Makefile index 2d196ec11518d10b6f57877c77bc35b7b9063446..346a5c7caf02bb7855164d671221801308e97d9e 100644 --- a/src/utils/RecordOutputMgr/Makefile +++ b/src/utils/RecordOutputMgr/Makefile @@ -11,6 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/BamTools/include \ -I$(UTILITIES_DIR)/BamTools/src/ \ -I$(UTILITIES_DIR)/version/ diff --git a/test/map/test-map.sh b/test/map/test-map.sh index f62d364207c2612a368a30a513f0e5c9ff1462c6..d9ecbaca890ad1c25b567b97b8c42a0c9cb26604 100644 --- a/test/map/test-map.sh +++ b/test/map/test-map.sh @@ -656,7 +656,7 @@ rm obs exp # Test that Bam database is not allowed ############################################################ echo " map.t44...\c" -echo -e "\n*****\n***** ERROR: BAM database file not currently supported for the map tool." > exp +echo -e "\n*****\n***** ERROR: BAM database file not currently supported for column operations." > exp $BT map -a ivls.bed -b values.bam 2> obs check obs exp rm obs exp @@ -671,3 +671,122 @@ echo "chr1 0 50 three_blocks_match 15 + 0 0 0 3 10,10,10, 0,20,40, ." > exp $BT map -o sum -a three_blocks_match.bed -b three_blocks_nomatch.bed -split > obs check obs exp rm obs exp + + + + + + +########################################################### +# +# +# Tests for multiple columns and operations +# +# +############################################################ + + +########################################################### +# Test that error is given when ops outnumber columns +############################################################ +echo " map.t46...\c" +echo \ +" +***** +***** ERROR: There are 1 columns given, but there are 2 operations." > exp +../../bin/bedtools map -a ivls.bed -b values.bed -o count,sum 2>&1 > /dev/null | head -3 > obs +check obs exp +rm obs exp + + +########################################################### +# Test that error is given when columns outnumber ops, +# if there are two or more ops. +############################################################ +echo " map.t47...\c" +echo \ +" +***** +***** ERROR: There are 3 columns given, but there are 2 operations." > exp +../../bin/bedtools map -a ivls.bed -b values.bed -c 5,1,2 -o count,sum 2>&1 > /dev/null | head -3 > obs +check obs exp +rm obs exp + + +########################################################### +# Test that numeric ops for non-numeric columns aren't allowed +############################################################ +echo " map.t48...\c" +echo \ +" +***** +***** ERROR: Column 1 is not a numeric field for database file values.bed." > exp +../../bin/bedtools map -a ivls.bed -b values.bed -c 1 -o sum 2>&1 > /dev/null | head -3 > obs +check obs exp +rm obs exp + + +########################################################### +# Test that multiple columns are allowed with a +# single operation +############################################################ +echo " map.t49...\c" +echo \ +"chr1 0 100 65 9 +chr1 100 200 1 7 +chr2 0 100 . . +chr2 100 200 . . +chr3 0 100 6 7 +chr3 100 200 8 23" > exp +../../bin/bedtools map -a ivls.bed -b values4.bed -c 5,7 -o sum > obs +check obs exp +rm obs exp + + +########################################################### +# Test that multiple columns are allowed with an +# equal number of ops that aren't all the same +############################################################ +echo " map.t50...\c" +echo \ +"chr1 0 100 13.5 65 9 +chr1 100 200 120 1 7 +chr2 0 100 . . . +chr2 100 200 . . . +chr3 0 100 10 6 7 +chr3 100 200 120 8 23" > exp +../../bin/bedtools map -a ivls.bed -b values4.bed -c 2,5,7 -o mean,sum,sum > obs +check obs exp +rm obs exp + + +########################################################### +# Test stddev +############################################################ +echo " map.t51...\c" +echo \ +"chr1 0 100 12.9167 +chr1 100 200 0 +chr2 0 100 . +chr2 100 200 . +chr3 0 100 76.2222 +chr3 100 200 0.25" > exp +../../bin/bedtools map -a ivls.bed -b values4.bed -c 7 -o stddev > obs +check obs exp +rm obs exp + +########################################################### +# Test sample_stddev +############################################################ +echo " map.t52...\c" +echo \ +"chr1 0 100 15.5 +chr1 100 200 . +chr2 0 100 . +chr2 100 200 . +chr3 0 100 114.333 +chr3 100 200 0.5" > exp +../../bin/bedtools map -a ivls.bed -b values4.bed -c 7 -o sample_stddev > obs +check obs exp +rm obs exp +