From 2268ee4fd792e109f2af5477c5e1b6208b5cba20 Mon Sep 17 00:00:00 2001 From: nkindlon <nek3d@virginia.edu> Date: Thu, 27 Feb 2014 23:45:48 -0500 Subject: [PATCH] Refactored KeyListOps, Context, mapFile for KeyListOps re-usability by other tools. --- src/intersectFile/Makefile | 1 + src/mapFile/Makefile | 1 + src/mapFile/mapFile.cpp | 67 +--- src/mapFile/mapFile.h | 85 +--- src/mapFile/mapMain.cpp | 138 ------- src/nekSandbox1/Makefile | 1 + src/regressTest/Makefile | 1 + src/sampleFile/Makefile | 1 + src/utils/BinTree/Makefile | 1 + src/utils/Contexts/ContextBase.cpp | 120 +++++- src/utils/Contexts/ContextBase.h | 21 +- src/utils/Contexts/ContextIntersect.h | 2 + src/utils/Contexts/ContextMap.cpp | 93 +---- src/utils/Contexts/ContextMap.h | 20 +- src/utils/Contexts/Makefile | 1 + .../FileRecordTools/Records/BamRecord.cpp | 5 + src/utils/FileRecordTools/Records/BamRecord.h | 1 + .../FileRecordTools/Records/Bed12Interval.cpp | 26 ++ .../FileRecordTools/Records/Bed12Interval.h | 1 + .../FileRecordTools/Records/Bed3Interval.cpp | 20 + .../FileRecordTools/Records/Bed3Interval.h | 2 + .../FileRecordTools/Records/Bed4Interval.cpp | 5 + .../FileRecordTools/Records/Bed4Interval.h | 1 + .../FileRecordTools/Records/Bed5Interval.cpp | 13 + .../FileRecordTools/Records/Bed5Interval.h | 1 + .../FileRecordTools/Records/Bed6Interval.cpp | 17 + .../FileRecordTools/Records/Bed6Interval.h | 1 + .../Records/BedGraphInterval.cpp | 11 + .../Records/BedGraphInterval.h | 1 + .../Records/BedPlusInterval.cpp | 15 + .../FileRecordTools/Records/BedPlusInterval.h | 2 + .../FileRecordTools/Records/GffRecord.cpp | 36 ++ src/utils/FileRecordTools/Records/GffRecord.h | 1 + src/utils/FileRecordTools/Records/Record.cpp | 6 +- src/utils/FileRecordTools/Records/Record.h | 2 + src/utils/GenomeFile/Makefile | 1 + src/utils/KeyListOps/KeyListOps.cpp | 364 +++++++++++++++++ src/utils/KeyListOps/KeyListOps.h | 54 +++ src/utils/KeyListOps/KeyListOpsMethods.cpp | 368 ++++++++++++++++++ src/utils/KeyListOps/KeyListOpsMethods.h | 113 ++++++ src/utils/KeyListOps/Makefile | 43 ++ src/utils/NewChromsweep/Makefile | 1 + src/utils/RecordOutputMgr/Makefile | 1 + src/utils/general/Makefile | 2 +- src/utils/general/QuickString.cpp | 65 ++++ src/utils/general/QuickString.h | 19 + test/map/test-map.sh | 97 ++++- 47 files changed, 1434 insertions(+), 414 deletions(-) create mode 100644 src/utils/KeyListOps/KeyListOps.cpp create mode 100644 src/utils/KeyListOps/KeyListOps.h create mode 100644 src/utils/KeyListOps/KeyListOpsMethods.cpp create mode 100644 src/utils/KeyListOps/KeyListOpsMethods.h create mode 100644 src/utils/KeyListOps/Makefile diff --git a/src/intersectFile/Makefile b/src/intersectFile/Makefile index e265b334..8c81049e 100644 --- a/src/intersectFile/Makefile +++ b/src/intersectFile/Makefile @@ -17,6 +17,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/RecordOutputMgr/ \ -I$(UTILITIES_DIR)/NewChromsweep \ -I$(UTILITIES_DIR)/BinTree \ diff --git a/src/mapFile/Makefile b/src/mapFile/Makefile index 17bb42df..86282427 100644 --- a/src/mapFile/Makefile +++ b/src/mapFile/Makefile @@ -29,6 +29,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ -I$(UTILITIES_DIR)/RecordOutputMgr/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/NewChromsweep \ -I$(UTILITIES_DIR)/VectorOps \ -I$(UTILITIES_DIR)/BinTree \ diff --git a/src/mapFile/mapFile.cpp b/src/mapFile/mapFile.cpp index 88dcc26f..8dbf24ad 100644 --- a/src/mapFile/mapFile.cpp +++ b/src/mapFile/mapFile.cpp @@ -47,74 +47,11 @@ bool FileMap::mapFiles() RecordKeyList keySet(hitSet.getKey()); RecordKeyList resultSet(hitSet.getKey()); _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet); - SummarizeHits(resultSet); - _recordOutputMgr->printRecord(resultSet.getKey(), _output); + _recordOutputMgr->printRecord(resultSet.getKey(), _context->getColumnOpsVal(resultSet)); } else { - SummarizeHits(hitSet); - _recordOutputMgr->printRecord(hitSet.getKey(), _output); + _recordOutputMgr->printRecord(hitSet.getKey(), _context->getColumnOpsVal(hitSet)); } } return true; } -void FileMap::ExtractColumnFromHits(RecordKeyList &hits) { - _column_vec.clear(); - RecordKeyList::const_iterator_type iter = hits.begin(); - for (; iter != hits.end(); iter = hits.next()) - { - _column_vec.push_back(iter->value()->getField(_context->getColumn()).str()); - } -} - -void FileMap::SummarizeHits(RecordKeyList &hits) { - - const QuickString & operation = _context->getColumnOperation(); - _output.clear(); - - if (hits.size() == 0) { - if (operation == "count" || operation == "count_distinct") - _output.append("0"); - else - _output.append(_context->getNullValue().str()); - return; - } - - _tmp_output.str(""); - _tmp_output.clear(); - - ExtractColumnFromHits(hits); - - VectorOps vo(_column_vec); - if (operation == "sum") - _tmp_output << setprecision (PRECISION) << vo.GetSum(); - else if (operation == "mean") - _tmp_output << setprecision (PRECISION) << vo.GetMean(); - else if (operation == "median") - _tmp_output << setprecision (PRECISION) << vo.GetMedian(); - else if (operation == "min") - _tmp_output << setprecision (PRECISION) << vo.GetMin(); - else if (operation == "max") - _tmp_output << setprecision (PRECISION) << vo.GetMax(); - else if (operation == "absmin") - _tmp_output << setprecision (PRECISION) << vo.GetAbsMin(); - else if (operation == "absmax") - _tmp_output << setprecision (PRECISION) << vo.GetAbsMax(); - else if (operation == "mode") - _tmp_output << vo.GetMode(); - else if (operation == "antimode") - _tmp_output << vo.GetAntiMode(); - else if (operation == "count") - _tmp_output << setprecision (PRECISION) << vo.GetCount(); - else if (operation == "count_distinct") - _tmp_output << setprecision (PRECISION) << vo.GetCountDistinct(); - else if (operation == "collapse") - _tmp_output << vo.GetCollapse(); - else if (operation == "distinct") - _tmp_output << vo.GetDistinct(); - else { - cerr << "ERROR: " << operation << " is an unrecognized operation\n"; - exit(1); - } - _output.append(_tmp_output.str()); - -} diff --git a/src/mapFile/mapFile.h b/src/mapFile/mapFile.h index cb1da082..fbb431ac 100644 --- a/src/mapFile/mapFile.h +++ b/src/mapFile/mapFile.h @@ -18,10 +18,11 @@ using namespace std; #include <iomanip> #include "VectorOps.h" #include "RecordKeyList.h" +#include "KeyListOps.h" +#include "ContextMap.h" using namespace std; -class ContextMap; class BlockMgr; class RecordOutputMgr; @@ -35,90 +36,8 @@ public: private: ContextMap *_context; - Record *_queryRec; - Record *_databaseRec; BlockMgr *_blockMgr; RecordOutputMgr *_recordOutputMgr; - - vector<string> _column_vec; // vector to hold current column's worth of data - - ostringstream _tmp_output; - QuickString _output; // placeholder for the results of mapping B to each a in A. - //------------------------------------------------ - // private methods - //------------------------------------------------ - void Map(); - void SummarizeHits(RecordKeyList &hits); - void ExtractColumnFromHits(RecordKeyList &hits); - }; #endif /* MAPFILE_H */ - - -/* -#include "bedFile.h" -#include "chromsweep.h" -#include "VectorOps.h" -#include "api/BamReader.h" -#include "api/BamWriter.h" -#include "api/BamAux.h" -#include "BamAncillary.h" -using namespace BamTools; - - -#include <vector> -#include <iostream> -#include <algorithm> -#include <numeric> -#include <fstream> -#include <iomanip> -#include <stdlib.h> -using namespace std; - - - -class BedMap { - -public: - - // constructor - BedMap(string bedAFile, string bedBFile, int column, string operation, - float overlapFraction, bool sameStrand, - bool diffStrand, bool reciprocal, - bool choseNullValue, string nullValue, - bool printHeader); - - // destructor - ~BedMap(void); - -private: - - //------------------------------------------------ - // private attributes - //------------------------------------------------ - string _bedAFile; - string _bedBFile; - int _column; - string _operation; - bool _sameStrand; - bool _diffStrand; - bool _reciprocal; - float _overlapFraction; - string _nullValue; - bool _printHeader; - - // instance of a bed file class. - BedFile *_bedA, *_bedB; - - vector<string> _column_vec; // vector to hold current column's worth of data - - //------------------------------------------------ - // private methods - //------------------------------------------------ - void Map(); - string MapHits(const BED &a, const vector<BED> &hits); - void ExtractColumnFromHits(const vector<BED> &hits); -}; -*/ -//#endif /* MAPFILE_H */ diff --git a/src/mapFile/mapMain.cpp b/src/mapFile/mapMain.cpp index a9eeb369..f08e56b3 100644 --- a/src/mapFile/mapMain.cpp +++ b/src/mapFile/mapMain.cpp @@ -38,144 +38,6 @@ int map_main(int argc, char* argv[]) { return retVal ? 0 : 1; } - -/* -int map_main(int argc, char* argv[]) { - - // our configuration variables - bool showHelp = false; - - // input files - string bedAFile; - string bedBFile; - int column = 5; - string operation = "sum"; - string nullValue = "."; - - // input arguments - float overlapFraction = 1E-9; - - bool haveBedA = false; - bool haveBedB = false; - bool haveColumn = false; - bool haveOperation = false; - bool haveFraction = false; - bool reciprocalFraction = false; - bool sameStrand = false; - bool diffStrand = false; - bool printHeader = false; - bool choseNullValue = false; - - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) map_help(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-a", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedA = true; - bedAFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-b", 2, parameterLength)) { - if ((i+1) < argc) { - haveBedB = true; - bedBFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-c", 2, parameterLength)) { - if ((i+1) < argc) { - haveColumn = true; - column = atoi(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-o", 2, parameterLength)) { - if ((i+1) < argc) { - haveOperation = true; - operation = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-f", 2, parameterLength)) { - if ((i+1) < argc) { - haveFraction = true; - overlapFraction = atof(argv[i + 1]); - i++; - } - } - else if(PARAMETER_CHECK("-r", 2, parameterLength)) { - reciprocalFraction = true; - } - else if (PARAMETER_CHECK("-s", 2, parameterLength)) { - sameStrand = true; - } - else if (PARAMETER_CHECK("-S", 2, parameterLength)) { - diffStrand = true; - } - else if (PARAMETER_CHECK("-null", 5, parameterLength)) { - nullValue = argv[i + 1]; - choseNullValue = true; - i++; - } - else if(PARAMETER_CHECK("-header", 7, parameterLength)) { - printHeader = true; - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBedA || !haveBedB) { - cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; - showHelp = true; - } - - if (reciprocalFraction && !haveFraction) { - cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl; - showHelp = true; - } - - if (sameStrand && diffStrand) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -s OR -S, not both." << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - - BedMap *bm = new BedMap(bedAFile, bedBFile, column, operation, - overlapFraction, sameStrand, - diffStrand, reciprocalFraction, - choseNullValue, nullValue, - printHeader); - delete bm; - return 0; - } - else { - map_help(); - return 0; - } -} -*/ - void map_help(void) { cerr << "\nTool: bedtools map (aka mapBed)" << endl; diff --git a/src/nekSandbox1/Makefile b/src/nekSandbox1/Makefile index fbe6d861..df8aba72 100644 --- a/src/nekSandbox1/Makefile +++ b/src/nekSandbox1/Makefile @@ -10,6 +10,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders \ -I$(UTILITIES_DIR)/FileRecordTools/Records \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/general \ -I$(UTILITIES_DIR)/NewChromsweep \ -I$(UTILITIES_DIR)/GenomeFile/ \ diff --git a/src/regressTest/Makefile b/src/regressTest/Makefile index e9ceebf3..8ffeeab2 100644 --- a/src/regressTest/Makefile +++ b/src/regressTest/Makefile @@ -18,6 +18,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders \ -I$(UTILITIES_DIR)/FileRecordTools/Records \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/general # ---------------------------------- diff --git a/src/sampleFile/Makefile b/src/sampleFile/Makefile index 2042291e..9ccbe5a9 100644 --- a/src/sampleFile/Makefile +++ b/src/sampleFile/Makefile @@ -17,6 +17,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/RecordOutputMgr/ \ -I$(UTILITIES_DIR)/version/ diff --git a/src/utils/BinTree/Makefile b/src/utils/BinTree/Makefile index de04c816..c29b5eba 100644 --- a/src/utils/BinTree/Makefile +++ b/src/utils/BinTree/Makefile @@ -11,6 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/BamTools/include \ -I$(UTILITIES_DIR)/BamTools/src/ \ -I$(UTILITIES_DIR)/version/ diff --git a/src/utils/Contexts/ContextBase.cpp b/src/utils/Contexts/ContextBase.cpp index cd30b203..adbc47af 100644 --- a/src/utils/Contexts/ContextBase.cpp +++ b/src/utils/Contexts/ContextBase.cpp @@ -52,20 +52,16 @@ ContextBase::ContextBase() _hasConstantSeed(false), _seed(0), _forwardOnly(false), - _reverseOnly(false) + _reverseOnly(false), + _hasColumnOpsMethods(false) { _programNames["intersect"] = INTERSECT; _programNames["sample"] = SAMPLE; _programNames["map"] = MAP; - _validScoreOps.insert("sum"); - _validScoreOps.insert("max"); - _validScoreOps.insert("min"); - _validScoreOps.insert("mean"); - _validScoreOps.insert("mode"); - _validScoreOps.insert("median"); - _validScoreOps.insert("antimode"); - _validScoreOps.insert("collapse"); + if (hasColumnOpsMethods()) { + _keyListOps = new KeyListOps(); + } } ContextBase::~ContextBase() @@ -79,6 +75,11 @@ ContextBase::~ContextBase() delete _files[i]; _files[i] = NULL; } + if (hasColumnOpsMethods()) { + delete _keyListOps; + _keyListOps = NULL; + } + } bool ContextBase::determineOutputType() { @@ -176,6 +177,19 @@ bool ContextBase::parseCmdArgs(int argc, char **argv, int skipFirstArgs) { else if (strcmp(_argv[_i], "-seed") == 0) { if (!handle_seed()) return false; } + else if (strcmp(_argv[_i], "-o") == 0) { + if (!handle_o()) return false; + } + else if (strcmp(_argv[_i], "-c") == 0) { + if (!handle_c()) return false; + } + else if (strcmp(_argv[_i], "-null") == 0) { + if (!handle_null()) return false; + } + else if (strcmp(_argv[_i], "-delim") == 0) { + if (!handle_delim()) return false; + } + } return true; } @@ -191,6 +205,12 @@ bool ContextBase::isValidState() if (!determineOutputType()) { return false; } + if (hasColumnOpsMethods()) { + FileRecordMgr *dbFile = getFile(hasIntersectMethods() ? _databaseFileIdx : 0); + if (!_keyListOps->isValidColumnOps(dbFile)) { + return false; + } + } return true; } @@ -363,3 +383,85 @@ bool ContextBase::handle_ubam() markUsed(_i - _skipFirstArgs); return true; } + + +// Methods specific to column operations. +// for col ops, -c is the string of columns upon which to operate +bool ContextBase::handle_c() +{ + if (!hasColumnOpsMethods()) { + return false; + } + if ((_i+1) < _argc) { + _keyListOps->setColumns(_argv[_i + 1]); + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + } + return true; +} + + +// for col ops, -o is the string of operations to apply to the columns (-c) +bool ContextBase::handle_o() +{ + if (!hasColumnOpsMethods()) { + return false; + } + if ((_i+1) < _argc) { + _keyListOps->setOperations(_argv[_i + 1]); + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + } + return true; +} + + +// for col ops, -null is a NULL vakue assigned +// when no overlaps are detected. +bool ContextBase::handle_null() +{ + if (!hasColumnOpsMethods()) { + return false; + } + if ((_i+1) < _argc) { + _keyListOps->setNullValue(_argv[_i + 1]); + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + } + return true; +} + +//for col ops, delimStr will appear between each item in +//a collapsed but delimited list. +bool ContextBase::handle_delim() +{ + if (!hasColumnOpsMethods()) { + return false; + } + if ((_i+1) < _argc) { + _keyListOps->setDelimStr(_argv[_i + 1]); + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + } + return true; +} + +void ContextBase::setColumnOpsMethods(bool val) +{ + _hasColumnOpsMethods = val; + if (val) { + _keyListOps = new KeyListOps(); + } +} + +const QuickString &ContextBase::getColumnOpsVal(RecordKeyList &keyList) const { + if (!hasColumnOpsMethods()) { + return _nullStr; + } + return _keyListOps->getOpVals(keyList); +} + diff --git a/src/utils/Contexts/ContextBase.h b/src/utils/Contexts/ContextBase.h index 872193fd..b4bf1227 100644 --- a/src/utils/Contexts/ContextBase.h +++ b/src/utils/Contexts/ContextBase.h @@ -24,6 +24,7 @@ #include "NewGenomeFile.h" #include "api/BamReader.h" #include "api/BamAux.h" +#include "KeyListOps.h" class ContextBase { @@ -144,6 +145,13 @@ public: //methods. virtual bool hasIntersectMethods() const { return false; } + // determine whether column operations like those used in map + // are available. + void setColumnOpsMethods(bool val); + virtual bool hasColumnOpsMethods() const { return _hasColumnOpsMethods; } + const QuickString &getColumnOpsVal(RecordKeyList &keyList) const; + //methods applicable only to column operations. + protected: PROGRAM_TYPE _program; @@ -191,15 +199,11 @@ protected: int _bamHeaderAndRefIdx; int _maxNumDatabaseFields; bool _useFullBamTags; - QuickString _columnOperation; - int _column; - QuickString _nullValue; bool _reportCount; int _maxDistance; bool _reportNames; bool _reportScores; QuickString _scoreOp; - set<QuickString> _validScoreOps; int _numOutputRecords; @@ -208,6 +212,10 @@ protected: bool _forwardOnly; bool _reverseOnly; + bool _hasColumnOpsMethods; + KeyListOps *_keyListOps; + QuickString _nullStr; //placeholder return value when col ops aren't valid. + void markUsed(int i) { _argsProcessed[i] = true; } bool isUsed(int i) const { return _argsProcessed[i]; } bool cmdArgsValid(); @@ -231,6 +239,11 @@ protected: virtual bool handle_split(); virtual bool handle_sorted(); virtual bool handle_ubam(); + + virtual bool handle_c(); + virtual bool handle_o(); + virtual bool handle_null(); + virtual bool handle_delim(); }; #endif /* CONTEXTBASE_H_ */ diff --git a/src/utils/Contexts/ContextIntersect.h b/src/utils/Contexts/ContextIntersect.h index 0144a121..b066e946 100644 --- a/src/utils/Contexts/ContextIntersect.h +++ b/src/utils/Contexts/ContextIntersect.h @@ -21,6 +21,8 @@ public: //NOTE: Query and database files will only be marked as such by either the //parseCmdArgs method, or by explicitly setting them. + FileRecordMgr *getQueryFile() { return getFile(_queryFileIdx); } + FileRecordMgr *getDatabaseFile() { return getFile(_databaseFileIdx); } int getQueryFileIdx() const { return _queryFileIdx; } void setQueryFileIdx(int idx) { _queryFileIdx = idx; } int getDatabaseFileIdx() const { return _databaseFileIdx; } diff --git a/src/utils/Contexts/ContextMap.cpp b/src/utils/Contexts/ContextMap.cpp index d94d0888..e3f82417 100644 --- a/src/utils/Contexts/ContextMap.cpp +++ b/src/utils/Contexts/ContextMap.cpp @@ -12,13 +12,7 @@ ContextMap::ContextMap() // map requires sorted input setSortedInput(true); setLeftJoin(true); - - // default to BED score column - setColumn(5); - // default to "sum" - setColumnOperation("sum"); - // default to "." as a NULL value - setNullValue('.'); + setColumnOpsMethods(true); } ContextMap::~ContextMap() @@ -44,75 +38,22 @@ bool ContextMap::parseCmdArgs(int argc, char **argv, int skipFirstArgs) { if (isUsed(_i - _skipFirstArgs)) { continue; } - else if (strcmp(_argv[_i], "-o") == 0) { - if (!handle_o()) return false; - } - else if (strcmp(_argv[_i], "-c") == 0) { - if (!handle_c()) return false; - } - else if (strcmp(_argv[_i], "-null") == 0) { - if (!handle_null()) return false; - } - } - return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs); -} - + if (strcmp(_argv[_i], "-c") == 0) { + //bypass intersect's use of the -c option, because -c + //means writeCount for intersect, but means columns for map. + if (!ContextBase::handle_c()) return false; + } -bool ContextMap::isValidState() -{ - if (!ContextIntersect::isValidState()) { - return false; } - - if (getDatabaseFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { - //throw Error - cerr << endl << "*****" - << endl - << "***** ERROR: BAM database file not currently supported for the map tool." - << endl; - exit(1); - } - // TODO - // enforce any specific checks for Map. - return true; -} - - -// for map, -c is the column upon which to operate -bool ContextMap::handle_c() -{ - if ((_i+1) < _argc) { - setColumn(atoi(_argv[_i + 1])); - markUsed(_i - _skipFirstArgs); - _i++; - markUsed(_i - _skipFirstArgs); - } - return true; -} - - -// for map, -o is the operation to apply to the column (-c) -bool ContextMap::handle_o() -{ - if ((_i+1) < _argc) { - setColumnOperation(_argv[_i + 1]); - markUsed(_i - _skipFirstArgs); - _i++; - markUsed(_i - _skipFirstArgs); - } - return true; -} - - -// for map, -null is a NULL vakue assigned -// when no overlaps are detected. -bool ContextMap::handle_null() -{ - if ((_i+1) < _argc) { - setNullValue(_argv[_i + 1]); - markUsed(_i - _skipFirstArgs); - _i++; - markUsed(_i - _skipFirstArgs); - } - return true; + return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs); } +// +// +//bool ContextMap::isValidState() +//{ +// if (!ContextIntersect::isValidState()) { +// return false; +// } +//} +// +// diff --git a/src/utils/Contexts/ContextMap.h b/src/utils/Contexts/ContextMap.h index b8ee57fd..9b7280e5 100644 --- a/src/utils/Contexts/ContextMap.h +++ b/src/utils/Contexts/ContextMap.h @@ -9,30 +9,20 @@ #define CONTEXTMAP_H_ #include "ContextIntersect.h" +#include "KeyListOps.h" class ContextMap : public ContextIntersect { public: ContextMap(); virtual ~ContextMap(); - virtual bool isValidState(); - +// virtual bool isValidState(); +// virtual bool parseCmdArgs(int argc, char **argv, int skipFirstArgs); - - int getColumn() const { return _column; } - void setColumn(int column) { _column = column; } - - const QuickString & getColumnOperation() const { return _columnOperation; } - void setColumnOperation(const QuickString & operation) { _columnOperation = operation; } - - const QuickString & getNullValue() const { return _nullValue; } - void setNullValue(const QuickString & nullValue) { _nullValue = nullValue; } - +// virtual bool hasIntersectMethods() const { return true; } +// private: - virtual bool handle_c(); - virtual bool handle_o(); - virtual bool handle_null(); }; diff --git a/src/utils/Contexts/Makefile b/src/utils/Contexts/Makefile index 7ddc3c6c..4b2ed429 100644 --- a/src/utils/Contexts/Makefile +++ b/src/utils/Contexts/Makefile @@ -9,6 +9,7 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/GenomeFile/ \ -I$(UTILITIES_DIR)/BamTools/include \ -I$(UTILITIES_DIR)/BamTools/src/ \ diff --git a/src/utils/FileRecordTools/Records/BamRecord.cpp b/src/utils/FileRecordTools/Records/BamRecord.cpp index 4c5cd8dc..f939fefb 100644 --- a/src/utils/FileRecordTools/Records/BamRecord.cpp +++ b/src/utils/FileRecordTools/Records/BamRecord.cpp @@ -172,5 +172,10 @@ const QuickString &BamRecord::getField(int fieldNum) const return Bed6Interval::getField(fieldNum); } +bool BamRecord::isNumericField(int fieldNum) { + + //TBD: As with getField, this isn't defined for BAM. + return (fieldNum > 6 ? false : Bed6Interval::isNumericField(fieldNum)); +} diff --git a/src/utils/FileRecordTools/Records/BamRecord.h b/src/utils/FileRecordTools/Records/BamRecord.h index b74dbc2c..022ecb4d 100644 --- a/src/utils/FileRecordTools/Records/BamRecord.h +++ b/src/utils/FileRecordTools/Records/BamRecord.h @@ -40,6 +40,7 @@ public: virtual const QuickString &getField(int fieldNum) const; virtual int getNumFields() const { return 12; } + static bool isNumericField(int fieldNum); protected: BamTools::BamAlignment _bamAlignment; diff --git a/src/utils/FileRecordTools/Records/Bed12Interval.cpp b/src/utils/FileRecordTools/Records/Bed12Interval.cpp index 867a69ec..0a5a092f 100644 --- a/src/utils/FileRecordTools/Records/Bed12Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed12Interval.cpp @@ -146,3 +146,29 @@ const QuickString &Bed12Interval::getField(int fieldNum) const } } +bool Bed12Interval::isNumericField(int fieldNum) { + switch (fieldNum) { + case 7: + return true; + break; + case 8: + return true; + break; + case 9: + return false; + break; + case 10: + return true; + break; + case 11: + return false; + break; + case 12: + return false; + break; + default: + return Bed6Interval::isNumericField(fieldNum); + break; + } +} + diff --git a/src/utils/FileRecordTools/Records/Bed12Interval.h b/src/utils/FileRecordTools/Records/Bed12Interval.h index 711800c3..ffa89f90 100644 --- a/src/utils/FileRecordTools/Records/Bed12Interval.h +++ b/src/utils/FileRecordTools/Records/Bed12Interval.h @@ -54,6 +54,7 @@ public: virtual const QuickString &getField(int fieldNum) const; virtual int getNumFields() const { return 12; } + static bool isNumericField(int fieldNum); protected: diff --git a/src/utils/FileRecordTools/Records/Bed3Interval.cpp b/src/utils/FileRecordTools/Records/Bed3Interval.cpp index 3f896be5..e31e43ea 100644 --- a/src/utils/FileRecordTools/Records/Bed3Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed3Interval.cpp @@ -79,3 +79,23 @@ const QuickString &Bed3Interval::getField(int fieldNum) const break; } } + +bool Bed3Interval::isNumericField(int fieldNum) { + switch (fieldNum) { + case 1: + return false; //chrom + break; + case 2: + return true; //startPos + break; + case 3: + return true; //endPos + break; + default: + cerr << endl << "*****" << endl + << "*****ERROR: requested invalid column " << fieldNum << ". Exiting." << endl + << endl << "*****" << endl; + exit(1); + break; + } +} diff --git a/src/utils/FileRecordTools/Records/Bed3Interval.h b/src/utils/FileRecordTools/Records/Bed3Interval.h index 9f1ff118..93377a0a 100644 --- a/src/utils/FileRecordTools/Records/Bed3Interval.h +++ b/src/utils/FileRecordTools/Records/Bed3Interval.h @@ -32,6 +32,8 @@ public: virtual const QuickString &getField(int fieldNum) const; virtual int getNumFields() const { return 3; } + static bool isNumericField(int fieldNum); + protected: virtual ~Bed3Interval(); diff --git a/src/utils/FileRecordTools/Records/Bed4Interval.cpp b/src/utils/FileRecordTools/Records/Bed4Interval.cpp index c1ef81a3..27ca9f7f 100644 --- a/src/utils/FileRecordTools/Records/Bed4Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed4Interval.cpp @@ -60,3 +60,8 @@ const QuickString &Bed4Interval::getField(int fieldNum) const } } +bool Bed4Interval::isNumericField(int fieldNum) { + return (fieldNum == 4 ? false : Bed3Interval::isNumericField(fieldNum)); +} + + diff --git a/src/utils/FileRecordTools/Records/Bed4Interval.h b/src/utils/FileRecordTools/Records/Bed4Interval.h index f42817c4..b0384464 100644 --- a/src/utils/FileRecordTools/Records/Bed4Interval.h +++ b/src/utils/FileRecordTools/Records/Bed4Interval.h @@ -28,6 +28,7 @@ public: virtual const QuickString &getField(int fieldNum) const; virtual int getNumFields() const { return 4; } + static bool isNumericField(int fieldNum); protected: diff --git a/src/utils/FileRecordTools/Records/Bed5Interval.cpp b/src/utils/FileRecordTools/Records/Bed5Interval.cpp index 7307fb66..130a788d 100644 --- a/src/utils/FileRecordTools/Records/Bed5Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed5Interval.cpp @@ -70,3 +70,16 @@ const QuickString &Bed5Interval::getField(int fieldNum) const break; } } + +bool Bed5Interval::isNumericField(int fieldNum) { + switch (fieldNum) { + case 4: + return false; + break; + case 5: + return true; + break; + default: + return Bed3Interval::isNumericField(fieldNum); + } +} diff --git a/src/utils/FileRecordTools/Records/Bed5Interval.h b/src/utils/FileRecordTools/Records/Bed5Interval.h index bc913d1d..2064d354 100644 --- a/src/utils/FileRecordTools/Records/Bed5Interval.h +++ b/src/utils/FileRecordTools/Records/Bed5Interval.h @@ -27,6 +27,7 @@ public: virtual const QuickString &getField(int fieldNum) const; virtual int getNumFields() const { return 5; } + static bool isNumericField(int fieldNum); protected: diff --git a/src/utils/FileRecordTools/Records/Bed6Interval.cpp b/src/utils/FileRecordTools/Records/Bed6Interval.cpp index 8371553a..5bc783c7 100644 --- a/src/utils/FileRecordTools/Records/Bed6Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed6Interval.cpp @@ -81,3 +81,20 @@ const QuickString &Bed6Interval::getField(int fieldNum) const break; } } + +bool Bed6Interval::isNumericField(int fieldNum) { + switch (fieldNum) { + case 4: + return false; + break; + case 5: + return true; + break; + case 6: + return false; + break; + default: + return Bed3Interval::isNumericField(fieldNum); + break; + } +} diff --git a/src/utils/FileRecordTools/Records/Bed6Interval.h b/src/utils/FileRecordTools/Records/Bed6Interval.h index 9ad9f80b..023683fe 100644 --- a/src/utils/FileRecordTools/Records/Bed6Interval.h +++ b/src/utils/FileRecordTools/Records/Bed6Interval.h @@ -27,6 +27,7 @@ public: virtual const QuickString &getField(int fieldNum) const; virtual int getNumFields() const { return 6; } + static bool isNumericField(int fieldNum); protected: diff --git a/src/utils/FileRecordTools/Records/BedGraphInterval.cpp b/src/utils/FileRecordTools/Records/BedGraphInterval.cpp index e0808573..9cfda480 100644 --- a/src/utils/FileRecordTools/Records/BedGraphInterval.cpp +++ b/src/utils/FileRecordTools/Records/BedGraphInterval.cpp @@ -60,3 +60,14 @@ const QuickString &BedGraphInterval::getField(int fieldNum) const } } +bool BedGraphInterval::isNumericField(int fieldNum) { + switch (fieldNum) { + case 4: + return true; + break; + default: + return Bed3Interval::isNumericField(fieldNum); + break; + } +} + diff --git a/src/utils/FileRecordTools/Records/BedGraphInterval.h b/src/utils/FileRecordTools/Records/BedGraphInterval.h index 1bdf619a..5db6feaf 100644 --- a/src/utils/FileRecordTools/Records/BedGraphInterval.h +++ b/src/utils/FileRecordTools/Records/BedGraphInterval.h @@ -28,6 +28,7 @@ public: virtual const QuickString &getField(int fieldNum) const; virtual int getNumFields() const { return 4; } + static bool isNumericField(int fieldNum); protected: virtual ~BedGraphInterval(); diff --git a/src/utils/FileRecordTools/Records/BedPlusInterval.cpp b/src/utils/FileRecordTools/Records/BedPlusInterval.cpp index fc8be368..5819b863 100644 --- a/src/utils/FileRecordTools/Records/BedPlusInterval.cpp +++ b/src/utils/FileRecordTools/Records/BedPlusInterval.cpp @@ -117,3 +117,18 @@ const QuickString &BedPlusInterval::getField(int fieldNum) const } return Bed6Interval::getField(fieldNum); } + +bool BedPlusInterval::isNumericField(int fieldNum) { + + // + // TBD: There is no currently no good way to guarantee / enforce whether + // fields after the 6th are numeric, so for now we'll give the user the + // benefit of the doubt on those. + // + if (fieldNum > startOtherIdx) { + return true; + } else { + return Bed6Interval::isNumericField(fieldNum); + } +} + diff --git a/src/utils/FileRecordTools/Records/BedPlusInterval.h b/src/utils/FileRecordTools/Records/BedPlusInterval.h index 4b98b4f3..077ed936 100644 --- a/src/utils/FileRecordTools/Records/BedPlusInterval.h +++ b/src/utils/FileRecordTools/Records/BedPlusInterval.h @@ -38,6 +38,8 @@ public: virtual void setField(int fieldNum, const char *str) { (*(_otherIdxs[fieldNum])) = str; } virtual void setNumPrintFields(int num) { _numPrintFields = num; } virtual int getNumPrintFields() const { return _numPrintFields; } + static bool isNumericField(int fieldNum); + protected: virtual ~BedPlusInterval(); diff --git a/src/utils/FileRecordTools/Records/GffRecord.cpp b/src/utils/FileRecordTools/Records/GffRecord.cpp index a91ce159..21cea1da 100644 --- a/src/utils/FileRecordTools/Records/GffRecord.cpp +++ b/src/utils/FileRecordTools/Records/GffRecord.cpp @@ -156,4 +156,40 @@ const QuickString &GffRecord::getField(int fieldNum) const } } +bool GffRecord::isNumericField(int fieldNum) { + switch (fieldNum) { + case 1: + return false; + break; + case 2: + return false; + break; + case 3: + return false; + break; + case 4: + return true; + break; + case 5: + return true; + break; + case 6: + return true; + break; + case 7: + return false; + break; + case 8: + return false; + break; + case 9: + return false; + break; + default: + return Bed6Interval::isNumericField(fieldNum); + break; + } + +} + diff --git a/src/utils/FileRecordTools/Records/GffRecord.h b/src/utils/FileRecordTools/Records/GffRecord.h index b84d96a7..e675542f 100644 --- a/src/utils/FileRecordTools/Records/GffRecord.h +++ b/src/utils/FileRecordTools/Records/GffRecord.h @@ -34,6 +34,7 @@ public: //Note: using the assignment operator in a GffRecord can potentially be a performance hit, //if the number of fields frequently differ between this object and the one being copied. const GffRecord &operator=(const GffRecord &other); + static bool isNumericField(int fieldNum); protected: virtual ~GffRecord(); diff --git a/src/utils/FileRecordTools/Records/Record.cpp b/src/utils/FileRecordTools/Records/Record.cpp index 2beb4dca..89544ed2 100644 --- a/src/utils/FileRecordTools/Records/Record.cpp +++ b/src/utils/FileRecordTools/Records/Record.cpp @@ -187,9 +187,9 @@ void Record::undoZeroLength() ostream &operator << (ostream &out, const Record &record) { - QuickString errBuf; - record.print(errBuf); - out << errBuf; + QuickString outBuf; + record.print(outBuf); + out << outBuf; return out; } diff --git a/src/utils/FileRecordTools/Records/Record.h b/src/utils/FileRecordTools/Records/Record.h index 2c303d90..d8071c1e 100644 --- a/src/utils/FileRecordTools/Records/Record.h +++ b/src/utils/FileRecordTools/Records/Record.h @@ -129,6 +129,8 @@ public: virtual bool sameChromIntersects(const Record *otherRecord, bool sameStrand, bool diffStrand, float overlapFraction, bool reciprocal) const; +// virtual static bool isNumericField(int fieldNum) const = 0; + protected: virtual ~Record(); //by making the destructor protected, only the friend class(es) can actually delete Record objects, or objects derived from Record. diff --git a/src/utils/GenomeFile/Makefile b/src/utils/GenomeFile/Makefile index afaeccd4..fd17d299 100644 --- a/src/utils/GenomeFile/Makefile +++ b/src/utils/GenomeFile/Makefile @@ -6,6 +6,7 @@ UTILITIES_DIR = ../ # ------------------- INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/BamTools/include/ # ---------------------------------- diff --git a/src/utils/KeyListOps/KeyListOps.cpp b/src/utils/KeyListOps/KeyListOps.cpp new file mode 100644 index 00000000..65763500 --- /dev/null +++ b/src/utils/KeyListOps/KeyListOps.cpp @@ -0,0 +1,364 @@ +/* + * KeyListOps.cpp + * + * Created on: Feb 24, 2014 + * Author: nek3d + */ +#include "KeyListOps.h" +#include "FileRecordMgr.h" +#include <cmath> //for isnan + +KeyListOps::KeyListOps() { + _opCodes["sum"] = SUM; + _opCodes["mean"] = MEAN; + _opCodes["stddev"] = STDDEV; + _opCodes["sample_stddev"] = SAMPLE_STDDEV; + _opCodes["median"] = MEDIAN; + _opCodes["mode"] = MODE; + _opCodes["antimode"] = ANTIMODE; + _opCodes["min"] = MIN; + _opCodes["max"] = MAX; + _opCodes["absmin"] = ABSMIN; + _opCodes["absmax"] = ABSMAX; + _opCodes["count"] = COUNT; + _opCodes["distinct"] = DISTINCT; + _opCodes["count_distinct"] = COUNT_DISTINCT; + _opCodes["distinct_only"] = DISTINCT_ONLY; + _opCodes["collapse"] = COLLAPSE; + _opCodes["concat"] = CONCAT; + _opCodes["freq_asc"] = FREQ_ASC; + _opCodes["freq_desc"] = FREQ_DESC; + _opCodes["first"] = FIRST; + _opCodes["last"] = LAST; + + _isNumericOp[SUM] = true; + _isNumericOp[MEAN] = true; + _isNumericOp[STDDEV] = true; + _isNumericOp[MEDIAN] = true; + _isNumericOp[MODE] = false; + _isNumericOp[ANTIMODE] = false; + _isNumericOp[MIN] = true; + _isNumericOp[MAX] = true; + _isNumericOp[ABSMIN] = true; + _isNumericOp[COUNT] = false; + _isNumericOp[DISTINCT] = false; + _isNumericOp[COUNT_DISTINCT] = false; + _isNumericOp[DISTINCT_ONLY] = false; + _isNumericOp[COLLAPSE] = false; + _isNumericOp[CONCAT] = false; + _isNumericOp[FREQ_ASC] = false; + _isNumericOp[FREQ_DESC] = false; + _isNumericOp[FIRST] = false; + _isNumericOp[LAST] = false; + + _methods.setDelimStr(","); + _methods.setNullValue("."); + + // default to BED score column + _columns = "5"; + // default to "sum" + _operations = "sum"; + +} + +bool KeyListOps::isNumericOp(OP_TYPES op) const { + map<OP_TYPES, bool>::const_iterator iter = _isNumericOp.find(op); + return (iter == _isNumericOp.end() ? false : iter->second); +} + +bool KeyListOps::isNumericOp(const QuickString &op) const { + return isNumericOp(getOpCode(op)); +} + +KeyListOps::OP_TYPES KeyListOps::getOpCode(const QuickString &operation) const { + //If the operation does not exist, return INVALID. + //otherwise, return code for given operation. + map<QuickString, OP_TYPES>::const_iterator iter = _opCodes.find(operation); + if (iter == _opCodes.end()) { + return INVALID; + } + return iter->second; +} + + +bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) { + + if (dbFile->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { + //throw Error + cerr << endl << "*****" << endl + << "***** ERROR: BAM database file not currently supported for column operations." + << endl; + exit(1); + } + + + //get the strings from context containing the comma-delimited lists of columns + //and operations. Split both of these into vectors. Get the operation code + //for each operation string. Finally, make a vector of pairs, where the first + //member of each pair is a column number, and the second member is the code for the + //operation to perform on that column. + + vector<QuickString> columnsVec; + vector<QuickString> opsVec; + int numCols = Tokenize(_columns, columnsVec, ','); + int numOps = Tokenize(_operations, opsVec, ','); + + if (numOps < 1 || numCols < 1) { + cerr << endl << "*****" << endl + << "***** ERROR: There must be at least one column and at least one operation named." << endl; + return false; + } + if (numOps > 1 && numCols != numOps) { + cerr << endl << "*****" << endl + << "***** ERROR: There are " << numCols <<" columns given, but there are " << numOps << " operations." << endl; + cerr << "\tPlease provide either a single operation that will be applied to all listed columns, " << endl; + cerr << "\tor an operation for each column." << endl; + return false; + } + for (int i=0; i < (int)columnsVec.size(); i++) { + int col = str2chrPos(columnsVec[i]); + + //check that the column number is valid + if (col < 1 || col > dbFile->getNumFields()) { + cerr << endl << "*****" << endl << "***** ERROR: Requested column " << col << ", but database file " + << dbFile->getFileName() << " only has fields 1 - " << dbFile->getNumFields() << "." << endl; + return false; + } + const QuickString &operation = opsVec.size() > 1 ? opsVec[i] : opsVec[0]; + OP_TYPES opCode = getOpCode(operation); + if (opCode == INVALID) { + cerr << endl << "*****" << endl + << "***** ERROR: " << operation << " is not a valid operation. " << endl; + return false; + } + _colOps.push_back(pair<int, OP_TYPES>(col, opCode)); + } + + + //The final step we need to do is check that for each column/operation pair, + //if the operation is numeric, see if the database's record type supports + //numeric operations for that column. For instance, we can allow the mean + //of column 4 for a BedGraph file, because that's numeric, but not for Bed4, + //because that isn't. + + for (int i = 0; i < (int)_colOps.size(); i++) { + int col = _colOps[i].first; + OP_TYPES opCode = _colOps[i].second; + FileRecordTypeChecker::RECORD_TYPE recordType = dbFile->getRecordType(); + + if (isNumericOp(opCode)) { + bool isValidNumOp = false; + switch(recordType) { + case FileRecordTypeChecker::BED3_RECORD_TYPE: + isValidNumOp = Bed3Interval::isNumericField(col); + break; + + case FileRecordTypeChecker::BED4_RECORD_TYPE: + isValidNumOp = Bed4Interval::isNumericField(col); + break; + + case FileRecordTypeChecker::BED5_RECORD_TYPE: + isValidNumOp = Bed5Interval::isNumericField(col); + break; + + case FileRecordTypeChecker::BEDGRAPH_RECORD_TYPE: + isValidNumOp = BedGraphInterval::isNumericField(col); + break; + + case FileRecordTypeChecker::BED6_RECORD_TYPE: + isValidNumOp = Bed6Interval::isNumericField(col); + break; + + case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE: + isValidNumOp = BedPlusInterval::isNumericField(col); + break; + + case FileRecordTypeChecker::BED12_RECORD_TYPE: + isValidNumOp = Bed12Interval::isNumericField(col); + break; + + case FileRecordTypeChecker::BAM_RECORD_TYPE: + isValidNumOp = BamRecord::isNumericField(col); + break; + + case FileRecordTypeChecker::VCF_RECORD_TYPE: + isValidNumOp = VcfRecord::isNumericField(col); + break; + + case FileRecordTypeChecker::GFF_RECORD_TYPE: + isValidNumOp = GffRecord::isNumericField(col); + break; + + default: + break; + } + if (!isValidNumOp) { + cerr << endl << "*****" << endl << "***** ERROR: Column " << col << " is not a numeric field for database file " + << dbFile->getFileName() << "." << endl; + return false; + } + } + } + + return true; +} + +const QuickString & KeyListOps::getOpVals(RecordKeyList &hits) +{ + //loop through all requested columns, and for each one, call the method needed + //for the operation specified. + _methods.setKeyList(&hits); + _outVals.clear(); + double val = 0.0; + for (int i=0; i < (int)_colOps.size(); i++) { + int col = _colOps[i].first; + OP_TYPES opCode = _colOps[i].second; + + _methods.setColumn(col); + switch (opCode) { + case SUM: + val = _methods.getSum(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case MEAN: + val = _methods.getMean(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case STDDEV: + val = _methods.getStddev(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case SAMPLE_STDDEV: + val = _methods.getSampleStddev(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case MEDIAN: + val = _methods.getMedian(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case MODE: + _outVals.append(_methods.getMode()); + break; + + case ANTIMODE: + _outVals.append(_methods.getAntiMode()); + break; + + case MIN: + val = _methods.getMin(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case MAX: + val = _methods.getMax(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case ABSMIN: + val = _methods.getAbsMin(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case ABSMAX: + val = _methods.getAbsMax(); + if (isnan(val)) { + _outVals.append(_methods.getNullValue()); + } else { + _outVals.append(val); + } + break; + + case COUNT: + _outVals.append(_methods.getCount()); + break; + + case DISTINCT: + _outVals.append(_methods.getDistinct()); + break; + + case COUNT_DISTINCT: + _outVals.append(_methods.getCountDistinct()); + break; + + case DISTINCT_ONLY: + _outVals.append(_methods.getDistinctOnly()); + break; + + case COLLAPSE: + _outVals.append(_methods.getCollapse()); + break; + + case CONCAT: + _outVals.append(_methods.getConcat()); + break; + + case FREQ_ASC: + _outVals.append(_methods.getFreqAsc()); + break; + + case FREQ_DESC: + _outVals.append(_methods.getFreqDesc()); + break; + + case FIRST: + _outVals.append(_methods.getFirst()); + break; + + case LAST: + _outVals.append(_methods.getLast()); + break; + + case INVALID: + default: + // Any unrecognized operation should have been handled already in the context validation. + // It's thus unnecessary to handle it here, but throw an error to help us know if future + // refactoring or code changes accidentally bypass the validation phase. + cerr << "ERROR: Invalid operation given for column " << col << ". Exiting..." << endl; + break; + } + //if this isn't the last column, add a tab. + if (i < (int)_colOps.size() -1) { + _outVals.append('\t'); + } + } + return _outVals; +} + + diff --git a/src/utils/KeyListOps/KeyListOps.h b/src/utils/KeyListOps/KeyListOps.h new file mode 100644 index 00000000..3c26d2c2 --- /dev/null +++ b/src/utils/KeyListOps/KeyListOps.h @@ -0,0 +1,54 @@ +/* + * KeyListOps.h + * + * Created on: Feb 24, 2014 + * Author: nek3d + */ + +#ifndef KEYLISTOPS_H_ +#define KEYLISTOPS_H_ + +#include "KeyListOpsMethods.h" + +class FileRecordMgr; + +class KeyListOps { +public: + + KeyListOps(); + + void setColumns(const QuickString &columns) { _columns = columns; } + void setOperations(const QuickString & operation) { _operations = operation; } + void setNullValue(const QuickString & nullValue) { _methods.setNullValue(nullValue); } + void setDelimStr(const QuickString & delimStr) { _methods.setDelimStr(delimStr); } + + void setKeyList(RecordKeyList *keyList) { _methods.setKeyList(keyList); } + + typedef enum { SUM, MEAN, STDDEV, SAMPLE_STDDEV, MEDIAN, MODE, ANTIMODE, MIN, MAX, ABSMIN, ABSMAX, COUNT, DISTINCT, COUNT_DISTINCT, + DISTINCT_ONLY, COLLAPSE, CONCAT, FREQ_ASC, FREQ_DESC, FIRST, LAST, INVALID } OP_TYPES; + + bool isValidColumnOps(FileRecordMgr *dbFile); + + const QuickString &getOpVals(RecordKeyList &hits); + +private: + void init(); + + QuickString _operations; + QuickString _columns; + + KeyListOpsMethods _methods; + map<QuickString, OP_TYPES> _opCodes; + map<OP_TYPES, bool> _isNumericOp; + + typedef vector<pair<int, OP_TYPES> > colOpsType; + colOpsType _colOps; + QuickString _outVals; + + OP_TYPES getOpCode(const QuickString &operation) const; + bool isNumericOp(OP_TYPES op) const; + bool isNumericOp(const QuickString &op) const; + +}; + +#endif /* KEYLISTOPS_H_ */ diff --git a/src/utils/KeyListOps/KeyListOpsMethods.cpp b/src/utils/KeyListOps/KeyListOpsMethods.cpp new file mode 100644 index 00000000..0b001358 --- /dev/null +++ b/src/utils/KeyListOps/KeyListOpsMethods.cpp @@ -0,0 +1,368 @@ +/* + * KeyListOpsMethods.cpp + * + * Created on: Feb 6, 2014 + * Author: nek3d + */ + +#include "KeyListOpsMethods.h" +#include <cfloat> +#include <cmath> +#include <algorithm> + +KeyListOpsMethods::KeyListOpsMethods() +: _keyList(&_nullKeyList), + _column(1), + _nullVal("."), + _delimStr(","), + _iter(_nullKeyList.begin()) +{ +} + +KeyListOpsMethods::KeyListOpsMethods(RecordKeyList *keyList, int column) +: _keyList(keyList), + _column(column), + _nullVal("."), + _delimStr(","), + _iter(keyList->begin()) +{ +} + + +KeyListOpsMethods::~KeyListOpsMethods() { + +} + +// return the total of the values in the vector +double KeyListOpsMethods::getSum() { + if (empty()) return NAN; + + double theSum = 0.0; + for (begin(); !end(); next()) { + theSum += getColValNum(); + } + return theSum; +} + +// return the average value in the vector +double KeyListOpsMethods::getMean() { + if (empty()) return NAN; + + return getSum() / (float)getCount(); +} + + + // return the standard deviation +double KeyListOpsMethods::getStddev() { + if (empty()) return NAN; + + double avg = getMean(); + double squareDiffSum = 0.0; + for (begin(); !end(); next()) { + double val = getColValNum(); + double diff = val - avg; + squareDiffSum += diff * diff; + } + return squareDiffSum / (float)getCount(); +} +// return the standard deviation +double KeyListOpsMethods::getSampleStddev() { + if (empty()) return NAN; + + double avg = getMean(); + double squareDiffSum = 0.0; + for (begin(); !end(); next()) { + double val = getColValNum(); + double diff = val - avg; + squareDiffSum += diff * diff; + } + return squareDiffSum / ((float)getCount() - 1.0); +} + +// return the median value in the vector +double KeyListOpsMethods::getMedian() { + if (empty()) return NAN; + + //get sorted vector. if even number of elems, return middle val. + //if odd, average of two. + toArray(true, ASC); + size_t count = getCount(); + if (count % 2) { + //odd number of elements. Take middle one. + return _numArray[count/2]; + } else { + //even numnber of elements. Take average of middle 2. + double sum = _numArray[count/2 -1] + _numArray[count/2]; + return sum / 2.0; + } +} + +// return the most common value in the vector +const QuickString &KeyListOpsMethods::getMode() { + if (empty()) return _nullVal; + + makeFreqMap(); + + //now pass through the freq map and keep track of which key has the highest occurance. + freqMapType::iterator maxIter = _freqMap.begin(); + int maxVal = 0; + for (; _freqIter != _freqMap.end(); _freqIter++) { + if (_freqIter->second > maxVal) { + maxIter = _freqIter; + maxVal = _freqIter->second; + } + } + _retStr = maxIter->first; + return _retStr; +} +// return the least common value in the vector +const QuickString &KeyListOpsMethods::getAntiMode() { + if (empty()) return _nullVal; + + makeFreqMap(); + + //now pass through the freq map and keep track of which key has the highest occurance. + freqMapType::iterator minIter = _freqMap.begin(); + int minVal = INT_MAX; + for (; _freqIter != _freqMap.end(); _freqIter++) { + if (_freqIter->second < minVal) { + minIter = _freqIter; + minVal = _freqIter->second; + } + } + _retStr = minIter->first; + return _retStr; +} +// return the minimum element of the vector +double KeyListOpsMethods::getMin() { + if (empty()) return NAN; + + double minVal = DBL_MAX; + for (begin(); !end(); next()) { + double currVal = getColValNum(); + minVal = (currVal < minVal) ? currVal : minVal; + } + return minVal; +} + +// return the maximum element of the vector +double KeyListOpsMethods::getMax() { + if (empty()) return NAN; + + double maxVal = DBL_MIN; + for (begin(); !end(); next()) { + double currVal = getColValNum(); + maxVal = (currVal > maxVal) ? currVal : maxVal; + } + return maxVal; +} + +// return the minimum absolute value of the vector +double KeyListOpsMethods::getAbsMin() { + if (empty()) return NAN; + + double minVal = DBL_MAX; + for (begin(); !end(); next()) { + double currVal = abs(getColValNum()); + minVal = (currVal < minVal) ? currVal : minVal; + } + return minVal; +} +// return the maximum absolute value of the vector +double KeyListOpsMethods::getAbsMax() { + if (empty()) return NAN; + + double maxVal = DBL_MIN; + for (begin(); !end(); next()) { + double currVal = abs(getColValNum()); + maxVal = (currVal > maxVal) ? currVal : maxVal; + } + return maxVal; +} +// return the count of element in the vector +uint32_t KeyListOpsMethods::getCount() { + return _keyList->size(); +} +// return a delimited list of the unique elements +const QuickString &KeyListOpsMethods::getDistinct() { + if (empty()) return _nullVal; + // separated list of unique values. If something repeats, only report once. + makeFreqMap(); + _retStr.clear(); + for (; _freqIter != _freqMap.end(); _freqIter++) { + if (_freqIter != _freqMap.begin()) _retStr += _delimStr; + _retStr.append(_freqIter->first); + } + return _retStr; +} + +const QuickString &KeyListOpsMethods::getDistinctOnly() { + if (empty()) return _nullVal; + + //separated list of only unique values. If item repeats, discard. + makeFreqMap(); + _retStr.clear(); + for (; _freqIter != _freqMap.end(); _freqIter++) { + if (_freqIter->second != 1) continue; + if (_freqIter != _freqMap.begin()) _retStr += _delimStr; + _retStr.append(_freqIter->first); + } + return _retStr; +} + +// return a the count of _unique_ elements in the vector +uint32_t KeyListOpsMethods::getCountDistinct() { + if (empty()) return 0; + + makeFreqMap(); + return _freqMap.size(); +} +// return a delimiter-separated list of elements +const QuickString &KeyListOpsMethods::getCollapse(const QuickString &delimiter) { + if (empty()) return _nullVal; + + //just put all items in one big separated list. + _retStr.clear(); + int i=0; + for (begin(); !end(); next()) { + if (i > 0) _retStr += _delimStr; + _retStr.append(getColVal()); + i++; + } + return _retStr; + +} +// return a concatenation of all elements in the vector +const QuickString &KeyListOpsMethods::getConcat() { + if (empty()) return _nullVal; + + //like collapse but w/o commas. Just a true concat of all vals. + //just swap out the delimChar with '' and call collapse, then + //restore the delimChar. + QuickString oldDelimStr(_delimStr); + _delimStr = ""; + getCollapse(); //this will store it's results in the _retStr method. + _delimStr = oldDelimStr; + return _retStr; +} + +// return a histogram of values and their freqs. in desc. order of frequency +const QuickString &KeyListOpsMethods::getFreqDesc() { + if (empty()) return _nullVal; + + //for each uniq val, report # occurances, in desc order. + makeFreqMap(); + //put freq map into multimap where key is the freq and val is the item. In other words, basically a reverse freq map. + histDescType hist; + for (; _freqIter != _freqMap.end(); _freqIter++) { + hist.insert(pair<int, QuickString>(_freqIter->second, _freqIter->first)); + } + //now iterate through the reverse map we just made and output it's pairs in val:key format. + _retStr.clear(); + for (histDescType::iterator histIter = hist.begin(); histIter != hist.end(); histIter++) { + if (histIter != hist.begin()) _retStr += _delimStr; + _retStr.append(histIter->second); + _retStr += ":"; + _retStr.append(histIter->first); + } + return _retStr; +} +// return a histogram of values and their freqs. in asc. order of frequency +const QuickString &KeyListOpsMethods::getFreqAsc() { + if (empty()) return _nullVal; + + //for each uniq val, report # occurances, in asc order. + makeFreqMap(); + //put freq map into multimap where key is the freq and val is the item. In other words, basically a reverse freq map. + histAscType hist; + for (; _freqIter != _freqMap.end(); _freqIter++) { + hist.insert(pair<int, QuickString>(_freqIter->second, _freqIter->first)); +// hist[*(_freqIter->second)] = _freqIter->first; + } + //now iterate through the reverse map we just made and output it's pairs in val:key format. + _retStr.clear(); + for (histAscType::iterator histIter = hist.begin(); histIter != hist.end(); histIter++) { + if (histIter != hist.begin()) _retStr += _delimStr; + _retStr.append(histIter->second); + _retStr += ":"; + _retStr.append(histIter->first); + } + return _retStr; +} +// return the first value in the list +const QuickString &KeyListOpsMethods::getFirst() { + if (empty()) return _nullVal; + + //just the first item. + begin(); + return getColVal(); +} +// return the last value in the list +const QuickString &KeyListOpsMethods::getLast() { + if (empty()) return _nullVal; + + //just the last item. + begin(); + for (size_t i = 0; i < getCount() -1; i++) { + next(); + } + return getColVal(); +} + +const QuickString &KeyListOpsMethods::getColVal() { + return _iter->value()->getField(_column); +} + +double KeyListOpsMethods::getColValNum() { + return atof(_iter->value()->getField(_column).c_str()); +} + +void KeyListOpsMethods::toArray(bool useNum, SORT_TYPE sortVal) { + + //TBD: optimize performance with better memory management. + if (useNum) { + _numArray.resize(_keyList->size()); + int i=0; + for (begin(); !end(); next()) { + _numArray[i] = getColValNum(); + i++; + } + } else { + _qsArray.resize(_keyList->size()); + int i=0; + for (begin(); !end(); next()) { + _qsArray[i] = getColVal(); + i++; + } + } + if (sortVal != UNSORTED) { + sortArray(useNum, sortVal == ASC); + } +} + +void KeyListOpsMethods::sortArray(bool useNum, bool ascOrder) +{ + if (useNum) { + if (ascOrder) { + sort(_numArray.begin(), _numArray.end(), less<double>()); + } else { + sort(_numArray.begin(), _numArray.end(), greater<double>()); + } + } else { + if (ascOrder) { + sort(_qsArray.begin(), _qsArray.end(), less<QuickString>()); + } else { + sort(_qsArray.begin(), _qsArray.end(), greater<QuickString>()); + } + } +} + +void KeyListOpsMethods::makeFreqMap() { + _freqMap.clear(); + + //make a map of values to their number of times occuring. + for (begin(); !end(); next()) { + _freqMap[getColVal()]++; + } + _freqIter = _freqMap.begin(); +} diff --git a/src/utils/KeyListOps/KeyListOpsMethods.h b/src/utils/KeyListOps/KeyListOpsMethods.h new file mode 100644 index 00000000..0cac9c87 --- /dev/null +++ b/src/utils/KeyListOps/KeyListOpsMethods.h @@ -0,0 +1,113 @@ +/* + * KeyListOpsMethods.h + * + * Created on: Feb 6, 2014 + * Author: nek3d + */ + +#ifndef KEYLISTOPSMETHODS_H_ +#define KEYLISTOPSMETHODS_H_ + +using namespace std; + +#include <map> +#include <utility> //for pair +#include "QuickString.h" +#include <stdint.h> +#include "RecordKeyList.h" + +class KeyListOpsMethods { +public: + KeyListOpsMethods(); + KeyListOpsMethods(RecordKeyList *keyList, int column = 1); + ~KeyListOpsMethods(); + + + void setKeyList(RecordKeyList *keyList) { _keyList = keyList; } + void setColumn(int col) { _column = col; } + void setNullValue(const QuickString & nullVal) { _nullVal = nullVal; } + const QuickString &getNullValue() const { return _nullVal; } + void setDelimStr(const QuickString &delimStr) { _delimStr = delimStr; } + const QuickString &getDelimStr() const { return _delimStr; } + + // return the total of the values in the vector + double getSum(); + // return the average value in the vector + double getMean(); + // return the standard deviation + double getStddev(); + // return the sample standard deviation + double getSampleStddev(); + // return the median value in the vector + double getMedian(); + // return the most common value in the vector + const QuickString &getMode(); + // return the least common value in the vector + const QuickString &getAntiMode(); + // return the minimum element of the vector + double getMin(); + // return the maximum element of the vector + double getMax(); + // return the minimum absolute value of the vector + double getAbsMin(); + // return the maximum absolute value of the vector + double getAbsMax(); + // return the count of element in the vector + uint32_t getCount(); + // return a the count of _unique_ elements in the vector + uint32_t getCountDistinct(); + // return only those elements that occur once + const QuickString &getDistinctOnly(); + // return a delimiter-separated list of elements + const QuickString & getCollapse(const QuickString & delimiter = ","); + // return a concatenation of all elements in the vector + const QuickString & getConcat(); + // return a comma-separated list of the _unique_ elements + const QuickString & getDistinct(); + // return a histogram of values and their freqs. in desc. order of frequency + const QuickString & getFreqDesc(); + // return a histogram of values and their freqs. in asc. order of frequency + const QuickString & getFreqAsc(); + // return the first value in the list + const QuickString & getFirst(); + // return the last value in the list + const QuickString & getLast(); + +private: + RecordKeyList *_keyList; + int _column; + QuickString _nullVal; + QuickString _delimStr; + QuickString _retStr; + + RecordKeyList _nullKeyList; //this has to exist just so we can initialize _iter, below. + RecordKeyList::const_iterator_type _iter; + + // Some methods need to put values into a vector, mostly for sorting. + vector<double> _numArray; + vector<QuickString> _qsArray; + + typedef map<QuickString, int> freqMapType; + freqMapType _freqMap; + freqMapType::iterator _freqIter; + + typedef enum { UNSORTED, ASC, DESC} SORT_TYPE; + + typedef multimap<int, QuickString, less<int> > histAscType; + typedef multimap<int, QuickString, greater<int> > histDescType; + void init(); + const QuickString &getColVal(); + double getColValNum(); + bool empty() { return _keyList->empty(); } + void begin() { _iter = _keyList->begin(); } + bool end() { return _iter == _keyList->end(); } + void next() { _iter = _keyList->next(); } + void toArray(bool useNum, SORT_TYPE sortVal = UNSORTED); + void sortArray(bool useNum, bool ascOrder); + void makeFreqMap(); + + +}; + + +#endif /* KEYLISTOPSMETHODS_H_ */ diff --git a/src/utils/KeyListOps/Makefile b/src/utils/KeyListOps/Makefile new file mode 100644 index 00000000..0b0ac991 --- /dev/null +++ b/src/utils/KeyListOps/Makefile @@ -0,0 +1,43 @@ +OBJ_DIR = ../../../obj/ +BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../../utils/ +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/GenomeFile/ \ + -I$(UTILITIES_DIR)/FileRecordTools/ \ + -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ + -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools/src/ \ + -I$(UTILITIES_DIR)/version/ + + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= KeyListOps.cpp KeyListOps.h KeyListOpsMethods.cpp KeyListOpsMethods.h +OBJECTS= KeyListOps.o KeyListOpsMethods.o +_EXT_OBJECTS= +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/KeyListOps.o $(OBJ_DIR)/KeyListOpsMethods.o + +.PHONY: clean \ No newline at end of file diff --git a/src/utils/NewChromsweep/Makefile b/src/utils/NewChromsweep/Makefile index 8f4d9310..34fc5d12 100644 --- a/src/utils/NewChromsweep/Makefile +++ b/src/utils/NewChromsweep/Makefile @@ -11,6 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/BamTools/include \ -I$(UTILITIES_DIR)/BamTools/src/ \ -I$(UTILITIES_DIR)/version/ diff --git a/src/utils/RecordOutputMgr/Makefile b/src/utils/RecordOutputMgr/Makefile index 2d196ec1..346a5c7c 100644 --- a/src/utils/RecordOutputMgr/Makefile +++ b/src/utils/RecordOutputMgr/Makefile @@ -11,6 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ -I$(UTILITIES_DIR)/BamTools/include \ -I$(UTILITIES_DIR)/BamTools/src/ \ -I$(UTILITIES_DIR)/version/ diff --git a/src/utils/general/Makefile b/src/utils/general/Makefile index 43dcfba0..0361fab4 100644 --- a/src/utils/general/Makefile +++ b/src/utils/general/Makefile @@ -4,7 +4,7 @@ UTILITIES_DIR = ../../utils/ # ------------------- # define our includes # ------------------- -INCLUDES = +INCLUDES = -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files diff --git a/src/utils/general/QuickString.cpp b/src/utils/general/QuickString.cpp index 831f84ab..9e061866 100644 --- a/src/utils/general/QuickString.cpp +++ b/src/utils/general/QuickString.cpp @@ -3,6 +3,7 @@ #include <cstdlib> #include <cstdio> #include "ParseTools.h" +#include "lineFileUtilities.h" QuickString::QuickString(size_t capacity) : _buffer(NULL), @@ -82,6 +83,35 @@ QuickString &QuickString::operator = (const QuickString & inBuf){ return *this; } +QuickString &QuickString::operator = (char val) { + clear(); + append(val); + return *this; +} +QuickString &QuickString::operator = (int val) { + clear(); + append(val); + return *this; +} + +QuickString &QuickString::operator = (uint32_t val) { + clear(); + append(val); + return *this; +} + +QuickString &QuickString::operator = (float val) { + clear(); + append(val); + return *this; +} + +QuickString &QuickString::operator = (double val) { + clear(); + append(val); + return *this; +} + QuickString &QuickString::operator += (const QuickString & inBuf) { @@ -107,6 +137,26 @@ QuickString &QuickString::operator += (const char *inBuf) return *this; } +QuickString &QuickString::operator += (int num) { + append(num); + return *this; +} + +QuickString &QuickString::operator += (uint32_t num) { + append(num); + return *this; +} + +QuickString &QuickString::operator += (float num) { + append(num); + return *this; +} + +QuickString &QuickString::operator += (double num) { + append(num); + return *this; +} + bool QuickString::operator == (const QuickString &qs) const { if ( _currSize != qs._currSize) { return false; @@ -194,6 +244,21 @@ void QuickString::append(const char *inBuf, size_t inBufLen) void QuickString::append(int num) { int2str(num, *this, true); } + +void QuickString::append(uint32_t num) { + int2str((int)num, *this, true); +} + +void QuickString::append(float num) { + append(ToString(num)); +} + +void QuickString::append(double num) { + append(ToString(num)); +} + + + QuickString &QuickString::assign(const char *inBuf, size_t inBufLen) { clear(); diff --git a/src/utils/general/QuickString.h b/src/utils/general/QuickString.h index 5fdc0fc4..a76e5ff2 100644 --- a/src/utils/general/QuickString.h +++ b/src/utils/general/QuickString.h @@ -10,6 +10,7 @@ using namespace std; #include <string> +#include <stdint.h> #include <climits> #include <ostream> @@ -32,10 +33,19 @@ public: QuickString &operator = (const string &); QuickString &operator = (const char *); QuickString &operator = (const QuickString &); + QuickString &operator = (char); + QuickString &operator = (int); + QuickString &operator = (uint32_t); + QuickString &operator = (float); + QuickString &operator = (double); QuickString &operator += (const QuickString &); QuickString &operator += (const string &); QuickString &operator += (const char *); QuickString &operator += (char); + QuickString &operator += (int); + QuickString &operator += (uint32_t); + QuickString &operator += (float); + QuickString &operator += (double); friend ostream &operator << (ostream &out, const QuickString &str); bool operator == (const QuickString &) const; @@ -52,7 +62,16 @@ public: void append(const QuickString &str) { append(str.c_str(), str.size()); } void append(const char *buf, size_t bufLen); void append(char c); + + //These are not templated because float and double require a stringstream based + //implementation, while the integer append uses a much faster home-brewed algorithm + //for better performance. void append(int num); + void append(uint32_t num); + void append(float num); + void append(double num); + + QuickString &assign(const char *str, size_t n); void resize(size_t n, char c = '\0'); diff --git a/test/map/test-map.sh b/test/map/test-map.sh index 293d84e3..a47b14e0 100644 --- a/test/map/test-map.sh +++ b/test/map/test-map.sh @@ -499,10 +499,8 @@ echo " map.t33..\c" echo \ " ***** -*****ERROR: requested column 15 , but record only has fields 1 - 12. Exiting. - -*****" > exp -$BT map -a ivls.bed -b test.vcf -c 15 -o collapse 2> obs +***** ERROR: Requested column 15, but database file test.vcf only has fields 1 - 12." > exp +$BT map -a ivls.bed -b test.vcf -c 15 -o collapse 2>&1 > /dev/null | head -3> obs check obs exp rm obs exp @@ -624,12 +622,9 @@ echo " map.t41..\c" echo \ " ***** -*****ERROR: requested column 41 , but record only has fields 1 - 6. Exiting. - -*****" > exp -$BT map -a ivls.bed -b values5.bed -c 41 -o collapse 2> obs +***** ERROR: Requested column 41, but database file test.vcf only has fields 1 - 12." > exp +$BT map -a ivls.bed -b test.vcf -c 41 -o collapse 2>&1 > /dev/null | head -3> obs check obs exp - rm obs exp ########################################################### @@ -639,12 +634,9 @@ echo " map.t42..\c" echo \ " ***** -*****ERROR: requested column -1 , but record only has fields 1 - 6. Exiting. - -*****" > exp -$BT map -a ivls.bed -b values5.bed -c -1 -o collapse 2> obs +***** ERROR: Requested column -1, but database file test.vcf only has fields 1 - 12." > exp +$BT map -a ivls.bed -b test.vcf -c -1 -o collapse 2>&1 > /dev/null | head -3> obs check obs exp - rm obs exp ########################################################### @@ -654,12 +646,9 @@ echo " map.t43..\c" echo \ " ***** -*****ERROR: requested column 0 , but record only has fields 1 - 6. Exiting. - -*****" > exp -$BT map -a ivls.bed -b values5.bed -c 0 -o collapse 2> obs +***** ERROR: Requested column 0, but database file test.vcf only has fields 1 - 12." > exp +$BT map -a ivls.bed -b test.vcf -c 0 -o collapse 2>&1 > /dev/null | head -3> obs check obs exp - rm obs exp @@ -667,7 +656,7 @@ rm obs exp # Test that Bam database is not allowed ############################################################ echo " map.t44...\c" -echo -e "\n*****\n***** ERROR: BAM database file not currently supported for the map tool." > exp +echo -e "\n*****\n***** ERROR: BAM database file not currently supported for column operations." > exp $BT map -a ivls.bed -b values.bam 2> obs check obs exp rm obs exp @@ -682,3 +671,71 @@ echo "chr1 0 50 three_blocks_match 15 + 0 0 0 3 10,10,10, 0,20,40, ." > exp $BT map -o sum -a three_blocks_match.bed -b three_blocks_nomatch.bed -split > obs check obs exp rm obs exp + + + + + + +########################################################### +# +# +# Tests for multiple columns and operations +# +# +############################################################ + + +########################################################### +# Test that error is given when ops outnumber columns +############################################################ +echo " map.t46...\c" +echo \ +" +***** +***** ERROR: There are 1 columns given, but there are 2 operations." > exp +../../bin/bedtools map -a ivls.bed -b values.bed -o count,sum 2>&1 > /dev/null | head -3 > obs +check obs exp +rm obs exp + + +########################################################### +# Test that error is given when columns outnumber ops, +# if there are two or more ops. +############################################################ +echo " map.t47...\c" +echo \ +" +***** +***** ERROR: There are 3 columns given, but there are 2 operations." > exp +../../bin/bedtools map -a ivls.bed -b values.bed -c 5,1,2 -o count,sum 2>&1 > /dev/null | head -3 > obs +check obs exp +rm obs exp + + +########################################################### +# Test that numeric ops for non-numeric columns aren't allowed +############################################################ +echo " map.t48...\c" +echo \ +" +***** +***** ERROR: Column 1 is not a numeric field for database file values.bed." > exp +../../bin/bedtools map -a ivls.bed -b values.bed -c 1 -o sum 2>&1 > /dev/null | head -3 > obs +check obs exp +rm obs exp + + +########################################################### +# Test that multiple columns are allowed with a +# single operation +############################################################ +# +# TBD +# +#echo " map.t49...\c" +#../../bin/bedtools map -a ivls.bed -b values.bed -c 2 -o sum 2>&1 > /dev/null | head -3 > obs +#check obs exp +#rm obs exp + + -- GitLab