Newer
Older
nkindlon
committed
/*
* KeyListOps.cpp
*
* Created on: Feb 24, 2014
* Author: nek3d
*/
#include "KeyListOps.h"
#include "FileRecordMgr.h"
#include <cmath> //for isnan
Neil Kindlon
committed
#include <sstream>
#include <iomanip>
nkindlon
committed
KeyListOps::KeyListOps():
_dbFileType(FileRecordTypeChecker::UNKNOWN_FILE_TYPE)
{
nkindlon
committed
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
_opCodes["sum"] = SUM;
_opCodes["mean"] = MEAN;
_opCodes["stddev"] = STDDEV;
_opCodes["sample_stddev"] = SAMPLE_STDDEV;
_opCodes["median"] = MEDIAN;
_opCodes["mode"] = MODE;
_opCodes["antimode"] = ANTIMODE;
_opCodes["min"] = MIN;
_opCodes["max"] = MAX;
_opCodes["absmin"] = ABSMIN;
_opCodes["absmax"] = ABSMAX;
_opCodes["count"] = COUNT;
_opCodes["distinct"] = DISTINCT;
_opCodes["count_distinct"] = COUNT_DISTINCT;
_opCodes["distinct_only"] = DISTINCT_ONLY;
_opCodes["collapse"] = COLLAPSE;
_opCodes["concat"] = CONCAT;
_opCodes["freq_asc"] = FREQ_ASC;
_opCodes["freq_desc"] = FREQ_DESC;
_opCodes["first"] = FIRST;
_opCodes["last"] = LAST;
_isNumericOp[SUM] = true;
_isNumericOp[MEAN] = true;
_isNumericOp[STDDEV] = true;
_isNumericOp[MEDIAN] = true;
_isNumericOp[MODE] = false;
_isNumericOp[ANTIMODE] = false;
_isNumericOp[MIN] = true;
_isNumericOp[MAX] = true;
_isNumericOp[ABSMIN] = true;
_isNumericOp[COUNT] = false;
_isNumericOp[DISTINCT] = false;
_isNumericOp[COUNT_DISTINCT] = false;
_isNumericOp[DISTINCT_ONLY] = false;
_isNumericOp[COLLAPSE] = false;
_isNumericOp[CONCAT] = false;
_isNumericOp[FREQ_ASC] = false;
_isNumericOp[FREQ_DESC] = false;
_isNumericOp[FIRST] = false;
_isNumericOp[LAST] = false;
_methods.setDelimStr(",");
_methods.setNullValue(".");
// default to BED score column
_columns = "5";
// default to "sum"
_operations = "sum";
Neil Kindlon
committed
_precision = DEFAULT_PRECISION;
nkindlon
committed
}
bool KeyListOps::isNumericOp(OP_TYPES op) const {
map<OP_TYPES, bool>::const_iterator iter = _isNumericOp.find(op);
return (iter == _isNumericOp.end() ? false : iter->second);
}
bool KeyListOps::isNumericOp(const QuickString &op) const {
return isNumericOp(getOpCode(op));
}
KeyListOps::OP_TYPES KeyListOps::getOpCode(const QuickString &operation) const {
//If the operation does not exist, return INVALID.
//otherwise, return code for given operation.
map<QuickString, OP_TYPES>::const_iterator iter = _opCodes.find(operation);
if (iter == _opCodes.end()) {
return INVALID;
}
return iter->second;
}
bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) {
//get the strings from context containing the comma-delimited lists of columns
//and operations. Split both of these into vectors. Get the operation code
//for each operation string. Finally, make a vector of pairs, where the first
//member of each pair is a column number, and the second member is the code for the
//operation to perform on that column.
Tokenizer colTokens;
Tokenizer opsTokens;
int numCols = colTokens.tokenize(_columns, ',');
int numOps = opsTokens.tokenize(_operations, ',');
nkindlon
committed
if (numOps < 1 || numCols < 1) {
cerr << endl << "*****" << endl
<< "***** ERROR: There must be at least one column and at least one operation named." << endl;
return false;
}
if (numOps > 1 && numCols > 1 && numCols != numOps) {
nkindlon
committed
cerr << endl << "*****" << endl
<< "***** ERROR: There are " << numCols <<" columns given, but there are " << numOps << " operations." << endl;
cerr << "\tPlease provide either a single operation that will be applied to all listed columns, " << endl;
cerr << "\ta single column to which all operations will be applied," << endl;
nkindlon
committed
cerr << "\tor an operation for each column." << endl;
return false;
}
// If there is only one column, all ops are performed on it.
// Otherwise, if there is only op, it is performed on all columns.
// Besides that, ops are performed on columns in their respective
// ordering.
for (int i=0; i < loop; i++) {
int col = str2chrPos(colTokens.getElem(numCols > 1 ? i : 0));
nkindlon
committed
//check that the column number is valid
if (col < 1 || col > dbFile->getNumFields()) {
cerr << endl << "*****" << endl << "***** ERROR: Requested column " << col << ", but database file "
<< dbFile->getFileName() << " only has fields 1 - " << dbFile->getNumFields() << "." << endl;
return false;
}
const QuickString &operation = opsTokens.getElem(numOps > 1 ? i : 0);
nkindlon
committed
OP_TYPES opCode = getOpCode(operation);
if (opCode == INVALID) {
cerr << endl << "*****" << endl
<< "***** ERROR: " << operation << " is not a valid operation. " << endl;
return false;
}
_colOps.push_back(pair<int, OP_TYPES>(col, opCode));
}
//lastly, if the file is BAM, and they asked for column 2, which is the
//flags field, then for now we have to throw an error, as the flag field
//is currently not supported.
if (_dbFileType == FileRecordTypeChecker::BAM_FILE_TYPE) {
//also, tell the methods class we're dealing with BAM.
_methods.setIsBam(true);
for (size_t i = 0; i < _colOps.size(); i++) {
if (_colOps[i].first == 2) {
cerr << endl << "*****" << endl << "***** ERROR: Requested column 2 of a BAM file, which is the Flags field." << endl;
cerr << " We currently do not support this, but may in future versions." << endl;
return false;
}
}
}
nkindlon
committed
return true;
}
const QuickString & KeyListOps::getOpVals(RecordKeyVector &hits)
nkindlon
committed
{
//loop through all requested columns, and for each one, call the method needed
//for the operation specified.
_methods.setKeyList(&hits);
_outVals.clear();
double val = 0.0;
for (int i=0; i < (int)_colOps.size(); i++) {
int col = _colOps[i].first;
OP_TYPES opCode = _colOps[i].second;
_methods.setColumn(col);
switch (opCode) {
case SUM:
val = _methods.getSum();
if (isnan(val)) {
_outVals.append(_methods.getNullValue());
} else {
Neil Kindlon
committed
_outVals.append(format(val));
nkindlon
committed
}
break;
case MEAN:
val = _methods.getMean();
if (isnan(val)) {
_outVals.append(_methods.getNullValue());
} else {
Neil Kindlon
committed
_outVals.append(format(val));
nkindlon
committed
}
break;
case STDDEV:
val = _methods.getStddev();
if (isnan(val)) {
_outVals.append(_methods.getNullValue());
} else {
Neil Kindlon
committed
_outVals.append(format(val));
nkindlon
committed
}
break;
case SAMPLE_STDDEV:
val = _methods.getSampleStddev();
if (isnan(val)) {
_outVals.append(_methods.getNullValue());
} else {
Neil Kindlon
committed
_outVals.append(format(val));
nkindlon
committed
}
break;
case MEDIAN:
val = _methods.getMedian();
if (isnan(val)) {
_outVals.append(_methods.getNullValue());
} else {
Neil Kindlon
committed
_outVals.append(format(val));
nkindlon
committed
}
break;
case MODE:
_outVals.append(_methods.getMode());
break;
case ANTIMODE:
_outVals.append(_methods.getAntiMode());
break;
case MIN:
val = _methods.getMin();
if (isnan(val)) {
_outVals.append(_methods.getNullValue());
} else {
Neil Kindlon
committed
_outVals.append(format(val));
nkindlon
committed
}
break;
case MAX:
val = _methods.getMax();
if (isnan(val)) {
_outVals.append(_methods.getNullValue());
} else {
Neil Kindlon
committed
_outVals.append(format(val));
nkindlon
committed
}
break;
case ABSMIN:
val = _methods.getAbsMin();
if (isnan(val)) {
_outVals.append(_methods.getNullValue());
} else {
Neil Kindlon
committed
_outVals.append(format(val));
nkindlon
committed
}
break;
case ABSMAX:
val = _methods.getAbsMax();
if (isnan(val)) {
_outVals.append(_methods.getNullValue());
} else {
Neil Kindlon
committed
_outVals.append(format(val));
nkindlon
committed
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
}
break;
case COUNT:
_outVals.append(_methods.getCount());
break;
case DISTINCT:
_outVals.append(_methods.getDistinct());
break;
case COUNT_DISTINCT:
_outVals.append(_methods.getCountDistinct());
break;
case DISTINCT_ONLY:
_outVals.append(_methods.getDistinctOnly());
break;
case COLLAPSE:
_outVals.append(_methods.getCollapse());
break;
case CONCAT:
_outVals.append(_methods.getConcat());
break;
case FREQ_ASC:
_outVals.append(_methods.getFreqAsc());
break;
case FREQ_DESC:
_outVals.append(_methods.getFreqDesc());
break;
case FIRST:
_outVals.append(_methods.getFirst());
break;
case LAST:
_outVals.append(_methods.getLast());
break;
case INVALID:
default:
// Any unrecognized operation should have been handled already in the context validation.
// It's thus unnecessary to handle it here, but throw an error to help us know if future
// refactoring or code changes accidentally bypass the validation phase.
cerr << "ERROR: Invalid operation given for column " << col << ". Exiting..." << endl;
break;
}
//if this isn't the last column, add a tab.
if (i < (int)_colOps.size() -1) {
_outVals.append('\t');
}
}
if (_methods.nonNumErrFlagSet()) {
//asked for a numeric op on a column in which a non numeric value was found.
cerr << _methods.getErrMsg() << endl;
_methods.resetNonNumErrFlag();
}
nkindlon
committed
return _outVals;
}
Neil Kindlon
committed
const QuickString &KeyListOps::format(double val)
{
std::stringstream strmBuf;
strmBuf << std::setprecision (_precision) << val;
_formatStr = strmBuf.str();
return _formatStr;
}
void KeyListOpsHelp() {
cerr << "\t-o\t" << "Specify the operation that should be applied to -c." << endl;
cerr << "\t\tValid operations:" << endl;
cerr << "\t\t sum, min, max, absmin, absmax," << endl;
cerr << "\t\t mean, median," << endl;
cerr << "\t\t collapse (i.e., print a delimited list (duplicates allowed)), " << endl;
cerr << "\t\t distinct (i.e., print a delimited list (NO duplicates allowed)), " << endl;
cerr << "\t\t count" << endl;
cerr << "\t\t count_distinct (i.e., a count of the unique values in the column), " << endl;
cerr << "\t\tDefault: sum" << endl;
cerr << "\t\tMultiple operations can be specified in a comma-delimited list." << endl << endl;
cerr << "\t\tIf there is only column, but multiple operations, all operations will be" << endl;
cerr << "\t\tapplied on that column. Likewise, if there is only one operation, but" << endl;
cerr << "\t\tmultiple columns, that operation will be applied to all columns." << endl;
cerr << "\t\tOtherwise, the number of columns must match the the number of operations," << endl;
cerr << "\t\tand will be applied in respective order." << endl;
cerr << "\t\tE.g., \"-c 5,4,6 -o sum,mean,count\" will give the sum of column 5," << endl;
cerr << "\t\tthe mean of column 4, and the count of column 6." << endl;
cerr << "\t\tThe order of output columns will match the ordering given in the command." << endl << endl<<endl;
cerr << "\t-delim\t" << "Specify a custom delimiter for the collapse operations." << endl;
cerr << "\t\t- Example: -delim \"|\"" << endl;
cerr << "\t\t- Default: \",\"." << endl << endl;
nkindlon
committed