KeyListOps.cpp

/*
 * KeyListOps.cpp
 *
 *  Created on: Feb 24, 2014
 *      Author: nek3d
 */
#include "KeyListOps.h"
#include "FileRecordMgr.h"
#include <cmath> //for isnan
#include <sstream>
#include <iomanip>

KeyListOps::KeyListOps():
_dbFileType(FileRecordTypeChecker::UNKNOWN_FILE_TYPE)
{
	_opCodes["sum"] = SUM;
	_opCodes["mean"] = MEAN;
	_opCodes["stddev"] = STDDEV;
	_opCodes["sample_stddev"] = SAMPLE_STDDEV;
	_opCodes["median"] = MEDIAN;
	_opCodes["mode"] = MODE;
	_opCodes["antimode"] = ANTIMODE;
	_opCodes["min"] = MIN;
	_opCodes["max"] = MAX;
	_opCodes["absmin"] = ABSMIN;
	_opCodes["absmax"] = ABSMAX;
	_opCodes["count"] = COUNT;
	_opCodes["distinct"] = DISTINCT;
	_opCodes["count_distinct"] = COUNT_DISTINCT;
	_opCodes["distinct_only"] = DISTINCT_ONLY;
	_opCodes["collapse"] = COLLAPSE;
	_opCodes["concat"] = CONCAT;
	_opCodes["freq_asc"] = FREQ_ASC;
	_opCodes["freq_desc"] = FREQ_DESC;
	_opCodes["first"] = FIRST;
	_opCodes["last"] = LAST;

	_isNumericOp[SUM] = true;
	_isNumericOp[MEAN] = true;
	_isNumericOp[STDDEV] = true;
	_isNumericOp[MEDIAN] = true;
	_isNumericOp[MODE] = false;
	_isNumericOp[ANTIMODE] = false;
	_isNumericOp[MIN] = true;
	_isNumericOp[MAX] = true;
	_isNumericOp[ABSMIN] = true;
	_isNumericOp[COUNT] = false;
	_isNumericOp[DISTINCT] = false;
	_isNumericOp[COUNT_DISTINCT] = false;
	_isNumericOp[DISTINCT_ONLY] = false;
	_isNumericOp[COLLAPSE] = false;
	_isNumericOp[CONCAT] = false;
	_isNumericOp[FREQ_ASC] = false;
	_isNumericOp[FREQ_DESC] = false;
	_isNumericOp[FIRST] = false;
	_isNumericOp[LAST] = false;

	_methods.setDelimStr(",");
	_methods.setNullValue(".");

	// default to BED score column
	_columns = "5";
	// default to "sum"
	_operations = "sum";
	_precision = DEFAULT_PRECISION;

}

bool KeyListOps::isNumericOp(OP_TYPES op) const {
	map<OP_TYPES, bool>::const_iterator iter = _isNumericOp.find(op);
	return (iter == _isNumericOp.end() ? false : iter->second);
}

bool KeyListOps::isNumericOp(const QuickString &op) const {
	return isNumericOp(getOpCode(op));
}

KeyListOps::OP_TYPES KeyListOps::getOpCode(const QuickString &operation) const {
	//If the operation does not exist, return INVALID.
	//otherwise, return code for given operation.
	map<QuickString, OP_TYPES>::const_iterator iter = _opCodes.find(operation);
	if (iter == _opCodes.end()) {
		return INVALID;
	}
	return iter->second;
}


bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) {

	//get the strings from context containing the comma-delimited lists of columns
	//and operations. Split both of these into vectors. Get the operation code
	//for each operation string. Finally, make a vector of pairs, where the first
	//member of each pair is a column number, and the second member is the code for the
	//operation to perform on that column.

    Tokenizer colTokens;
    Tokenizer opsTokens;

    int numCols = colTokens.tokenize(_columns, ',');
	int numOps = opsTokens.tokenize(_operations, ',');

	if (numOps < 1 || numCols < 1) {
		 cerr << endl << "*****" << endl
		             << "***** ERROR: There must be at least one column and at least one operation named." << endl;
		 return false;
	}
	if (numOps > 1 && numCols > 1 && numCols != numOps) {
		 cerr << endl << "*****" << endl
		             << "***** ERROR: There are " << numCols <<" columns given, but there are " << numOps << " operations." << endl;
		cerr << "\tPlease provide either a single operation that will be applied to all listed columns, " << endl;
		cerr << "\ta single column to which all operations will be applied," << endl;
		cerr << "\tor an operation for each column." << endl;
		return false;
	}
	int loop = max(numCols, numOps);

	// If there is only one column, all ops are performed on it.
	// Otherwise, if there is only op, it is performed on all columns.
	// Besides that, ops are performed on columns in their respective
	// ordering.

	for (int i=0; i < loop; i++) {
		int col = str2chrPos(colTokens.getElem(numCols > 1 ? i : 0));

		//check that the column number is valid
		if (col < 1 || col > dbFile->getNumFields()) {
			 cerr << endl << "*****" << endl  << "***** ERROR: Requested column " << col << ", but database file "
					 << dbFile->getFileName() << " only has fields 1 - " << dbFile->getNumFields() << "." << endl;
			 return false;
		}
		const QuickString &operation = opsTokens.getElem(numOps > 1 ? i : 0);
		OP_TYPES opCode = getOpCode(operation);
		if (opCode == INVALID) {
			cerr << endl << "*****" << endl
								 << "***** ERROR: " << operation << " is not a valid operation. " << endl;
			return false;
		}
		_colOps.push_back(pair<int, OP_TYPES>(col, opCode));
	}

	//lastly, if the file is BAM, and they asked for column 2, which is the
	//flags field, then for now we have to throw an error, as the flag field
	//is currently not supported.
	if (_dbFileType == FileRecordTypeChecker::BAM_FILE_TYPE) {
		//also, tell the methods class we're dealing with BAM.
		_methods.setIsBam(true);
		for (size_t i = 0; i < _colOps.size(); i++) {
			if (_colOps[i].first == 2) {
				cerr << endl << "*****" << endl << "***** ERROR: Requested column 2 of a BAM file, which is the Flags field." << endl;
				cerr << "             We currently do not support this, but may in future versions." << endl;
				return false;
			}
		}
	}

    return true;
}

const QuickString & KeyListOps::getOpVals(RecordKeyVector &hits)
{
	//loop through all requested columns, and for each one, call the method needed
	//for the operation specified.
	_methods.setKeyList(&hits);
	_outVals.clear();
	double val = 0.0;
	for (int i=0; i < (int)_colOps.size(); i++) {
		int col = _colOps[i].first;
		OP_TYPES opCode = _colOps[i].second;

		_methods.setColumn(col);
		switch (opCode) {
		case SUM:
			val = _methods.getSum();
			if (isnan(val)) {
				_outVals.append(_methods.getNullValue());
			} else {
				_outVals.append(format(val));
			}
			break;

		case MEAN:
			val = _methods.getMean();
			if (isnan(val)) {
				_outVals.append(_methods.getNullValue());
			} else {
				_outVals.append(format(val));
			}
			break;

		case STDDEV:
			val = _methods.getStddev();
			if (isnan(val)) {
				_outVals.append(_methods.getNullValue());
			} else {
				_outVals.append(format(val));
			}
			break;

		case SAMPLE_STDDEV:
			val = _methods.getSampleStddev();
			if (isnan(val)) {
				_outVals.append(_methods.getNullValue());
			} else {
				_outVals.append(format(val));
			}
			break;

		case MEDIAN:
			val = _methods.getMedian();
			if (isnan(val)) {
				_outVals.append(_methods.getNullValue());
			} else {
				_outVals.append(format(val));
			}
			break;

		case MODE:
			_outVals.append(_methods.getMode());
			break;

		case ANTIMODE:
			_outVals.append(_methods.getAntiMode());
			break;

		case MIN:
			val = _methods.getMin();
			if (isnan(val)) {
				_outVals.append(_methods.getNullValue());
			} else {
				_outVals.append(format(val));
			}
			break;

		case MAX:
			val = _methods.getMax();
			if (isnan(val)) {
				_outVals.append(_methods.getNullValue());
			} else {
				_outVals.append(format(val));
			}
			break;

		case ABSMIN:
			val = _methods.getAbsMin();
			if (isnan(val)) {
				_outVals.append(_methods.getNullValue());
			} else {
				_outVals.append(format(val));
			}
			break;

		case ABSMAX:
			val = _methods.getAbsMax();
			if (isnan(val)) {
				_outVals.append(_methods.getNullValue());
			} else {
				_outVals.append(format(val));
			}
			break;

		case COUNT:
			_outVals.append(_methods.getCount());
			break;

		case DISTINCT:
			_outVals.append(_methods.getDistinct());
			break;

		case COUNT_DISTINCT:
			_outVals.append(_methods.getCountDistinct());
			break;

		case DISTINCT_ONLY:
			_outVals.append(_methods.getDistinctOnly());
			break;

		case COLLAPSE:
			_outVals.append(_methods.getCollapse());
			break;

		case CONCAT:
			_outVals.append(_methods.getConcat());
			break;

		case FREQ_ASC:
			_outVals.append(_methods.getFreqAsc());
			break;

		case FREQ_DESC:
			_outVals.append(_methods.getFreqDesc());
			break;

		case FIRST:
			_outVals.append(_methods.getFirst());
			break;

		case LAST:
			_outVals.append(_methods.getLast());
			break;

		case INVALID:
		default:
			// Any unrecognized operation should have been handled already in the context validation.
			// It's thus unnecessary to handle it here, but throw an error to help us know if future
			// refactoring or code changes accidentally bypass the validation phase.
			cerr << "ERROR: Invalid operation given for column " << col << ". Exiting..." << endl;
			break;
		}
		//if this isn't the last column, add a tab.
		if (i < (int)_colOps.size() -1) {
			_outVals.append('\t');
		}
	}
	if (_methods.nonNumErrFlagSet()) {
		//asked for a numeric op on a column in which a non numeric value was found.
		cerr << _methods.getErrMsg() << endl;
		_methods.resetNonNumErrFlag();
	}
	return _outVals;
}

const QuickString &KeyListOps::format(double val)
{
   std::stringstream strmBuf;
   strmBuf << std::setprecision (_precision) << val;
   _formatStr = strmBuf.str();
   return _formatStr;
}

void KeyListOpsHelp() {

    cerr << "\t-o\t"             << "Specify the operation that should be applied to -c." << endl;
    cerr                         << "\t\tValid operations:" << endl;
    cerr                         << "\t\t    sum, min, max, absmin, absmax," << endl;
    cerr                         << "\t\t    mean, median," << endl;
    cerr                         << "\t\t    collapse (i.e., print a delimited list (duplicates allowed)), " << endl;
    cerr                         << "\t\t    distinct (i.e., print a delimited list (NO duplicates allowed)), " << endl;
    cerr                         << "\t\t    count" << endl;
    cerr                         << "\t\t    count_distinct (i.e., a count of the unique values in the column), " << endl;
    cerr                         << "\t\tDefault: sum" << endl;
    cerr						 << "\t\tMultiple operations can be specified in a comma-delimited list." << endl << endl;

    cerr						<< "\t\tIf there is only column, but multiple operations, all operations will be" << endl;
    cerr						<< "\t\tapplied on that column. Likewise, if there is only one operation, but" << endl;
    cerr						<< "\t\tmultiple columns, that operation will be applied to all columns." << endl;
    cerr						<< "\t\tOtherwise, the number of columns must match the the number of operations," << endl;
    cerr						<< "\t\tand will be applied in respective order." << endl;
    cerr						<< "\t\tE.g., \"-c 5,4,6 -o sum,mean,count\" will give the sum of column 5," << endl;
    cerr						<< "\t\tthe mean of column 4, and the count of column 6." << endl;
    cerr						<< "\t\tThe order of output columns will match the ordering given in the command." << endl << endl<<endl;

    cerr << "\t-delim\t"                 << "Specify a custom delimiter for the collapse operations." << endl;
    cerr                                 << "\t\t- Example: -delim \"|\"" << endl;
    cerr                                 << "\t\t- Default: \",\"." << endl << endl;

}