Commit 77c1bdad authored by nkindlon's avatar nkindlon
Browse files

Created new KeyListOps class to replace vectorOps. Converted map tool to this.

parent 85df2b4a
......@@ -78,6 +78,7 @@ UTIL_SUBDIRS = $(SRC_DIR)/utils/bedFile \
$(SRC_DIR)/utils/gzstream \
$(SRC_DIR)/utils/fileType \
$(SRC_DIR)/utils/bedFilePE \
$(SRC_DIR)/utils/KeyListOps \
$(SRC_DIR)/utils/NewChromsweep \
$(SRC_DIR)/utils/sequenceUtilities \
$(SRC_DIR)/utils/tabFile \
......
......@@ -29,6 +29,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/VectorOps \
-I$(UTILITIES_DIR)/BinTree \
......
......@@ -21,11 +21,14 @@ const int PRECISION = 21;
FileMap::FileMap(ContextMap *context)
: _context(context),
_blockMgr(NULL),
_recordOutputMgr(NULL)
_recordOutputMgr(NULL),
_colOps(_context->getColOps())
{
_blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal());
_recordOutputMgr = new RecordOutputMgr();
_recordOutputMgr->init(_context);
_keyListOps.setNullValue(_context->getNullValue());
_keyListOps.setDelimStr(_context->getDelim());
}
FileMap::~FileMap(void) {
......@@ -43,78 +46,174 @@ bool FileMap::mapFiles()
}
RecordKeyList hitSet;
while (sweep.next(hitSet)) {
_outputValues.clear();
if (_context->getObeySplits()) {
RecordKeyList keySet(hitSet.getKey());
RecordKeyList resultSet(hitSet.getKey());
_blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet);
SummarizeHits(resultSet);
_recordOutputMgr->printRecord(resultSet.getKey(), _output);
calculateOutput(resultSet);
_recordOutputMgr->printRecord(resultSet.getKey(), _outputValues);
} else {
SummarizeHits(hitSet);
_recordOutputMgr->printRecord(hitSet.getKey(), _output);
calculateOutput(hitSet);
_recordOutputMgr->printRecord(hitSet.getKey(), _outputValues);
}
}
return true;
}
void FileMap::ExtractColumnFromHits(RecordKeyList &hits) {
_column_vec.clear();
RecordKeyList::const_iterator_type iter = hits.begin();
for (; iter != hits.end(); iter = hits.next())
{
_column_vec.push_back(iter->value()->getField(_context->getColumn()).str());
}
}
void FileMap::SummarizeHits(RecordKeyList &hits) {
const QuickString & operation = _context->getColumnOperation();
_output.clear();
if (hits.size() == 0) {
if (operation == "count" || operation == "count_distinct")
_output.append("0");
else
_output.append(_context->getNullValue().str());
return;
}
_tmp_output.str("");
_tmp_output.clear();
ExtractColumnFromHits(hits);
VectorOps vo(_column_vec);
if (operation == "sum")
_tmp_output << setprecision (PRECISION) << vo.GetSum();
else if (operation == "mean")
_tmp_output << setprecision (PRECISION) << vo.GetMean();
else if (operation == "median")
_tmp_output << setprecision (PRECISION) << vo.GetMedian();
else if (operation == "min")
_tmp_output << setprecision (PRECISION) << vo.GetMin();
else if (operation == "max")
_tmp_output << setprecision (PRECISION) << vo.GetMax();
else if (operation == "absmin")
_tmp_output << setprecision (PRECISION) << vo.GetAbsMin();
else if (operation == "absmax")
_tmp_output << setprecision (PRECISION) << vo.GetAbsMax();
else if (operation == "mode")
_tmp_output << vo.GetMode();
else if (operation == "antimode")
_tmp_output << vo.GetAntiMode();
else if (operation == "count")
_tmp_output << setprecision (PRECISION) << vo.GetCount();
else if (operation == "count_distinct")
_tmp_output << setprecision (PRECISION) << vo.GetCountDistinct();
else if (operation == "collapse")
_tmp_output << vo.GetCollapse();
else if (operation == "distinct")
_tmp_output << vo.GetDistinct();
else {
cerr << "ERROR: " << operation << " is an unrecognized operation\n";
exit(1);
}
_output.append(_tmp_output.str());
void FileMap::calculateOutput(RecordKeyList &hits)
{
//loop through all requested columns, and for each one, call the method needed
//for the operation specified.
_keyListOps.setKeyList(&hits);
double val = 0.0;
for (int i=0; i < (int)_colOps.size(); i++) {
int col = _colOps[i].first;
KeyListOps::OP_TYPES opCode = _colOps[i].second;
_keyListOps.setColumn(col);
switch (opCode) {
case KeyListOps::SUM:
val = _keyListOps.getSum();
if (isnan(val)) {
_outputValues.append(_context->getNullValue());
} else {
_outputValues.append(val);
}
break;
case KeyListOps::MEAN:
val = _keyListOps.getMean();
if (isnan(val)) {
_outputValues.append(_context->getNullValue());
} else {
_outputValues.append(val);
}
break;
case KeyListOps::STDDEV:
val = _keyListOps.getStddev();
if (isnan(val)) {
_outputValues.append(_context->getNullValue());
} else {
_outputValues.append(val);
}
break;
case KeyListOps::SAMPLE_STDDEV:
val = _keyListOps.getSampleStddev();
if (isnan(val)) {
_outputValues.append(_context->getNullValue());
} else {
_outputValues.append(val);
}
break;
case KeyListOps::MEDIAN:
val = _keyListOps.getMedian();
if (isnan(val)) {
_outputValues.append(_context->getNullValue());
} else {
_outputValues.append(val);
}
break;
case KeyListOps::MODE:
_outputValues.append(_keyListOps.getMode());
break;
case KeyListOps::ANTIMODE:
_outputValues.append(_keyListOps.getAntiMode());
break;
case KeyListOps::MIN:
val = _keyListOps.getMin();
if (isnan(val)) {
_outputValues.append(_context->getNullValue());
} else {
_outputValues.append(val);
}
break;
case KeyListOps::MAX:
val = _keyListOps.getMax();
if (isnan(val)) {
_outputValues.append(_context->getNullValue());
} else {
_outputValues.append(val);
}
break;
case KeyListOps::ABSMIN:
val = _keyListOps.getAbsMin();
if (isnan(val)) {
_outputValues.append(_context->getNullValue());
} else {
_outputValues.append(val);
}
break;
case KeyListOps::ABSMAX:
val = _keyListOps.getAbsMax();
if (isnan(val)) {
_outputValues.append(_context->getNullValue());
} else {
_outputValues.append(val);
}
break;
case KeyListOps::COUNT:
_outputValues.append(_keyListOps.getCount());
break;
case KeyListOps::DISTINCT:
_outputValues.append(_keyListOps.getDistinct());
break;
case KeyListOps::COUNT_DISTINCT:
_outputValues.append(_keyListOps.getCountDistinct());
break;
case KeyListOps::DISTINCT_ONLY:
_outputValues.append(_keyListOps.getDistinctOnly());
break;
case KeyListOps::COLLAPSE:
_outputValues.append(_keyListOps.getCollapse());
break;
case KeyListOps::CONCAT:
_outputValues.append(_keyListOps.getConcat());
break;
case KeyListOps::FREQ_ASC:
_outputValues.append(_keyListOps.getFreqAsc());
break;
case KeyListOps::FREQ_DESC:
_outputValues.append(_keyListOps.getFreqDesc());
break;
case KeyListOps::FIRST:
_outputValues.append(_keyListOps.getFirst());
break;
case KeyListOps::LAST:
_outputValues.append(_keyListOps.getLast());
break;
case KeyListOps::INVALID:
default:
// Any unrecognized operation should have been handled already in the context validation.
// It's thus unnecessary to handle it here, but throw an error to help us know if future
// refactoring or code changes accidentally bypass the validation phase.
cerr << "ERROR: Invalid operation given for column " << col << ". Exiting..." << endl;
break;
}
//if this isn't the last column, add a tab.
if (i < (int)_colOps.size() -1) {
_outputValues.append('\t');
}
}
}
......@@ -18,10 +18,11 @@ using namespace std;
#include <iomanip>
#include "VectorOps.h"
#include "RecordKeyList.h"
#include "KeyListOps.h"
#include "ContextMap.h"
using namespace std;
class ContextMap;
class BlockMgr;
class RecordOutputMgr;
......@@ -35,90 +36,13 @@ public:
private:
ContextMap *_context;
Record *_queryRec;
Record *_databaseRec;
BlockMgr *_blockMgr;
RecordOutputMgr *_recordOutputMgr;
KeyListOps _keyListOps;
const ContextMap::colOpsType & _colOps;
QuickString _outputValues; // placeholder for the results of mapping B to each a in A.
vector<string> _column_vec; // vector to hold current column's worth of data
ostringstream _tmp_output;
QuickString _output; // placeholder for the results of mapping B to each a in A.
//------------------------------------------------
// private methods
//------------------------------------------------
void Map();
void SummarizeHits(RecordKeyList &hits);
void ExtractColumnFromHits(RecordKeyList &hits);
void calculateOutput(RecordKeyList &hits);
};
#endif /* MAPFILE_H */
/*
#include "bedFile.h"
#include "chromsweep.h"
#include "VectorOps.h"
#include "api/BamReader.h"
#include "api/BamWriter.h"
#include "api/BamAux.h"
#include "BamAncillary.h"
using namespace BamTools;
#include <vector>
#include <iostream>
#include <algorithm>
#include <numeric>
#include <fstream>
#include <iomanip>
#include <stdlib.h>
using namespace std;
class BedMap {
public:
// constructor
BedMap(string bedAFile, string bedBFile, int column, string operation,
float overlapFraction, bool sameStrand,
bool diffStrand, bool reciprocal,
bool choseNullValue, string nullValue,
bool printHeader);
// destructor
~BedMap(void);
private:
//------------------------------------------------
// private attributes
//------------------------------------------------
string _bedAFile;
string _bedBFile;
int _column;
string _operation;
bool _sameStrand;
bool _diffStrand;
bool _reciprocal;
float _overlapFraction;
string _nullValue;
bool _printHeader;
// instance of a bed file class.
BedFile *_bedA, *_bedB;
vector<string> _column_vec; // vector to hold current column's worth of data
//------------------------------------------------
// private methods
//------------------------------------------------
void Map();
string MapHits(const BED &a, const vector<BED> &hits);
void ExtractColumnFromHits(const vector<BED> &hits);
};
*/
//#endif /* MAPFILE_H */
......@@ -38,144 +38,6 @@ int map_main(int argc, char* argv[]) {
return retVal ? 0 : 1;
}
/*
int map_main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
// input files
string bedAFile;
string bedBFile;
int column = 5;
string operation = "sum";
string nullValue = ".";
// input arguments
float overlapFraction = 1E-9;
bool haveBedA = false;
bool haveBedB = false;
bool haveColumn = false;
bool haveOperation = false;
bool haveFraction = false;
bool reciprocalFraction = false;
bool sameStrand = false;
bool diffStrand = false;
bool printHeader = false;
bool choseNullValue = false;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
}
}
if(showHelp) map_help();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-a", 2, parameterLength)) {
if ((i+1) < argc) {
haveBedA = true;
bedAFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-b", 2, parameterLength)) {
if ((i+1) < argc) {
haveBedB = true;
bedBFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-c", 2, parameterLength)) {
if ((i+1) < argc) {
haveColumn = true;
column = atoi(argv[i + 1]);
i++;
}
}
else if(PARAMETER_CHECK("-o", 2, parameterLength)) {
if ((i+1) < argc) {
haveOperation = true;
operation = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-f", 2, parameterLength)) {
if ((i+1) < argc) {
haveFraction = true;
overlapFraction = atof(argv[i + 1]);
i++;
}
}
else if(PARAMETER_CHECK("-r", 2, parameterLength)) {
reciprocalFraction = true;
}
else if (PARAMETER_CHECK("-s", 2, parameterLength)) {
sameStrand = true;
}
else if (PARAMETER_CHECK("-S", 2, parameterLength)) {
diffStrand = true;
}
else if (PARAMETER_CHECK("-null", 5, parameterLength)) {
nullValue = argv[i + 1];
choseNullValue = true;
i++;
}
else if(PARAMETER_CHECK("-header", 7, parameterLength)) {
printHeader = true;
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveBedA || !haveBedB) {
cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl;
showHelp = true;
}
if (reciprocalFraction && !haveFraction) {
cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl;
showHelp = true;
}
if (sameStrand && diffStrand) {
cerr << endl << "*****" << endl << "*****ERROR: Request either -s OR -S, not both." << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedMap *bm = new BedMap(bedAFile, bedBFile, column, operation,
overlapFraction, sameStrand,
diffStrand, reciprocalFraction,
choseNullValue, nullValue,
printHeader);
delete bm;
return 0;
}
else {
map_help();
return 0;
}
}
*/
void map_help(void) {
cerr << "\nTool: bedtools map (aka mapBed)" << endl;
......
......@@ -57,15 +57,6 @@ ContextBase::ContextBase()
_programNames["intersect"] = INTERSECT;
_programNames["sample"] = SAMPLE;
_programNames["map"] = MAP;
_validScoreOps.insert("sum");
_validScoreOps.insert("max");
_validScoreOps.insert("min");
_validScoreOps.insert("mean");
_validScoreOps.insert("mode");
_validScoreOps.insert("median");
_validScoreOps.insert("antimode");
_validScoreOps.insert("collapse");
}
ContextBase::~ContextBase()
......
......@@ -191,15 +191,11 @@ protected:
int _bamHeaderAndRefIdx;
int _maxNumDatabaseFields;
bool _useFullBamTags;
QuickString _columnOperation;
int _column;
QuickString _nullValue;
bool _reportCount;
int _maxDistance;
bool _reportNames;
bool _reportScores;
QuickString _scoreOp;
set<QuickString> _validScoreOps;
int _numOutputRecords;
......
......@@ -21,6 +21,8 @@ public:
//NOTE: Query and database files will only be marked as such by either the
//parseCmdArgs method, or by explicitly setting them.
FileRecordMgr *getQueryFile() { return getFile(_queryFileIdx); }
FileRecordMgr *getDatabaseFile() { return getFile(_databaseFileIdx); }
int getQueryFileIdx() const { return _queryFileIdx; }
void setQueryFileIdx(int idx) { _queryFileIdx = idx; }
int getDatabaseFileIdx() const { return _databaseFileIdx; }
......
......@@ -8,15 +8,16 @@
#include "ContextMap.h"
ContextMap::ContextMap()
: _delimStr(",")
{
// map requires sorted input
setSortedInput(true);
setLeftJoin(true);
// default to BED score column
setColumn(5);
setColumns("5");
// default to "sum"
setColumnOperation("sum");
setOperations("sum");
// default to "." as a NULL value
setNullValue('.');
}
......@@ -53,6 +54,10 @@ bool ContextMap::parseCmdArgs(int argc, char **argv, int skipFirstArgs) {
else if (strcmp(_argv[_i], "-null") == 0) {
if (!handle_null()) return false;
}
else if (strcmp(_argv[_i], "-delim") == 0) {
if (!handle_delim()) return false;
}
}
return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs);
}
......@@ -66,23 +71,64 @@ bool ContextMap::isValidState()
if (getDatabaseFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) {
//throw Error
cerr << endl << "*****"
<< endl
cerr << endl << "*****" << endl
<< "***** ERROR: BAM database file not currently supported for the map tool."
<< endl;
exit(1);
}
// TODO
// enforce any specific checks for Map.
//get the strings from context containing the comma-delimited lists of columns
//and operations. Split both of these into vectors. Get the operation code
//for each operation string. Finally, make a vector of pairs, where the first
//member of each pair is a column number, and the second member is the code for the
//operation to perform on that column.
vector<QuickString> columnsVec;
vector<QuickString> opsVec;
int numCols = Tokenize(_columns, columnsVec, ',');
int numOps = Tokenize(_operations, opsVec, ',');
if (numOps < 1 || numCols < 1) {
cerr << endl << "*****" <&