Commit 2268ee4f authored by nkindlon's avatar nkindlon
Browse files

Refactored KeyListOps, Context, mapFile for KeyListOps re-usability by other tools.

parent 85df2b4a
......@@ -17,6 +17,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/BinTree \
......
......@@ -29,6 +29,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/VectorOps \
-I$(UTILITIES_DIR)/BinTree \
......
......@@ -47,74 +47,11 @@ bool FileMap::mapFiles()
RecordKeyList keySet(hitSet.getKey());
RecordKeyList resultSet(hitSet.getKey());
_blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet);
SummarizeHits(resultSet);
_recordOutputMgr->printRecord(resultSet.getKey(), _output);
_recordOutputMgr->printRecord(resultSet.getKey(), _context->getColumnOpsVal(resultSet));
} else {
SummarizeHits(hitSet);
_recordOutputMgr->printRecord(hitSet.getKey(), _output);
_recordOutputMgr->printRecord(hitSet.getKey(), _context->getColumnOpsVal(hitSet));
}
}
return true;
}
void FileMap::ExtractColumnFromHits(RecordKeyList &hits) {
_column_vec.clear();
RecordKeyList::const_iterator_type iter = hits.begin();
for (; iter != hits.end(); iter = hits.next())
{
_column_vec.push_back(iter->value()->getField(_context->getColumn()).str());
}
}
void FileMap::SummarizeHits(RecordKeyList &hits) {
const QuickString & operation = _context->getColumnOperation();
_output.clear();
if (hits.size() == 0) {
if (operation == "count" || operation == "count_distinct")
_output.append("0");
else
_output.append(_context->getNullValue().str());
return;
}
_tmp_output.str("");
_tmp_output.clear();
ExtractColumnFromHits(hits);
VectorOps vo(_column_vec);
if (operation == "sum")
_tmp_output << setprecision (PRECISION) << vo.GetSum();
else if (operation == "mean")
_tmp_output << setprecision (PRECISION) << vo.GetMean();
else if (operation == "median")
_tmp_output << setprecision (PRECISION) << vo.GetMedian();
else if (operation == "min")
_tmp_output << setprecision (PRECISION) << vo.GetMin();
else if (operation == "max")
_tmp_output << setprecision (PRECISION) << vo.GetMax();
else if (operation == "absmin")
_tmp_output << setprecision (PRECISION) << vo.GetAbsMin();
else if (operation == "absmax")
_tmp_output << setprecision (PRECISION) << vo.GetAbsMax();
else if (operation == "mode")
_tmp_output << vo.GetMode();
else if (operation == "antimode")
_tmp_output << vo.GetAntiMode();
else if (operation == "count")
_tmp_output << setprecision (PRECISION) << vo.GetCount();
else if (operation == "count_distinct")
_tmp_output << setprecision (PRECISION) << vo.GetCountDistinct();
else if (operation == "collapse")
_tmp_output << vo.GetCollapse();
else if (operation == "distinct")
_tmp_output << vo.GetDistinct();
else {
cerr << "ERROR: " << operation << " is an unrecognized operation\n";
exit(1);
}
_output.append(_tmp_output.str());
}
......@@ -18,10 +18,11 @@ using namespace std;
#include <iomanip>
#include "VectorOps.h"
#include "RecordKeyList.h"
#include "KeyListOps.h"
#include "ContextMap.h"
using namespace std;
class ContextMap;
class BlockMgr;
class RecordOutputMgr;
......@@ -35,90 +36,8 @@ public:
private:
ContextMap *_context;
Record *_queryRec;
Record *_databaseRec;
BlockMgr *_blockMgr;
RecordOutputMgr *_recordOutputMgr;
vector<string> _column_vec; // vector to hold current column's worth of data
ostringstream _tmp_output;
QuickString _output; // placeholder for the results of mapping B to each a in A.
//------------------------------------------------
// private methods
//------------------------------------------------
void Map();
void SummarizeHits(RecordKeyList &hits);
void ExtractColumnFromHits(RecordKeyList &hits);
};
#endif /* MAPFILE_H */
/*
#include "bedFile.h"
#include "chromsweep.h"
#include "VectorOps.h"
#include "api/BamReader.h"
#include "api/BamWriter.h"
#include "api/BamAux.h"
#include "BamAncillary.h"
using namespace BamTools;
#include <vector>
#include <iostream>
#include <algorithm>
#include <numeric>
#include <fstream>
#include <iomanip>
#include <stdlib.h>
using namespace std;
class BedMap {
public:
// constructor
BedMap(string bedAFile, string bedBFile, int column, string operation,
float overlapFraction, bool sameStrand,
bool diffStrand, bool reciprocal,
bool choseNullValue, string nullValue,
bool printHeader);
// destructor
~BedMap(void);
private:
//------------------------------------------------
// private attributes
//------------------------------------------------
string _bedAFile;
string _bedBFile;
int _column;
string _operation;
bool _sameStrand;
bool _diffStrand;
bool _reciprocal;
float _overlapFraction;
string _nullValue;
bool _printHeader;
// instance of a bed file class.
BedFile *_bedA, *_bedB;
vector<string> _column_vec; // vector to hold current column's worth of data
//------------------------------------------------
// private methods
//------------------------------------------------
void Map();
string MapHits(const BED &a, const vector<BED> &hits);
void ExtractColumnFromHits(const vector<BED> &hits);
};
*/
//#endif /* MAPFILE_H */
......@@ -38,144 +38,6 @@ int map_main(int argc, char* argv[]) {
return retVal ? 0 : 1;
}
/*
int map_main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
// input files
string bedAFile;
string bedBFile;
int column = 5;
string operation = "sum";
string nullValue = ".";
// input arguments
float overlapFraction = 1E-9;
bool haveBedA = false;
bool haveBedB = false;
bool haveColumn = false;
bool haveOperation = false;
bool haveFraction = false;
bool reciprocalFraction = false;
bool sameStrand = false;
bool diffStrand = false;
bool printHeader = false;
bool choseNullValue = false;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
}
}
if(showHelp) map_help();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-a", 2, parameterLength)) {
if ((i+1) < argc) {
haveBedA = true;
bedAFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-b", 2, parameterLength)) {
if ((i+1) < argc) {
haveBedB = true;
bedBFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-c", 2, parameterLength)) {
if ((i+1) < argc) {
haveColumn = true;
column = atoi(argv[i + 1]);
i++;
}
}
else if(PARAMETER_CHECK("-o", 2, parameterLength)) {
if ((i+1) < argc) {
haveOperation = true;
operation = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-f", 2, parameterLength)) {
if ((i+1) < argc) {
haveFraction = true;
overlapFraction = atof(argv[i + 1]);
i++;
}
}
else if(PARAMETER_CHECK("-r", 2, parameterLength)) {
reciprocalFraction = true;
}
else if (PARAMETER_CHECK("-s", 2, parameterLength)) {
sameStrand = true;
}
else if (PARAMETER_CHECK("-S", 2, parameterLength)) {
diffStrand = true;
}
else if (PARAMETER_CHECK("-null", 5, parameterLength)) {
nullValue = argv[i + 1];
choseNullValue = true;
i++;
}
else if(PARAMETER_CHECK("-header", 7, parameterLength)) {
printHeader = true;
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveBedA || !haveBedB) {
cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl;
showHelp = true;
}
if (reciprocalFraction && !haveFraction) {
cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl;
showHelp = true;
}
if (sameStrand && diffStrand) {
cerr << endl << "*****" << endl << "*****ERROR: Request either -s OR -S, not both." << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedMap *bm = new BedMap(bedAFile, bedBFile, column, operation,
overlapFraction, sameStrand,
diffStrand, reciprocalFraction,
choseNullValue, nullValue,
printHeader);
delete bm;
return 0;
}
else {
map_help();
return 0;
}
}
*/
void map_help(void) {
cerr << "\nTool: bedtools map (aka mapBed)" << endl;
......
......@@ -10,6 +10,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders \
-I$(UTILITIES_DIR)/FileRecordTools/Records \
-I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/general \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/GenomeFile/ \
......
......@@ -18,6 +18,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders \
-I$(UTILITIES_DIR)/FileRecordTools/Records \
-I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/general
# ----------------------------------
......
......@@ -17,6 +17,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/version/
......
......@@ -11,6 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src/ \
-I$(UTILITIES_DIR)/version/
......
......@@ -52,20 +52,16 @@ ContextBase::ContextBase()
_hasConstantSeed(false),
_seed(0),
_forwardOnly(false),
_reverseOnly(false)
_reverseOnly(false),
_hasColumnOpsMethods(false)
{
_programNames["intersect"] = INTERSECT;
_programNames["sample"] = SAMPLE;
_programNames["map"] = MAP;
_validScoreOps.insert("sum");
_validScoreOps.insert("max");
_validScoreOps.insert("min");
_validScoreOps.insert("mean");
_validScoreOps.insert("mode");
_validScoreOps.insert("median");
_validScoreOps.insert("antimode");
_validScoreOps.insert("collapse");
if (hasColumnOpsMethods()) {
_keyListOps = new KeyListOps();
}
}
ContextBase::~ContextBase()
......@@ -79,6 +75,11 @@ ContextBase::~ContextBase()
delete _files[i];
_files[i] = NULL;
}
if (hasColumnOpsMethods()) {
delete _keyListOps;
_keyListOps = NULL;
}
}
bool ContextBase::determineOutputType() {
......@@ -176,6 +177,19 @@ bool ContextBase::parseCmdArgs(int argc, char **argv, int skipFirstArgs) {
else if (strcmp(_argv[_i], "-seed") == 0) {
if (!handle_seed()) return false;
}
else if (strcmp(_argv[_i], "-o") == 0) {
if (!handle_o()) return false;
}
else if (strcmp(_argv[_i], "-c") == 0) {
if (!handle_c()) return false;
}
else if (strcmp(_argv[_i], "-null") == 0) {
if (!handle_null()) return false;
}
else if (strcmp(_argv[_i], "-delim") == 0) {
if (!handle_delim()) return false;
}
}
return true;
}
......@@ -191,6 +205,12 @@ bool ContextBase::isValidState()
if (!determineOutputType()) {
return false;
}
if (hasColumnOpsMethods()) {
FileRecordMgr *dbFile = getFile(hasIntersectMethods() ? _databaseFileIdx : 0);
if (!_keyListOps->isValidColumnOps(dbFile)) {
return false;
}
}
return true;
}
......@@ -363,3 +383,85 @@ bool ContextBase::handle_ubam()
markUsed(_i - _skipFirstArgs);
return true;
}
// Methods specific to column operations.
// for col ops, -c is the string of columns upon which to operate
bool ContextBase::handle_c()
{
if (!hasColumnOpsMethods()) {
return false;
}
if ((_i+1) < _argc) {
_keyListOps->setColumns(_argv[_i + 1]);
markUsed(_i - _skipFirstArgs);
_i++;
markUsed(_i - _skipFirstArgs);
}
return true;
}
// for col ops, -o is the string of operations to apply to the columns (-c)
bool ContextBase::handle_o()
{
if (!hasColumnOpsMethods()) {
return false;
}
if ((_i+1) < _argc) {
_keyListOps->setOperations(_argv[_i + 1]);
markUsed(_i - _skipFirstArgs);
_i++;
markUsed(_i - _skipFirstArgs);
}
return true;
}
// for col ops, -null is a NULL vakue assigned
// when no overlaps are detected.
bool ContextBase::handle_null()
{
if (!hasColumnOpsMethods()) {
return false;
}
if ((_i+1) < _argc) {
_keyListOps->setNullValue(_argv[_i + 1]);
markUsed(_i - _skipFirstArgs);
_i++;
markUsed(_i - _skipFirstArgs);
}
return true;
}
//for col ops, delimStr will appear between each item in
//a collapsed but delimited list.
bool ContextBase::handle_delim()
{
if (!hasColumnOpsMethods()) {
return false;
}
if ((_i+1) < _argc) {
_keyListOps->setDelimStr(_argv[_i + 1]);
markUsed(_i - _skipFirstArgs);
_i++;
markUsed(_i - _skipFirstArgs);
}
return true;
}
void ContextBase::setColumnOpsMethods(bool val)
{
_hasColumnOpsMethods = val;
if (val) {
_keyListOps = new KeyListOps();
}
}
const QuickString &ContextBase::getColumnOpsVal(RecordKeyList &keyList) const {
if (!hasColumnOpsMethods()) {
return _nullStr;
}
return _keyListOps->getOpVals(keyList);
}
......@@ -24,6 +24,7 @@
#include "NewGenomeFile.h"
#include "api/BamReader.h"
#include "api/BamAux.h"
#include "KeyListOps.h"
class ContextBase {
......@@ -144,6 +145,13 @@ public:
//methods.
virtual bool hasIntersectMethods() const { return false; }
// determine whether column operations like those used in map
// are available.
void setColumnOpsMethods(bool val);
virtual bool hasColumnOpsMethods() const { return _hasColumnOpsMethods; }
const QuickString &getColumnOpsVal(RecordKeyList &keyList) const;
//methods applicable only to column operations.
protected:
PROGRAM_TYPE _program;
......@@ -191,15 +199,11 @@ protected:
int _bamHeaderAndRefIdx;
int _maxNumDatabaseFields;
bool _useFullBamTags;
QuickString _columnOperation;
int _column;
QuickString _nullValue;
bool _reportCount;
int _maxDistance;
bool _reportNames;
bool _reportScores;
QuickString _scoreOp;
set<QuickString> _validScoreOps;
int _numOutputRecords;
......@@ -208,6 +212,10 @@ protected:
bool _forwardOnly;
bool _reverseOnly;
bool _hasColumnOpsMethods;
KeyListOps *_keyListOps;
QuickString _nullStr; //placeholder return value when col ops aren't valid.
void markUsed(int i) { _argsProcessed[i] = true; }
bool isUsed(int i) const { return _argsProcessed[i]; }
bool cmdArgsValid();
......@@ -231,6 +239,11 @@ protected:
virtual bool handle_split();
virtual bool handle_sorted();
virtual bool handle_ubam();
virtual bool handle_c();
virtual bool handle_o();
virtual bool handle_null();
virtual bool handle_delim();
};
#endif /* CONTEXTBASE_H_ */
......@@ -21,6 +21,8 @@ public:
//NOTE: Query and database files will only be marked as such by either the
//parseCmdArgs method, or by explicitly setting them.
FileRecordMgr *getQueryFile() { return getFile(_queryFileIdx); }
FileRecordMgr *getDatabaseFile() { return getFile(_databaseFileIdx); }
int getQueryFileIdx() const { return _queryFileIdx; }
void setQueryFileIdx(int idx) { _queryFileIdx = idx; }
int getDatabaseFileIdx() const { return _databaseFileIdx; }
......
......@@ -12,13 +12,7 @@ ContextMap::ContextMap()
// map requires sorted input
setSortedInput(true);
setLeftJoin(true);
// default to BED score column
setColumn(5);
// default to "sum"
setColumnOperation("sum");
// default to "." as a NULL value
setNullValue('.');
setColumnOpsMethods(true);
}
ContextMap::~ContextMap()
......@@ -44,75 +38,22 @@ bool ContextMap::parseCmdArgs(int argc, char **argv, int skipFirstArgs) {
if (isUsed(_i - _skipFirstArgs)) {
continue;
}
else if (strcmp(_argv[_i], "-o") == 0) {
if (!handle_o()) return false;