From c87afc9fd47d977e60aa99f37a5ec7a22ecfdb2d Mon Sep 17 00:00:00 2001 From: arq5x <arq5x@virginia.edu> Date: Thu, 9 Jan 2014 14:10:03 -0500 Subject: [PATCH] first pass at using Context and PFM for the map tool. --- Makefile | 2 +- src/mapBed/Makefile | 38 ------ src/mapFile/Makefile | 55 ++++++++ .../mapBed.cpp => mapFile/mapFile.cpp} | 123 +++++++++++++++++- src/{mapBed/mapBed.h => mapFile/mapFile.h} | 53 +++++++- src/{mapBed => mapFile}/mapMain.cpp | 31 ++++- src/utils/Contexts/ContextBase.cpp | 1 + src/utils/Contexts/ContextBase.h | 3 +- src/utils/Contexts/Makefile | 9 +- src/utils/FileRecordTools/RecordOutputMgr.cpp | 19 ++- src/utils/FileRecordTools/RecordOutputMgr.h | 2 + 11 files changed, 278 insertions(+), 58 deletions(-) delete mode 100644 src/mapBed/Makefile create mode 100644 src/mapFile/Makefile rename src/{mapBed/mapBed.cpp => mapFile/mapFile.cpp} (53%) rename src/{mapBed/mapBed.h => mapFile/mapFile.h} (68%) rename src/{mapBed => mapFile}/mapMain.cpp (93%) diff --git a/Makefile b/Makefile index 3a4cb074..f3cfb4fc 100644 --- a/Makefile +++ b/Makefile @@ -46,7 +46,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/jaccard \ $(SRC_DIR)/linksBed \ $(SRC_DIR)/maskFastaFromBed \ - $(SRC_DIR)/mapBed \ + $(SRC_DIR)/mapFile \ $(SRC_DIR)/mergeBed \ $(SRC_DIR)/multiBamCov \ $(SRC_DIR)/multiIntersectBed \ diff --git a/src/mapBed/Makefile b/src/mapBed/Makefile deleted file mode 100644 index 96828e61..00000000 --- a/src/mapBed/Makefile +++ /dev/null @@ -1,38 +0,0 @@ -UTILITIES_DIR = ../utils/ -OBJ_DIR = ../../obj/ -BIN_DIR = ../../bin/ - -# ------------------- -# define our includes -# ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ - -I$(UTILITIES_DIR)/gzstream/ \ - -I$(UTILITIES_DIR)/genomeFile/ \ - -I$(UTILITIES_DIR)/lineFileUtilities/ \ - -I$(UTILITIES_DIR)/fileType/ \ - -I$(UTILITIES_DIR)/BamTools/include \ - -I$(UTILITIES_DIR)/BamTools-Ancillary \ - -I$(UTILITIES_DIR)/chromsweep \ - -I$(UTILITIES_DIR)/VectorOps \ - -I$(UTILITIES_DIR)/version/ - -# ---------------------------------- -# define our source and object files -# ---------------------------------- -SOURCES= mapMain.cpp mapBed.cpp mapBed.h -OBJECTS= mapMain.o mapBed.o -BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) - -all: $(BUILT_OBJECTS) - -.PHONY: all - -$(BUILT_OBJECTS): $(SOURCES) - @echo " * compiling" $(*F).cpp - @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES) - -clean: - @echo "Cleaning up." - @rm -f $(OBJ_DIR)/mapMain.o $(OBJ_DIR)/mapBed.o - -.PHONY: clean diff --git a/src/mapFile/Makefile b/src/mapFile/Makefile new file mode 100644 index 00000000..3eea8d06 --- /dev/null +++ b/src/mapFile/Makefile @@ -0,0 +1,55 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +# INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ +# -I$(UTILITIES_DIR)/gzstream/ \ +# -I$(UTILITIES_DIR)/genomeFile/ \ +# -I$(UTILITIES_DIR)/lineFileUtilities/ \ +# -I$(UTILITIES_DIR)/fileType/ \ +# -I$(UTILITIES_DIR)/BamTools/include \ +# -I$(UTILITIES_DIR)/BamTools-Ancillary \ +# -I$(UTILITIES_DIR)/chromsweep \ +# -I$(UTILITIES_DIR)/VectorOps \ +# -I$(UTILITIES_DIR)/version/ + +INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ + -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/GenomeFile/ \ + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools/src \ + -I$(UTILITIES_DIR)/BlockedIntervals \ + -I$(UTILITIES_DIR)/BamTools-Ancillary \ + -I$(UTILITIES_DIR)/FileRecordTools/ \ + -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ + -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/NewChromsweep \ + -I$(UTILITIES_DIR)/VectorOps \ + -I$(UTILITIES_DIR)/BinTree \ + -I$(UTILITIES_DIR)/version/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= mapMain.cpp mapFile.cpp mapFile.h +OBJECTS= mapMain.o mapFile.o +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/mapMain.o $(OBJ_DIR)/mapFile.o + +.PHONY: clean diff --git a/src/mapBed/mapBed.cpp b/src/mapFile/mapFile.cpp similarity index 53% rename from src/mapBed/mapBed.cpp rename to src/mapFile/mapFile.cpp index c7faf4b2..360b6d69 100644 --- a/src/mapBed/mapBed.cpp +++ b/src/mapFile/mapFile.cpp @@ -9,9 +9,127 @@ Licenced under the GNU General Public License 2.0 license. ******************************************************************************/ -#include "lineFileUtilities.h" -#include "mapBed.h" +#include "mapFile.h" +#include "ContextMap.h" +#include "FileRecordMgr.h" +#include "NewChromsweep.h" +#include "BinTree.h" +#include "RecordOutputMgr.h" +const int PRECISION = 21; + +FileMap::FileMap(ContextMap *context) +: _context(context), + _blockMgr(NULL), + _recordOutputMgr(NULL) +{ + // FIX ME - block manager only works for intersect + //_blockMgr = new BlockMgr(); + //_blockMgr->setContext(context); + _recordOutputMgr = new RecordOutputMgr(); +} + +FileMap::~FileMap(void) { + if (_blockMgr != NULL) { + delete _blockMgr; + _blockMgr = NULL; + } + delete _recordOutputMgr; +} + +bool FileMap::mapFiles() +{ + NewChromSweep sweep(_context); + if (!sweep.init()) { + return false; + } + if (!_recordOutputMgr->init(_context)) { + return false; + } + RecordKeyList hitSet; + while (sweep.next(hitSet)) { + //if (_context->getObeySplits()) { + // RecordKeyList keySet(hitSet.getKey()); + // RecordKeyList resultSet(hitSet.getKey()); + // _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet); + //} else { + //} + //_recordOutputMgr->printKeyAndTerminate(hitSet); + _recordOutputMgr->printRecord(hitSet.getKey(), MapHits(hitSet)); + } + return true; +} + +void FileMap::ExtractColumnFromHits(RecordKeyList &hits) { + for (RecordKeyList::const_iterator_type iter = hits.begin(); iter != hits.end(); iter = hits.next()) { + try { + ostringstream startString; + startString << iter->value()->getStartPos(); + //_column_vec.push_back(iter->value().fields.at(_column)); + _column_vec.push_back(startString.str()); + } + catch(std::out_of_range& e) { + exit(1); + } + /* + catch(std::out_of_range& e) { + cerr << endl << "*****" << endl + << "*****ERROR: requested column (" + << _column + 1 + << ") exceeds the number of columns in file at line " + << _bedA->_lineNum << ". Exiting." + << endl << "*****" << endl; + exit(1); + } + */ + } +} + +string FileMap::MapHits(RecordKeyList &hits) { + + ostringstream output; + if (hits.size() == 0) + // FIX ME + return "-1"; + //return _nullValue; + + ExtractColumnFromHits(hits); + + string operation = _context->getColumnOperation(); + + VectorOps vo(_column_vec); + if (operation == "sum") + output << setprecision (PRECISION) << vo.GetSum(); + else if (operation == "mean") + output << setprecision (PRECISION) << vo.GetMean(); + else if (operation == "median") + output << setprecision (PRECISION) << vo.GetMedian(); + else if (operation == "min") + output << setprecision (PRECISION) << vo.GetMin(); + else if (operation == "max") + output << setprecision (PRECISION) << vo.GetMax(); + else if (operation == "mode") + output << vo.GetMode(); + else if (operation == "antimode") + output << vo.GetAntiMode(); + else if (operation == "count") + output << setprecision (PRECISION) << vo.GetCount(); + else if (operation == "count_distinct") + output << setprecision (PRECISION) << vo.GetCountDistinct(); + else if (operation == "collapse") + output << vo.GetCollapse(); + else if (operation == "distinct") + output << vo.GetDistinct(); + else { + cerr << "ERROR: " << operation << " is an unrecoginzed operation\n"; + exit(1); + } + _column_vec.clear(); + return output.str(); +} + + +/* const int PRECISION = 21; double GetUserColumn(const string s); @@ -120,3 +238,4 @@ void BedMap::ExtractColumnFromHits(const vector<BED> &hits) { } } } +*/ diff --git a/src/mapBed/mapBed.h b/src/mapFile/mapFile.h similarity index 68% rename from src/mapBed/mapBed.h rename to src/mapFile/mapFile.h index f8cc5de1..57e97556 100644 --- a/src/mapBed/mapBed.h +++ b/src/mapFile/mapFile.h @@ -1,5 +1,5 @@ /***************************************************************************** - mapBed.h + mapFile.h (c) 2009 - Aaron Quinlan Hall Laboratory @@ -9,9 +9,52 @@ Licenced under the GNU General Public License 2.0 license. ******************************************************************************/ -#ifndef MAPBED_H -#define MAPBED_H +#ifndef MAPFILE_H +#define MAPFILE_H +using namespace std; + +#include <sstream> +#include <iomanip> +#include "VectorOps.h" +#include "RecordKeyList.h" + +using namespace std; + +class ContextMap; +class BlockMgr; +class RecordOutputMgr; + +class FileMap { + +public: + FileMap(ContextMap *context); + ~FileMap(void); + + bool mapFiles(); + +private: + ContextMap *_context; + Record *_queryRec; + Record *_databaseRec; + BlockMgr *_blockMgr; + RecordOutputMgr *_recordOutputMgr; + + vector<string> _column_vec; // vector to hold current column's worth of data + + //------------------------------------------------ + // private methods + //------------------------------------------------ + void Map(); + string MapHits(RecordKeyList &hits); + void ExtractColumnFromHits(RecordKeyList &hits); + +}; + +#endif /* MAPFILE_H */ + + +/* #include "bedFile.h" #include "chromsweep.h" #include "VectorOps.h" @@ -75,5 +118,5 @@ private: string MapHits(const BED &a, const vector<BED> &hits); void ExtractColumnFromHits(const vector<BED> &hits); }; - -#endif /* MAPBED_H */ +*/ +//#endif /* MAPFILE_H */ diff --git a/src/mapBed/mapMain.cpp b/src/mapFile/mapMain.cpp similarity index 93% rename from src/mapBed/mapMain.cpp rename to src/mapFile/mapMain.cpp index 8f1559d5..bc437edb 100644 --- a/src/mapBed/mapMain.cpp +++ b/src/mapFile/mapMain.cpp @@ -9,21 +9,37 @@ Licenced under the GNU General Public License 2.0 license. ******************************************************************************/ -#include "mapBed.h" -#include "version.h" - using namespace std; +#include "mapFile.h" +#include "ContextMap.h" + // define our program name #define PROGRAM_NAME "bedtools map" +void map_help(void); -// define our parameter checking macro -#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) +int map_main(int argc, char* argv[]) { + + ContextMap *context = new ContextMap(); + if (!context->parseCmdArgs(argc, argv, 1) || context->getShowHelp() || !context->isValidState()) { + if (!context->getErrorMsg().empty()) { + cerr << context->getErrorMsg() << endl; + } + map_help(); + delete context; + return 0; + } + FileMap *fileMap = new FileMap(context); + + bool retVal = fileMap->mapFiles(); + delete fileMap; + delete context; + return retVal ? 0 : 1; +} -// function declarations -void map_help(void); +/* int map_main(int argc, char* argv[]) { // our configuration variables @@ -158,6 +174,7 @@ int map_main(int argc, char* argv[]) { return 0; } } +*/ void map_help(void) { diff --git a/src/utils/Contexts/ContextBase.cpp b/src/utils/Contexts/ContextBase.cpp index 42754d3d..13b0864b 100644 --- a/src/utils/Contexts/ContextBase.cpp +++ b/src/utils/Contexts/ContextBase.cpp @@ -55,6 +55,7 @@ ContextBase::ContextBase() { _programNames["intersect"] = INTERSECT; _programNames["sample"] = SAMPLE; + _programNames["map"] = MAP; _validScoreOps.insert("sum"); _validScoreOps.insert("max"); diff --git a/src/utils/Contexts/ContextBase.h b/src/utils/Contexts/ContextBase.h index 118c14f2..d2ce6194 100644 --- a/src/utils/Contexts/ContextBase.h +++ b/src/utils/Contexts/ContextBase.h @@ -210,7 +210,8 @@ protected: int _bamHeaderAndRefIdx; int _maxNumDatabaseFields; bool _useFullBamTags; - + string _columnOperation; + int _column; bool _reportCount; int _maxDistance; bool _reportNames; diff --git a/src/utils/Contexts/Makefile b/src/utils/Contexts/Makefile index e1029608..8746fcdd 100644 --- a/src/utils/Contexts/Makefile +++ b/src/utils/Contexts/Makefile @@ -15,8 +15,8 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= ContextBase.cpp ContextBase.h ContextIntersect.cpp ContextIntersect.h ContextSample.cpp ContextSample.h -OBJECTS= ContextBase.o ContextIntersect.o ContextSample.o +SOURCES= ContextBase.cpp ContextBase.h ContextIntersect.cpp ContextIntersect.h ContextMap.cpp ContextMap.h ContextSample.cpp ContextSample.h +OBJECTS= ContextBase.o ContextIntersect.o ContextMap.o ContextSample.o _EXT_OBJECTS=ParseTools.o QuickString.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) @@ -31,6 +31,9 @@ $(BUILT_OBJECTS): $(SOURCES) clean: @echo "Cleaning up." - @rm -f $(OBJ_DIR)/ContextBase.o $(OBJ_DIR)/ContextIntersect.o $(OBJ_DIR)/ContextSample.o + @rm -f $(OBJ_DIR)/ContextBase.o \ + $(OBJ_DIR)/ContextIntersect.o \ + $(OBJ_DIR)/ContextMap.o \ + $(OBJ_DIR)/ContextSample.o .PHONY: clean \ No newline at end of file diff --git a/src/utils/FileRecordTools/RecordOutputMgr.cpp b/src/utils/FileRecordTools/RecordOutputMgr.cpp index ac8b37ed..880face5 100644 --- a/src/utils/FileRecordTools/RecordOutputMgr.cpp +++ b/src/utils/FileRecordTools/RecordOutputMgr.cpp @@ -117,6 +117,18 @@ void RecordOutputMgr::printRecord(const Record *record) printRecord(keyList); } +void RecordOutputMgr::printRecord(const Record *record, const string value) +{ + RecordKeyList keyList(record); + printRecord(keyList); + _outBuf.append(value.c_str()); + newline(); + + if (needsFlush()) { + flush(); + } +} + void RecordOutputMgr::printRecord(RecordKeyList &keyList) { if (keyList.getKey()->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) { RecordKeyList blockList(keyList.getKey()); @@ -129,7 +141,6 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList) { return; } printRecord(keyList, NULL); - } void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockList) @@ -192,6 +203,12 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi } _currBlockList = NULL; return; + } else if (_context->getProgram() == ContextBase::MAP) { + if (!printKeyAndTerminate(keyList)) { + tab(); + } + _currBlockList = NULL; + return; } } diff --git a/src/utils/FileRecordTools/RecordOutputMgr.h b/src/utils/FileRecordTools/RecordOutputMgr.h index 8725fc0f..7758df29 100644 --- a/src/utils/FileRecordTools/RecordOutputMgr.h +++ b/src/utils/FileRecordTools/RecordOutputMgr.h @@ -27,6 +27,8 @@ public: // void printHeader(const string &); void printRecord(const Record *record); void printRecord(RecordKeyList &keyList); + // Added by ARQ + void printRecord(const Record *record, const string value); private: typedef enum { NOT_BAM, BAM_AS_BAM, BAM_AS_BED} printBamType; -- GitLab