From c87afc9fd47d977e60aa99f37a5ec7a22ecfdb2d Mon Sep 17 00:00:00 2001
From: arq5x <arq5x@virginia.edu>
Date: Thu, 9 Jan 2014 14:10:03 -0500
Subject: [PATCH] first pass at using Context and PFM for the map tool.

---
 Makefile                                      |   2 +-
 src/mapBed/Makefile                           |  38 ------
 src/mapFile/Makefile                          |  55 ++++++++
 .../mapBed.cpp => mapFile/mapFile.cpp}        | 123 +++++++++++++++++-
 src/{mapBed/mapBed.h => mapFile/mapFile.h}    |  53 +++++++-
 src/{mapBed => mapFile}/mapMain.cpp           |  31 ++++-
 src/utils/Contexts/ContextBase.cpp            |   1 +
 src/utils/Contexts/ContextBase.h              |   3 +-
 src/utils/Contexts/Makefile                   |   9 +-
 src/utils/FileRecordTools/RecordOutputMgr.cpp |  19 ++-
 src/utils/FileRecordTools/RecordOutputMgr.h   |   2 +
 11 files changed, 278 insertions(+), 58 deletions(-)
 delete mode 100644 src/mapBed/Makefile
 create mode 100644 src/mapFile/Makefile
 rename src/{mapBed/mapBed.cpp => mapFile/mapFile.cpp} (53%)
 rename src/{mapBed/mapBed.h => mapFile/mapFile.h} (68%)
 rename src/{mapBed => mapFile}/mapMain.cpp (93%)

diff --git a/Makefile b/Makefile
index 3a4cb074..f3cfb4fc 100644
--- a/Makefile
+++ b/Makefile
@@ -46,7 +46,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
 		  $(SRC_DIR)/jaccard \
 		  $(SRC_DIR)/linksBed \
 		  $(SRC_DIR)/maskFastaFromBed \
-		  $(SRC_DIR)/mapBed \
+		  $(SRC_DIR)/mapFile \
 		  $(SRC_DIR)/mergeBed \
 		  $(SRC_DIR)/multiBamCov \
 		  $(SRC_DIR)/multiIntersectBed \
diff --git a/src/mapBed/Makefile b/src/mapBed/Makefile
deleted file mode 100644
index 96828e61..00000000
--- a/src/mapBed/Makefile
+++ /dev/null
@@ -1,38 +0,0 @@
-UTILITIES_DIR = ../utils/
-OBJ_DIR = ../../obj/
-BIN_DIR = ../../bin/
-
-# -------------------
-# define our includes
-# -------------------
-INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
-           -I$(UTILITIES_DIR)/gzstream/ \
-           -I$(UTILITIES_DIR)/genomeFile/ \
-           -I$(UTILITIES_DIR)/lineFileUtilities/ \
-           -I$(UTILITIES_DIR)/fileType/ \
-           -I$(UTILITIES_DIR)/BamTools/include \
-           -I$(UTILITIES_DIR)/BamTools-Ancillary \
-           -I$(UTILITIES_DIR)/chromsweep \
-           -I$(UTILITIES_DIR)/VectorOps \
-           -I$(UTILITIES_DIR)/version/
-
-# ----------------------------------
-# define our source and object files
-# ----------------------------------
-SOURCES= mapMain.cpp mapBed.cpp mapBed.h
-OBJECTS= mapMain.o mapBed.o
-BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
-
-all: $(BUILT_OBJECTS)
-
-.PHONY: all
-
-$(BUILT_OBJECTS): $(SOURCES)
-	@echo "  * compiling" $(*F).cpp
-	@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES)
-
-clean:
-	@echo "Cleaning up."
-	@rm -f $(OBJ_DIR)/mapMain.o $(OBJ_DIR)/mapBed.o
-
-.PHONY: clean
diff --git a/src/mapFile/Makefile b/src/mapFile/Makefile
new file mode 100644
index 00000000..3eea8d06
--- /dev/null
+++ b/src/mapFile/Makefile
@@ -0,0 +1,55 @@
+UTILITIES_DIR = ../utils/
+OBJ_DIR = ../../obj/
+BIN_DIR = ../../bin/
+
+# -------------------
+# define our includes
+# -------------------
+# INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
+#            -I$(UTILITIES_DIR)/gzstream/ \
+#            -I$(UTILITIES_DIR)/genomeFile/ \
+#            -I$(UTILITIES_DIR)/lineFileUtilities/ \
+#            -I$(UTILITIES_DIR)/fileType/ \
+#            -I$(UTILITIES_DIR)/BamTools/include \
+#            -I$(UTILITIES_DIR)/BamTools-Ancillary \
+#            -I$(UTILITIES_DIR)/chromsweep \
+#            -I$(UTILITIES_DIR)/VectorOps \
+#            -I$(UTILITIES_DIR)/version/
+
+INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
+			-I$(UTILITIES_DIR)/general/ \
+			-I$(UTILITIES_DIR)/fileType/ \
+			-I$(UTILITIES_DIR)/gzstream/ \
+           -I$(UTILITIES_DIR)/GenomeFile/ \
+           -I$(UTILITIES_DIR)/BamTools/include \
+           -I$(UTILITIES_DIR)/BamTools/src \
+           -I$(UTILITIES_DIR)/BlockedIntervals \
+           -I$(UTILITIES_DIR)/BamTools-Ancillary \
+           -I$(UTILITIES_DIR)/FileRecordTools/ \
+           -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
+           -I$(UTILITIES_DIR)/FileRecordTools/Records/ \
+           -I$(UTILITIES_DIR)/NewChromsweep \
+           -I$(UTILITIES_DIR)/VectorOps \
+           -I$(UTILITIES_DIR)/BinTree \
+           -I$(UTILITIES_DIR)/version/
+
+# ----------------------------------
+# define our source and object files
+# ----------------------------------
+SOURCES= mapMain.cpp mapFile.cpp mapFile.h
+OBJECTS= mapMain.o mapFile.o
+BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
+
+all: $(BUILT_OBJECTS)
+
+.PHONY: all
+
+$(BUILT_OBJECTS): $(SOURCES)
+	@echo "  * compiling" $(*F).cpp
+	@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES)
+
+clean:
+	@echo "Cleaning up."
+	@rm -f $(OBJ_DIR)/mapMain.o $(OBJ_DIR)/mapFile.o
+
+.PHONY: clean
diff --git a/src/mapBed/mapBed.cpp b/src/mapFile/mapFile.cpp
similarity index 53%
rename from src/mapBed/mapBed.cpp
rename to src/mapFile/mapFile.cpp
index c7faf4b2..360b6d69 100644
--- a/src/mapBed/mapBed.cpp
+++ b/src/mapFile/mapFile.cpp
@@ -9,9 +9,127 @@
 
   Licenced under the GNU General Public License 2.0 license.
 ******************************************************************************/
-#include "lineFileUtilities.h"
-#include "mapBed.h"
+#include "mapFile.h"
+#include "ContextMap.h"
+#include "FileRecordMgr.h"
+#include "NewChromsweep.h"
+#include "BinTree.h"
+#include "RecordOutputMgr.h"
 
+const int PRECISION = 21;
+
+FileMap::FileMap(ContextMap *context)
+: _context(context),
+  _blockMgr(NULL),
+  _recordOutputMgr(NULL)
+{
+  // FIX ME - block manager only works for intersect
+  //_blockMgr = new BlockMgr();
+  //_blockMgr->setContext(context);
+  _recordOutputMgr = new RecordOutputMgr();
+}
+
+FileMap::~FileMap(void) {
+  if (_blockMgr != NULL) {
+    delete _blockMgr;
+    _blockMgr = NULL;
+  }
+  delete _recordOutputMgr;
+}
+
+bool FileMap::mapFiles()
+{
+    NewChromSweep sweep(_context);
+    if (!sweep.init()) {
+      return false;
+    }
+    if (!_recordOutputMgr->init(_context)) {
+      return false;
+    }
+    RecordKeyList hitSet;
+    while (sweep.next(hitSet)) {
+      //if (_context->getObeySplits()) {
+      //  RecordKeyList keySet(hitSet.getKey());
+      //  RecordKeyList resultSet(hitSet.getKey());
+      //  _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet);
+      //} else {
+      //}
+      //_recordOutputMgr->printKeyAndTerminate(hitSet);
+      _recordOutputMgr->printRecord(hitSet.getKey(), MapHits(hitSet));
+    }
+    return true;
+}
+
+void FileMap::ExtractColumnFromHits(RecordKeyList &hits) {
+  for (RecordKeyList::const_iterator_type iter = hits.begin(); iter != hits.end(); iter = hits.next()) {
+      try {
+        ostringstream startString;
+        startString << iter->value()->getStartPos();
+        //_column_vec.push_back(iter->value().fields.at(_column));
+        _column_vec.push_back(startString.str());
+      }
+      catch(std::out_of_range& e) {
+        exit(1);
+      }
+      /*
+      catch(std::out_of_range& e) {
+        cerr << endl << "*****" << endl 
+             << "*****ERROR: requested column ("
+             << _column + 1
+             << ") exceeds the number of columns in file at line "
+             << _bedA->_lineNum << ". Exiting." 
+             << endl << "*****" << endl;
+        exit(1);
+      }
+      */
+  }
+}
+
+string FileMap::MapHits(RecordKeyList &hits) {
+    
+    ostringstream output;
+    if (hits.size() == 0)
+        // FIX ME
+        return "-1";
+        //return _nullValue;
+
+    ExtractColumnFromHits(hits);
+
+    string operation = _context->getColumnOperation();
+
+    VectorOps vo(_column_vec);
+    if (operation == "sum") 
+        output << setprecision (PRECISION) << vo.GetSum();
+    else if (operation == "mean")
+        output << setprecision (PRECISION) << vo.GetMean();
+    else if (operation == "median")
+        output << setprecision (PRECISION) << vo.GetMedian();
+    else if (operation == "min")
+        output << setprecision (PRECISION) << vo.GetMin();
+    else if (operation == "max")
+        output << setprecision (PRECISION) << vo.GetMax();
+    else if (operation == "mode")
+        output << vo.GetMode();
+    else if (operation == "antimode")
+        output << vo.GetAntiMode();
+    else if (operation == "count") 
+        output << setprecision (PRECISION) << vo.GetCount();
+    else if (operation == "count_distinct")
+        output << setprecision (PRECISION) << vo.GetCountDistinct();
+    else if (operation == "collapse")
+        output << vo.GetCollapse();
+    else if (operation == "distinct")
+        output << vo.GetDistinct();
+    else {
+        cerr << "ERROR: " << operation << " is an unrecoginzed operation\n";
+        exit(1);
+    }
+    _column_vec.clear();
+    return output.str();
+}
+
+
+/*
 const int PRECISION = 21;
 double GetUserColumn(const string s);
 
@@ -120,3 +238,4 @@ void BedMap::ExtractColumnFromHits(const vector<BED> &hits) {
         }
     }
 }
+*/
diff --git a/src/mapBed/mapBed.h b/src/mapFile/mapFile.h
similarity index 68%
rename from src/mapBed/mapBed.h
rename to src/mapFile/mapFile.h
index f8cc5de1..57e97556 100644
--- a/src/mapBed/mapBed.h
+++ b/src/mapFile/mapFile.h
@@ -1,5 +1,5 @@
 /*****************************************************************************
-  mapBed.h
+  mapFile.h
 
   (c) 2009 - Aaron Quinlan
   Hall Laboratory
@@ -9,9 +9,52 @@
 
   Licenced under the GNU General Public License 2.0 license.
 ******************************************************************************/
-#ifndef MAPBED_H
-#define MAPBED_H
+#ifndef MAPFILE_H
+#define MAPFILE_H
 
+using namespace std;
+
+#include <sstream>
+#include <iomanip>
+#include "VectorOps.h"
+#include "RecordKeyList.h"
+
+using namespace std;
+
+class ContextMap;
+class BlockMgr;
+class RecordOutputMgr;
+
+class FileMap {
+
+public:
+    FileMap(ContextMap *context);
+    ~FileMap(void);
+
+    bool mapFiles();
+
+private:
+    ContextMap *_context;
+    Record *_queryRec;
+    Record *_databaseRec;
+    BlockMgr *_blockMgr;
+    RecordOutputMgr *_recordOutputMgr;
+
+    vector<string> _column_vec; // vector to hold current column's worth of data
+
+    //------------------------------------------------
+    // private methods
+    //------------------------------------------------
+    void Map();
+    string MapHits(RecordKeyList &hits);
+    void ExtractColumnFromHits(RecordKeyList &hits);
+
+};
+
+#endif /* MAPFILE_H */
+
+
+/*
 #include "bedFile.h"
 #include "chromsweep.h"
 #include "VectorOps.h"
@@ -75,5 +118,5 @@ private:
     string MapHits(const BED &a, const vector<BED> &hits);
     void ExtractColumnFromHits(const vector<BED> &hits);
 };
-
-#endif /* MAPBED_H */
+*/
+//#endif /* MAPFILE_H */
diff --git a/src/mapBed/mapMain.cpp b/src/mapFile/mapMain.cpp
similarity index 93%
rename from src/mapBed/mapMain.cpp
rename to src/mapFile/mapMain.cpp
index 8f1559d5..bc437edb 100644
--- a/src/mapBed/mapMain.cpp
+++ b/src/mapFile/mapMain.cpp
@@ -9,21 +9,37 @@
 
   Licenced under the GNU General Public License 2.0 license.
 ******************************************************************************/
-#include "mapBed.h"
-#include "version.h"
-
 using namespace std;
 
+#include "mapFile.h"
+#include "ContextMap.h"
+
 // define our program name
 #define PROGRAM_NAME "bedtools map"
 
+void map_help(void);
 
-// define our parameter checking macro
-#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
+int map_main(int argc, char* argv[]) {
+
+    ContextMap *context = new ContextMap();
+    if (!context->parseCmdArgs(argc, argv, 1) || context->getShowHelp() || !context->isValidState()) {
+        if (!context->getErrorMsg().empty()) {
+            cerr << context->getErrorMsg() << endl;
+        }
+        map_help();
+        delete context;
+        return 0;
+    }
+    FileMap *fileMap = new FileMap(context);
+
+    bool retVal = fileMap->mapFiles();
+    delete fileMap;
+    delete context;
+    return retVal ? 0 : 1;
+}
 
-// function declarations
-void map_help(void);
 
+/*
 int map_main(int argc, char* argv[]) {
 
     // our configuration variables
@@ -158,6 +174,7 @@ int map_main(int argc, char* argv[]) {
         return 0;
     }
 }
+*/
 
 void map_help(void) {
 
diff --git a/src/utils/Contexts/ContextBase.cpp b/src/utils/Contexts/ContextBase.cpp
index 42754d3d..13b0864b 100644
--- a/src/utils/Contexts/ContextBase.cpp
+++ b/src/utils/Contexts/ContextBase.cpp
@@ -55,6 +55,7 @@ ContextBase::ContextBase()
 {
 	_programNames["intersect"] = INTERSECT;
 	_programNames["sample"] = SAMPLE;
+	_programNames["map"] = MAP;
 
 	_validScoreOps.insert("sum");
 	_validScoreOps.insert("max");
diff --git a/src/utils/Contexts/ContextBase.h b/src/utils/Contexts/ContextBase.h
index 118c14f2..d2ce6194 100644
--- a/src/utils/Contexts/ContextBase.h
+++ b/src/utils/Contexts/ContextBase.h
@@ -210,7 +210,8 @@ protected:
     int _bamHeaderAndRefIdx;
     int _maxNumDatabaseFields;
     bool _useFullBamTags;
-
+    string _columnOperation;
+    int _column;
 	bool _reportCount;
 	int _maxDistance;
 	bool _reportNames;
diff --git a/src/utils/Contexts/Makefile b/src/utils/Contexts/Makefile
index e1029608..8746fcdd 100644
--- a/src/utils/Contexts/Makefile
+++ b/src/utils/Contexts/Makefile
@@ -15,8 +15,8 @@ INCLUDES =  -I$(UTILITIES_DIR)/general/ \
 # ----------------------------------
 # define our source and object files
 # ----------------------------------
-SOURCES= ContextBase.cpp ContextBase.h ContextIntersect.cpp ContextIntersect.h ContextSample.cpp ContextSample.h
-OBJECTS= ContextBase.o ContextIntersect.o ContextSample.o
+SOURCES= ContextBase.cpp ContextBase.h ContextIntersect.cpp ContextIntersect.h ContextMap.cpp ContextMap.h ContextSample.cpp ContextSample.h
+OBJECTS= ContextBase.o ContextIntersect.o ContextMap.o ContextSample.o
 _EXT_OBJECTS=ParseTools.o QuickString.o
 EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
 BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
@@ -31,6 +31,9 @@ $(BUILT_OBJECTS): $(SOURCES)
 
 clean:
 	@echo "Cleaning up."
-	@rm -f $(OBJ_DIR)/ContextBase.o $(OBJ_DIR)/ContextIntersect.o $(OBJ_DIR)/ContextSample.o
+	@rm -f $(OBJ_DIR)/ContextBase.o \
+	       $(OBJ_DIR)/ContextIntersect.o \
+	       $(OBJ_DIR)/ContextMap.o \
+	       $(OBJ_DIR)/ContextSample.o
 
 .PHONY: clean
\ No newline at end of file
diff --git a/src/utils/FileRecordTools/RecordOutputMgr.cpp b/src/utils/FileRecordTools/RecordOutputMgr.cpp
index ac8b37ed..880face5 100644
--- a/src/utils/FileRecordTools/RecordOutputMgr.cpp
+++ b/src/utils/FileRecordTools/RecordOutputMgr.cpp
@@ -117,6 +117,18 @@ void RecordOutputMgr::printRecord(const Record *record)
 	printRecord(keyList);
 }
 
+void RecordOutputMgr::printRecord(const Record *record, const string value)
+{	
+	RecordKeyList keyList(record);
+	printRecord(keyList);
+	_outBuf.append(value.c_str());
+	newline();
+
+	if (needsFlush()) {
+		flush();
+	}
+}
+
 void RecordOutputMgr::printRecord(RecordKeyList &keyList) {
 	if (keyList.getKey()->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) {
 		RecordKeyList blockList(keyList.getKey());
@@ -129,7 +141,6 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList) {
 		return;
 	}
     printRecord(keyList, NULL);
-
 }
 
 void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockList)
@@ -192,6 +203,12 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi
 		}
 		_currBlockList = NULL;
 		return;
+	} else if (_context->getProgram() == ContextBase::MAP) {
+		if (!printKeyAndTerminate(keyList)) {
+			tab();
+		}
+		_currBlockList = NULL;
+		return;
 	}
 }
 
diff --git a/src/utils/FileRecordTools/RecordOutputMgr.h b/src/utils/FileRecordTools/RecordOutputMgr.h
index 8725fc0f..7758df29 100644
--- a/src/utils/FileRecordTools/RecordOutputMgr.h
+++ b/src/utils/FileRecordTools/RecordOutputMgr.h
@@ -27,6 +27,8 @@ public:
 //	void printHeader(const string &);
 	void printRecord(const Record *record);
 	void printRecord(RecordKeyList &keyList);
+	// Added by ARQ
+	void printRecord(const Record *record, const string value);
 
 private:
 	typedef enum { NOT_BAM, BAM_AS_BAM, BAM_AS_BED} printBamType;
-- 
GitLab