From 05a364e4e39df1431ac9da35e0739bc88e02c00f Mon Sep 17 00:00:00 2001
From: nkindlon <nek3d@virginia.edu>
Date: Tue, 10 Sep 2013 15:31:57 -0400
Subject: [PATCH] added more utils

---
 src/utils/GenomeFile/GenomeFile.cpp       | 121 ++++++++++++
 src/utils/GenomeFile/GenomeFile.h         |  72 ++++++++
 src/utils/GenomeFile/Makefile             |  34 ++++
 src/utils/GenomeFile/NewGenomeFile.cpp    | 112 +++++++++++
 src/utils/GenomeFile/NewGenomeFile.h      |  64 +++++++
 src/utils/NewChromsweep/Makefile          |  36 ++++
 src/utils/NewChromsweep/NewChromsweep.cpp | 216 ++++++++++++++++++++++
 src/utils/NewChromsweep/NewChromsweep.h   |  95 ++++++++++
 8 files changed, 750 insertions(+)
 create mode 100644 src/utils/GenomeFile/GenomeFile.cpp
 create mode 100644 src/utils/GenomeFile/GenomeFile.h
 create mode 100644 src/utils/GenomeFile/Makefile
 create mode 100644 src/utils/GenomeFile/NewGenomeFile.cpp
 create mode 100644 src/utils/GenomeFile/NewGenomeFile.h
 create mode 100644 src/utils/NewChromsweep/Makefile
 create mode 100644 src/utils/NewChromsweep/NewChromsweep.cpp
 create mode 100644 src/utils/NewChromsweep/NewChromsweep.h

diff --git a/src/utils/GenomeFile/GenomeFile.cpp b/src/utils/GenomeFile/GenomeFile.cpp
new file mode 100644
index 00000000..0f647aae
--- /dev/null
+++ b/src/utils/GenomeFile/GenomeFile.cpp
@@ -0,0 +1,121 @@
+/*****************************************************************************
+  GenomeFile.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licensed under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "lineFileUtilities.h"
+#include "GenomeFile.h"
+
+
+GenomeFile::GenomeFile(const string &genomeFile) {
+    _genomeFile = genomeFile;
+    loadGenomeFileIntoMap();
+}
+
+GenomeFile::GenomeFile(const RefVector &genome) {
+    for (size_t i = 0; i < genome.size(); ++i) {
+        string chrom = genome[i].RefName;
+        int length = genome[i].RefLength;
+        
+        _chromSizes[chrom] = length;
+        _chromList.push_back(chrom);
+    }
+}
+
+// Destructor
+GenomeFile::~GenomeFile(void) {
+}
+
+
+void GenomeFile::loadGenomeFileIntoMap() {
+
+    string genomeLine;
+    int lineNum = 0;
+    vector<string> genomeFields;            // vector for a GENOME entry
+
+    // open the GENOME file for reading
+    ifstream genome(_genomeFile.c_str(), ios::in);
+    if ( !genome ) {
+        cerr << "Error: The requested genome file (" << _genomeFile << ") could not be opened. Exiting!" << endl;
+        exit (1);
+    }
+
+    while (getline(genome, genomeLine)) {
+
+        Tokenize(genomeLine,genomeFields);  // load the fields into the vector
+        lineNum++;
+
+        // ignore a blank line
+        if (genomeFields.size() > 0) {
+            if (genomeFields[0].find("#") != 0) {
+                // we need at least 2 columns
+                if (genomeFields.size() >= 2) {
+                    char *p2End;
+                    long c2;
+                    // make sure the second column is numeric.
+                    c2 = strtol(genomeFields[1].c_str(), &p2End, 10);
+
+                    // strtol  will set p2End to the start of the string if non-integral, base 10
+                    if (p2End != genomeFields[1].c_str()) {
+                        string chrom       = genomeFields[0];
+                        int size           = atoi(genomeFields[1].c_str());
+                        _chromSizes[chrom] = size;
+                        _chromList.push_back(chrom);
+                        _startOffsets.push_back(_genomeLength);
+                        _genomeLength += size;
+                    }
+                }
+                else {
+                    cerr << "Less than the req'd two fields were encountered in the genome file (" << _genomeFile << ")";
+                    cerr << " at line " << lineNum << ".  Exiting." << endl;
+                    exit (1);
+                }
+            }
+        }
+        genomeFields.clear();
+    }
+}
+
+pair<string, uint32_t> GenomeFile::projectOnGenome(uint32_t genome_pos) {
+    // search for the chrom that the position belongs on.
+    // add 1 to genome position b/c of zero-based, half open.
+    vector<uint32_t>::const_iterator low =
+        lower_bound(_startOffsets.begin(), _startOffsets.end(), genome_pos + 1);
+    
+    // use the iterator to identify the appropriate index 
+    // into the chrom name and start vectors
+    int i = int(low-_startOffsets.begin());
+    string chrom = _chromList[i - 1];
+    uint32_t start = genome_pos - _startOffsets[i - 1];
+    return make_pair(chrom, start);
+}
+    
+uint32_t GenomeFile::getChromSize(const string &chrom) {
+    chromToSizes::const_iterator chromIt = _chromSizes.find(chrom);
+    if (chromIt != _chromSizes.end())
+        return _chromSizes[chrom];
+    else
+        return -1;  // chrom not found.
+}
+
+uint32_t GenomeFile::getGenomeSize(void) {
+    return _genomeLength;
+}
+
+vector<string> GenomeFile::getChromList() {
+    return _chromList;
+}
+
+int GenomeFile::getNumberOfChroms() {
+    return _chromList.size();
+}
+
+string GenomeFile::getGenomeFileName() {
+    return _genomeFile;
+}
diff --git a/src/utils/GenomeFile/GenomeFile.h b/src/utils/GenomeFile/GenomeFile.h
new file mode 100644
index 00000000..10f6167a
--- /dev/null
+++ b/src/utils/GenomeFile/GenomeFile.h
@@ -0,0 +1,72 @@
+/*****************************************************************************
+  GenomeFile.h
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licensed under the GNU General Public License 2.0 license.
+******************************************************************************/
+#ifndef GENOMEFILE_H
+#define GENOMEFILE_H
+
+#include <map>
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <cstring>
+#include <cstdio>
+#include <algorithm> // for bsearch lower_bound()
+#include "api/BamReader.h"
+#include "api/BamAux.h"
+using namespace BamTools;
+
+using namespace std;
+
+
+// typedef for mapping b/w chrom name and it's size in b.p.
+typedef map<string, int, std::less<string> > chromToSizes;
+
+
+class GenomeFile {
+
+public:
+
+    // Constructor using a file
+    GenomeFile(const string &genomeFile);
+    
+    // Constructor using a vector of BamTools RefVector
+    GenomeFile(const RefVector &genome);
+
+    // Destructor
+    ~GenomeFile(void);
+
+    // load a GENOME file into a map keyed by chrom. value is size of chrom.
+    void loadGenomeFileIntoMap();
+    
+    pair<string, uint32_t> projectOnGenome(uint32_t genome_pos);
+    
+    uint32_t getChromSize(const string &chrom);     // return the size of a chromosome
+    uint32_t getGenomeSize(void);              // return the total size of the geonome
+    vector<string> getChromList();             // return a list of chrom names
+    int getNumberOfChroms();                   // return the number of chroms
+    string getGenomeFileName();                // return the name of the genome file
+
+
+
+
+private:
+    string  _genomeFile;
+    chromToSizes _chromSizes;
+    vector<string> _chromList;
+
+    // projecting chroms into a single coordinate system
+    uint32_t _genomeLength;
+    vector<uint32_t> _startOffsets;
+    
+};
+
+#endif /* GENOMEFILE_H */
diff --git a/src/utils/GenomeFile/Makefile b/src/utils/GenomeFile/Makefile
new file mode 100644
index 00000000..afaeccd4
--- /dev/null
+++ b/src/utils/GenomeFile/Makefile
@@ -0,0 +1,34 @@
+OBJ_DIR = ../../../obj/
+BIN_DIR = ../../../bin/
+UTILITIES_DIR = ../
+# -------------------
+# define our includes
+# -------------------
+INCLUDES = -I$(UTILITIES_DIR)/general/ \
+		 	-I$(UTILITIES_DIR)/lineFileUtilities/ \
+           -I$(UTILITIES_DIR)/BamTools/include/	
+
+# ----------------------------------
+# define our source and object files
+# ----------------------------------
+SOURCES= NewGenomeFile.cpp NewGenomeFile.h GenomeFile.cpp GenomeFile.h
+OBJECTS= NewGenomeFile.o GenomeFile.o
+#_EXT_OBJECTS=lineFileUtilities.o fileType.o
+#EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
+BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
+
+$(BUILT_OBJECTS): $(SOURCES) $(SUBDIRS)
+	@echo "  * compiling GenomeFile.cpp"
+	@$(CXX) -c -o $(OBJ_DIR)/GenomeFile.o GenomeFile.cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
+	@echo "  * compiling NewGenomeFile.cpp"
+	@$(CXX) -c -o $(OBJ_DIR)/NewGenomeFile.o NewGenomeFile.cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
+
+#$(EXT_OBJECTS):
+#	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/
+#	@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/
+
+clean:
+	@echo "Cleaning up."
+	@rm -f $(OBJ_DIR)/NewGenomeFile.o $(OBJ_DIR)/GenomeFile.o
+
+.PHONY: clean
diff --git a/src/utils/GenomeFile/NewGenomeFile.cpp b/src/utils/GenomeFile/NewGenomeFile.cpp
new file mode 100644
index 00000000..c6ceb8d0
--- /dev/null
+++ b/src/utils/GenomeFile/NewGenomeFile.cpp
@@ -0,0 +1,112 @@
+/*****************************************************************************
+  NewGenomeFile.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licensed under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "NewGenomeFile.h"
+
+
+NewGenomeFile::NewGenomeFile(const QuickString &genomeFilename)
+: _maxId(-1)
+{
+    _genomeFileName = genomeFilename;
+    loadGenomeFileIntoMap();
+}
+
+NewGenomeFile::NewGenomeFile(const BamTools::RefVector &refVector)
+: _maxId(-1)
+{
+    for (size_t i = 0; i < refVector.size(); ++i) {
+        QuickString chrom = refVector[i].RefName;
+        CHRPOS length = refVector[i].RefLength;
+        _maxId++;
+        _chromSizeIds[chrom] = pair<CHRPOS, CHRPOS>(length, _maxId);
+    }
+}
+
+// Destructor
+NewGenomeFile::~NewGenomeFile(void) {
+}
+
+
+void NewGenomeFile::loadGenomeFileIntoMap() {
+
+	FILE *fp = fopen(_genomeFileName.c_str(), "r");
+	if (fp == NULL) {
+		fprintf(stderr, "Error: Can't open genome file %s. Exiting...\n", _genomeFileName.c_str());
+		exit(1);
+	}
+	char sLine[2048];
+	char chrName[2048];
+	CHRPOS chrSize = 0;
+	while (!feof(fp)) {
+		memset(sLine, 0, 2048);
+		memset(chrName, 0, 2048);
+		chrSize = 0;
+		fgets(sLine, 2048, fp);
+		sscanf(sLine, "%s %d", chrName, &chrSize);
+		if (strlen(sLine) == 0) {
+			continue;
+		}
+		_maxId++;
+		_chromSizeIds[chrName] = pair<CHRPOS, CHRPOS>(chrSize, _maxId);
+		_startOffsets.push_back(_genomeLength);
+		_genomeLength += chrSize;
+		_chromList.push_back(chrName);
+
+	}
+	_startOffsets.push_back(_genomeLength); //insert the final length as the last element
+	//to help with the lower_bound call in the projectOnGenome method.
+	fclose(fp);
+}
+
+bool NewGenomeFile::projectOnGenome(CHRPOS genome_pos, QuickString &chrom, CHRPOS &start) {
+    // search for the chrom that the position belongs on.
+    // add 1 to genome position b/c of zero-based, half open.
+    vector<CHRPOS>::const_iterator low =
+        lower_bound(_startOffsets.begin(), _startOffsets.end(), genome_pos + 1);
+    
+    // use the iterator to identify the appropriate index 
+    // into the chrom name and start vectors
+    CHRPOS i = CHRPOS(low-_startOffsets.begin());
+    if (i < 0 || i >= _chromList.size()) {
+    	return false; //position not on genome
+    }
+    chrom = _chromList[i - 1];
+    start = genome_pos - _startOffsets[i - 1];
+    return true;
+}
+    
+CHRPOS NewGenomeFile::getChromSize(const QuickString &chrom) {
+	if (chrom == _currChromName) {
+		return _currChromSize;
+	}
+	lookupType::const_iterator iter= _chromSizeIds.find(chrom);
+    if (iter != _chromSizeIds.end()) {
+    	_currChromName = iter->first;
+    	_currChromSize = iter->second.first;
+    	_currChromId = iter->second.second;
+    	return _currChromSize;
+    }
+    return INT_MAX;
+}
+
+CHRPOS NewGenomeFile::getChromId(const QuickString &chrom) {
+	if (chrom == _currChromName) {
+		return _currChromId;
+	}
+	lookupType::const_iterator iter= _chromSizeIds.find(chrom);
+    if (iter != _chromSizeIds.end()) {
+    	_currChromName = iter->first;
+    	_currChromSize = iter->second.first;
+    	_currChromId = iter->second.second;
+    	return _currChromId;
+    }
+    return INT_MAX;
+}
diff --git a/src/utils/GenomeFile/NewGenomeFile.h b/src/utils/GenomeFile/NewGenomeFile.h
new file mode 100644
index 00000000..afd23b50
--- /dev/null
+++ b/src/utils/GenomeFile/NewGenomeFile.h
@@ -0,0 +1,64 @@
+
+/*****************************************************************************
+  NewGenomeFile.h
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licensed under the GNU General Public License 2.0 license.
+******************************************************************************/
+#ifndef NEW_GENOMEFILE_H
+#define NEW_GENOMEFILE_H
+
+
+#include "BedtoolsTypes.h"
+
+#include "api/BamReader.h"
+#include "api/BamAux.h"
+
+
+class NewGenomeFile {
+
+public:
+
+     NewGenomeFile(const QuickString &genomeFileName);
+     NewGenomeFile(const BamTools::RefVector &genome);
+    ~NewGenomeFile(void);
+
+    // load a GENOME file into a map keyed by chrom. value is a pair<int, int> of id and size.
+    void loadGenomeFileIntoMap();
+    
+    bool projectOnGenome(CHRPOS genome_pos, QuickString &chrom, CHRPOS &start);
+    
+    CHRPOS getGenomeSize(void) const { return _genomeLength; }                // return the total size of the genome
+    CHRPOS getChromSize(const QuickString &chrom);  // return the size of a chromosome
+    CHRPOS getChromId(const QuickString &chrom); // return chromosome's sort order
+    const vector<QuickString> &getChromList() const { return _chromList; }  // return a list of chrom names
+    CHRPOS getNumberOfChroms() const { return _chromList.size(); }
+    const QuickString &getGenomeFileName() const { return _genomeFileName; }
+
+
+
+
+private:
+    QuickString  _genomeFileName;
+    typedef map<QuickString, pair<CHRPOS, CHRPOS> > lookupType;
+    lookupType _chromSizeIds;
+    vector<QuickString> _chromList;
+    CHRPOS _maxId;
+
+    // projecting chroms onto a single coordinate system
+    CHRPOS _genomeLength;
+    vector<CHRPOS> _startOffsets;
+    
+    //cache members for quick lookup
+    QuickString _currChromName;
+    CHRPOS _currChromSize;
+    CHRPOS _currChromId;
+
+};
+
+#endif /* GENOMEFILE_H */
diff --git a/src/utils/NewChromsweep/Makefile b/src/utils/NewChromsweep/Makefile
new file mode 100644
index 00000000..65263894
--- /dev/null
+++ b/src/utils/NewChromsweep/Makefile
@@ -0,0 +1,36 @@
+OBJ_DIR = ../../../obj/
+BIN_DIR = ../../../bin/
+UTILITIES_DIR = ../../utils/
+# -------------------
+# define our includes
+# -------------------
+INCLUDES = -I$(UTILITIES_DIR)/general/ \
+			-I$(UTILITIES_DIR)/fileType/ \
+			-I$(UTILITIES_DIR)/Contexts/ \
+			-I$(UTILITIES_DIR)/GenomeFile/ \
+           -I$(UTILITIES_DIR)/FileRecordTools/ \
+           -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
+           -I$(UTILITIES_DIR)/FileRecordTools/Records/ \
+           -I$(UTILITIES_DIR)/BamTools/include 
+         
+# ----------------------------------
+# define our source and object files
+# ----------------------------------
+SOURCES= NewChromsweep.cpp NewChromsweep.h 
+OBJECTS= NewChromsweep.o
+_EXT_OBJECTS=
+EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
+BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
+
+$(BUILT_OBJECTS): $(SOURCES)
+	@echo "  * compiling" $(*F).cpp
+	@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
+
+$(EXT_OBJECTS):
+	@$(MAKE) --no-print-directory -C $(INCLUDES)
+
+clean:
+	@echo "Cleaning up."
+	@rm -f $(OBJ_DIR)/NewChromsweep.o $(BIN_DIR)/NewChromsweep.o
+
+.PHONY: clean
\ No newline at end of file
diff --git a/src/utils/NewChromsweep/NewChromsweep.cpp b/src/utils/NewChromsweep/NewChromsweep.cpp
new file mode 100644
index 00000000..6c441c4e
--- /dev/null
+++ b/src/utils/NewChromsweep/NewChromsweep.cpp
@@ -0,0 +1,216 @@
+/*****************************************************************************
+  NewChromsweep.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+
+#include "NewChromsweep.h"
+#include "Context.h"
+#include "FileRecordMgr.h"
+
+NewChromSweep::NewChromSweep(Context *context,
+                       bool useMergedIntervals)
+:	_context(context),
+ 	_queryFRM(NULL),
+ 	_databaseFRM(NULL),
+ 	_useMergedIntervals(false),
+ 	_queryRecordsTotalLength(0),
+ 	_databaseRecordsTotalLength(0),
+ 	_wasInitialized(false),
+ 	_currQueryRec(NULL),
+ 	_currDatabaseRec(NULL)
+
+{
+}
+
+
+bool NewChromSweep::init() {
+    
+	//Create new FileRecordMgrs for the two input files.
+	//Open them, and get the first record from each.
+	//if any of that goes wrong, return false;
+	//otherwise, return true.
+    _queryFRM = new FileRecordMgr(_context->getQueryFileIdx(), _context);
+    _databaseFRM = new FileRecordMgr(_context->getDatabaseFileIdx(), _context);
+    
+    if (!_queryFRM->open()) {
+    	return false;
+    }
+    if (!_databaseFRM->open()) {
+    	return false;
+    }
+
+    _context->determineOutputType();
+
+    nextRecord(false);
+    if (_currDatabaseRec == NULL) {
+    	return false;
+    }
+    _wasInitialized = true;
+    return true;
+ }
+
+void NewChromSweep::closeOut() {
+	while (!_queryFRM->eof()) {
+		nextRecord(true);
+	}
+	while (!_databaseFRM->eof()) {
+		nextRecord(false);
+	}
+}
+
+NewChromSweep::~NewChromSweep(void) {
+	if (!_wasInitialized) {
+		return;
+	}
+	if (_currQueryRec != NULL) {
+		_queryFRM->deleteRecord(_currQueryRec);
+	}
+	if (_currDatabaseRec != NULL) {
+		_databaseFRM->deleteRecord(_currDatabaseRec);
+	}
+	_queryFRM->close();
+	_databaseFRM->close();
+
+	delete _queryFRM;
+	delete _databaseFRM;
+
+}
+
+
+void NewChromSweep::scanCache() {
+	recListIterType cacheIter = _cache.begin();
+    while (cacheIter != _cache.end())
+    {
+    	const Record *cacheRec = cacheIter->value();
+        if (_currQueryRec->sameChrom(cacheRec) && !(_currQueryRec->after(cacheRec))) {
+            if (intersects(_currQueryRec, cacheRec)) {
+                _hits.push_back(cacheRec);
+            }
+            cacheIter = _cache.next();
+        }
+        else {
+            cacheIter = _cache.deleteCurrent();
+    		_databaseFRM->deleteRecord(cacheRec);
+        }
+    }
+}
+
+void NewChromSweep::clearCache()
+{
+	//delete all objects pointed to by cache
+	for (recListIterType iter = _cache.begin(); iter != _cache.end(); iter = _cache.next()) {
+		_databaseFRM->deleteRecord(iter->value());
+	}
+	_cache.clear();
+}
+
+bool NewChromSweep::chromChange()
+{
+    // the files are on the same chrom
+	if (_currDatabaseRec == NULL || _currQueryRec->sameChrom(_currDatabaseRec)) {
+		return false;
+	}
+	// the query is ahead of the database. fast-forward the database to catch-up.
+	if (_currQueryRec->chromAfter(_currDatabaseRec)) {
+
+		while (_currDatabaseRec != NULL &&
+				_currQueryRec->chromAfter(_currDatabaseRec)) {
+			_databaseFRM->deleteRecord(_currDatabaseRec);
+			nextRecord(false);
+		}
+		clearCache();
+        return false;
+    }
+    // the database is ahead of the query.
+    else {
+        // 1. scan the cache for remaining hits on the query's current chrom.
+        if (_currQueryRec->getChrName() == _currChromName)
+        {
+            scanCache();
+        }
+        // 2. fast-forward until we catch up and report 0 hits until we do.
+        else if (_currQueryRec->chromBefore(_currDatabaseRec))
+        {
+            clearCache();
+        }
+
+        return true;
+    }
+}
+
+
+bool NewChromSweep::next(RecordKeyList &next) {
+	if (_currQueryRec != NULL) {
+		_queryFRM->deleteRecord(_currQueryRec);
+	}
+	nextRecord(true);
+	if (_currQueryRec == NULL) { //eof hit!
+		return false;
+	}
+
+	if (_currDatabaseRec == NULL && _cache.empty()) {
+		return false;
+	}
+	_hits.clear();
+	_currChromName = _currQueryRec->getChrName();
+	// have we changed chromosomes?
+	if (!chromChange()) {
+		// scan the database cache for hits
+		scanCache();
+		//skip if we hit the end of the DB
+		// advance the db until we are ahead of the query. update hits and cache as necessary
+		while (_currDatabaseRec != NULL &&
+				_currQueryRec->sameChrom(_currDatabaseRec) &&
+				!(_currDatabaseRec->after(_currQueryRec))) {
+			if (intersects(_currQueryRec, _currDatabaseRec)) {
+				_hits.push_back(_currDatabaseRec);
+			}
+			if (_currQueryRec->after(_currDatabaseRec)) {
+				_databaseFRM->deleteRecord(_currDatabaseRec);
+				_currDatabaseRec = NULL;
+			} else {
+				_cache.push_back(_currDatabaseRec);
+				_currDatabaseRec = NULL;
+			}
+			nextRecord(false);
+		}
+	}
+	next.setKey(_currQueryRec);
+	next.setListNoCopy(_hits);
+	return true;
+}
+
+void NewChromSweep::nextRecord(bool query) {
+	if (query) {
+		if (!_context->getUseMergedIntervals()) {
+			_currQueryRec = _queryFRM->allocateAndGetNextRecord();
+		} else {
+			_currQueryRec = _queryFRM->allocateAndGetNextMergedRecord(_context->getSameStrand() ? FileRecordMgr::SAME_STRAND_EITHER : FileRecordMgr::ANY_STRAND);
+		}
+		if (_currQueryRec != NULL) {
+			_queryRecordsTotalLength += (unsigned long)(_currQueryRec->getEndPos() - _currQueryRec->getStartPos());
+		}
+	} else { //database
+		if (!_context->getUseMergedIntervals()) {
+			_currDatabaseRec = _databaseFRM->allocateAndGetNextRecord();
+		} else {
+			_currDatabaseRec = _databaseFRM->allocateAndGetNextMergedRecord(_context->getSameStrand() ? FileRecordMgr::SAME_STRAND_EITHER : FileRecordMgr::ANY_STRAND);
+		}
+		if (_currDatabaseRec != NULL) {
+			_databaseRecordsTotalLength += (unsigned long)(_currDatabaseRec->getEndPos() - _currDatabaseRec->getStartPos());
+		}
+	}
+}
+
+bool NewChromSweep::intersects(const Record *rec1, const Record *rec2) const
+{
+	return rec1->sameChromIntersects(rec2, _context->getSameStrand(), _context->getDiffStrand(),
+			_context->getOverlapFraction(), _context->getReciprocal());
+}
diff --git a/src/utils/NewChromsweep/NewChromsweep.h b/src/utils/NewChromsweep/NewChromsweep.h
new file mode 100644
index 00000000..abd20e92
--- /dev/null
+++ b/src/utils/NewChromsweep/NewChromsweep.h
@@ -0,0 +1,95 @@
+/*****************************************************************************
+  NewChromsweepBed.h
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#ifndef NEW_CHROMSWEEP_H
+#define NEW_CHROMSWEEP_H
+
+using namespace std;
+
+#include <string>
+#include "BTlist.h"
+#include "RecordKeyList.h"
+#include <queue>
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+#include "QuickString.h"
+
+class Record;
+class FileRecordMgr;
+class Context;
+
+class NewChromSweep {
+public:
+
+    NewChromSweep(Context *context, bool useMergedIntervals = false);
+    
+    
+    ~NewChromSweep(void);
+    bool init();
+    
+    typedef BTlist<const Record *> recListType;
+    typedef const BTlistNode<const Record *> * recListIterType;
+    // loads next (a pair) with the current query and it's overlaps
+    bool next(RecordKeyList &next);
+    
+    //MUST call this method after sweep if you want the
+    //getTotalRecordLength methods to return the whole length of the
+    //record files, rather than just what was used by sweep.
+    void closeOut();
+
+    unsigned long getQueryTotalRecordLength() { return _queryRecordsTotalLength; }
+    unsigned long getDatabaseTotalRecordLength() { return _databaseRecordsTotalLength; }
+
+    // Usage:
+    //     NewChromSweep sweep = NewChromSweep(queryFileName, databaseFileName);
+    //     RecordKeyList hits;
+    //     while (sweep.next(hits))
+    //     {
+    //        processHits(hits);
+    //     }
+    // 	   getQueryTotalRecordLength()
+    //     getDatabaseTotalRecordLength()
+
+private:
+    Context *_context;
+    FileRecordMgr *_queryFRM;
+    FileRecordMgr *_databaseFRM;
+
+    bool _useMergedIntervals;
+
+    unsigned long _queryRecordsTotalLength;
+    unsigned long _databaseRecordsTotalLength;
+
+    bool _wasInitialized;
+
+    // a cache of still active features from the database file
+    recListType _cache;
+    // the set of hits in the database for the current query
+    recListType _hits;
+
+    // the current query and db features.
+    Record * _currQueryRec;
+    Record *_currDatabaseRec;
+    // a cache of the current chrom from the query. used to handle chrom changes.
+    QuickString _currChromName;
+
+    void nextRecord(bool query); //true fetches next query record, false fetches next db record.
+    void nextDatabase();
+    
+    void scanCache();
+    void clearCache();
+    bool chromChange();
+    bool intersects(const Record *rec1, const Record *rec2) const;
+
+};
+
+#endif /* NewChromSweep_H */
-- 
GitLab