From 05a364e4e39df1431ac9da35e0739bc88e02c00f Mon Sep 17 00:00:00 2001 From: nkindlon <nek3d@virginia.edu> Date: Tue, 10 Sep 2013 15:31:57 -0400 Subject: [PATCH] added more utils --- src/utils/GenomeFile/GenomeFile.cpp | 121 ++++++++++++ src/utils/GenomeFile/GenomeFile.h | 72 ++++++++ src/utils/GenomeFile/Makefile | 34 ++++ src/utils/GenomeFile/NewGenomeFile.cpp | 112 +++++++++++ src/utils/GenomeFile/NewGenomeFile.h | 64 +++++++ src/utils/NewChromsweep/Makefile | 36 ++++ src/utils/NewChromsweep/NewChromsweep.cpp | 216 ++++++++++++++++++++++ src/utils/NewChromsweep/NewChromsweep.h | 95 ++++++++++ 8 files changed, 750 insertions(+) create mode 100644 src/utils/GenomeFile/GenomeFile.cpp create mode 100644 src/utils/GenomeFile/GenomeFile.h create mode 100644 src/utils/GenomeFile/Makefile create mode 100644 src/utils/GenomeFile/NewGenomeFile.cpp create mode 100644 src/utils/GenomeFile/NewGenomeFile.h create mode 100644 src/utils/NewChromsweep/Makefile create mode 100644 src/utils/NewChromsweep/NewChromsweep.cpp create mode 100644 src/utils/NewChromsweep/NewChromsweep.h diff --git a/src/utils/GenomeFile/GenomeFile.cpp b/src/utils/GenomeFile/GenomeFile.cpp new file mode 100644 index 00000000..0f647aae --- /dev/null +++ b/src/utils/GenomeFile/GenomeFile.cpp @@ -0,0 +1,121 @@ +/***************************************************************************** + GenomeFile.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licensed under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "lineFileUtilities.h" +#include "GenomeFile.h" + + +GenomeFile::GenomeFile(const string &genomeFile) { + _genomeFile = genomeFile; + loadGenomeFileIntoMap(); +} + +GenomeFile::GenomeFile(const RefVector &genome) { + for (size_t i = 0; i < genome.size(); ++i) { + string chrom = genome[i].RefName; + int length = genome[i].RefLength; + + _chromSizes[chrom] = length; + _chromList.push_back(chrom); + } +} + +// Destructor +GenomeFile::~GenomeFile(void) { +} + + +void GenomeFile::loadGenomeFileIntoMap() { + + string genomeLine; + int lineNum = 0; + vector<string> genomeFields; // vector for a GENOME entry + + // open the GENOME file for reading + ifstream genome(_genomeFile.c_str(), ios::in); + if ( !genome ) { + cerr << "Error: The requested genome file (" << _genomeFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + + while (getline(genome, genomeLine)) { + + Tokenize(genomeLine,genomeFields); // load the fields into the vector + lineNum++; + + // ignore a blank line + if (genomeFields.size() > 0) { + if (genomeFields[0].find("#") != 0) { + // we need at least 2 columns + if (genomeFields.size() >= 2) { + char *p2End; + long c2; + // make sure the second column is numeric. + c2 = strtol(genomeFields[1].c_str(), &p2End, 10); + + // strtol will set p2End to the start of the string if non-integral, base 10 + if (p2End != genomeFields[1].c_str()) { + string chrom = genomeFields[0]; + int size = atoi(genomeFields[1].c_str()); + _chromSizes[chrom] = size; + _chromList.push_back(chrom); + _startOffsets.push_back(_genomeLength); + _genomeLength += size; + } + } + else { + cerr << "Less than the req'd two fields were encountered in the genome file (" << _genomeFile << ")"; + cerr << " at line " << lineNum << ". Exiting." << endl; + exit (1); + } + } + } + genomeFields.clear(); + } +} + +pair<string, uint32_t> GenomeFile::projectOnGenome(uint32_t genome_pos) { + // search for the chrom that the position belongs on. + // add 1 to genome position b/c of zero-based, half open. + vector<uint32_t>::const_iterator low = + lower_bound(_startOffsets.begin(), _startOffsets.end(), genome_pos + 1); + + // use the iterator to identify the appropriate index + // into the chrom name and start vectors + int i = int(low-_startOffsets.begin()); + string chrom = _chromList[i - 1]; + uint32_t start = genome_pos - _startOffsets[i - 1]; + return make_pair(chrom, start); +} + +uint32_t GenomeFile::getChromSize(const string &chrom) { + chromToSizes::const_iterator chromIt = _chromSizes.find(chrom); + if (chromIt != _chromSizes.end()) + return _chromSizes[chrom]; + else + return -1; // chrom not found. +} + +uint32_t GenomeFile::getGenomeSize(void) { + return _genomeLength; +} + +vector<string> GenomeFile::getChromList() { + return _chromList; +} + +int GenomeFile::getNumberOfChroms() { + return _chromList.size(); +} + +string GenomeFile::getGenomeFileName() { + return _genomeFile; +} diff --git a/src/utils/GenomeFile/GenomeFile.h b/src/utils/GenomeFile/GenomeFile.h new file mode 100644 index 00000000..10f6167a --- /dev/null +++ b/src/utils/GenomeFile/GenomeFile.h @@ -0,0 +1,72 @@ +/***************************************************************************** + GenomeFile.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licensed under the GNU General Public License 2.0 license. +******************************************************************************/ +#ifndef GENOMEFILE_H +#define GENOMEFILE_H + +#include <map> +#include <string> +#include <iostream> +#include <sstream> +#include <fstream> +#include <cstring> +#include <cstdio> +#include <algorithm> // for bsearch lower_bound() +#include "api/BamReader.h" +#include "api/BamAux.h" +using namespace BamTools; + +using namespace std; + + +// typedef for mapping b/w chrom name and it's size in b.p. +typedef map<string, int, std::less<string> > chromToSizes; + + +class GenomeFile { + +public: + + // Constructor using a file + GenomeFile(const string &genomeFile); + + // Constructor using a vector of BamTools RefVector + GenomeFile(const RefVector &genome); + + // Destructor + ~GenomeFile(void); + + // load a GENOME file into a map keyed by chrom. value is size of chrom. + void loadGenomeFileIntoMap(); + + pair<string, uint32_t> projectOnGenome(uint32_t genome_pos); + + uint32_t getChromSize(const string &chrom); // return the size of a chromosome + uint32_t getGenomeSize(void); // return the total size of the geonome + vector<string> getChromList(); // return a list of chrom names + int getNumberOfChroms(); // return the number of chroms + string getGenomeFileName(); // return the name of the genome file + + + + +private: + string _genomeFile; + chromToSizes _chromSizes; + vector<string> _chromList; + + // projecting chroms into a single coordinate system + uint32_t _genomeLength; + vector<uint32_t> _startOffsets; + +}; + +#endif /* GENOMEFILE_H */ diff --git a/src/utils/GenomeFile/Makefile b/src/utils/GenomeFile/Makefile new file mode 100644 index 00000000..afaeccd4 --- /dev/null +++ b/src/utils/GenomeFile/Makefile @@ -0,0 +1,34 @@ +OBJ_DIR = ../../../obj/ +BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../ +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/BamTools/include/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= NewGenomeFile.cpp NewGenomeFile.h GenomeFile.cpp GenomeFile.h +OBJECTS= NewGenomeFile.o GenomeFile.o +#_EXT_OBJECTS=lineFileUtilities.o fileType.o +#EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +$(BUILT_OBJECTS): $(SOURCES) $(SUBDIRS) + @echo " * compiling GenomeFile.cpp" + @$(CXX) -c -o $(OBJ_DIR)/GenomeFile.o GenomeFile.cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + @echo " * compiling NewGenomeFile.cpp" + @$(CXX) -c -o $(OBJ_DIR)/NewGenomeFile.o NewGenomeFile.cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +#$(EXT_OBJECTS): +# @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ +# @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/NewGenomeFile.o $(OBJ_DIR)/GenomeFile.o + +.PHONY: clean diff --git a/src/utils/GenomeFile/NewGenomeFile.cpp b/src/utils/GenomeFile/NewGenomeFile.cpp new file mode 100644 index 00000000..c6ceb8d0 --- /dev/null +++ b/src/utils/GenomeFile/NewGenomeFile.cpp @@ -0,0 +1,112 @@ +/***************************************************************************** + NewGenomeFile.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licensed under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "NewGenomeFile.h" + + +NewGenomeFile::NewGenomeFile(const QuickString &genomeFilename) +: _maxId(-1) +{ + _genomeFileName = genomeFilename; + loadGenomeFileIntoMap(); +} + +NewGenomeFile::NewGenomeFile(const BamTools::RefVector &refVector) +: _maxId(-1) +{ + for (size_t i = 0; i < refVector.size(); ++i) { + QuickString chrom = refVector[i].RefName; + CHRPOS length = refVector[i].RefLength; + _maxId++; + _chromSizeIds[chrom] = pair<CHRPOS, CHRPOS>(length, _maxId); + } +} + +// Destructor +NewGenomeFile::~NewGenomeFile(void) { +} + + +void NewGenomeFile::loadGenomeFileIntoMap() { + + FILE *fp = fopen(_genomeFileName.c_str(), "r"); + if (fp == NULL) { + fprintf(stderr, "Error: Can't open genome file %s. Exiting...\n", _genomeFileName.c_str()); + exit(1); + } + char sLine[2048]; + char chrName[2048]; + CHRPOS chrSize = 0; + while (!feof(fp)) { + memset(sLine, 0, 2048); + memset(chrName, 0, 2048); + chrSize = 0; + fgets(sLine, 2048, fp); + sscanf(sLine, "%s %d", chrName, &chrSize); + if (strlen(sLine) == 0) { + continue; + } + _maxId++; + _chromSizeIds[chrName] = pair<CHRPOS, CHRPOS>(chrSize, _maxId); + _startOffsets.push_back(_genomeLength); + _genomeLength += chrSize; + _chromList.push_back(chrName); + + } + _startOffsets.push_back(_genomeLength); //insert the final length as the last element + //to help with the lower_bound call in the projectOnGenome method. + fclose(fp); +} + +bool NewGenomeFile::projectOnGenome(CHRPOS genome_pos, QuickString &chrom, CHRPOS &start) { + // search for the chrom that the position belongs on. + // add 1 to genome position b/c of zero-based, half open. + vector<CHRPOS>::const_iterator low = + lower_bound(_startOffsets.begin(), _startOffsets.end(), genome_pos + 1); + + // use the iterator to identify the appropriate index + // into the chrom name and start vectors + CHRPOS i = CHRPOS(low-_startOffsets.begin()); + if (i < 0 || i >= _chromList.size()) { + return false; //position not on genome + } + chrom = _chromList[i - 1]; + start = genome_pos - _startOffsets[i - 1]; + return true; +} + +CHRPOS NewGenomeFile::getChromSize(const QuickString &chrom) { + if (chrom == _currChromName) { + return _currChromSize; + } + lookupType::const_iterator iter= _chromSizeIds.find(chrom); + if (iter != _chromSizeIds.end()) { + _currChromName = iter->first; + _currChromSize = iter->second.first; + _currChromId = iter->second.second; + return _currChromSize; + } + return INT_MAX; +} + +CHRPOS NewGenomeFile::getChromId(const QuickString &chrom) { + if (chrom == _currChromName) { + return _currChromId; + } + lookupType::const_iterator iter= _chromSizeIds.find(chrom); + if (iter != _chromSizeIds.end()) { + _currChromName = iter->first; + _currChromSize = iter->second.first; + _currChromId = iter->second.second; + return _currChromId; + } + return INT_MAX; +} diff --git a/src/utils/GenomeFile/NewGenomeFile.h b/src/utils/GenomeFile/NewGenomeFile.h new file mode 100644 index 00000000..afd23b50 --- /dev/null +++ b/src/utils/GenomeFile/NewGenomeFile.h @@ -0,0 +1,64 @@ + +/***************************************************************************** + NewGenomeFile.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licensed under the GNU General Public License 2.0 license. +******************************************************************************/ +#ifndef NEW_GENOMEFILE_H +#define NEW_GENOMEFILE_H + + +#include "BedtoolsTypes.h" + +#include "api/BamReader.h" +#include "api/BamAux.h" + + +class NewGenomeFile { + +public: + + NewGenomeFile(const QuickString &genomeFileName); + NewGenomeFile(const BamTools::RefVector &genome); + ~NewGenomeFile(void); + + // load a GENOME file into a map keyed by chrom. value is a pair<int, int> of id and size. + void loadGenomeFileIntoMap(); + + bool projectOnGenome(CHRPOS genome_pos, QuickString &chrom, CHRPOS &start); + + CHRPOS getGenomeSize(void) const { return _genomeLength; } // return the total size of the genome + CHRPOS getChromSize(const QuickString &chrom); // return the size of a chromosome + CHRPOS getChromId(const QuickString &chrom); // return chromosome's sort order + const vector<QuickString> &getChromList() const { return _chromList; } // return a list of chrom names + CHRPOS getNumberOfChroms() const { return _chromList.size(); } + const QuickString &getGenomeFileName() const { return _genomeFileName; } + + + + +private: + QuickString _genomeFileName; + typedef map<QuickString, pair<CHRPOS, CHRPOS> > lookupType; + lookupType _chromSizeIds; + vector<QuickString> _chromList; + CHRPOS _maxId; + + // projecting chroms onto a single coordinate system + CHRPOS _genomeLength; + vector<CHRPOS> _startOffsets; + + //cache members for quick lookup + QuickString _currChromName; + CHRPOS _currChromSize; + CHRPOS _currChromId; + +}; + +#endif /* GENOMEFILE_H */ diff --git a/src/utils/NewChromsweep/Makefile b/src/utils/NewChromsweep/Makefile new file mode 100644 index 00000000..65263894 --- /dev/null +++ b/src/utils/NewChromsweep/Makefile @@ -0,0 +1,36 @@ +OBJ_DIR = ../../../obj/ +BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../../utils/ +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/Contexts/ \ + -I$(UTILITIES_DIR)/GenomeFile/ \ + -I$(UTILITIES_DIR)/FileRecordTools/ \ + -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ + -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/BamTools/include + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= NewChromsweep.cpp NewChromsweep.h +OBJECTS= NewChromsweep.o +_EXT_OBJECTS= +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/NewChromsweep.o $(BIN_DIR)/NewChromsweep.o + +.PHONY: clean \ No newline at end of file diff --git a/src/utils/NewChromsweep/NewChromsweep.cpp b/src/utils/NewChromsweep/NewChromsweep.cpp new file mode 100644 index 00000000..6c441c4e --- /dev/null +++ b/src/utils/NewChromsweep/NewChromsweep.cpp @@ -0,0 +1,216 @@ +/***************************************************************************** + NewChromsweep.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ + +#include "NewChromsweep.h" +#include "Context.h" +#include "FileRecordMgr.h" + +NewChromSweep::NewChromSweep(Context *context, + bool useMergedIntervals) +: _context(context), + _queryFRM(NULL), + _databaseFRM(NULL), + _useMergedIntervals(false), + _queryRecordsTotalLength(0), + _databaseRecordsTotalLength(0), + _wasInitialized(false), + _currQueryRec(NULL), + _currDatabaseRec(NULL) + +{ +} + + +bool NewChromSweep::init() { + + //Create new FileRecordMgrs for the two input files. + //Open them, and get the first record from each. + //if any of that goes wrong, return false; + //otherwise, return true. + _queryFRM = new FileRecordMgr(_context->getQueryFileIdx(), _context); + _databaseFRM = new FileRecordMgr(_context->getDatabaseFileIdx(), _context); + + if (!_queryFRM->open()) { + return false; + } + if (!_databaseFRM->open()) { + return false; + } + + _context->determineOutputType(); + + nextRecord(false); + if (_currDatabaseRec == NULL) { + return false; + } + _wasInitialized = true; + return true; + } + +void NewChromSweep::closeOut() { + while (!_queryFRM->eof()) { + nextRecord(true); + } + while (!_databaseFRM->eof()) { + nextRecord(false); + } +} + +NewChromSweep::~NewChromSweep(void) { + if (!_wasInitialized) { + return; + } + if (_currQueryRec != NULL) { + _queryFRM->deleteRecord(_currQueryRec); + } + if (_currDatabaseRec != NULL) { + _databaseFRM->deleteRecord(_currDatabaseRec); + } + _queryFRM->close(); + _databaseFRM->close(); + + delete _queryFRM; + delete _databaseFRM; + +} + + +void NewChromSweep::scanCache() { + recListIterType cacheIter = _cache.begin(); + while (cacheIter != _cache.end()) + { + const Record *cacheRec = cacheIter->value(); + if (_currQueryRec->sameChrom(cacheRec) && !(_currQueryRec->after(cacheRec))) { + if (intersects(_currQueryRec, cacheRec)) { + _hits.push_back(cacheRec); + } + cacheIter = _cache.next(); + } + else { + cacheIter = _cache.deleteCurrent(); + _databaseFRM->deleteRecord(cacheRec); + } + } +} + +void NewChromSweep::clearCache() +{ + //delete all objects pointed to by cache + for (recListIterType iter = _cache.begin(); iter != _cache.end(); iter = _cache.next()) { + _databaseFRM->deleteRecord(iter->value()); + } + _cache.clear(); +} + +bool NewChromSweep::chromChange() +{ + // the files are on the same chrom + if (_currDatabaseRec == NULL || _currQueryRec->sameChrom(_currDatabaseRec)) { + return false; + } + // the query is ahead of the database. fast-forward the database to catch-up. + if (_currQueryRec->chromAfter(_currDatabaseRec)) { + + while (_currDatabaseRec != NULL && + _currQueryRec->chromAfter(_currDatabaseRec)) { + _databaseFRM->deleteRecord(_currDatabaseRec); + nextRecord(false); + } + clearCache(); + return false; + } + // the database is ahead of the query. + else { + // 1. scan the cache for remaining hits on the query's current chrom. + if (_currQueryRec->getChrName() == _currChromName) + { + scanCache(); + } + // 2. fast-forward until we catch up and report 0 hits until we do. + else if (_currQueryRec->chromBefore(_currDatabaseRec)) + { + clearCache(); + } + + return true; + } +} + + +bool NewChromSweep::next(RecordKeyList &next) { + if (_currQueryRec != NULL) { + _queryFRM->deleteRecord(_currQueryRec); + } + nextRecord(true); + if (_currQueryRec == NULL) { //eof hit! + return false; + } + + if (_currDatabaseRec == NULL && _cache.empty()) { + return false; + } + _hits.clear(); + _currChromName = _currQueryRec->getChrName(); + // have we changed chromosomes? + if (!chromChange()) { + // scan the database cache for hits + scanCache(); + //skip if we hit the end of the DB + // advance the db until we are ahead of the query. update hits and cache as necessary + while (_currDatabaseRec != NULL && + _currQueryRec->sameChrom(_currDatabaseRec) && + !(_currDatabaseRec->after(_currQueryRec))) { + if (intersects(_currQueryRec, _currDatabaseRec)) { + _hits.push_back(_currDatabaseRec); + } + if (_currQueryRec->after(_currDatabaseRec)) { + _databaseFRM->deleteRecord(_currDatabaseRec); + _currDatabaseRec = NULL; + } else { + _cache.push_back(_currDatabaseRec); + _currDatabaseRec = NULL; + } + nextRecord(false); + } + } + next.setKey(_currQueryRec); + next.setListNoCopy(_hits); + return true; +} + +void NewChromSweep::nextRecord(bool query) { + if (query) { + if (!_context->getUseMergedIntervals()) { + _currQueryRec = _queryFRM->allocateAndGetNextRecord(); + } else { + _currQueryRec = _queryFRM->allocateAndGetNextMergedRecord(_context->getSameStrand() ? FileRecordMgr::SAME_STRAND_EITHER : FileRecordMgr::ANY_STRAND); + } + if (_currQueryRec != NULL) { + _queryRecordsTotalLength += (unsigned long)(_currQueryRec->getEndPos() - _currQueryRec->getStartPos()); + } + } else { //database + if (!_context->getUseMergedIntervals()) { + _currDatabaseRec = _databaseFRM->allocateAndGetNextRecord(); + } else { + _currDatabaseRec = _databaseFRM->allocateAndGetNextMergedRecord(_context->getSameStrand() ? FileRecordMgr::SAME_STRAND_EITHER : FileRecordMgr::ANY_STRAND); + } + if (_currDatabaseRec != NULL) { + _databaseRecordsTotalLength += (unsigned long)(_currDatabaseRec->getEndPos() - _currDatabaseRec->getStartPos()); + } + } +} + +bool NewChromSweep::intersects(const Record *rec1, const Record *rec2) const +{ + return rec1->sameChromIntersects(rec2, _context->getSameStrand(), _context->getDiffStrand(), + _context->getOverlapFraction(), _context->getReciprocal()); +} diff --git a/src/utils/NewChromsweep/NewChromsweep.h b/src/utils/NewChromsweep/NewChromsweep.h new file mode 100644 index 00000000..abd20e92 --- /dev/null +++ b/src/utils/NewChromsweep/NewChromsweep.h @@ -0,0 +1,95 @@ +/***************************************************************************** + NewChromsweepBed.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#ifndef NEW_CHROMSWEEP_H +#define NEW_CHROMSWEEP_H + +using namespace std; + +#include <string> +#include "BTlist.h" +#include "RecordKeyList.h" +#include <queue> +#include <iostream> +#include <fstream> +#include <stdlib.h> +#include "QuickString.h" + +class Record; +class FileRecordMgr; +class Context; + +class NewChromSweep { +public: + + NewChromSweep(Context *context, bool useMergedIntervals = false); + + + ~NewChromSweep(void); + bool init(); + + typedef BTlist<const Record *> recListType; + typedef const BTlistNode<const Record *> * recListIterType; + // loads next (a pair) with the current query and it's overlaps + bool next(RecordKeyList &next); + + //MUST call this method after sweep if you want the + //getTotalRecordLength methods to return the whole length of the + //record files, rather than just what was used by sweep. + void closeOut(); + + unsigned long getQueryTotalRecordLength() { return _queryRecordsTotalLength; } + unsigned long getDatabaseTotalRecordLength() { return _databaseRecordsTotalLength; } + + // Usage: + // NewChromSweep sweep = NewChromSweep(queryFileName, databaseFileName); + // RecordKeyList hits; + // while (sweep.next(hits)) + // { + // processHits(hits); + // } + // getQueryTotalRecordLength() + // getDatabaseTotalRecordLength() + +private: + Context *_context; + FileRecordMgr *_queryFRM; + FileRecordMgr *_databaseFRM; + + bool _useMergedIntervals; + + unsigned long _queryRecordsTotalLength; + unsigned long _databaseRecordsTotalLength; + + bool _wasInitialized; + + // a cache of still active features from the database file + recListType _cache; + // the set of hits in the database for the current query + recListType _hits; + + // the current query and db features. + Record * _currQueryRec; + Record *_currDatabaseRec; + // a cache of the current chrom from the query. used to handle chrom changes. + QuickString _currChromName; + + void nextRecord(bool query); //true fetches next query record, false fetches next db record. + void nextDatabase(); + + void scanCache(); + void clearCache(); + bool chromChange(); + bool intersects(const Record *rec1, const Record *rec2) const; + +}; + +#endif /* NewChromSweep_H */ -- GitLab