diff --git a/src/intersectFile/Makefile b/src/intersectFile/Makefile index 0b37027a789f65e7f584e42e8d359653e3e129fa..e8ac9ea59f32f81ed62311c5d49259c33feac41c 100644 --- a/src/intersectFile/Makefile +++ b/src/intersectFile/Makefile @@ -11,6 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/gzstream/ \ -I$(UTILITIES_DIR)/GenomeFile/ \ -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools/src \ -I$(UTILITIES_DIR)/BlockedIntervals \ -I$(UTILITIES_DIR)/BamTools-Ancillary \ -I$(UTILITIES_DIR)/FileRecordTools/ \ diff --git a/src/nekSandbox1/Makefile b/src/nekSandbox1/Makefile index a392a8edd9ec6735877e1b14d6ba16928b9e1c3b..e63d657fc8c6f8364d652a131efb73afe406edcf 100644 --- a/src/nekSandbox1/Makefile +++ b/src/nekSandbox1/Makefile @@ -13,7 +13,8 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/general \ -I$(UTILITIES_DIR)/NewChromsweep \ -I$(UTILITIES_DIR)/GenomeFile/ \ - -I$(UTILITIES_DIR)/BamTools/include + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools/src diff --git a/src/nekSandbox1/nekSandboxMain.cpp b/src/nekSandbox1/nekSandboxMain.cpp index cdc1e0a99bf360ef0142658497d424118d538741..be5aa61b847f2d105a50a2797800f410c72f11ca 100644 --- a/src/nekSandbox1/nekSandboxMain.cpp +++ b/src/nekSandbox1/nekSandboxMain.cpp @@ -16,6 +16,8 @@ using namespace std; #include "InflateStreamBuf.h" #include "InputStreamMgr.h" #include "BufferedStreamMgr.h" +#include "api/internal/io/BgzfStream_p.h" + //void doSweep(const Context *context); //void testDualQueue(Context *context); // @@ -32,48 +34,95 @@ int nek_sandbox1_main(int argc,char** argv) if (argc < 2) { cerr << "Error: Need one input file. Use \"-\" for stdin." << endl; } - - ifstream myFile(argv[1]); - if (!myFile.good()) { - cerr << "Error: Can't open genome file" << argv[1] << "Exiting..." << endl; - exit(1); - } - string sLine; - vector<QuickString> fields; - QuickString chrName; - - vector<QuickString> chroms; - chroms.push_back("1"); - chroms.push_back("2"); - chroms.push_back("10"); - chroms.push_back("11"); - - vector<int> chromCounts(4, 0); - int chromIdx = 0; - while (!myFile.eof()) { - sLine.clear(); - fields.clear(); - getline(myFile, sLine); - if (sLine[0] == '@') { - cout << sLine << endl; - continue; - } - Tokenize(sLine.c_str(), fields); - const QuickString &currChrom = fields[2]; - if (currChrom == chroms[chromIdx]) { - cout << sLine << endl; - chromCounts[chromIdx]++; - if (chromCounts[chromIdx] >= 3000) { - chromIdx++; - } - if (chromIdx > 3) { - break; - } - } - } - - return 0; - +// ifstream inFileStream(argv[1]); +// static const int BUF_SIZE = 8192; +// BamTools::Internal::BgzfStream bgStream; +// bgStream.OpenStream(&inFileStream, BamTools::IBamIODevice::ReadOnly); +// char sLine[BUF_SIZE]; +// while (bgStream.IsOpen()) { +// memset(sLine, 0, BUF_SIZE); +// bgStream.Read(sLine, BUF_SIZE-1); +// if((int)strlen(sLine) < BUF_SIZE-1) { +// bgStream.Close(); +// } +// printf("%s", sLine); +// } +// return 0; +// QuickString filename(argv[1]); +// istream *inputStream = NULL; +// if (filename == "-") { +// inputStream = &cin; +// } else { +// inputStream = new ifstream(filename.c_str()); +// } +// +// BamTools::BamReader _bamReader; +// try { +// _bamReader.OpenStream(inputStream); +// } +// catch (...) { +// fprintf(stderr, "ERROR: Unable to open BAM file from standard input.\n"); +// exit(1); +// } +//// try { +//// _bamReader.Open(argv[1]); +//// } +//// catch (...) { +//// fprintf(stderr, "ERROR: Unable to open BAM file %s\n",argv[1]); +//// exit(1); +//// } +//// } +// QuickString _bamHeader = _bamReader.GetHeaderText(); +// BamTools::RefVector _references = _bamReader.GetReferenceData(); +// +// if (_bamHeader.empty() || _references.empty()) { +// cout << "This is not a bam file." << endl; +// } else { +// cout << "This is a BAM file." << endl; +// } +// return 0; +// +// ifstream myFile(argv[1]); +// if (!myFile.good()) { +// cerr << "Error: Can't open genome file" << argv[1] << "Exiting..." << endl; +// exit(1); +// } +// string sLine; +// vector<QuickString> fields; +// QuickString chrName; +// +// vector<QuickString> chroms; +// chroms.push_back("1"); +// chroms.push_back("2"); +// chroms.push_back("10"); +// chroms.push_back("11"); +// +// vector<int> chromCounts(4, 0); +// int chromIdx = 0; +// while (!myFile.eof()) { +// sLine.clear(); +// fields.clear(); +// getline(myFile, sLine); +// if (sLine[0] == '@') { +// cout << sLine << endl; +// continue; +// } +// Tokenize(sLine.c_str(), fields); +// const QuickString &currChrom = fields[2]; +// if (currChrom == chroms[chromIdx]) { +// cout << sLine << endl; +// chromCounts[chromIdx]++; +// if (chromCounts[chromIdx] >= 3000) { +// chromIdx++; +// } +// if (chromIdx > 3) { +// break; +// } +// } +// } +// +// return 0; +// Context context; context.addInputFile(argv[1]); context.setSortedInput(true); @@ -100,9 +149,6 @@ int nek_sandbox1_main(int argc,char** argv) break; } - if (record->getStartPos() == 90647945) { - printf("Breakpoint here.\n"); - } outbuf.clear(); record->print(outbuf); printf("%s\n", outbuf.c_str()); diff --git a/src/regressTest/Makefile b/src/regressTest/Makefile index 526db6bdd21bae5e07950113e1a6bd3efb6bdbe8..e9ceebf3a43bc67ff0f1672139b174d66fb1f037 100644 --- a/src/regressTest/Makefile +++ b/src/regressTest/Makefile @@ -13,6 +13,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ -I$(UTILITIES_DIR)/fileType/ \ -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools/src \ -I$(UTILITIES_DIR)/version/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders \ diff --git a/src/utils/BinTree/Makefile b/src/utils/BinTree/Makefile index c5189664fabe170dec31e7e4565e8325f97cfb83..4bfe2edcbc43dba6bfa25e210e5f7e5ada221a84 100644 --- a/src/utils/BinTree/Makefile +++ b/src/utils/BinTree/Makefile @@ -11,7 +11,8 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ - -I$(UTILITIES_DIR)/BamTools/include + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools/src # ---------------------------------- # define our source and object files diff --git a/src/utils/FileRecordTools/FileReaders/BamFileReader.cpp b/src/utils/FileRecordTools/FileReaders/BamFileReader.cpp index c00b58316d96ead3e80a876d4b8f3ba0b0257647..a9d129d75f9fb3904ef4f7e06b0211842d2607a9 100644 --- a/src/utils/FileRecordTools/FileReaders/BamFileReader.cpp +++ b/src/utils/FileRecordTools/FileReaders/BamFileReader.cpp @@ -2,60 +2,70 @@ #include "ParseTools.h" #include <cstdio> BamFileReader::BamFileReader() -: _eof(false), - _useTags(true) +: _bamReader(NULL), + _eof(false), + _useTags(true), + _shouldDeleteBamReader(false) { } BamFileReader::~BamFileReader() { - + if (_bamReader != NULL && _shouldDeleteBamReader) { + delete _bamReader; + _shouldDeleteBamReader = false; + _bamReader = NULL; + } } bool BamFileReader::open() { - if (_inputStream != NULL) { - try { - _bamReader.OpenStream(_inputStream); - } - catch (...) { - fprintf(stderr, "ERROR: Unable to open BAM file from standard input.\n"); - exit(1); - } - } else { - try { - _bamReader.Open(_filename); - } - catch (...) { - fprintf(stderr, "ERROR: Unable to open BAM file %s\n", _filename.c_str()); - exit(1); - } - } - _bamHeader = _bamReader.GetHeaderText(); - _references = _bamReader.GetReferenceData(); +// if (_bamReader == NULL) { +// _bamReader = new BamTools::BamReader(); +// _shouldDeleteBamReader = true; +// } +// if (_inputStream != NULL) { +// try { +// _bamReader->OpenStream(_inputStream); +// } +// catch (...) { +// fprintf(stderr, "ERROR: Unable to open BAM file from standard input.\n"); +// exit(1); +// } +// } else { +// try { +// _bamReader->Open(_filename); +// } +// catch (...) { +// fprintf(stderr, "ERROR: Unable to open BAM file %s\n", _filename.c_str()); +// exit(1); +// } +// } + _bamHeader = _bamReader->GetHeaderText(); + _references = _bamReader->GetReferenceData(); return true; } bool BamFileReader::isOpen() const { - return _bamReader.IsOpen(); + return _bamReader->IsOpen(); } void BamFileReader::close() { - _bamReader.Close(); +// _bamReader->Close(); } bool BamFileReader::readEntry() { if (_useTags) { - if (_bamReader.GetNextAlignment(_bamAlignment)) { + if (_bamReader->GetNextAlignment(_bamAlignment)) { return true; } } else { - if (_bamReader.GetNextAlignmentCore(_bamAlignment)) { + if (_bamReader->GetNextAlignmentCore(_bamAlignment)) { return true; } } diff --git a/src/utils/FileRecordTools/FileReaders/BamFileReader.h b/src/utils/FileRecordTools/FileReaders/BamFileReader.h index 29032886bdf60b1cf4fa9afd235c85324bce1fbf..deb03a2986f1bee63d32999435dc9807b239294e 100644 --- a/src/utils/FileRecordTools/FileReaders/BamFileReader.h +++ b/src/utils/FileRecordTools/FileReaders/BamFileReader.h @@ -31,9 +31,10 @@ public: //this is set to false, so not using them, which reduces //the run time of reading a BAM file by more than half. virtual void setUseTags(bool flag) { _useTags = flag; } + void setBamReader(BamTools::BamReader *bamReader) { _bamReader = bamReader; } virtual bool readEntry(); - virtual bool hasHeader() const { return _bamReader.IsOpen(); } //any open Bam file automatically has a header + virtual bool hasHeader() const { return _bamReader->IsOpen(); } //any open Bam file automatically has a header virtual const QuickString &getHeader() const { return _bamHeader; } const BamTools::RefVector &getReferences() const { return _references; } @@ -49,12 +50,13 @@ public: char getStrand() const; protected: - BamTools::BamReader _bamReader; + BamTools::BamReader *_bamReader; BamTools::BamAlignment _bamAlignment; bool _eof; QuickString _bamHeader; BamTools::RefVector _references; bool _useTags; + bool _shouldDeleteBamReader; void extractNameFromCore(); diff --git a/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp index e68b4d3d8fe2d8b62bc152599609425866b1f595..6a4da6f73b82aa30c924e9469ba1fb424a1e9750 100644 --- a/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp +++ b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp @@ -16,7 +16,9 @@ BufferedStreamMgr::BufferedStreamMgr(const QuickString &filename) _filename(filename), _mainBufCurrStartPos(0), _mainBufCurrLen(0), - _eof(false) + _eof(false), + _useBufSize(0), + _streamFinished(false) { } @@ -49,9 +51,14 @@ bool BufferedStreamMgr::init() if (!getTypeData()) { return false; } + if (_inputStreamMgr->isGzipped()) { + _useBufSize = GZIP_LINE_BUF_SIZE; + } else { + _useBufSize = MAIN_BUF_READ_SIZE; + } - _mainBuf = new bufType[MAIN_BUF_READ_SIZE +1]; - memset(_mainBuf, 0, MAIN_BUF_READ_SIZE +1); + _mainBuf = new bufType[_useBufSize +1]; + memset(_mainBuf, 0, _useBufSize +1); return true; } @@ -106,10 +113,17 @@ bool BufferedStreamMgr::readFileChunk() if (eof()) { return false; } - memset(_mainBuf, 0, MAIN_BUF_READ_SIZE +1); - - _inputStreamMgr->getFinalStream()->read((char *)_mainBuf, MAIN_BUF_READ_SIZE); - _mainBufCurrLen = _inputStreamMgr->getFinalStream()->gcount(); + memset(_mainBuf, 0, _useBufSize +1); _mainBufCurrStartPos = 0; - return _mainBufCurrLen > 0; + + if (!_streamFinished) { + _mainBufCurrLen = _inputStreamMgr->read((char *)_mainBuf, _useBufSize); + if (_mainBufCurrLen < _useBufSize) { + _streamFinished = true; + } + return _mainBufCurrLen > 0; + } else { + _mainBufCurrLen = 0; + return false; + } } diff --git a/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.h b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.h index 93eb30ce1d34de66eb55c42e141686d0f1e84400..6bdc0640cc113efcb88e664851c3ef7d9d249177 100644 --- a/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.h +++ b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.h @@ -23,10 +23,11 @@ public: bool init(); const FileRecordTypeChecker & getTypeChecker() const { return _typeChecker; } - istream *getStream() { return _inputStreamMgr->getFinalStream(); } +// istream *getStream() { return _inputStreamMgr->getFinalStream(); } bool eof() const { return _eof; } bool getLine(QuickString &line); + BamTools::BamReader *getBamReader() { return _inputStreamMgr->getBamReader(); } private: InputStreamMgr *_inputStreamMgr; @@ -39,10 +40,12 @@ private: int _mainBufCurrStartPos; int _mainBufCurrLen; bool _eof; + int _useBufSize; + bool _streamFinished; //The minus ones in these constants are for leaving room for a null terminator after reading into buffers. static const int MAIN_BUF_READ_SIZE = 67108863; //64 Mb minus 1 static const int TYPE_CHECK_READ_SIZE = 4095; // 4K - static const int GZIP_LINE_BUF_SIZE = 16384; //16K + static const int GZIP_LINE_BUF_SIZE = 8191; // 8K bool readFileChunk(); bool getTypeData(); // void resetStream(); diff --git a/src/utils/FileRecordTools/FileReaders/FileReader.cpp b/src/utils/FileRecordTools/FileReaders/FileReader.cpp index a9505b1b414011cd52bcf3161a4e50580393c8af..3c5314a533b617c5228ed1fef9fbbf9a93cdda97 100644 --- a/src/utils/FileRecordTools/FileReaders/FileReader.cpp +++ b/src/utils/FileRecordTools/FileReaders/FileReader.cpp @@ -4,12 +4,13 @@ #include "BufferedStreamMgr.h" FileReader::FileReader() -: _inputStream(NULL), +: +// _inputStream(NULL), _bufStreamMgr(NULL), _isFileOpen(false), - _mustDeleteInputStream(false), - _externalInputStream(false), - _useBufStream(true), +// _mustDeleteInputStream(false), +// _externalInputStream(false), +// _useBufStream(true), _currChromId(-1) { } @@ -21,29 +22,34 @@ bool FileReader::open() { if (_isFileOpen) { return true; - } - - printf("Inside FileReader::open.\n"); - if (!_externalInputStream && _inputStream == NULL) { - _inputStream = new ifstream(_filename.c_str(), ios::in); - _mustDeleteInputStream = true; - } - if (_inputStream == NULL || !_inputStream->good()) { + } else { fprintf(stderr, "Error: bad input stream.\n"); exit(1); } - - _isFileOpen = true; - return true; + return false; //can't reach this point, but it's here to satisfy the compiler. +// +// printf("Inside FileReader::open.\n"); +// if (!_externalInputStream && _inputStream == NULL) { +// _inputStream = new ifstream(_filename.c_str(), ios::in); +// _mustDeleteInputStream = true; +// } +// if (_inputStream == NULL || !_inputStream->good()) { +// fprintf(stderr, "Error: bad input stream.\n"); +// exit(1); +// } +// +// _isFileOpen = true; +// return true; } void FileReader::close() { - if (_mustDeleteInputStream) { - delete _inputStream; - } - return; +// if (_mustDeleteInputStream) { +// delete _inputStream; +// } +// return; } bool FileReader::eof() const { - return _useBufStream ? _bufStreamMgr->eof() : _inputStream->eof(); +// return _useBufStream ? _bufStreamMgr->eof() : _inputStream->eof(); + return _bufStreamMgr->eof(); } diff --git a/src/utils/FileRecordTools/FileReaders/FileReader.h b/src/utils/FileRecordTools/FileReaders/FileReader.h index 2d37869067ac88e74e8b93da682fd3868bcafcfc..2b4d9e5e3633cdf0cd7a21cc91e3167fd2e9b2f8 100644 --- a/src/utils/FileRecordTools/FileReaders/FileReader.h +++ b/src/utils/FileRecordTools/FileReaders/FileReader.h @@ -16,19 +16,19 @@ public: FileReader(); virtual ~FileReader(); void setFileName(const string &filename) { _filename = filename; } - void setInputStream(istream *inputStream) { - _inputStream = inputStream; - _externalInputStream = true; - } +// void setInputStream(istream *inputStream) { +// _inputStream = inputStream; +// _externalInputStream = true; +// } void setInputStream(BufferedStreamMgr *bufStreamMgr) { _bufStreamMgr = bufStreamMgr; - _inputStream = _bufStreamMgr->getStream(); - _externalInputStream = true; - _useBufStream = true; +// _inputStream = _bufStreamMgr->getStream(); +// _externalInputStream = true; +// _useBufStream = true; // This will short circuit the open method. BufferedStreamMgr does it's own file opening. //However, for BAM, we want to re-open it. - _isFileOpen = _bufStreamMgr->getTypeChecker().isBam() ? false : true; + _isFileOpen = true; //_bufStreamMgr->getTypeChecker().isBam() ? false : true; } void setContext(const Context *context) { _context = context; } @@ -43,13 +43,13 @@ public: virtual const QuickString &getHeader() const =0; protected: string _filename; - istream *_inputStream; +// istream *_inputStream; BufferedStreamMgr *_bufStreamMgr; bool _isFileOpen; - bool _mustDeleteInputStream; - bool _externalInputStream; - bool _useBufStream; +// bool _mustDeleteInputStream; +// bool _externalInputStream; +// bool _useBufStream; int _currChromId; Context const *_context; diff --git a/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp index 9711a292100dfb87cf4ec9ce6961915370dbd659..2044120bb392ebcd4dc00bff051e89d9f1a28cc1 100644 --- a/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp +++ b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp @@ -22,7 +22,11 @@ InputStreamMgr::InputStreamMgr(const QuickString &filename, bool buildScanBuffer _isStdin(false), _isGzipped(false), _isBam(false), - _numBytesInBuffer(0) + _isBgzipped(false), + _bamRuledOut(false), + _numBytesInBuffer(0), + _bamReader(NULL), + _bgStream(NULL) { _possibleBamCode.resize(4, 0); } @@ -45,6 +49,14 @@ InputStreamMgr::~InputStreamMgr() { delete _infStreamBuf; _infStreamBuf = NULL; } + if (_bamReader != NULL) { + delete _bamReader; + _bgStream = NULL; + } + if (_bgStream != NULL) { + delete _bgStream; + _bgStream = NULL; + } } bool InputStreamMgr::init() @@ -79,12 +91,23 @@ bool InputStreamMgr::init() return true; } +int InputStreamMgr::read(char *data, size_t dataSize) +{ + if (_isBgzipped) { + return (int)(_bgStream->Read(data, dataSize)); + } + _finalInputStream->read(data, dataSize); + return _finalInputStream->gcount(); +} + void InputStreamMgr::populateScanBuffer() { _scanBuffer.clear(); int numChars=0; int currChar = 0; + bool mustAppend = true; while (1) { + mustAppend = true; currChar = _pushBackStreamBuf->sbumpc(); //Stop when EOF hit. if (currChar == EOF) { @@ -93,10 +116,16 @@ void InputStreamMgr::populateScanBuffer() numChars++; _scanBuffer.push_back(currChar); if (_isGzipped) { - if (bamDetected(numChars, currChar)) { + if (!_bamRuledOut && detectBamOrBgzip(numChars, currChar, mustAppend)) { return; } - _compressedSaveData.push_back(currChar); + if (numChars == 0) { + continue; //this will only happen when we've just discovered that this + //is definitely not BAM, and want to start over. + } + if (mustAppend) { + _compressedSaveData.push_back(currChar); + } } //For non-gzip, also stop if we have the minimum number of bytes and newline is hit. @@ -115,7 +144,7 @@ void InputStreamMgr::populateScanBuffer() } } -bool InputStreamMgr::bamDetected(int numChars, int currChar) +bool InputStreamMgr::detectBamOrBgzip(int &numChars, int currChar, bool &mustAppend) { //Look for the BAM magic string "BAM\1" in the first fouur characters of the input stream. //In compressed form, the first char is the gzip signifier, which was already found. @@ -124,10 +153,50 @@ bool InputStreamMgr::bamDetected(int numChars, int currChar) _possibleBamCode[numChars -1] = currChar; //special: test for BAM if (numChars == 4 && _possibleBamCode[1] == 139 && _possibleBamCode[2] == 8 && _possibleBamCode[3] == 4) { - //BAM detected. + //BAM magic string detected.This is either a BAM or bgzip file. To find out which, we have to try and + //open the file as BAM, with a BAM reader, and see if the header and references are both non-empty. + //However, if they are empty, we will have had to save all bytes consumed in the attempt, meaning still + //fill the scanBuffer and push it back onto the pushBackStream as normal. + for (; numChars < BAM_SCAN_BUFFER_SIZE; numChars++) { + currChar = _pushBackStreamBuf->sbumpc(); + //Stop when EOF hit. + if (currChar == EOF) { + break; + } + _scanBuffer.push_back(currChar); + + } _pushBackStreamBuf->pushBack(_scanBuffer); + + //ok, now all the data read so far is saved in the scan buffer, and pushbackstream is reset. + //now we make a BamReader and try to open the file. + + + _bamReader = new BamTools::BamReader(); + _bamReader->OpenStream(_finalInputStream); + QuickString bamHeader(_bamReader->GetHeaderText()); + BamTools::RefVector bamRefs(_bamReader->GetReferenceData()); + + if (bamHeader.empty() || bamRefs.empty()) { + //This is NOT a bam file. + _pushBackStreamBuf->clear(); + _compressedSaveData.clear(); + //Put all bytes read so far back onto the scan buffer, then reset + //everything so that we're effectively starting over. + _pushBackStreamBuf->pushBack(_scanBuffer); + _scanBuffer.clear(); + mustAppend = false; + numChars = 0; + _isBam = false; + _isBgzipped = true; + _bamRuledOut = true; + _numBytesInBuffer = 0; + delete _bamReader; + _bamReader = NULL; + return false; + } _isBam = true; - _numBytesInBuffer = 4; + _numBytesInBuffer = _scanBuffer.size(); return true; } } @@ -155,22 +224,38 @@ void InputStreamMgr::reset() if (_isBam) { return; } - if (!_isStdin) { + //For file input, just re-open the file. _oldInputStream = _finalInputStream; _finalInputStream = new ifstream(_filename.c_str()); } else { - if (_isGzipped) { + if (_isBgzipped) { + for (BTlist<int>::const_iterator_type iter = _pushBackStreamBuf->_buffer.begin(); + iter != _pushBackStreamBuf->_buffer.end(); iter = _pushBackStreamBuf->_buffer.next()) { + _compressedSaveData.push_back(iter->value()); + } + _pushBackStreamBuf->clear(); + _pushBackStreamBuf->pushBack(_compressedSaveData); + } else if (_isGzipped) { _pushBackStreamBuf->pushBack(_compressedSaveData); } else { _pushBackStreamBuf->pushBack(BTlist<int>(_saveDataStr)); } // _finalInputStream = new istream(_pushBackStreamBuf); } - if (_isGzipped) { - //the file is gzipped, but is not BAM. + if (_isBgzipped) { + //The file is bgzipped, but not BAM. + _bgStream = new BamTools::Internal::BgzfStream(); + _bgStream->OpenStream(_finalInputStream, BamTools::IBamIODevice::ReadOnly); + } else if (_isGzipped) { + //the file is gzipped, but is not bgzipped or BAM. _infStreamBuf = new InflateStreamBuf(_finalInputStream); + if (_oldInputStream != NULL) { + delete _oldInputStream; + } _oldInputStream = _finalInputStream; _finalInputStream = new istream(_infStreamBuf); } } + + diff --git a/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h index df21ce34c2388a1906f644f8e63b95f76ade4e32..7a11d4e655770a39bd60665f3eecd5e5dd897cb3 100644 --- a/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h +++ b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h @@ -13,6 +13,8 @@ using namespace std; #include "PushBackStreamBuf.h" #include "InflateStreamBuf.h" #include "QuickString.h" +#include "api/BamReader.h" +#include "api/internal/io/BgzfStream_p.h" #include <iostream> @@ -22,9 +24,10 @@ public: InputStreamMgr(const QuickString &filename, bool buildScanBuffer = true); ~InputStreamMgr(); bool init(); + int read(char *data, size_t dataSize); //use getScanBuffer for auto-detection of file types. - istream *getFinalStream() { return _finalInputStream; } +// istream *getFinalStream() { return _finalInputStream; } const BTlist<int> &getScanBuffer() const { return _scanBuffer; } int getBufferLength() const { return _numBytesInBuffer; } void populateScanBuffer(); @@ -34,6 +37,7 @@ public: PushBackStreamBuf *getPushBackStreamBuf() const {return _pushBackStreamBuf; } void getSavedData(QuickString &str) const { str = _saveDataStr; } bool isBam() const { return _isBam; } + BamTools::BamReader *getBamReader() { return _bamReader; } private: QuickString _filename; @@ -48,13 +52,18 @@ private: bool _isStdin; bool _isGzipped; bool _isBam; + bool _isBgzipped; + bool _bamRuledOut; vector<int> _possibleBamCode; static const int SCAN_BUFFER_SIZE = 4096; // 4 K buffer + static const int BAM_SCAN_BUFFER_SIZE = 32768; // 32K static const int MIN_SCAN_BUFFER_SIZE = 2048; int _numBytesInBuffer; //this will hold the length of the buffer after the scan. + BamTools::BamReader *_bamReader; + BamTools::Internal::BgzfStream *_bgStream; static const char *FIFO_STRING_LITERAL; - bool bamDetected(int numChars, int currChar); + bool detectBamOrBgzip(int &numChars, int currChar, bool &mustAppend); void decompressBuffer(); }; diff --git a/src/utils/FileRecordTools/FileReaders/Makefile b/src/utils/FileRecordTools/FileReaders/Makefile index 085486444cd7fa6e70e21a6463d884102e88086b..3e6ad82db933214ac4188c7e15e4230a8053da3b 100644 --- a/src/utils/FileRecordTools/FileReaders/Makefile +++ b/src/utils/FileRecordTools/FileReaders/Makefile @@ -5,6 +5,7 @@ UTILITIES_DIR = ../../../utils/ # define our includes # ------------------- INCLUDES = -I$(UTILITIES_DIR)/BamTools/include/ \ + -I$(UTILITIES_DIR)/BamTools/src/ \ -I$(UTILITIES_DIR)/fileType/ \ -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/GenomeFile/ \ diff --git a/src/utils/FileRecordTools/FileRecordMgr.cpp b/src/utils/FileRecordTools/FileRecordMgr.cpp index ab3b516b17fc6cb47dd37987f710f7bd55479336..5d6326351d6a995aabc645fbbe19f0e95bb549a6 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.cpp +++ b/src/utils/FileRecordTools/FileRecordMgr.cpp @@ -228,6 +228,7 @@ void FileRecordMgr::allocateFileReader() case FileRecordTypeChecker::BAM_FILE_TYPE: _fileReader = new BamFileReader(); (static_cast<BamFileReader *>(_fileReader))->setUseTags(_context->getUseFullBamTags()); + (static_cast<BamFileReader *>(_fileReader))->setBamReader(_bufStreamMgr->getBamReader()); break; default: break; diff --git a/src/utils/FileRecordTools/FileRecordMgr.h b/src/utils/FileRecordTools/FileRecordMgr.h index cc176890616b58f019db498b7c37f99e1ce7f95c..a3ef287748295eeba6d97b696612607437834203 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.h +++ b/src/utils/FileRecordTools/FileRecordMgr.h @@ -171,6 +171,7 @@ private: unsigned long _totalMergedRecordLength; BlockMgr *_blockMgr; + BamTools::BamReader *_bamReader; void allocateFileReader(); diff --git a/src/utils/FileRecordTools/Makefile b/src/utils/FileRecordTools/Makefile index 536ab1926db4e17cac4653be2f5f069c8afb3517..69d2f4de81580f6003596cbf81187b2ab5bf3b2d 100644 --- a/src/utils/FileRecordTools/Makefile +++ b/src/utils/FileRecordTools/Makefile @@ -10,7 +10,8 @@ INCLUDES = -I$(UTILITIES_DIR)/fileType/ \ -I$(UTILITIES_DIR)/GenomeFile/ \ -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/Contexts/ \ - -I$(UTILITIES_DIR)/BamTools/include + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools/src SUBDIRS = ./FileReaders \ ./Records \ diff --git a/src/utils/FileRecordTools/Records/Makefile b/src/utils/FileRecordTools/Records/Makefile index 757739c7d5a8ca595c8240bb4c6ecb74a7e342d3..f1e883217d366ad0a90be082fa2578862bd1b79d 100644 --- a/src/utils/FileRecordTools/Records/Makefile +++ b/src/utils/FileRecordTools/Records/Makefile @@ -9,7 +9,8 @@ INCLUDES = -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/fileType/ \ -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/GenomeFile/ \ - -I$(UTILITIES_DIR)/BamTools/include + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools/src # ---------------------------------- # define our source and object files diff --git a/src/utils/NewChromsweep/Makefile b/src/utils/NewChromsweep/Makefile index 65263894a1a9921dd9a11213517f3dd941327c24..7c1808e989f149e4071824fb07a4d62d20452bff 100644 --- a/src/utils/NewChromsweep/Makefile +++ b/src/utils/NewChromsweep/Makefile @@ -11,7 +11,8 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/FileRecordTools/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ - -I$(UTILITIES_DIR)/BamTools/include + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools/src # ---------------------------------- # define our source and object files diff --git a/src/utils/general/BTlist.h b/src/utils/general/BTlist.h index 0a5a57646bf4d346d5934aa8de5833ba39f8c1a3..7bae4f3054d652efb2d4e04affe64ddd2fa1cc3b 100644 --- a/src/utils/general/BTlist.h +++ b/src/utils/general/BTlist.h @@ -27,6 +27,7 @@ #include "FreeList.h" #include "QuickString.h" +#include <stack> #include <cstring> //for memset template <class T> class BTlist; @@ -168,6 +169,31 @@ public: _size++; } +// void push_front(const T &val) { +// BTlistNode<T> *newNode = new BTlistNode<T>(val); +// if (empty()) { +// _begin = newNode; +// _currEnd = newNode; +// _prevCursor = NULL; +// } else { +// newNode->_next = _begin; +// _begin = newNode; +// _prevCursor = _currEnd; +// } +// _size++; +// } +// void push_front(BTlist<T> &newList) { +// stack<T> theStack; +// for (const_iterator_type iter = newList.begin(); iter != newList.end(); iter = newList.next()) { +// theStack.push(iter->_val); +// } +// while (!theStack.empty()) { +// const T &val = theStack.top(); +// theStack.pop(); +// push_front(val); +// } +// } + void clear() { if (_dontDelete) { return; @@ -207,6 +233,8 @@ public: //this method will convert the contents of the list into a QuickString. //It assumes that the templated type of the listNode can be converted to a //char using the (char) cast. The user must ensure that this is the case. + //The 2nd parameter needs only be true if you wish to append the data + //to the QuickString's existing contents. Otherwise, it will be cleared first. void toStr(QuickString &str, bool append = false) const { if (!append) { str.clear(); diff --git a/src/utils/general/PushBackStreamBuf.h b/src/utils/general/PushBackStreamBuf.h index b39a1eb1dbdafec0114a53344576548a6f79e620..0c41cd5a1937d4b441424fe3747d1e21ed5f83cb 100644 --- a/src/utils/general/PushBackStreamBuf.h +++ b/src/utils/general/PushBackStreamBuf.h @@ -15,11 +15,14 @@ using namespace std; class PushBackStreamBuf: public std::streambuf { public: + friend class InputStreamMgr; PushBackStreamBuf(streambuf* primary_stream); ~PushBackStreamBuf(); void pushBack(const BTlist<int> &vec); +// void push_front(const BTlist<int> &vec) { _buffer.push_front(vec); } int sbumpc(); + void clear() { _buffer.clear(); } protected: int uflow() { return sbumpc(); } @@ -28,7 +31,7 @@ protected: private: streambuf* _primary_stream; BTlist<int> _buffer; - static const int SCAN_BUFFER_SIZE = 4096; //4 KB buffer + static const int SCAN_BUFFER_SIZE = 8192; //8 KB buffer static const int MAIN_BUFFER_SIZE = 128 * 1024; //128K buffer };