diff --git a/src/utils/BinTree/BinTree.cpp b/src/utils/BinTree/BinTree.cpp new file mode 100644 index 0000000000000000000000000000000000000000..36e5b36e9e02f8deeb9997bc0614f1ed11769ec2 --- /dev/null +++ b/src/utils/BinTree/BinTree.cpp @@ -0,0 +1,229 @@ +#include "BinTree.h" +#include "FileRecordMgr.h" + + +BinTree::BinTree(int databaseFileIdx, Context *context) +: _databaseFileIdx(databaseFileIdx), + _context(context), + _binOffsetsExtended(NULL), + _dbFileMgr(NULL), + _showBinMetrics(false), + _maxBinNumFound(0) + { + _binOffsetsExtended = new uint32_t[NUM_BIN_LEVELS]; + memset(_binOffsetsExtended, 0, NUM_BIN_LEVELS * sizeof(uint32_t)); + + //start at idx 1, because the memset above already initialized + //the first idx to zero, which is what we want. + for (uint32_t i= 1; i < NUM_BIN_LEVELS; i++) { + _binOffsetsExtended[i] = _binOffsetsExtended[i-1] + (1 << ((NUM_BIN_LEVELS - i -1) * 3)); + } +} + +BinTree::~BinTree() { + //TBD: pass all elements in tree back to FRM for proper cleanup/deletion + for (mainMapType::iterator mainIter = _mainMap.begin(); mainIter != _mainMap.end(); mainIter++) { + allBinsType bins = mainIter->second; + if (bins == NULL) { + fprintf(stderr, "ERROR: In BinTree destructor: found chromosome with NULL bin array.\n"); + continue; + } + if (!_showBinMetrics) { //don't clean up bins when simply reporting metrics. + + for (uint32_t i=0; i < NUM_BINS; i++) { + binType bin = bins[i]; + if (bin == NULL) { + continue; + } + for (innerListIterType listIter = bin->begin(); listIter != bin->end(); listIter = bin->next()) { + const Record *record = listIter->value(); + _dbFileMgr->deleteRecord(record); + } + delete bin; + bin = NULL; + } + } + delete [] bins; + bins = NULL; + } + if (_dbFileMgr != NULL) { + delete _dbFileMgr; + _dbFileMgr = NULL; + } + delete [] _binOffsetsExtended; + + if (_showBinMetrics) { + map<int, int> hitsHistogram; + FILE *fp = fopen("BinsHitFile.txt", "w"); + fprintf(fp, "The largest bin was %u\n", _maxBinNumFound); + fprintf(fp, "There were %d different bins hit by database.\n", (int)_binsHit.size()); + for (map<uint32_t, int>::iterator binIter = _binsHit.begin(); binIter != _binsHit.end(); binIter++) { + uint32_t binNum = binIter->first; + int numHits = binIter->second; + fprintf(fp, "%u\t%d\n", binNum, numHits); + hitsHistogram[numHits]++; + } + fclose(fp); + fp = fopen("BinHitsHistogram.txt", "w"); + fprintf(fp, "NumHits\tNumBins\n"); + for (map<int, int>::iterator histIter = hitsHistogram.begin(); histIter != hitsHistogram.end(); histIter++) { + fprintf(fp, "%d\t%d\n", histIter->first, histIter->second); + } + fclose(fp); + } +} + +bool BinTree::loadDB() +{ + _dbFileMgr = new FileRecordMgr(_databaseFileIdx, _context); + if (!_dbFileMgr->open()) { + fprintf(stderr, "ERROR: Can't open database file %s to build tree.\n", _context->getInputFileName(_databaseFileIdx).c_str()); + delete _dbFileMgr; + _dbFileMgr = NULL; + return false; + } + Record *record = NULL; + while (!_dbFileMgr->eof()) { + record = _dbFileMgr->allocateAndGetNextRecord(); + if (record == NULL) { + continue; + } + + if (!addRecordToTree(record)) { + fprintf(stderr, "ERROR: Unable to add record to tree.\n"); + _dbFileMgr->close(); + return false; + } + } + _dbFileMgr->close(); + + //TBD: give ERROR and return false if tree is empty. + if (_mainMap.empty()) { + fprintf(stderr, "ERROR: Tree is empty, no records added.\n"); + return false; + } + return true; + +} + +void BinTree::getHits(Record *record, RecordKeyList &hitSet) +{ + if (_showBinMetrics) { + return; //don't care about query entries just yet. + } + const QuickString &chr = record->getChrName(); + mainMapType::iterator mainIter = _mainMap.find(chr); + if (mainIter == _mainMap.end()) { + //given chrom not even in map. + return; + } + + uint32_t startBin = 0; + uint32_t endBin = 0; + + uint32_t startPos = record->getStartPos(); + uint32_t endPos = record->getEndPos(); + + startBin = (startPos >> _binFirstShift); + endBin = ((endPos-1) >> _binFirstShift); + + + const allBinsType bins = mainIter->second; + + /* SYNOPSIS: + 1. We loop through each UCSC BIN level for feature A's chrom. + 2. For each BIN, we loop through each B feature and add it to + hits if it meets all of the user's requests, which include: + (a) overlap fractio, (b) strandedness, (c) reciprocal overlap + */ + for (uint32_t i = 0; i < NUM_BIN_LEVELS; i++) { + uint32_t offset = _binOffsetsExtended[i]; + for (uint32_t j = (startBin+offset); j <= (endBin+offset); j++) { + + // move to the next bin if this one is empty + const binType &bin = bins[j]; + if (bin == NULL) { + //no list of records in this bin. + continue; + } + if (bin->empty()) { + //bin has list, but it's empty. + //Actually, this should never happen, so throw an error. + fprintf(stderr, "ERROR: found empty list for bin %u of chromosome %s\n", + j, chr.c_str()); + continue; + } + for (innerListIterType listIter = bin->begin(); listIter != bin->end(); listIter = bin->next()) { + const Record *dbRec = listIter->value(); + if (record->intersects(dbRec, _context->getSameStrand(), _context->getDiffStrand(), + _context->getOverlapFraction(), _context->getReciprocal())) { + hitSet.push_back(dbRec); + } + } + } + startBin >>= _binNextShift; + endBin >>= _binNextShift; + } +} + +bool BinTree::addRecordToTree(const Record *record) +{ + //TBD. get chr, bin. allocate all bins and single bins as needed. + const QuickString &chr = record->getChrName(); + uint32_t startPos = (uint32_t)(record->getStartPos()); + uint32_t endPos = (uint32_t)(record->getEndPos()); + + + //is this chr currently in the main map? + allBinsType bins = NULL; + mainMapType::iterator mainIter = _mainMap.find(chr); + if (mainIter == _mainMap.end()) { + //this is a new chr NOT currently in the map. + bins = new binType[NUM_BINS]; + memset(bins, 0, NUM_BINS * sizeof(binType)); + _mainMap[chr] = bins; + } else { + bins = mainIter->second; + } + uint32_t binNum = getBin(startPos, endPos); + + if (_showBinMetrics) { + if (binNum > _maxBinNumFound) { + _maxBinNumFound = binNum; + } + _binsHit[binNum]++; + return true; + } + + if (binNum < 0 || binNum >= NUM_BINS) { + fprintf(stderr, "ERROR: Received illegal bin number %u from getBin call.\n", binNum); + return false; + } + binType &bin = bins[binNum]; + if (bin == NULL) { + bin = new innerListType; + } + bin->push_back(record); + + return true; +} + +uint32_t BinTree::getBin(const Record *record) const { + return getBin((uint32_t)(record->getStartPos()), (uint32_t)(record->getEndPos())); +} + +uint32_t BinTree::getBin(uint32_t start, uint32_t end) const { + --end; + start >>= _binFirstShift; + end >>= _binFirstShift; + + for (uint32_t i = 0; i < NUM_BIN_LEVELS; ++i) { + if (start == end) { + return _binOffsetsExtended[i] + start; + } + start >>= _binNextShift; + end >>= _binNextShift; + } + //failure + return -1; +} diff --git a/src/utils/BinTree/BinTree.h b/src/utils/BinTree/BinTree.h new file mode 100644 index 0000000000000000000000000000000000000000..2d76a4612d85fb99dcd00ac631a154e220dbeb91 --- /dev/null +++ b/src/utils/BinTree/BinTree.h @@ -0,0 +1,78 @@ +/* + * BinTree.h + * + * Created on: Jan 5, 2013 + * Author: nek3d + */ + +#ifndef BINTREE_H_ +#define BINTREE_H_ + +using namespace std; + +#include <stdint.h> +#include <string> +#include <set> +#include <map> + +#include "QuickString.h" +#include "RecordKeyList.h" +#include "Context.h" + +class FileRecordMgr; +class Record; + +class BinTree { +public: + BinTree(int databaseFileIdx, Context *context); + + ~BinTree(); + bool loadDB(); + void getHits(Record *record, RecordKeyList &hitSet); + + +private: + + int _databaseFileIdx; + Context *_context; + + // + // BIN HANDLING + // + static const uint32_t NUM_BINS = 37450; + static const uint32_t NUM_BIN_LEVELS = 7; + + // bins range in size from 16kb to 512Mb + // Bin 0 spans 512Mbp, # Level 1 + // Bins 1-8 span 64Mbp, # Level 2 + // Bins 9-72 span 8Mbp, # Level 3 + // Bins 73-584 span 1Mbp # Level 4 + // Bins 585-4680 span 128Kbp # Level 5 + // Bins 4681-37449 span 16Kbp # Level 6 + uint32_t *_binOffsetsExtended; + static const uint32_t _binFirstShift = 14; /* How much to shift to get to finest bin. */ + static const uint32_t _binNextShift = 3; /* How much to shift to get to next larger bin. */ + + typedef BTlist<const Record *> innerListType; + typedef const BTlistNode<const Record *> * innerListIterType; + typedef innerListType * binType; + typedef binType * allBinsType; + typedef QuickString mainKeyType; + typedef map<mainKeyType, allBinsType> mainMapType; + mainMapType _mainMap; + + FileRecordMgr *_dbFileMgr; + + bool _showBinMetrics; + uint32_t _maxBinNumFound; + map<uint32_t, int> _binsHit; + + bool addRecordToTree(const Record *); + uint32_t getBin(uint32_t start, uint32_t end) const; + uint32_t getBin(const Record *record) const; + + +}; + + +#endif /* BINTREE_H_ */ diff --git a/src/utils/BinTree/Makefile b/src/utils/BinTree/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..c5189664fabe170dec31e7e4565e8325f97cfb83 --- /dev/null +++ b/src/utils/BinTree/Makefile @@ -0,0 +1,36 @@ +OBJ_DIR = ../../../obj/ +BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../../utils/ +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/Contexts/ \ + -I$(UTILITIES_DIR)/GenomeFile/ \ + -I$(UTILITIES_DIR)/FileRecordTools/ \ + -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ + -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/BamTools/include + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= BinTree.cpp BinTree.h +OBJECTS= BinTree.o +_EXT_OBJECTS= +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/BinTree.o + +.PHONY: clean \ No newline at end of file diff --git a/src/utils/Contexts/Context.cpp b/src/utils/Contexts/Context.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fc24bb0622483fddcb25f06e3711edecf895a47a --- /dev/null +++ b/src/utils/Contexts/Context.cpp @@ -0,0 +1,307 @@ +/* + * Context.cpp + * + * Created on: Feb 12, 2013 + * Author: nek3d + */ + +#include "Context.h" + +Context::Context() +: + _program(UNSPECIFIED_PROGRAM), + _useMergedIntervals(false), + _genomeFile(NULL), + _outputFileType(FileRecordTypeChecker::UNKNOWN_FILE_TYPE), + _outputTypeDetermined(false), + _skipFirstArgs(0), + _showHelp(false), + _obeySplits(false), + _uncompressedBam(false), + _anyHit(false), + _noHit(false), + _writeA(false), + _writeB(false), + _leftJoin(false), + _writeCount(false), + _writeOverlap(false), + _writeAllOverlap(false), + _haveFraction(false), + _overlapFraction(1E-9), + _reciprocal(false), + _sameStrand(false), + _diffStrand(false), + _sortedInput(false), + _printHeader(false), + _printable(true), + _explicitBedOutput(false), + _queryFileIdx(-1), + _databaseFileIdx(-1), + _bamHeaderAndRefIdx(-1), + _maxNumDatabaseFields(0), + _reportCount(false), + _maxDistance(0), + _reportNames(false), + _reportScores(false) +{ + _validScoreOps.insert("sum"); + _validScoreOps.insert("max"); + _validScoreOps.insert("min"); + _validScoreOps.insert("mean"); + _validScoreOps.insert("mode"); + _validScoreOps.insert("median"); + _validScoreOps.insert("antimode"); + _validScoreOps.insert("collapse"); +} + +Context::~Context() +{ + if (_genomeFile != NULL) { + delete _genomeFile; + _genomeFile = NULL; + } +} + +bool Context::determineOutputType() { + if (_outputTypeDetermined) { + return true; + } + //test whether output should be BED or BAM. + //If the user explicitly requested BED, then it's BED. + //Otherwise, if there are any BAM files in the input, + //then the output should be BAM. + if (getExplicitBedOutput() || getQueryFileType() != FileRecordTypeChecker::BAM_FILE_TYPE) { + setOutputFileType(FileRecordTypeChecker::SINGLE_LINE_DELIM_TEXT_FILE_TYPE); + } else { + setOutputFileType(FileRecordTypeChecker::BAM_FILE_TYPE); + } + _outputTypeDetermined = true; + return true; + +} + +void Context::openGenomeFile(const QuickString &genomeFilename) +{ + _genomeFile = new NewGenomeFile(genomeFilename.c_str()); +} + +void Context::openGenomeFile(const BamTools::RefVector &refVector) +{ + _genomeFile = new NewGenomeFile(refVector); +} + +void Context::parseCmdArgs(int argc, char **argv, int skipFirstArgs) { + _argc = argc; + _argv = argv; + _skipFirstArgs = skipFirstArgs; + + _argsProcessed.resize(argc - skipFirstArgs, false); + + for (int i=skipFirstArgs; i < argc; i++) { + if (isUsed(i - skipFirstArgs)) { + continue; + } + + if (strcmp(argv[i], "-i") == 0) { + addInputFile(argv[i+1]); + markUsed(i - skipFirstArgs); + i++; + markUsed(i - skipFirstArgs); + } else if (strcmp(argv[i], "-g") == 0) { + openGenomeFile(argv[i+1]); + markUsed(i - skipFirstArgs); + i++; + markUsed(i - skipFirstArgs); + } else if (strcmp(argv[i], "-h") == 0) { + setShowHelp(true); + markUsed(i - skipFirstArgs); + } else if (strcmp(argv[i], "--help") == 0) { + setShowHelp(true); + markUsed(i - skipFirstArgs); + } + else if (strcmp(argv[i], "-split") == 0) { + setObeySplits(true); + markUsed(i - skipFirstArgs); + } + if (strcmp(argv[i], "-a") == 0) { + addInputFile(argv[i+1]); + _queryFileIdx = getNumInputFiles() -1; + markUsed(i - skipFirstArgs); + i++; + markUsed(i - skipFirstArgs); + } + else if(strcmp(argv[i], "-abam") == 0) { + addInputFile(argv[i+1]); + _queryFileIdx = getNumInputFiles() -1; + markUsed(i - skipFirstArgs); + i++; + markUsed(i - skipFirstArgs); + setInputFileType(_queryFileIdx, FileRecordTypeChecker::BAM_FILE_TYPE); + } + else if (strcmp(argv[i], "-b") == 0) { + addInputFile(argv[i+1]); + _databaseFileIdx = getNumInputFiles() -1; + markUsed(i - skipFirstArgs); + i++; + markUsed(i - skipFirstArgs); + } else if (strcmp(argv[i], "-u") == 0) { + setAnyHit(true); + markUsed(i - skipFirstArgs); + } else if(strcmp(argv[i], "-f") == 0) { + if ((i+1) < argc) { + setHaveFraction(true); + setOverlapFraction(atof(argv[i + 1])); + markUsed(i - skipFirstArgs); + i++; + markUsed(i - skipFirstArgs); + } + } + else if(strcmp(argv[i], "-bed") == 0) { + setExplicitBedOutput(true); + markUsed(i - skipFirstArgs); + } + else if(strcmp(argv[i], "-wa") == 0) { + setWriteA(true); + markUsed(i - skipFirstArgs); + } + else if(strcmp(argv[i], "-wb") == 0) { + setWriteB(true); + markUsed(i - skipFirstArgs); + } + else if(strcmp(argv[i], "-wo") == 0) { + setWriteOverlap(true); + markUsed(i - skipFirstArgs); + } + else if(strcmp(argv[i], "-wao") == 0) { + setWriteAllOverlap(true); + setWriteOverlap(true); + markUsed(i - skipFirstArgs); + } + else if(strcmp(argv[i], "-c") == 0) { + setWriteCount(true); + markUsed(i - skipFirstArgs); + } + else if(strcmp(argv[i], "-r") == 0) { + setReciprocal(true); + markUsed(i - skipFirstArgs); + } + else if (strcmp(argv[i], "-v") == 0) { + setNoHit(true); + markUsed(i - skipFirstArgs); + } + else if (strcmp(argv[i], "-s") == 0) { + setSameStrand(true); + markUsed(i - skipFirstArgs); + } + else if (strcmp(argv[i], "-S") == 0) { + setDiffStrand(true); + markUsed(i - skipFirstArgs); + } + else if (strcmp(argv[i], "-loj") == 0) { + setLeftJoin(true); + markUsed(i - skipFirstArgs); + } + else if (strcmp(argv[i], "-ubam") == 0) { + setUncompressedBam(true); + markUsed(i - skipFirstArgs); + } + else if(strcmp(argv[i], "-sorted") == 0) { + setSortedInput(true); + markUsed(i - skipFirstArgs); + } + else if(strcmp(argv[i], "-header") == 0) { + setPrintHeader(true); + markUsed(i - skipFirstArgs); + } + } +} + +bool Context::isValidState() +{ + if (!Context::cmdArgsValid()) { + return false; + } + if (getAnyHit() && getNoHit()) { + _errorMsg = "Error: request either -u for anyHit OR -v for noHit, not both."; + return false; + } + if (getWriteCount()) { + if (getAnyHit()) { + _errorMsg = "Error: request either -c for writeCount OR -u for anyHit, not both."; + return false; + } else if (getWriteB()) { + _errorMsg = "Error: request either -c for writeCount OR -wb for writeB, not both."; + return false; + } else if (getQueryFileType() == FileRecordTypeChecker::BAM_FILE_TYPE && !getExplicitBedOutput()) { + _errorMsg = "Error: writeCount option is not valid with BAM query input, unless bed output is specified with -bed option."; + return false; + } + } + if (getWriteOverlap()) { + + if (getWriteA()) { + _errorMsg = "Error: request either -wa for writeA OR -wo for writeOverlap, not both."; + return false; + } else if (getWriteB()) { + _errorMsg = "Error: request either -wb for writeB OR -wo for writeOverlap, not both."; + return false; + } else if (getWriteCount()) { + _errorMsg = "Error: request either -c for writeCount OR -wo for writeOverlap, not both."; + return false; + } else if (getAnyHit()) { + _errorMsg = "Error: request either -u for anyHit OR -wo for writeOverlap, not both."; + return false; + } else if (getQueryFileType() == FileRecordTypeChecker::BAM_FILE_TYPE && !getExplicitBedOutput()) { + _errorMsg = "Error: writeAllOverlap option is not valid with BAM query input, unless bed output is specified with -bed option."; + return false; + } + } + if (getWriteB() || getLeftJoin()) { + if (getQueryFileType() == FileRecordTypeChecker::BAM_FILE_TYPE && !getExplicitBedOutput()) { + cerr << endl << "*****" << endl << "*****WARNING: -wb and -loj are ignored with bam input, unless bed output is specified with -bed option." << endl << "*****" << endl; + } + } + if (getSameStrand() && getDiffStrand()) { + _errorMsg = "Error: request -s for sameStrand, or -S for diffStrand, not both."; + return false; + } + + if (getQueryFileType() == FileRecordTypeChecker::BAM_FILE_TYPE && getPrintHeader()) { + cerr << endl << "*****" << endl << "*****WARNING: -header option is not valid for BAM input." << endl << "*****" << endl; + setPrintHeader(false); + } + if (getAnyHit() || getNoHit() || getWriteCount()) { + setPrintable(false); + } + return _inputFiles.size() > 0; +} + +bool Context::cmdArgsValid() +{ + bool retval = true; + for (int i = _skipFirstArgs; i < _argc; i++) { + if (!isUsed(i - _skipFirstArgs)) { + _errorMsg += "\nERROR. Unrecognized argument: "; + _errorMsg += _argv[i]; + retval = false; + } + } + return retval; +} + +int Context::getBamHeaderAndRefIdx() { + if (_bamHeaderAndRefIdx != -1) { + //already found which BAM file to usefor the header + return _bamHeaderAndRefIdx; + } + if (_inputFiles[_queryFileIdx]._fileType == FileRecordTypeChecker::BAM_FILE_TYPE) { + _bamHeaderAndRefIdx = _queryFileIdx; + } else { + _bamHeaderAndRefIdx = _databaseFileIdx; + } + return _bamHeaderAndRefIdx; +} + + + + diff --git a/src/utils/Contexts/Context.h b/src/utils/Contexts/Context.h new file mode 100644 index 0000000000000000000000000000000000000000..f65528d00c0ec1cfdbcf1c1c3cce2aa529a60a8c --- /dev/null +++ b/src/utils/Contexts/Context.h @@ -0,0 +1,250 @@ +/* + * ContextBase.h + * + * Created on: Feb 11, 2013 + * Author: nek3d + */ + +#ifndef CONTEXTBASE_H_ +#define CONTEXTBASE_H_ + + +// This is an abstract base class for all context objects. +// +// Context classes handle the settings for an operation, +// such as merge, intersect, jaccard, etc. +// +// Settings include the input and output parameters, such as input +// files, file types (if explicitly provided), genome files, +// run options, output format, etc. + +#include "BedtoolsTypes.h" +#include "FileRecordTypeChecker.h" +#include "NewGenomeFile.h" +#include "api/BamReader.h" +#include "api/BamAux.h" + + +class Context { +public: + Context(); + ~Context(); + + typedef FileRecordTypeChecker::FILE_TYPE ContextFileType; + typedef FileRecordTypeChecker::RECORD_TYPE ContextRecordType; + + typedef enum {UNSPECIFIED_PROGRAM, INTERSECT, WINDOW, CLOSEST, COVERAGE, MAP, GENOMECOV, MERGE, CLUSTER, + COMPLEMENT, SUBTRACT, SLOP, FLANK, SORT, RANDOM, SHUFFLE, ANNOTATE, MULTIINTER, UNIONBEDG, PAIRTOBED, + PAIRTOPAIR,BAMTOBED, BEDTOBAM, BEDTOFASTQ, BEDPETOBAM, BED12TOBED6, GETFASTA, MASKFASTA, NUC, + MULTICOV, TAG, JACCARD, OVERLAP, IGV, LINKS,MAKEWINDOWS, GROUPBY, EXPAND } PROGRAM_TYPE; + + PROGRAM_TYPE getProgram() const { return _program; } + void setProgram(PROGRAM_TYPE program) { _program = program; } + + void addInputFile(const QuickString &inputFile, + ContextFileType explicitFileType = FileRecordTypeChecker::UNKNOWN_FILE_TYPE, + ContextRecordType explicitRecordType = FileRecordTypeChecker::UNKNOWN_RECORD_TYPE) { + _inputFiles.push_back(FileEntryType(inputFile, explicitFileType, explicitRecordType)); + } + + int getNumInputFiles() const { return _inputFiles.size(); } + const QuickString &getInputFileName(int fileNum) const { return _inputFiles[fileNum]._fileName; } + ContextFileType getInputFileType(int fileNum) const { return _inputFiles[fileNum]._fileType; } + void setInputFileType(int fileNum, ContextFileType fileType) { _inputFiles[fileNum]._fileType = fileType; } + ContextRecordType getInputRecordType(int fileNum) const { return _inputFiles[fileNum]._recordType; } + void setInputRecordType(int fileNum, ContextRecordType recordType) { _inputFiles[fileNum]._recordType = recordType; } + + + bool determineOutputType(); + + const QuickString &getHeader(int fileIdx) { return _headers[fileIdx]; } + void setHeader(int fileIdx, const QuickString &header) { _headers[fileIdx] = header; } + const BamTools::RefVector &getReferences(int fileIdx) { return _references[fileIdx]; } + void setReferences(int fileIdx, const BamTools::RefVector &refs) { _references[fileIdx] = refs; } + int getBamHeaderAndRefIdx(); //return idx of 1st query that is BAM. If none, first DB that is BAM. + + bool getUseMergedIntervals() const { return _useMergedIntervals; } + void setUseMergedIntervals(bool val) { _useMergedIntervals = val; } + + void openGenomeFile(const QuickString &genomeFilename); + void openGenomeFile(const BamTools::RefVector &refVector); + bool hasGenomeFile() const { return _genomeFile != NULL; } + NewGenomeFile *getGenomeFile() const { return _genomeFile; } + + void setOutputFileType(ContextFileType fileType) { _outputFileType = fileType; } + ContextFileType getOutputFileType() const { return _outputFileType; } + + void parseCmdArgs(int argc, char **argv, int skipFirstArgs); + + //isValidState checks that parameters to context are in an acceptable state. + // If not, the error msg string will be set with a reason why it failed. + bool isValidState(); + bool getShowHelp() const { return _showHelp; } + void setShowHelp(bool val) { _showHelp = val; } + + const QuickString &getErrorMsg() const { return _errorMsg; } + void setErrorMessage(const QuickString &errorMsg) { _errorMsg = errorMsg; } + + //split handling. + bool getObeySplits() const {return _obeySplits; } + void setObeySplits(bool val) { _obeySplits = val; } + + //Decide whether output format is BED or BAM. + //Default is BAM if any input files are BAM. + bool getExplicitBedOutput() const { return _explicitBedOutput; } + void setExplicitBedOutput(bool val) { _explicitBedOutput = val; } + + bool getUncompressedBam() const { return _uncompressedBam; } + void setUncompressedBam(bool val) { _uncompressedBam = val; } + // + // INTERSECT METOHDS + // + //NOTE: Query and database files will only be marked as such by either the + //parseCmdArgs method, or by explicitly setting them. + int getQueryFileIdx() const { return _queryFileIdx; } + void setQueryFileIdx(int idx) { _queryFileIdx = idx; } + int getDatabaseFileIdx() const { return _databaseFileIdx; } + void setDatabaseFileIdx(int idx) { _databaseFileIdx = idx; } + const QuickString &getQueryFileName() const { return _inputFiles[_queryFileIdx]._fileName; } + const QuickString &getDatabaseFileName() const { return _inputFiles[_databaseFileIdx]._fileName; } + ContextFileType getQueryFileType() const { return _inputFiles[_queryFileIdx]._fileType; } + ContextFileType getDatabaseFileType() const { return _inputFiles[_databaseFileIdx]._fileType; } + ContextRecordType getQueryRecordType() const { return _inputFiles[_queryFileIdx]._recordType; } + ContextRecordType getDatabaseRecordType() const { return _inputFiles[_databaseFileIdx]._recordType; } + int getMaxNumDatabaseFields() const { return _maxNumDatabaseFields; } + void setMaxNumDatabaseFields(int val) { _maxNumDatabaseFields = val; } + + bool getAnyHit() const {return _anyHit; } + void setAnyHit(bool val) { _anyHit = val; } + + bool getNoHit() const {return _noHit; } + void setNoHit(bool val) { _noHit = val; } + + bool getLeftJoin() const {return _leftJoin; } + void setLeftJoin(bool val) { _leftJoin = val; } + + bool getWriteA() const {return _writeA; } + void setWriteA(bool val) { _writeA = val; } + + bool getWriteB() const {return _writeB; } + void setWriteB(bool val) { _writeB = val; } + + bool getWriteCount() const {return _writeCount; } + void setWriteCount(bool val) { _writeCount = val; } + + bool getWriteOverlap() const {return _writeOverlap; } + void setWriteOverlap(bool val) { _writeOverlap = val; } + + bool getWriteAllOverlap() const {return _writeAllOverlap; } + void setWriteAllOverlap(bool val) { _writeAllOverlap = val; } + + bool getHaveFraction() const {return _haveFraction; } + void setHaveFraction(bool val) { _haveFraction = val; } + + float getOverlapFraction() const { return _overlapFraction; } + void setOverlapFraction(float fraction) { _overlapFraction = fraction; } + + bool getReciprocal() const {return _reciprocal; } + void setReciprocal(bool val) { _reciprocal = val; } + + bool getSameStrand() const {return _sameStrand; } + void setSameStrand(bool val) { _sameStrand = val; } + + bool getDiffStrand() const {return _diffStrand; } + void setDiffStrand(bool val) { _diffStrand = val; } + + bool getSortedInput() const {return _sortedInput; } + void setSortedInput(bool val) { _sortedInput = val; } + + bool getPrintHeader() const {return _printHeader; } + void setPrintHeader(bool val) { _printHeader = val; } + + bool getPrintable() const { return _printable; } + void setPrintable(bool val) { _printable = val; } + + // + // MERGE METHODS + // + bool getReportCount() const { return _reportCount; } + void setReportCount(bool val) { _reportCount = val; } + + int getMaxDistance() const { return _maxDistance; } + void setMaxDistance(int distance) { _maxDistance = distance; } + + bool getReportNames() const { return _reportNames; } + void setReportNames(bool val) { _reportNames = val; } + + bool getReportScores() const { return _reportScores; } + void setReportScores(bool val) { _reportScores = val; } + + const QuickString &getScoreOp() const { return _scoreOp; } + void setScoreOp(const QuickString &op) { _scoreOp = op; } + + +protected: + PROGRAM_TYPE _program; + + class FileEntryType { + public: + FileEntryType(const QuickString &filename, ContextFileType fileType, ContextRecordType recordType) + : _fileName(filename), _fileType(fileType), _recordType(recordType) {} + QuickString _fileName; + ContextFileType _fileType; + ContextRecordType _recordType; + }; + vector<FileEntryType> _inputFiles; + + bool _useMergedIntervals; + NewGenomeFile *_genomeFile; + + ContextFileType _outputFileType; + bool _outputTypeDetermined; + + map<int, QuickString> _headers; + map<int, BamTools::RefVector> _references; + + QuickString _errorMsg; + vector<bool> _argsProcessed; //used for processing cmdLine args. + int _argc; + char **_argv; + int _skipFirstArgs; + bool _showHelp; + bool _obeySplits; + bool _uncompressedBam; + + bool _anyHit; + bool _noHit; + bool _writeA; + bool _writeB; + bool _leftJoin; + bool _writeCount; + bool _writeOverlap; + bool _writeAllOverlap; + bool _haveFraction; + float _overlapFraction; + bool _reciprocal; + bool _sameStrand; + bool _diffStrand; + bool _sortedInput; + bool _printHeader; + bool _printable; + bool _explicitBedOutput; + int _queryFileIdx; + int _databaseFileIdx; + int _bamHeaderAndRefIdx; + int _maxNumDatabaseFields; + + bool _reportCount; + int _maxDistance; + bool _reportNames; + bool _reportScores; + QuickString _scoreOp; + set<QuickString> _validScoreOps; + + void markUsed(int i) { _argsProcessed[i] = true; } + bool isUsed(int i) const { return _argsProcessed[i]; } + bool cmdArgsValid(); + +}; + +#endif /* CONTEXT_H_ */ diff --git a/src/utils/Contexts/Makefile b/src/utils/Contexts/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..df9a819e5be900882f484cdd033c2aeaf4e47631 --- /dev/null +++ b/src/utils/Contexts/Makefile @@ -0,0 +1,34 @@ +OBJ_DIR = ../../../obj/ +BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../../utils/ +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ + -I$(UTILITIES_DIR)/GenomeFile/ \ + -I$(UTILITIES_DIR)/BamTools/include + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= Context.cpp Context.h +OBJECTS= Context.o +_EXT_OBJECTS=ParseTools.o QuickString.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/Context.o + +.PHONY: clean \ No newline at end of file diff --git a/src/utils/FileRecordTools/FileReaders/BamFileReader.cpp b/src/utils/FileRecordTools/FileReaders/BamFileReader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2fc71a357a81b4823a79dfca71d1dc570ce59f9b --- /dev/null +++ b/src/utils/FileRecordTools/FileReaders/BamFileReader.cpp @@ -0,0 +1,117 @@ +#include "BamFileReader.h" +#include "ParseTools.h" +#include <cstdio> +BamFileReader::BamFileReader() +: _eof(false), + _useTags(true) +{ + +} + +BamFileReader::~BamFileReader() +{ + +} + +bool BamFileReader::open() +{ + if (_inputStream != NULL) { + try { + _bamReader.OpenStream(_inputStream); + } + catch (...) { + fprintf(stderr, "ERROR: Unable to open BAM file from standard input.\n"); + exit(1); + } + } else { + try { + _bamReader.Open(_filename); + } + catch (...) { + fprintf(stderr, "ERROR: Unable to open BAM file %s\n", _filename.c_str()); + exit(1); + } + } + _bamHeader = _bamReader.GetHeaderText(); + _references = _bamReader.GetReferenceData(); + + return true; +} + +bool BamFileReader::isOpen() const +{ + return _bamReader.IsOpen(); +} + +void BamFileReader::close() +{ + _bamReader.Close(); +} + +bool BamFileReader::readEntry() +{ + if (_useTags) { + if (_bamReader.GetNextAlignment(_bamAlignment)) { + return true; + } + } else { + if (_bamReader.GetNextAlignmentCore(_bamAlignment)) { + return true; + } + } + //hit end of file + _eof = true; + return false; +} + +void BamFileReader::getChrName(QuickString &str) const +{ + int refId = _bamAlignment.RefID; + if (refId < 0) { + return; + } + str = _references[refId].RefName; +} + +int BamFileReader::getChrId() const +{ + return _bamAlignment.RefID; +} + +int BamFileReader::getStartPos() const +{ + return _bamAlignment.Position; +} + +int BamFileReader::getEndPos() const +{ + return _bamAlignment.GetEndPosition(false, false); +} + +void BamFileReader::getName(QuickString &str) const +{ + if (!_useTags) { + str = _bamAlignment.SupportData.AllCharData.c_str(); + } else { + str = _bamAlignment.Name; + } + if (_bamAlignment.IsFirstMate()) { + str += "/1"; + } + else if (_bamAlignment.IsSecondMate()) { + str += "/2"; + } +} + +void BamFileReader::getScore(QuickString &str) const +{ + int2str(_bamAlignment.MapQuality, str); +} + +char BamFileReader::getStrand() const +{ + if (_bamAlignment.IsReverseStrand()) { + return '-'; + } + return '+'; +} diff --git a/src/utils/FileRecordTools/FileReaders/BamFileReader.h b/src/utils/FileRecordTools/FileReaders/BamFileReader.h new file mode 100644 index 0000000000000000000000000000000000000000..9f8cb426c8bbe913f7989ff8fa1a3dc4ce74bd2d --- /dev/null +++ b/src/utils/FileRecordTools/FileReaders/BamFileReader.h @@ -0,0 +1,64 @@ +/* + * BamFileReader.h + * + * Created on: Dec 4, 2012 + * Author: nek3d + */ + +#ifndef BAMFILEREADER_H_ +#define BAMFILEREADER_H_ + +using namespace std; + +#include "FileReader.h" +#include "QuickString.h" +#include "api/BamReader.h" +#include "api/BamAux.h" + +class BamFileReader : public FileReader { +public: + BamFileReader(); + virtual ~BamFileReader(); + virtual bool open(); //open the file + virtual bool isOpen() const; + virtual void close(); + virtual bool eof() const { + return _eof; + } + + //setUseTags will tell the BamReader to give us all + //the extra tag information in a BAM record. By default, + //this is set to false, so not using them, which reduces + //the run time of reading a BAM file by more than half. + virtual void setUseTags(bool flag) { _useTags = flag; } + virtual bool readEntry(); + + virtual bool hasHeader() const { return _bamReader.IsOpen(); } //any open Bam file automatically has a header + virtual const QuickString &getHeader() const { return _bamHeader; } + const BamTools::RefVector &getReferences() const { return _references; } + + const BamTools::BamAlignment &getAlignment() const { return _bamAlignment; } + + + void getChrName(QuickString &) const; + int getChrId() const; + int getStartPos() const; + int getEndPos() const; + void getName(QuickString &) const; + void getScore(QuickString &) const; + char getStrand() const; + +protected: + BamTools::BamReader _bamReader; + BamTools::BamAlignment _bamAlignment; + bool _eof; + QuickString _bamHeader; + BamTools::RefVector _references; + bool _useTags; + + + void extractNameFromCore(); +}; + + +#endif /* BAMFILEREADER_H_ */ diff --git a/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e68b4d3d8fe2d8b62bc152599609425866b1f595 --- /dev/null +++ b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp @@ -0,0 +1,115 @@ +/* + * BufferedStreamMgr.cpp + * + * Created on: Jul 9, 2013 + * Author: nek3d + */ + +#include "BufferedStreamMgr.h" +#include "CompressionTools.h" +#include "InputStreamMgr.h" +#include <fstream> + +BufferedStreamMgr::BufferedStreamMgr(const QuickString &filename) +: _inputStreamMgr(NULL), + _mainBuf(NULL), + _filename(filename), + _mainBufCurrStartPos(0), + _mainBufCurrLen(0), + _eof(false) +{ + +} + +BufferedStreamMgr::~BufferedStreamMgr() +{ + + if (_mainBuf != NULL) { + delete [] _mainBuf; + } + if (_inputStreamMgr != NULL) { + delete _inputStreamMgr; + _inputStreamMgr = NULL; + } +} + +bool BufferedStreamMgr::init() +{ + _inputStreamMgr = new InputStreamMgr(_filename); + if (!_inputStreamMgr->init()) { + return false; + } + if (_inputStreamMgr->isBam()) { + //there is a special check for a BAM file's magic string inside + //the inputStreamMgr's init method. If it is found, we can + //stop here. + _typeChecker.setBam(); + return true; + } + if (!getTypeData()) { + return false; + } + + _mainBuf = new bufType[MAIN_BUF_READ_SIZE +1]; + memset(_mainBuf, 0, MAIN_BUF_READ_SIZE +1); + + return true; +} + +bool BufferedStreamMgr::getTypeData() +{ + QuickString currScanBuffer; + _inputStreamMgr->getSavedData(currScanBuffer); + do { + if (!_typeChecker.scanBuffer(currScanBuffer.c_str(), currScanBuffer.size()) && !_typeChecker.needsMoreData()) { + return false; + } else if (_typeChecker.needsMoreData()) { + _inputStreamMgr->populateScanBuffer(); + currScanBuffer.clear(); + _inputStreamMgr->getSavedData(currScanBuffer); + } + } while (_typeChecker.needsMoreData()); + _inputStreamMgr->reset(); + return true; +} + +bool BufferedStreamMgr::getLine(QuickString &line) +{ + line.clear(); + + if (_mainBufCurrStartPos >= _mainBufCurrLen) { + if (!readFileChunk()) { + _eof = true; + return false; + } + } + while (1) { + int searchPos = _mainBufCurrStartPos; + while (searchPos < _mainBufCurrLen && _mainBuf[searchPos] != '\n') { + searchPos++; + } + + line.append((char *)_mainBuf + _mainBufCurrStartPos, searchPos - _mainBufCurrStartPos); + _mainBufCurrStartPos = searchPos +1; + if (searchPos == _mainBufCurrLen) { //hit end of buffer, but no newline yet + if (!readFileChunk()) { //hit eof + return true; + } + } else if (_mainBuf[searchPos] == '\n') { + return true; + } + } +} + +bool BufferedStreamMgr::readFileChunk() +{ + if (eof()) { + return false; + } + memset(_mainBuf, 0, MAIN_BUF_READ_SIZE +1); + + _inputStreamMgr->getFinalStream()->read((char *)_mainBuf, MAIN_BUF_READ_SIZE); + _mainBufCurrLen = _inputStreamMgr->getFinalStream()->gcount(); + _mainBufCurrStartPos = 0; + return _mainBufCurrLen > 0; +} diff --git a/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.h b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.h new file mode 100644 index 0000000000000000000000000000000000000000..93eb30ce1d34de66eb55c42e141686d0f1e84400 --- /dev/null +++ b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.h @@ -0,0 +1,52 @@ +/* + * BufferedStreamMgr.h + * + * Created on: Jul 9, 2013 + * Author: nek3d + */ + +#ifndef BUFFEREDSTREAMMGR_H_ +#define BUFFEREDSTREAMMGR_H_ + +using namespace std; + +#include <iostream> +#include "QuickString.h" +#include "FileRecordTypeChecker.h" +#include "InputStreamMgr.h" + +class BufferedStreamMgr { +public: + BufferedStreamMgr(const QuickString &filename); + ~BufferedStreamMgr(); + + bool init(); + + const FileRecordTypeChecker & getTypeChecker() const { return _typeChecker; } + istream *getStream() { return _inputStreamMgr->getFinalStream(); } + + bool eof() const { return _eof; } + bool getLine(QuickString &line); + +private: + InputStreamMgr *_inputStreamMgr; + typedef unsigned char bufType; + bufType *_mainBuf; + + FileRecordTypeChecker _typeChecker; + QuickString _filename; + + int _mainBufCurrStartPos; + int _mainBufCurrLen; + bool _eof; + //The minus ones in these constants are for leaving room for a null terminator after reading into buffers. + static const int MAIN_BUF_READ_SIZE = 67108863; //64 Mb minus 1 + static const int TYPE_CHECK_READ_SIZE = 4095; // 4K + static const int GZIP_LINE_BUF_SIZE = 16384; //16K + bool readFileChunk(); + bool getTypeData(); +// void resetStream(); +}; + + +#endif /* BUFFEREDSTREAMMGR_H_ */ diff --git a/src/utils/FileRecordTools/FileReaders/FileReader.cpp b/src/utils/FileRecordTools/FileReaders/FileReader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a9505b1b414011cd52bcf3161a4e50580393c8af --- /dev/null +++ b/src/utils/FileRecordTools/FileReaders/FileReader.cpp @@ -0,0 +1,49 @@ +#include <iostream> +#include <cstring> +#include "FileReader.h" +#include "BufferedStreamMgr.h" + +FileReader::FileReader() +: _inputStream(NULL), + _bufStreamMgr(NULL), + _isFileOpen(false), + _mustDeleteInputStream(false), + _externalInputStream(false), + _useBufStream(true), + _currChromId(-1) +{ +} + +FileReader::~FileReader() { +} + +bool FileReader::open() { + + if (_isFileOpen) { + return true; + } + + printf("Inside FileReader::open.\n"); + if (!_externalInputStream && _inputStream == NULL) { + _inputStream = new ifstream(_filename.c_str(), ios::in); + _mustDeleteInputStream = true; + } + if (_inputStream == NULL || !_inputStream->good()) { + fprintf(stderr, "Error: bad input stream.\n"); + exit(1); + } + + _isFileOpen = true; + return true; +} + +void FileReader::close() { + if (_mustDeleteInputStream) { + delete _inputStream; + } + return; +} + +bool FileReader::eof() const { + return _useBufStream ? _bufStreamMgr->eof() : _inputStream->eof(); +} diff --git a/src/utils/FileRecordTools/FileReaders/FileReader.h b/src/utils/FileRecordTools/FileReaders/FileReader.h new file mode 100644 index 0000000000000000000000000000000000000000..2d37869067ac88e74e8b93da682fd3868bcafcfc --- /dev/null +++ b/src/utils/FileRecordTools/FileReaders/FileReader.h @@ -0,0 +1,58 @@ +#ifndef FILEREADER_H_ +#define FILEREADER_H_ + +using namespace std; + +#include <string> +#include <fstream> +#include <map> +#include "QuickString.h" +#include "Context.h" + +#include "BufferedStreamMgr.h" + +class FileReader { +public: + FileReader(); + virtual ~FileReader(); + void setFileName(const string &filename) { _filename = filename; } + void setInputStream(istream *inputStream) { + _inputStream = inputStream; + _externalInputStream = true; + } + void setInputStream(BufferedStreamMgr *bufStreamMgr) { + _bufStreamMgr = bufStreamMgr; + _inputStream = _bufStreamMgr->getStream(); + _externalInputStream = true; + _useBufStream = true; + + // This will short circuit the open method. BufferedStreamMgr does it's own file opening. + //However, for BAM, we want to re-open it. + _isFileOpen = _bufStreamMgr->getTypeChecker().isBam() ? false : true; + + } + void setContext(const Context *context) { _context = context; } + virtual bool open(); //open the file + virtual bool isOpen() const { return _isFileOpen; } + virtual void close(); + virtual bool eof() const; + virtual bool readEntry() =0; // this is an abstract base class. + //Derived classes could read text, BAM, possibly FASTA, and/or Variant files, as well as Blast/SIM4 output. + virtual int getCurrChromdId() const { return _currChromId; } + virtual bool hasHeader() const = 0; + virtual const QuickString &getHeader() const =0; +protected: + string _filename; + istream *_inputStream; + BufferedStreamMgr *_bufStreamMgr; + + bool _isFileOpen; + bool _mustDeleteInputStream; + bool _externalInputStream; + bool _useBufStream; + int _currChromId; + Context const *_context; + +}; + +#endif // FILEREADER_H_ diff --git a/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3139026505e4ad997ee58dd12c24892a36098b6b --- /dev/null +++ b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp @@ -0,0 +1,171 @@ +/* + * InputStreamMgr.cpp + * + * Created on: Mar 21, 2013 + * Author: nek3d + */ + +#include "InputStreamMgr.h" +#include <cstring> //for memset +#include "gzstream.h" +#include "CompressionTools.h" + +InputStreamMgr::InputStreamMgr(const QuickString &filename, bool buildScanBuffer) +: + _filename(filename), + _pushBackStreamBuf(NULL), + _inputFileStream(NULL), + _infStreamBuf(NULL), + _oldInputStream(NULL), + _isStdin(false), + _isGzipped(false), + _isBam(false), + _numBytesInBuffer(0) +{ + _possibleBamCode.resize(4, 0); +} + + +InputStreamMgr::~InputStreamMgr() { + if (_pushBackStreamBuf != NULL) { + delete _pushBackStreamBuf; + _pushBackStreamBuf = NULL; + } + if (_inputFileStream != NULL) { + delete _inputFileStream; + _inputFileStream = NULL; + } + if (_oldInputStream != NULL) { + delete _oldInputStream; + _oldInputStream = NULL; + } + if (_infStreamBuf != NULL) { + delete _infStreamBuf; + _infStreamBuf = NULL; + } +} + +bool InputStreamMgr::init() +{ + if (_filename == "-" || _filename == "stdin") { //stdin + _isStdin = true; + //peek at the first char of stdin to see if this is gzipped. + if ((unsigned char)cin.peek() == 0x1f) { + _isGzipped = true; + } + _pushBackStreamBuf = new PushBackStreamBuf(cin.rdbuf()); + } else { + _inputFileStream = new ifstream(_filename.c_str()); + if (_inputFileStream->fail()) { + cerr << "Error: Unable to open file " << _filename << ". Exiting." << endl; + delete _inputFileStream; + _inputFileStream = NULL; + exit(1); + } + //peek at the first char of stdin to see if this is gzipped. + if ((unsigned char)_inputFileStream->peek() == 0x1f) { + _isGzipped = true; + } + _pushBackStreamBuf = new PushBackStreamBuf(_inputFileStream->rdbuf()); + } + //now we have a PushBackStreamBuf. Make a new stream. + _finalInputStream = new istream(_pushBackStreamBuf); + populateScanBuffer(); + return true; +} + +void InputStreamMgr::populateScanBuffer() +{ + _scanBuffer.clear(); + int numChars=0; + int currChar = 0; + while (1) { + currChar = _pushBackStreamBuf->sbumpc(); + //Stop when EOF hit. + if (currChar == EOF) { + break; + } + numChars++; + _scanBuffer.push_back(currChar); + if (_isGzipped) { + if (bamDetected(numChars, currChar)) { + return; + } + _compressedSaveData.push_back(currChar); + } + + //For non-gzip, also stop if we have the minimum number of bytes and newline is hit. + //For gzip, stop at SCAN_BUFFER_SIZE. + if ((!_isGzipped && (currChar == '\n' && numChars >= MIN_SCAN_BUFFER_SIZE )) || (_isGzipped && numChars >= SCAN_BUFFER_SIZE)) { + break; + } + } + _numBytesInBuffer = _scanBuffer.size(); + + //append it to the savedDataStr. If it's gzipped, decompress it first. + if (_isGzipped) { + decompressBuffer(); + } else { + _scanBuffer.toStr(_saveDataStr, true); + } +} + +bool InputStreamMgr::bamDetected(int numChars, int currChar) +{ + //Look for the BAM magic string "BAM\1" in the first fouur characters of the input stream. + //In compressed form, the first char is the gzip signifier, which was already found. + //The next three are the integers 139, 8, and 4. + if (numChars < 5) { + _possibleBamCode[numChars -1] = currChar; + //special: test for BAM + if (numChars == 4 && _possibleBamCode[1] == 139 && _possibleBamCode[2] == 8 && _possibleBamCode[3] == 4) { + //BAM detected. + _pushBackStreamBuf->pushBack(_scanBuffer); + _isBam = true; + _numBytesInBuffer = 4; + return true; + } + } + return false; +} + +void InputStreamMgr::decompressBuffer() +{ + //allocate an array to hold uncompressed data. + uInt maxDecompressSize = 20 * _numBytesInBuffer; + unsigned char *newScanBuffer = new unsigned char[maxDecompressSize]; + memset(newScanBuffer, 0, maxDecompressSize); + + unsigned int numDecompressChars = inflateGzippedArray(_scanBuffer, newScanBuffer, maxDecompressSize, _numBytesInBuffer); + + // newScanBuffer should now contain uncompressed data. + //delete old buffer, point it at new buffer. + _saveDataStr.append((char *)newScanBuffer, numDecompressChars); + + delete [] newScanBuffer; +} + +void InputStreamMgr::reset() +{ + if (_isBam) { + return; + } + + if (!_isStdin) { + _oldInputStream = _finalInputStream; + _finalInputStream = new ifstream(_filename.c_str()); + } else { + if (_isGzipped) { + _pushBackStreamBuf->pushBack(_compressedSaveData); + } else { + _pushBackStreamBuf->pushBack(BTlist<int>(_saveDataStr)); + } +// _finalInputStream = new istream(_pushBackStreamBuf); + } + if (_isGzipped) { + //the file is gzipped, but is not BAM. + _infStreamBuf = new InflateStreamBuf(_finalInputStream); + _oldInputStream = _finalInputStream; + _finalInputStream = new istream(_infStreamBuf); + } +} diff --git a/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h new file mode 100644 index 0000000000000000000000000000000000000000..4e22c31ed19a01bf9f3edc8316c9faebd56f782e --- /dev/null +++ b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h @@ -0,0 +1,60 @@ +/* + * InputStreamMgr.h + * + * Created on: Mar 21, 2013 + * Author: nek3d + */ + +#ifndef INPUTSTREAMMGR_H_ +#define INPUTSTREAMMGR_H_ + +using namespace std; + +#include "PushBackStreamBuf.h" +#include "InflateStreamBuf.h" +#include "QuickString.h" + +#include <iostream> + +class InputStreamMgr { +public: + //Contsructor: 1st arg can be "-" for stdin. set 2nd arg false if fileType already known. + InputStreamMgr(const QuickString &filename, bool buildScanBuffer = true); + ~InputStreamMgr(); + bool init(); + + //use getScanBuffer for auto-detection of file types. + istream *getFinalStream() { return _finalInputStream; } + const BTlist<int> &getScanBuffer() const { return _scanBuffer; } + int getBufferLength() const { return _numBytesInBuffer; } + void populateScanBuffer(); + void reset(); + const QuickString &getSavedData() const { return _saveDataStr; } + bool isGzipped() const { return _isGzipped; } + PushBackStreamBuf *getPushBackStreamBuf() const {return _pushBackStreamBuf; } + void getSavedData(QuickString &str) const { str = _saveDataStr; } + bool isBam() const { return _isBam; } + +private: + QuickString _filename; + PushBackStreamBuf *_pushBackStreamBuf; + ifstream *_inputFileStream; + BTlist<int> _scanBuffer; + QuickString _saveDataStr; + BTlist<int> _compressedSaveData; + InflateStreamBuf *_infStreamBuf; + istream * _finalInputStream; + istream *_oldInputStream; + bool _isStdin; + bool _isGzipped; + bool _isBam; + vector<int> _possibleBamCode; + static const int SCAN_BUFFER_SIZE = 4096; // 4 K buffer + static const int MIN_SCAN_BUFFER_SIZE = 2048; + int _numBytesInBuffer; //this will hold the length of the buffer after the scan. + bool bamDetected(int numChars, int currChar); + void decompressBuffer(); + +}; + +#endif /* INPUTSTREAMMGR_H_ */ diff --git a/src/utils/FileRecordTools/FileReaders/Makefile b/src/utils/FileRecordTools/FileReaders/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..085486444cd7fa6e70e21a6463d884102e88086b --- /dev/null +++ b/src/utils/FileRecordTools/FileReaders/Makefile @@ -0,0 +1,39 @@ +OBJ_DIR = ../../../../obj/ +BIN_DIR = ../../../../bin/ +UTILITIES_DIR = ../../../utils/ +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/BamTools/include/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/Contexts/ \ + -I$(UTILITIES_DIR)/GenomeFile/ \ + -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= FileReader.h FileReader.cpp SingleLineDelimTransferBuffer.h SingleLineDelimTransferBuffer.cpp \ + SingleLineDelimTextFileReader.h SingleLineDelimTextFileReader.cpp BamFileReader.h BamFileReader.cpp \ + BufferedStreamMgr.h BufferedStreamMgr.cpp InputStreamMgr.h InputStreamMgr.cpp +OBJECTS= FileReader.o SingleLineDelimTransferBuffer.o SingleLineDelimTextFileReader.o BamFileReader.o BufferedStreamMgr.o InputStreamMgr.o +_EXT_OBJECTS=ParseTools.o QuickString.o CompressionTools.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/FileReader.o $(OBJ_DIR)SingleLineDelimTransferBuffer $(OBJ_DIR)/SingleLineDelimTextFileReader.o \ + $(OBJ_DIR)/BinaryFileReader.o $(OBJ_DIR)/BamFileReader.o $(OBJ_DIR)/BufferedStreamMgr.o $(OBJ_DIR)/InputStreamMgr.o $(OBJ_DIR)/PushBackGzStream.o + +.PHONY: clean diff --git a/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d3c20bef8db98d3b4c6ee5dc1f14be063e1bef6f --- /dev/null +++ b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp @@ -0,0 +1,138 @@ +#include "SingleLineDelimTextFileReader.h" +#include <iostream> +#include "BufferedStreamMgr.h" +#include "ParseTools.h" + +SingleLineDelimTextFileReader::SingleLineDelimTextFileReader(int numFields, char delimChar) +: _numFields(numFields), + _delimChar(delimChar), + _fullHeaderFound(false), + _currDataPos(0), + _lineNum(0) +{ + _delimPositions = new int[numFields +1]; +} + +SingleLineDelimTextFileReader::~SingleLineDelimTextFileReader() +{ + delete [] _delimPositions; +} + +bool SingleLineDelimTextFileReader::readEntry() +{ + if (!_isFileOpen) { + return false; + } + + if (_bufStreamMgr->eof()) { + return false; + } + if (!_bufStreamMgr->getLine(_sLine)) { + return false; + } + _lineNum++; + if (_sLine.empty()) { + return false; + } + + //scan the whole header in one call. + bool wasHeader = false; + while (detectAndHandleHeader()) { //header line + if (!_bufStreamMgr->getLine(_sLine)) { + return false; + _lineNum++; + } + } + //after the first time we find a header, any other header line + //doesn't count, and should not be added to the header, because it is a + // "commented out record." + _fullHeaderFound = true; + + //check to make sure line has something besides whitespace. + bool hasNonSpace = false; + int lineLen = _sLine.size(); + for (int i=0; i < lineLen; i++) { + if (!isspace(_sLine[i])) { + hasNonSpace = true; + break; + } + } + if (!hasNonSpace) { + return false; + } + + //trim off any white space from end of line. + int currPos = lineLen-1; + while (isspace(_sLine[currPos])) { + currPos--; + } + _sLine.resize(currPos +1); + + if (wasHeader) { + return true; + } + + if (!findDelimiters()) { + return false; + } + if (_context->hasGenomeFile()) { + getField(0, _currChromStr); + _currChromId = _context->getGenomeFile()->getChromId(_currChromStr); + } + return true; +} + + +void SingleLineDelimTextFileReader::getField(int fieldNum, QuickString &str) const { + int startPos = _delimPositions[fieldNum] +1; + int endPos = _delimPositions[fieldNum+1]; + str.assign(_sLine.c_str() + startPos, endPos - startPos); +} + + +void SingleLineDelimTextFileReader::getField(int fieldNum, int &val) { + getField(fieldNum, _tempChrPosStr); + val = str2chrPos(_tempChrPosStr.c_str()); +} + +void SingleLineDelimTextFileReader::getField(int fieldNum, char &val) const { + val = _sLine[_delimPositions[fieldNum] +1]; +} + +void SingleLineDelimTextFileReader::appendField(int fieldNum, QuickString &str) const { + int startPos = _delimPositions[fieldNum] +1; + int endPos = _delimPositions[fieldNum+1]; + str.append(_sLine.c_str() + startPos, endPos - startPos); +} + +bool SingleLineDelimTextFileReader::detectAndHandleHeader() +{ + if (!isHeaderLine(_sLine)) { + return false; + } + if (!_fullHeaderFound) { + _header += _sLine; + _header += "\n"; //add new line, since it was chomped by getline + } + return true; +} + +bool SingleLineDelimTextFileReader::findDelimiters() { + memset(_delimPositions, 0, (_numFields +1) * sizeof(int)); + //scan the line for delimiters, determine their positions + _delimPositions[0] = -1; + int currField=1; + int len = (int)_sLine.size(); + for (int i=0; i < len; i++) { + if (_sLine[i] == _delimChar) { + _delimPositions[currField] = i; + currField++; + } + } + _delimPositions[currField] = len; + if (currField != _numFields) { + cerr << "Error: line number " << _lineNum << " of file " << _filename << " has " << currField << " fields, but " << _numFields << " were expected." << endl; + exit(1); + } + return true; +} diff --git a/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.h b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.h new file mode 100644 index 0000000000000000000000000000000000000000..954ca2a17dcac76110220ac95f754b902762b3d4 --- /dev/null +++ b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.h @@ -0,0 +1,46 @@ +/* + * SingleLineTextFileReader.h + * + * Created on: Nov 8, 2012 + * Author: nek3d + */ + +#ifndef SINGLELINETEXTFILEREADER_H_ +#define SINGLELINETEXTFILEREADER_H_ + +using namespace std; + +#include "FileReader.h" +#include "QuickString.h" + +class SingleLineDelimTextFileReader : public FileReader { +public: + SingleLineDelimTextFileReader(int numFields, char delimChar = '\t'); + ~SingleLineDelimTextFileReader(); + + virtual bool readEntry(); + virtual int getNumFields() const { return _numFields; } + virtual void getField(int numField, QuickString &val) const; + virtual void getField(int numField, int &val); //this signaiture isn't const because it operates on an internal QuickString for speed. + virtual void getField(int fieldNum, char &val) const; + virtual void appendField(int fieldNum, QuickString &str) const; + virtual const QuickString &getHeader() const { return _header; } + virtual bool hasHeader() const { return _fullHeaderFound; } +protected: + int _numFields; + char _delimChar; + QuickString _header; + bool _fullHeaderFound; + int _currDataPos; + QuickString _sLine; + int *_delimPositions; + QuickString _currChromStr; + QuickString _tempChrPosStr; + int _lineNum; + bool detectAndHandleHeader(); + bool findDelimiters(); + +}; + + +#endif /* SINGLELINETEXTFILEREADER_H_ */ diff --git a/src/utils/FileRecordTools/FileReaders/SingleLineDelimTransferBuffer.cpp b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTransferBuffer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dec5e317d92ddc11d4c21fd7d107bbc6c0322b0a --- /dev/null +++ b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTransferBuffer.cpp @@ -0,0 +1,72 @@ +#include "SingleLineDelimTransferBuffer.h" +#include <cstring> +#include <iostream> +#include <cstdio> +#include <cstdlib> + +SingleLineDelimTransferBuffer::SingleLineDelimTransferBuffer(int numFields, char delimChar) +: _numFields(numFields), + _delimChar(delimChar) +{ + _fields = new char *[_numFields]; + for (int i=0; i < numFields; i++) { + _fields[i] = new char[MAX_FIELD_SIZE]; + memset(_fields[i], 0, MAX_FIELD_SIZE); + } +} + +SingleLineDelimTransferBuffer::~SingleLineDelimTransferBuffer() +{ + for (int i=0; i < _numFields; i++) { + delete [] _fields[i]; + _fields[i] = NULL; + } + delete [] _fields; + _fields = NULL; +} + +bool SingleLineDelimTransferBuffer::initFromInput(const char *inBuffer) +{ + clear(); + if (strlen(inBuffer) == 0) { + return false; + } + const char *oldCursor = (const char *)inBuffer; + const char *newCursor = (const char *)inBuffer; + int fieldNum = 0; + while (oldCursor[0] != '\0' && oldCursor[0] != '\n') { + while (newCursor[0] != _delimChar && newCursor[0] != '\0') { + newCursor++; + } + if (newCursor != oldCursor) { + memcpy(_fields[fieldNum], oldCursor, newCursor - oldCursor); + fieldNum++; + } + if (newCursor[0] != '\0') { + oldCursor = newCursor +1; + newCursor = oldCursor; + } else { + break; + } + } + if (fieldNum == _numFields) { + return true; + } + return false; +} + +const char *SingleLineDelimTransferBuffer::getField(int fieldNum) const +{ + if (fieldNum < _numFields) { + return _fields[fieldNum]; + } + cerr << "Error: Requested field " << fieldNum << "from TransferBuffer with only " << _numFields << " fields." <<endl; + exit(1); +} + +void SingleLineDelimTransferBuffer::clear(void) +{ + for (int i=0; i < _numFields; i++) { + memset(_fields[i], 0, MAX_FIELD_SIZE); + } +} diff --git a/src/utils/FileRecordTools/FileReaders/SingleLineDelimTransferBuffer.h b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTransferBuffer.h new file mode 100644 index 0000000000000000000000000000000000000000..9398ff773b7a26a8e645a3e7cf3d94331860d63c --- /dev/null +++ b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTransferBuffer.h @@ -0,0 +1,31 @@ +/* + * SingleLineDelimTransferBuffer.h + * + * Created on: Nov 8, 2012 + * Author: nek3d + */ + +#ifndef SINGLELINEDELIMTRANSFERBUFFER_H_ +#define SINGLELINEDELIMTRANSFERBUFFER_H_ + +using namespace std; + +class SingleLineDelimTransferBuffer { +public: + SingleLineDelimTransferBuffer(int numFields, char delim='\t'); + ~SingleLineDelimTransferBuffer(); + bool initFromInput(const char *inBuffer); + int getNumFields() const { return _numFields; } + const char *getField(int fieldNum) const; + +protected: + void clear(); + + static const int MAX_FIELD_SIZE = 8192; //Given field can not exceed 8K. + int _numFields; + char _delimChar; + char **_fields; +}; + + +#endif /* SINGLELINEDELIMTRANSFERBUFFER_H_ */ diff --git a/src/utils/FileRecordTools/FileRecordMgr.cpp b/src/utils/FileRecordTools/FileRecordMgr.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0f2110a8e9fb791e7f607e603a178be3de8e0e54 --- /dev/null +++ b/src/utils/FileRecordTools/FileRecordMgr.cpp @@ -0,0 +1,361 @@ + +#include "FileRecordMgr.h" +#include "Context.h" +#include "FreeList.h" +#include "Record.h" + +FileRecordMgr::FileRecordMgr(int fileIdx, Context *context) +: + _bufStreamMgr(NULL), + _contextFileIdx(fileIdx), + _context(context), + _fileReader(NULL), + _fileType(FileRecordTypeChecker::UNKNOWN_FILE_TYPE), + _recordType(FileRecordTypeChecker::UNKNOWN_RECORD_TYPE), + _recordMgr(NULL), + _freeListBlockSize(512), + _useFullBamTags(false), + _headerSet(false), + _prevStart(INT_MAX), + _mustBeForward(false), + _mustBeReverse(false), + _totalRecordLength(0), + _totalMergedRecordLength(0) +{ +} + +FileRecordMgr::~FileRecordMgr(){ + + if (_bufStreamMgr != NULL) { + delete _bufStreamMgr; + _bufStreamMgr = NULL; + } + if (_fileReader != NULL) { + close(); //just make sure file was closed. + delete _fileReader; + _fileReader = NULL; + } + if (_recordMgr != NULL) { + delete _recordMgr; + _recordMgr = NULL; + } +} + +bool FileRecordMgr::open(){ + + const QuickString &filename = _context->getInputFileName(_contextFileIdx); + _bufStreamMgr = new BufferedStreamMgr(filename); + if (!_bufStreamMgr->init()) { + fprintf(stderr, "Error: unable to open file or unable to determine types for file %s.\n", filename.c_str()); + delete _bufStreamMgr; + _bufStreamMgr = NULL; + exit(1); + } + + _fileType = _bufStreamMgr->getTypeChecker().getFileType(); + _recordType = _bufStreamMgr->getTypeChecker().getRecordType(); + if (_fileType == FileRecordTypeChecker::UNKNOWN_FILE_TYPE || _recordType == FileRecordTypeChecker::UNKNOWN_RECORD_TYPE) { + fprintf(stderr, "Error: Unable to determine type for file %s.\n", filename.c_str()); + delete _bufStreamMgr; + _bufStreamMgr = NULL; + exit(1); + } + allocateFileReader(); + _recordMgr = new RecordMgr(_recordType, _freeListBlockSize); + + _fileReader->setFileName(filename.c_str()); + _fileReader->setInputStream(_bufStreamMgr); + _fileReader->setContext(_context); + if (!_fileReader->open()) { + fprintf(stderr, "Error: Types determined but can't open file %s.\n", filename.c_str()); + delete _bufStreamMgr; + _bufStreamMgr = NULL; + exit(1); + } + _context->setInputFileType(_contextFileIdx, _fileType); + _context->setInputRecordType(_contextFileIdx, _recordType); + + //if this is a database file, and the numFields is greater than the numFields in other DB files, update. + // TBD: This is one of many things that will need to be changed to support multiple database files in future versions. + int numFields = _bufStreamMgr->getTypeChecker().getNumFields(); + if (_contextFileIdx == _context->getDatabaseFileIdx() && numFields > _context->getMaxNumDatabaseFields()) { + _context->setMaxNumDatabaseFields(numFields); + } + if (_fileType == FileRecordTypeChecker::BAM_FILE_TYPE) { + _context->setHeader(_contextFileIdx, _fileReader->getHeader()); + _context->setReferences(_contextFileIdx, static_cast<BamFileReader *>(_fileReader)->getReferences()); + _headerSet = true; + } + return true; +} + +void FileRecordMgr::close(){ + if (_bufStreamMgr != NULL) { + delete _bufStreamMgr; + _bufStreamMgr = NULL; + } + if (_fileReader != NULL) { + _fileReader->close(); + delete _fileReader; + _fileReader = NULL; + } +} + +bool FileRecordMgr::eof(){ + return _storedRecords.empty() && _fileReader->eof() ? true: false; +} + +Record *FileRecordMgr::allocateAndGetNextRecord() +{ + if (!_fileReader->isOpen()) { + return NULL; + } + if (!_fileReader->readEntry()) { + return NULL; + } + if (!_headerSet && _fileReader->hasHeader()) { + _context->setHeader(_contextFileIdx, _fileReader->getHeader()); + _headerSet = true; + } + Record *record = NULL; + record = _recordMgr->allocateRecord(); + if (!record->initFromFile(_fileReader)) { + _recordMgr->deleteRecord(record); + return NULL; + } + + //test for sorted order, if necessary. + if (_context->getSortedInput()) { + testInputSortOrder(record); + } + _totalRecordLength += (unsigned long)(record->getEndPos() - record->getStartPos()); + return record; +} + +void FileRecordMgr::testInputSortOrder(const Record *record) +{ + //user specified that file must be sorted. Check that it is so. + // TBD: In future versions, we might not want/need all files to be sorted, + // even if the -sorted option is used, depending on number of input files + // and program being run. Should that occur, this block will need adjusting. + // NEK - 9/5/13 + + const QuickString &currChrom = record->getChrName(); + if (currChrom != _prevChrom) { + if ( _foundChroms.find(currChrom) != _foundChroms.end()) { + //this is a different chrom than the last record had, but we've already seen this chrom. + fprintf(stderr, "Error: Sorted input specified, but the file %s has the following out of order record:\n", _context->getInputFileName(_contextFileIdx).c_str()); + QuickString errBuf; + record->print(errBuf); + fprintf(stderr, "%s\n", errBuf.c_str()); + exit(1); + } else { + //new chrom has not been seen before. + _foundChroms.insert(currChrom); + _prevChrom = currChrom; + _prevStart = INT_MAX; + } + } else if (record->getStartPos() < _prevStart) { //same chrom as last record, but with lower startPos, so still out of order. + fprintf(stderr, "Error: Sorted input specified, but the file %s has the following out of order record:\n", _context->getInputFileName(_contextFileIdx).c_str()); + QuickString errBuf; + record->print(errBuf); + fprintf(stderr, "%s\n", errBuf.c_str()); + exit(1); + } + _prevStart = record->getStartPos(); + +} + +void FileRecordMgr::deleteRecord(const Record *record) { + _recordMgr->deleteRecord(record); +} + +void FileRecordMgr::allocateFileReader() +{ + switch (_fileType) { + case FileRecordTypeChecker::SINGLE_LINE_DELIM_TEXT_FILE_TYPE: + case FileRecordTypeChecker::VCF_FILE_TYPE: + _fileReader = new SingleLineDelimTextFileReader(_bufStreamMgr->getTypeChecker().getNumFields(), _bufStreamMgr->getTypeChecker().getDelimChar()); + break; + + case FileRecordTypeChecker::BAM_FILE_TYPE: + _fileReader = new BamFileReader(); + (static_cast<BamFileReader *>(_fileReader))->setUseTags(_useFullBamTags); + break; + default: + break; + } +} + +Record *FileRecordMgr::allocateAndGetNextMergedRecord(WANT_STRAND_TYPE desiredStrand, int maxDistance) { + RecordKeyList recList; + if (!allocateAndGetNextMergedRecord(recList, desiredStrand, maxDistance)) { + return NULL; + } + deleteAllMergedItemsButKey(recList); + return const_cast<Record *>(recList.getKey()); //want key to be non-const +} + +bool FileRecordMgr::allocateAndGetNextMergedRecord(RecordKeyList & recList, WANT_STRAND_TYPE desiredStrand, int maxDistance) +{ + if (!recList.allClear()) { + deleteMergedRecord(recList); + } + + _mustBeForward = desiredStrand == SAME_STRAND_FORWARD; + _mustBeReverse = desiredStrand == SAME_STRAND_REVERSE; + + Record *startRecord = tryToTakeFromStorage(); + + // if we couldn't use a previously stored record for starters, + //then begin with a new one that matches strand criteria. + while (startRecord == NULL) { + startRecord = allocateAndGetNextRecord(); + if (startRecord == NULL) { //hit EOF!! + return false; + } + + if (_mustBeForward && !startRecord->getStrand()) { + //record is reverse, wanted forward. + addToStorage(startRecord); + startRecord = NULL; + } else if (_mustBeReverse && startRecord->getStrand()) { + //record is forward, wanted reverse + addToStorage(startRecord); + startRecord = NULL; + } + } + if (startRecord->getStartPos() == 107071272) { + printf("Break point.\n"); + } + + // OK!! We have a start record! + + _mustBeForward = desiredStrand == SAME_STRAND_FORWARD || (desiredStrand == SAME_STRAND_EITHER && startRecord->getStrand()); + _mustBeReverse = desiredStrand == SAME_STRAND_REVERSE || (desiredStrand == SAME_STRAND_EITHER && !startRecord->getStrand()); + + const QuickString &currChrom = startRecord->getChrName(); + _foundChroms.insert(currChrom); + + bool madeComposite = false; + recList.push_back(startRecord); + recList.setKey(startRecord); //key of recList will just be the startRecord unless we're able to merge more. + + bool currStrand = startRecord->getStrand(); + bool mustMatchStrand = desiredStrand != ANY_STRAND; + + int currEnd = startRecord->getEndPos(); + //now look for more records to merge with this one. + //stop when they're out of range, not on the same chromosome, or we hit EOF. + //ignore if they don't comply with strand. + Record *nextRecord = NULL; + while (nextRecord == NULL) { + bool takenFromStorage = false; + nextRecord = mustMatchStrand ? tryToTakeFromStorage(currStrand) : tryToTakeFromStorage(); + if (nextRecord == NULL) { + nextRecord = allocateAndGetNextRecord(); + } else { + takenFromStorage = true; + } + if (nextRecord == NULL) { // EOF hit + break; + } + const QuickString &newChrom = nextRecord->getChrName(); + if (newChrom != currChrom) { //hit a different chromosome. + if (_foundChroms.find(newChrom) == _foundChroms.end() || takenFromStorage) { + //haven't seen this chromosome before. + addToStorage(nextRecord); + break; + } else { + //different strand, but we've already seen this chrom. File is not sorted. + fprintf(stderr, "ERROR: Input file %s is not sorted by chromosome, startPos.\n", _context->getInputFileName(_contextFileIdx).c_str()); + deleteRecord(nextRecord); + deleteMergedRecord(recList); + exit(1); + } + } + int nextStart = nextRecord->getStartPos(); + //is the record out of range? + if (nextStart > currEnd + maxDistance) { + //yes, it's out of range. + addToStorage(nextRecord); + break; + } + + //ok, they're on the same chrom and in range. Are we happy with the strand? + if (mustMatchStrand && nextRecord->getStrand() != currStrand) { + //no, we're not. + addToStorage(nextRecord); + nextRecord = NULL; + continue; + } + //everything's good! do a merge. + recList.push_back(nextRecord); + madeComposite = true; + int nextEnd = nextRecord->getEndPos(); + if (nextEnd > currEnd) { + currEnd = nextEnd; + } + nextRecord = NULL; + } + if (madeComposite) { + Record *newKey = _recordMgr->allocateRecord(); + (*newKey) = (*startRecord); + newKey->setEndPos(currEnd); + recList.setKey(newKey); + } + _totalMergedRecordLength += (unsigned long)(recList.getKey()->getEndPos() - recList.getKey()->getStartPos()); + return true; +} + +void FileRecordMgr::addToStorage(Record *record) { + _storedRecords.push(record); +} + +Record *FileRecordMgr::tryToTakeFromStorage() { + Record *record = _storedRecords.empty() ? NULL : const_cast<Record *>(_storedRecords.top()); + if (record != NULL) { + _storedRecords.pop(); + } + return record; +} + +Record *FileRecordMgr::tryToTakeFromStorage(bool strand) { + Record *record = NULL; + if(strand) { + if (_storedRecords.emptyForward()) { + return NULL; + } else { + record = const_cast<Record *>(_storedRecords.topForward()); + _storedRecords.popForward(); + return record; + } + } else { + if (_storedRecords.emptyReverse()) { + return NULL; + } else { + record = const_cast<Record *>(_storedRecords.topReverse()); + _storedRecords.popReverse(); + return record; + } + } +} + +void FileRecordMgr::deleteMergedRecord(RecordKeyList &recList) +{ + deleteAllMergedItemsButKey(recList); + deleteRecord(recList.getKey()); + recList.setKey(NULL); +} + +void FileRecordMgr::deleteAllMergedItemsButKey(RecordKeyList &recList) { + //if the key is also in the list, this method won't delete it. + for (RecordKeyList::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { + if (iter->value() == recList.getKey()) { + continue; + } + deleteRecord(iter->value()); + } + recList.clearList(); +} + diff --git a/src/utils/FileRecordTools/FileRecordMgr.h b/src/utils/FileRecordTools/FileRecordMgr.h new file mode 100644 index 0000000000000000000000000000000000000000..46c58a95e7bf7d37d20209e54b8eb36ff70c1367 --- /dev/null +++ b/src/utils/FileRecordTools/FileRecordMgr.h @@ -0,0 +1,192 @@ +/* + * FileRecordMgr.h + * + * Created on: Nov 8, 2012 + * Author: nek3d + */ + +#ifndef FILERECORDMGR_H_ +#define FILERECORDMGR_H_ + +using namespace std; + +#include <string> +#include "QuickString.h" +#include <set> +#include "DualQueue.h" + +//include headers for all FileReader and derivative classes. +#include "Context.h" +#include "BufferedStreamMgr.h" +#include "FileReader.h" +#include "SingleLineDelimTextFileReader.h" +#include "BamFileReader.h" + +//record manager and all record classes +#include "RecordMgr.h" + +#include "RecordKeyList.h" +#include "BlockMgr.h" + +class Record; +class FileRecordMgr { +public: + FileRecordMgr(int inputFileIdx, Context *context); + ~FileRecordMgr(); + bool open(); + void close(); + bool eof(); + + //NOTE: FOR Non-BAM FILES, HEADER INFORMATION IS CURRENTLY ONLY AVAILABLE AFTER 1st CALL + //TO allocateAndGetNextRecord(). + bool hasHeader() const { return _fileReader->hasHeader(); } + const QuickString &getHeader() const { return _fileReader->getHeader(); } + + bool recordsHaveName() const { + testBufferedMgr(); + return _bufStreamMgr->getTypeChecker().recordTypeHasName(_recordType); + } + bool recordsHaveScore() const { + testBufferedMgr(); + return _bufStreamMgr->getTypeChecker().recordTypeHasScore(_recordType); + } + bool recordsHaveStrand() const { + testBufferedMgr(); + return _bufStreamMgr->getTypeChecker().recordTypeHasStrand(_recordType); + } + + FileRecordTypeChecker::FILE_TYPE getFileType() const { + testBufferedMgr(); + return _fileType; + } + FileRecordTypeChecker::RECORD_TYPE getRecordType() const { + testBufferedMgr(); + return _recordType; + } + const string &getFileTypeName() const { + testBufferedMgr(); + return _bufStreamMgr->getTypeChecker().getFileTypeName(); + } + + const string &getRecordTypeName() const { + testBufferedMgr(); + return _bufStreamMgr->getTypeChecker().getRecordTypeName(); + } + + //This is an all-in-one method to give the user a new record that is initialized with + //the next entry in the data file. + //NOTE!! User MUST pass back the returned pointer to deleteRecord method for cleanup! + //Also Note! User must check for NULL returned, meaning we failed to get the next record. + Record *allocateAndGetNextRecord(); + void deleteRecord(const Record *); + + + ////////////////////////////////////////////////////////////////////////////////// + // + // MERGED RECORDS + // + //this will give a single "meta" record containing "flattened" or merged records. + // + // 1st ARG: Pass an empty RecordKeyList. When done, will have a pair: 1st is the final merged record, + // second is list of constituent Records merged. + // ** NOTE ** If the RecordKeyList is not empty, this method will empty it for you and delete all contents! + // + // 2nd ARG: Choose from WANT_STRAND_TYPE, defined below below + // + // 3rd ARG: allows for nearby records, i.e. maxDistance 100 will merge records <= 100 bases apart. Default 0 means only + // merge records that actually intersect. + // + // Return value: true if any records found. False if eof hit before records matching requested parameters found. + + typedef enum { SAME_STRAND_FORWARD, //must all be forward strand + SAME_STRAND_REVERSE, //must all be reverse strand + SAME_STRAND_EITHER, //must be same strand, but can be either forward or reverse + ANY_STRAND } //do no care about strand (Default value) + WANT_STRAND_TYPE; + + // + // WARNING!! Specifying a strand will keep all records on the other strand in memory!! + // This is done so that requests for records on that other strand can still be met. + // For now, use this method at any time to purge the kept records from memory, such as + // when changing chromosomes, for example. + void purgeKeepList(); + bool allocateAndGetNextMergedRecord(RecordKeyList & recList, WANT_STRAND_TYPE desiredStrand = ANY_STRAND, int maxDistance = 0); + void deleteMergedRecord(RecordKeyList &recList); // MUST use this method for cleanup! + + //this method will allocate a new record of merged records, but the returned record should only be passed to the deleteRecord method + //for cleanup, not to the delete mmerged record. + Record *allocateAndGetNextMergedRecord(WANT_STRAND_TYPE desiredStrand = ANY_STRAND, int maxDistance = 0); + + // + // END MERGED RECORDS + // + ////////////////////////////////////////////////////////////////////////////////// + + + //File statistics + unsigned long getTotalRecordLength() const { return _totalRecordLength; } //sum of length of all returned records + unsigned long getTotalMergedRecordLength() const { return _totalMergedRecordLength; } // sum of all merged intervals + + + + //Setting the freeListBlockSize is optional. If the user never calls this, + //the blockSize defaults to 512. + void setFreeListBlockSize(int blockSize) { _freeListBlockSize = blockSize; } + + //special: For BAM files, our default is to not use all the + //tag information in a BAM file, which reduces the run time in some + //cases by more than 50%. But setting this method to true + //will use all the tags, if more information is desired. + //MUST BE CALLED BEFORE THE BAM FILE IS OPEN. + void setFullBamFlags(bool flag) { _useFullBamTags = flag; } + +private: + BufferedStreamMgr *_bufStreamMgr; + int _contextFileIdx; + + Context *_context; + FileReader *_fileReader; + FileRecordTypeChecker::FILE_TYPE _fileType; + FileRecordTypeChecker::RECORD_TYPE _recordType; + RecordMgr *_recordMgr; + int _freeListBlockSize; + bool _useFullBamTags; + bool _headerSet; + + //members for enforcing sorted order. + set<QuickString> _foundChroms; + QuickString _prevChrom; + int _prevStart; + + //members for handling merged records + DualQueue<Record *, DualQueueAscending > _storedRecords; + + bool _mustBeForward; + bool _mustBeReverse; + + //available stats after run + unsigned long _totalRecordLength; + unsigned long _totalMergedRecordLength; + + BlockMgr *_blockMgr; + + + void allocateFileReader(); + void testInputSortOrder(const Record *record); + void deleteAllMergedItemsButKey(RecordKeyList &recList); + void addToStorage(Record *record); + Record *tryToTakeFromStorage(); + Record *tryToTakeFromStorage(bool strand); + void testBufferedMgr() const { + if (_bufStreamMgr == NULL) { + cerr << "Error: attempted to access type information for file " << _context->getInputFileName(_contextFileIdx) << " before it was opened." << endl; + exit (1); + } + } + + + +}; + + +#endif /* FILERECORDMGR_H_ */ diff --git a/src/utils/FileRecordTools/Makefile b/src/utils/FileRecordTools/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..536ab1926db4e17cac4653be2f5f069c8afb3517 --- /dev/null +++ b/src/utils/FileRecordTools/Makefile @@ -0,0 +1,52 @@ +OBJ_DIR = ../../../obj/ +BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../../utils/ +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ + -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/GenomeFile/ \ + -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/Contexts/ \ + -I$(UTILITIES_DIR)/BamTools/include + +SUBDIRS = ./FileReaders \ + ./Records \ + + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= FileRecordMgr.cpp FileRecordMgr.h RecordOutputMgr.cpp RecordOutputMgr.h +OBJECTS= FileRecordMgr.o RecordOutputMgr.o +_EXT_OBJECTS=SingleLineDelimTextFileReader.o BamFileReader.o Bed3Interval.o Bed6Interval.o BedPlusInterval.o Bed12Interval.o BamRecord.o \ + SingleLineDelimTransferBuffer.o FileRecordTypeChecker.o QuickString.o ParseTools.o RecordKeyList.o BufferedStreamMgr.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +$(BUILT_OBJECTS): $(SOURCES) $(SUBDIRS) + @echo " * compiling FileRecordMgr.cpp" + @$(CXX) -c -o $(OBJ_DIR)/FileRecordMgr.o FileRecordMgr.cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + @echo " * compiling RecordOutputMgr.cpp" + @$(CXX) -c -o $(OBJ_DIR)/RecordOutputMgr.o RecordOutputMgr.cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + + + +.PHONY: $(SUBDIRS) +$(SUBDIRS): $(OBJ_DIR) + @echo "- Building in $@" + @$(MAKE) --no-print-directory --directory=$@ + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/FileRecordMgr.o + @rm -f $(OBJ_DIR)/RecordMgr.o + @rm -f $(OBJ_DIR)/FileRecordTypeChecker.o + @rm -f $(OBJ_DIR)/SingleLineDelimTextFileReader.o + @rm -f $(OBJ_DIR)/SingleLineDelimTransferBuffer.o + @rm -f $(OBJ_DIR)/RecordOutputMgr.o + + +.PHONY: clean \ No newline at end of file diff --git a/src/utils/FileRecordTools/RecordOutputMgr.cpp b/src/utils/FileRecordTools/RecordOutputMgr.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2ef905345ca1cdfd5558b3786225df2cb4853a08 --- /dev/null +++ b/src/utils/FileRecordTools/RecordOutputMgr.cpp @@ -0,0 +1,301 @@ +/* + * RecordOutputMgr.cpp + * + * Created on: May 28, 2013 + * Author: nek3d + */ + +#include "RecordOutputMgr.h" +#include "Context.h" + +#include "Bed3Interval.h" +#include "Bed4Interval.h" +#include "BedGraphInterval.h" +#include "Bed5Interval.h" +#include "Bed6Interval.h" +#include "BedPlusInterval.h" +#include "Bed12Interval.h" +#include "BamRecord.h" +#include "VcfRecord.h" +#include "GffRecord.h" + + + +#include <cstdio> + + +RecordOutputMgr::RecordOutputMgr() +: _context(NULL), + _printable(true), + _bamWriter(NULL), + _currBlockList(NULL) +{ + +} + +RecordOutputMgr::~RecordOutputMgr() +{ + flush(); + if (_bamWriter != NULL) { + _bamWriter->Close(); + delete _bamWriter; + _bamWriter = NULL; + } +} + +bool RecordOutputMgr::init(Context *context) { + _context = context; + if (_context->getOutputFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { + //set-up BAM writer. + _bamWriter = new BamTools::BamWriter(); + _bamWriter->SetCompressionMode(_context->getUncompressedBam() ? BamTools::BamWriter::Uncompressed : BamTools::BamWriter::Compressed); + + _bamWriter->Open("stdout", _context->getHeader(_context->getBamHeaderAndRefIdx()).c_str(), _context->getReferences(_context->getBamHeaderAndRefIdx())); + } else { + //for everything but BAM, we'll copy output to an output buffer before printing. + _outBuf.reserve(MAX_OUTBUF_SIZE); + } + if (_context->getAnyHit() || _context->getNoHit() || _context->getWriteCount()) { + _printable = false; + } + if (!_context->isValidState()) { + fprintf(stderr, "%s\n", context->getErrorMsg().c_str()); + exit(1); + } + if (_context->getPrintHeader()) { + _outBuf.append(_context->getHeader(_context->getQueryFileIdx())); + } + return true; +} + +void RecordOutputMgr::printHeader(const string &header) +{ + _outBuf.append(header); +} + +bool RecordOutputMgr::printKeyAndTerminate(RecordKeyList &keyList) { + printBamType bamCode = printBamRecord(keyList); + if (bamCode == BAM_AS_BAM) { + return true; + } else if (bamCode == NOT_BAM) { + keyList.getKey()->print(_outBuf); + return false; + } + //otherwise, it was BAM_AS_BED, and the key was printed. + return false; + +} + +RecordOutputMgr::printBamType RecordOutputMgr::printBamRecord(RecordKeyList &keyList, bool bamOutputOnly) +{ + const Record *record = keyList.getKey(); + if (record->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) { + if (_context->getOutputFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { + _bamWriter->SaveAlignment(static_cast<const BamRecord *>(record)->getAlignment()); + return BAM_AS_BAM; + } else { + if (!bamOutputOnly) { + static_cast<const BamRecord *>(record)->print(_outBuf, _currBlockList); + } + return BAM_AS_BED; + } + } + return NOT_BAM; +} + +void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockList) +{ + if (needsFlush()) flush(); + + //The first time we print a record is when we print any header, because the header + //hasn't been read from the query file until after the first record has also been read. + if (_context->getPrintHeader()) { + _outBuf.append(_context->getHeader(_context->getQueryFileIdx())); + _context->setPrintHeader(false); + } + + _currBlockList = blockList; + + if (_printable) { + if (keyList.empty()) { + if (_context->getWriteAllOverlap()) { + // -wao the user wants to force the reporting of 0 overlap + if (printKeyAndTerminate(keyList)) { + _currBlockList = NULL; + return; + } + tab(); + null(false, true); + tab(); + _outBuf.append('0'); + newline(); + } + else if (_context->getLeftJoin()) { + if (printKeyAndTerminate(keyList)) { + _currBlockList = NULL; + return; + } + tab(); + null(false, true); + newline(); + } + } else { + if (printBamRecord(keyList, true) == BAM_AS_BAM) { + _currBlockList = NULL; + return; + } + for (RecordKeyList::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { + reportOverlapDetail(keyList.getKey(), iter->value()); + } + } + } else { // not printable + reportOverlapSummary(keyList); + } + _currBlockList = NULL; +} + +void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record *hitRecord) +{ + //get the max start and min end as strings. + const QuickString &startStr = keyRecord->getStartPos() > hitRecord->getStartPos() ? keyRecord->getStartPosStr() : hitRecord->getStartPosStr(); + const QuickString &endStr = keyRecord->getEndPos() < hitRecord->getEndPos() ? keyRecord->getEndPosStr() : hitRecord->getEndPosStr(); + + int maxStart = max(keyRecord->getStartPos(), hitRecord->getStartPos()); + int minEnd = min(keyRecord->getEndPos(), hitRecord->getEndPos()); + + + if (!_context->getWriteA() && !_context->getWriteB() && !_context->getWriteOverlap() && !_context->getLeftJoin()) { + printKey(keyRecord, startStr, endStr); + newline(); + } + else if ((_context->getWriteA() && _context->getWriteB()) || _context->getLeftJoin()) { + printKey(keyRecord); + tab(); + hitRecord->print(_outBuf); + newline(); + } + else if (_context->getWriteA()) { + printKey(keyRecord); + newline(); + } + else if (_context->getWriteB()) { + printKey(keyRecord, startStr, endStr); + tab(); + hitRecord->print(_outBuf); + newline(); + } + else if (_context->getWriteOverlap()) { + int printOverlapBases = max(0, minEnd-maxStart); + printKey(keyRecord); + tab(); + hitRecord->print(_outBuf); + tab(); + int2str(printOverlapBases, _outBuf, true); + newline(); + } +} + +void RecordOutputMgr::reportOverlapSummary(RecordKeyList &keyList) +{ + int numOverlapsFound = (int)keyList.size(); + if (_context->getAnyHit() && numOverlapsFound > 0) { + if (printKeyAndTerminate(keyList)) { + return; + } + newline(); + } else if (_context->getWriteCount()) { + if (printKeyAndTerminate(keyList)) { + return; + } + tab(); + int2str(numOverlapsFound, _outBuf, true); + newline(); + } else if (_context->getNoHit() && numOverlapsFound == 0) { + if (printKeyAndTerminate(keyList)) { + return; + } + newline(); + } +} + + +void RecordOutputMgr::null(bool queryType, bool dbType) +{ + FileRecordTypeChecker::RECORD_TYPE recordType = FileRecordTypeChecker::UNKNOWN_RECORD_TYPE; + if (queryType) { + recordType = _context->getQueryRecordType(); + } else if (dbType) { + recordType = _context->getDatabaseRecordType(); + } else { + return; //TBD? Implement printNull for records that are neither query nor database. + //Not sure if this would ever be necessary. + } + + //This is kind of a hack. Need an instance of the correct class of record in order to call it's printNull method. + Record *dummyRecord = NULL; + + switch (recordType) { + case FileRecordTypeChecker::BED3_RECORD_TYPE: + dummyRecord = new Bed3Interval(); + break; + case FileRecordTypeChecker::BED4_RECORD_TYPE: + dummyRecord = new Bed4Interval(); + break; + case FileRecordTypeChecker::BEDGRAPH_RECORD_TYPE: + dummyRecord = new BedGraphInterval(); + break; + case FileRecordTypeChecker::BED5_RECORD_TYPE: + dummyRecord = new Bed5Interval(); + break; + case FileRecordTypeChecker::BED6_RECORD_TYPE: + dummyRecord = new Bed6Interval(); + break; + case FileRecordTypeChecker::BED12_RECORD_TYPE: + dummyRecord = new Bed12Interval(); + break; + case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE: + dummyRecord = new BedPlusInterval(); + (static_cast<BedPlusInterval *>(dummyRecord))->setNumPrintFields(_context->getMaxNumDatabaseFields()); + break; + case FileRecordTypeChecker::VCF_RECORD_TYPE: + dummyRecord = new VcfRecord(); + (static_cast<VcfRecord *>(dummyRecord))->setNumPrintFields(_context->getMaxNumDatabaseFields()); + break; + case FileRecordTypeChecker::BAM_RECORD_TYPE: + dummyRecord = new BamRecord(); + break; + case FileRecordTypeChecker::GFF_RECORD_TYPE: + dummyRecord = new GffRecord(); + (static_cast<GffRecord *>(dummyRecord))->setNumFields(_context->getMaxNumDatabaseFields()); + break; + default: + break; + } + + dummyRecord->printNull(_outBuf); + delete dummyRecord; + +} + +void RecordOutputMgr::printKey(const Record *key, const QuickString & start, const QuickString & end) +{ + if (key->getType() != FileRecordTypeChecker::BAM_RECORD_TYPE) { + key->print(_outBuf, start, end); + } else { + static_cast<const BamRecord *>(key)->print(_outBuf, start, end, _currBlockList); + } +} + +void RecordOutputMgr::printKey(const Record *key) +{ + if (key->getType() != FileRecordTypeChecker::BAM_RECORD_TYPE) { + key->print(_outBuf); + } else { + static_cast<const BamRecord *>(key)->print(_outBuf, _currBlockList); + } +} + +void RecordOutputMgr::flush() { + fwrite(_outBuf.c_str(), 1, _outBuf.size(), stdout); + _outBuf.clear(); +} diff --git a/src/utils/FileRecordTools/RecordOutputMgr.h b/src/utils/FileRecordTools/RecordOutputMgr.h new file mode 100644 index 0000000000000000000000000000000000000000..b8f766305a3e18088d4412e7365787306c124e3e --- /dev/null +++ b/src/utils/FileRecordTools/RecordOutputMgr.h @@ -0,0 +1,56 @@ +/* + * RecordOutputMgr.h + * + * Created on: May 28, 2013 + * Author: nek3d + */ + +#ifndef RECORDOUTPUTMGR_H_ +#define RECORDOUTPUTMGR_H_ + +using namespace std; + +#include "RecordKeyList.h" +#include "api/BamWriter.h" + +class Context; + + +class RecordOutputMgr { +public: + RecordOutputMgr(); + ~RecordOutputMgr(); + + //The init method must be called after all the input files are open. + bool init(Context *context); + + typedef enum { NOT_BAM, BAM_AS_BAM, BAM_AS_BED} printBamType; + + void printHeader(const string &); + bool printKeyAndTerminate(RecordKeyList &keyList); + printBamType printBamRecord(RecordKeyList &keyList, bool bamOutputOnly = false); + void printRecord(RecordKeyList &keyList, RecordKeyList *blockList = NULL); + void reportOverlapDetail(const Record *keyRecord, const Record *hitRecord); + void reportOverlapSummary(RecordKeyList &keyList); + +private: + Context *_context; + bool _printable; + BamTools::BamWriter *_bamWriter; + RecordKeyList *_currBlockList; + + QuickString _outBuf; + + //some helper functions to neaten the code. + void tab() { _outBuf.append('\t'); } + void newline() { _outBuf.append('\n'); } + void null(bool queryType, bool dbType); + + void printKey(const Record *key); + void printKey(const Record *key, const QuickString & start, const QuickString & end); + static const unsigned int MAX_OUTBUF_SIZE = 33554432; //32 Mb + bool needsFlush() const { return _outBuf.size() >= MAX_OUTBUF_SIZE *.9; } + void flush(); +}; + +#endif /* RECORDOUTPUTMGR_H_ */ diff --git a/src/utils/FileRecordTools/Records/BamRecord.cpp b/src/utils/FileRecordTools/Records/BamRecord.cpp new file mode 100644 index 0000000000000000000000000000000000000000..94c1fcd9b213e8ba515561378bf14781a68b96f2 --- /dev/null +++ b/src/utils/FileRecordTools/Records/BamRecord.cpp @@ -0,0 +1,109 @@ +#include "BamRecord.h" +#include "BamFileReader.h" +#include "RecordKeyList.h" + +BamRecord::BamRecord() +{ + +} + +BamRecord::~BamRecord() +{ + +} + +const BamRecord &BamRecord::operator=(const BamRecord &other) +{ + Bed6Interval::operator=(other); + _bamAlignment = other._bamAlignment; + return *this; +} + + +bool BamRecord::initFromFile(FileReader *fileReader) +{ + BamFileReader *bamFileReader = static_cast<BamFileReader*>(fileReader); + return initFromFile(bamFileReader); + +} + +bool BamRecord::initFromFile(BamFileReader *bamFileReader) +{ + bamFileReader->getChrName(_chrName); + + _chrId = bamFileReader->getCurrChromdId(); + _startPos = bamFileReader->getStartPos(); + int2str(_startPos, _startPosStr); + _endPos = bamFileReader->getEndPos(); + int2str(_endPos, _endPosStr); + bamFileReader->getName(_name); + bamFileReader->getScore(_score); + char strandChar = bamFileReader->getStrand(); + setStrand(strandChar); + + _bamAlignment = bamFileReader->getAlignment(); + return true; +} + +void BamRecord::clear() +{ + Bed6Interval::clear(); + + //For now, we're going to not clear the BamAlignment object, as all of its + //fields will be reset next time it is used anyway. If testing shows this to be a + //problem, we'll revisit. +} + +void BamRecord::print(QuickString &outBuf, RecordKeyList *keyList) const +{ + Bed6Interval::print(outBuf); + printRemainingBamFields(outBuf, keyList); +} + +void BamRecord::print(QuickString &outBuf, int start, int end, RecordKeyList *keyList) const +{ + Bed6Interval::print(outBuf, start, end); + printRemainingBamFields(outBuf, keyList); +} + +void BamRecord::print(QuickString &outBuf, const QuickString & start, const QuickString & end, RecordKeyList *keyList) const +{ + Bed6Interval::print(outBuf, start, end); + printRemainingBamFields(outBuf, keyList); +} + +void BamRecord::printNull(QuickString &outBuf) const +{ + Bed6Interval::printNull(outBuf); + outBuf.append("\t.\t.\t.\t.\t.\t.", 12); +} + +void BamRecord::printRemainingBamFields(QuickString &outBuf, RecordKeyList *keyList) const +{ + outBuf.append('\t'); + outBuf.append(_bamAlignment.Position); + outBuf.append('\t'); + outBuf.append(_endPos); + outBuf.append("\t0,0,0", 6); + outBuf.append('\t'); + outBuf.append((int)keyList->size()); + + vector<int> blockLengths; + vector<int> blockStarts; + for (RecordKeyList::const_iterator_type iter = keyList->begin(); iter != keyList->end(); iter = keyList->next()) { + const Record *block = iter->value(); + blockLengths.push_back(block->getEndPos() - block->getStartPos()); + blockStarts.push_back(block->getStartPos() - _bamAlignment.Position); + } + + outBuf.append('\t'); + for (int i=0; i < (int)blockLengths.size(); i++) { + outBuf.append(blockLengths[i]); + outBuf.append(','); + } + outBuf.append('\t'); + for (int i=0; i < (int)blockStarts.size(); i++) { + outBuf.append( blockStarts[i]); + outBuf.append(','); + } +} diff --git a/src/utils/FileRecordTools/Records/BamRecord.h b/src/utils/FileRecordTools/Records/BamRecord.h new file mode 100644 index 0000000000000000000000000000000000000000..b412b96e61c656107dc81108a2a282aa4c30ae3b --- /dev/null +++ b/src/utils/FileRecordTools/Records/BamRecord.h @@ -0,0 +1,50 @@ +/* + * BamRecord.h + * + * Created on: Dec 4, 2012 + * Author: nek3d + */ + +#ifndef BAMRECORD_H_ +#define BAMRECORD_H_ + +#include "Bed6Interval.h" +#include "ParseTools.h" +#include "QuickString.h" +#include "api/BamAlignment.h" + +class FileReader; +class BamFileReader; +class RecordKeyList; + +class BamRecord : public Bed6Interval { +public: + friend class FreeList<BamRecord>; + + BamRecord(); + virtual const BamRecord &operator=(const BamRecord &); + bool initFromFile(FileReader *); + virtual bool initFromFile(BamFileReader *); + virtual void clear(); + virtual void print(QuickString &outBuf, int start, int end, RecordKeyList *keyList) const; + virtual void print(QuickString &outBuf, RecordKeyList *keyList) const; + virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end, RecordKeyList *keyList) const; + virtual void printNull(QuickString &outBuf) const; + void printRemainingBamFields(QuickString &outBuf, RecordKeyList *keyList) const; + + + virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BAM_RECORD_TYPE; } + + const BamTools::BamAlignment &getAlignment() const { return _bamAlignment; } + +protected: + virtual ~BamRecord(); + void printRemainingBamFields(); + + + BamTools::BamAlignment _bamAlignment; + +}; + + +#endif /* BAMRECORD_H_ */ diff --git a/src/utils/FileRecordTools/Records/Bed12Interval.cpp b/src/utils/FileRecordTools/Records/Bed12Interval.cpp new file mode 100644 index 0000000000000000000000000000000000000000..648dcccda2d73dac78482719a0536c570feb9cab --- /dev/null +++ b/src/utils/FileRecordTools/Records/Bed12Interval.cpp @@ -0,0 +1,122 @@ +#include "Bed12Interval.h" +#include "SingleLineDelimTextFileReader.h" +#include <cstdlib> + +Bed12Interval::Bed12Interval() +:_thickStart(-1), + _thickEnd(-1), + _blockCount(-1) + +{ +} + +Bed12Interval::~Bed12Interval() +{ +} +const Bed12Interval &Bed12Interval::operator=(const Bed12Interval &other) { + Bed6Interval::operator=(other); + + _thickStart = other._thickStart; + _thickEnd = other._thickEnd; + _itemRGB = other._itemRGB; + _blockCount = other._blockCount; + _blockSizes = other._blockSizes; + _blockStarts = other._blockStarts; + + return *this; +} + +bool Bed12Interval::initFromFile(SingleLineDelimTextFileReader *fileReader) +{ + bool baseRetFlag = Bed6Interval::initFromFile(fileReader); + + fileReader->getField(6, _thickStartStr); + fileReader->getField(7, _thickEndStr); + fileReader->getField(8, _itemRGB); + fileReader->getField(9, _blockCountStr); + fileReader->getField(10, _blockSizes); + fileReader->getField(11, _blockStarts); + + _thickStart = str2chrPos(_thickStartStr); + _thickEnd = str2chrPos(_thickEndStr); + _blockCount = str2chrPos(_blockCountStr); + return baseRetFlag; +} + +void Bed12Interval::clear() { + Bed6Interval::clear(); + + _thickStart = -1; + _thickEnd = -1; + _itemRGB.clear(); + _blockCount = -1; + _blockSizes.clear(); + _blockStarts.clear(); + _thickStartStr.clear(); + _thickEndStr.clear(); + _blockCountStr.clear(); + +} + +void Bed12Interval::print(QuickString &outBuf) const +{ + Bed6Interval::print(outBuf); + + outBuf.append('\t'); + outBuf.append(_thickStartStr); + outBuf.append('\t'); + outBuf.append(_thickEndStr); + outBuf.append('\t'); + outBuf.append(_itemRGB); + outBuf.append('\t'); + outBuf.append(_blockCountStr); + outBuf.append('\t'); + outBuf.append(_blockSizes); + outBuf.append('\t'); + outBuf.append(_blockStarts); +} + +void Bed12Interval::print(QuickString &outBuf, int start, int end) const +{ + Bed6Interval::print(outBuf, start, end); + + outBuf.append('\t'); + outBuf.append(_thickStartStr); + outBuf.append('\t'); + outBuf.append(_thickEndStr); + outBuf.append('\t'); + outBuf.append(_itemRGB); + outBuf.append('\t'); + outBuf.append(_blockCountStr); + outBuf.append('\t'); + outBuf.append(_blockSizes); + outBuf.append('\t'); + outBuf.append(_blockStarts); +} + +void Bed12Interval::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const +{ + Bed6Interval::print(outBuf, start, end); + outBuf.append('\t'); + outBuf.append(_thickStartStr); + outBuf.append('\t'); + outBuf.append(_thickEndStr); + outBuf.append('\t'); + outBuf.append(_itemRGB); + outBuf.append('\t'); + outBuf.append(_blockCountStr); + outBuf.append('\t'); + outBuf.append(_blockSizes); + outBuf.append('\t'); + outBuf.append(_blockStarts); +} + + +void Bed12Interval::printNull(QuickString &outBuf) const +{ + Bed6Interval::printNull(outBuf); + + outBuf.append("\t.\t.\t.\t.\t.\t.", 12); +} + + diff --git a/src/utils/FileRecordTools/Records/Bed12Interval.h b/src/utils/FileRecordTools/Records/Bed12Interval.h new file mode 100644 index 0000000000000000000000000000000000000000..a902b4f4472503d3ece54509dafa7ad30b60ea35 --- /dev/null +++ b/src/utils/FileRecordTools/Records/Bed12Interval.h @@ -0,0 +1,74 @@ +/* + * Bed12Interval.h + * + * Created on: Nov 28, 2012 + * Author: nek3d + */ + +#ifndef BED12INTERVAL_H_ +#define BED12INTERVAL_H_ + +#include "Bed6Interval.h" + +class SingleLineDelimTextFileReader; + + +class Bed12Interval : public Bed6Interval { +public: + friend class FreeList<Bed12Interval>; + + Bed12Interval(); + virtual bool initFromFile(SingleLineDelimTextFileReader *); + virtual void clear(); + virtual void print(QuickString &outBuf) const; + virtual void print(QuickString &outBuf, int start, int end) const; + virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const; + virtual void printNull(QuickString &outBuf) const; + virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED12_RECORD_TYPE; } + + const Bed12Interval &operator=(const Bed12Interval &other); + + virtual int getThickStart() const { return _thickStart; } + virtual void setThickStart(int thickStart) { _thickStart = thickStart; } + + virtual int getThickEnd() const { return _thickEnd; } + virtual void setThickEnd(int thickEnd) { _thickEnd = thickEnd; } + + virtual const QuickString & getItemRGB() const { return _itemRGB; } + virtual void setItemRGB(const QuickString & rgb) { _itemRGB = rgb; } + virtual void setItemRGB(const string & rgb) { _itemRGB = rgb; } + virtual void setItemRGB(const char *rgb) { _itemRGB = rgb; } + + virtual int getBlockCount() const { return _blockCount; } + virtual void setBlockCount(int blockCount) { _blockCount = blockCount; } + + virtual const QuickString & getBlockSizes() const { return _blockSizes; } + virtual void setBlockSizes(const QuickString & blockSizes) { _blockSizes = blockSizes; } + virtual void setBlockSizes(const string & blockSizes) { _blockSizes = blockSizes; } + virtual void setBlockSizes(const char *blockSizes) { _blockSizes = blockSizes; } + + virtual const QuickString & getBlockStarts() const { return _blockStarts; } + virtual void setBlockStarts(const QuickString & blockStarts) { _blockStarts = blockStarts; } + virtual void setBlockStarts(const string & blockStarts) { _blockStarts = blockStarts; } + virtual void setBlockStarts(const char *blockStarts) { _blockStarts = blockStarts; } + + +protected: + virtual ~Bed12Interval(); + + int _thickStart; + int _thickEnd; + QuickString _itemRGB; + int _blockCount; + QuickString _blockSizes; + QuickString _blockStarts; + + //store int values as their originial strings for faster outputting. + QuickString _thickStartStr; + QuickString _thickEndStr; + QuickString _blockCountStr; +}; + + + +#endif /* BED12INTERVAL_H_ */ diff --git a/src/utils/FileRecordTools/Records/Bed3Interval.cpp b/src/utils/FileRecordTools/Records/Bed3Interval.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5e285dc809f0094686cf2de28cd077c96954c88e --- /dev/null +++ b/src/utils/FileRecordTools/Records/Bed3Interval.cpp @@ -0,0 +1,64 @@ + +#include "Bed3Interval.h" +#include "SingleLineDelimTextFileReader.h" +#include <cstdio> +#include <cstdlib> + +Bed3Interval::Bed3Interval() +{ + +} + +Bed3Interval::~Bed3Interval() +{ + +} + +bool Bed3Interval::initFromFile(FileReader *fileReader) +{ + SingleLineDelimTextFileReader *sldFileReader = static_cast<SingleLineDelimTextFileReader*>(fileReader); + return initFromFile(sldFileReader); +} + + +bool Bed3Interval::initFromFile(SingleLineDelimTextFileReader *fileReader) +{ + fileReader->getField(0, _chrName); + _chrId = fileReader->getCurrChromdId(); + fileReader->getField(1, _startPosStr); + fileReader->getField(2, _endPosStr); + _startPos = str2chrPos(_startPosStr); + _endPos = str2chrPos(_endPosStr); + return true; +} + +void Bed3Interval::print(QuickString &outBuf) const +{ + outBuf.append(_chrName); + outBuf.append('\t'); + outBuf.append(_startPos); + outBuf.append('\t'); + outBuf.append(_endPos); +} + +void Bed3Interval::print(QuickString &outBuf, int start, int end) const +{ + outBuf.append(_chrName); + outBuf.append('\t'); + outBuf.append(start); + outBuf.append('\t'); + outBuf.append(end); +} + +void Bed3Interval::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const +{ + outBuf.append(_chrName); + outBuf.append('\t'); + outBuf.append(start); + outBuf.append('\t'); + outBuf.append(end); +} + +void Bed3Interval::printNull(QuickString &outBuf) const { + outBuf.append(".\t-1\t-1", 7); +} diff --git a/src/utils/FileRecordTools/Records/Bed3Interval.h b/src/utils/FileRecordTools/Records/Bed3Interval.h new file mode 100644 index 0000000000000000000000000000000000000000..a3015e0be90b12bedf4694a8ce6e7a3422032391 --- /dev/null +++ b/src/utils/FileRecordTools/Records/Bed3Interval.h @@ -0,0 +1,40 @@ +/* + * Bed3Interval.h + * + * Created on: Nov 8, 2012 + * Author: nek3d + */ + +#ifndef BED3INTERVAL_H_ +#define BED3INTERVAL_H_ + +#include "Record.h" +#include "ParseTools.h" +#include "QuickString.h" + +class FileReader; +class SingleLineDelimTextFileReader; + + +class Bed3Interval : public Record { +public: + friend class FreeList<Bed3Interval>; + + Bed3Interval(); + bool initFromFile(FileReader *); + virtual bool initFromFile(SingleLineDelimTextFileReader *); + virtual void print(QuickString &outBuf) const; + virtual void print(QuickString &outBuf, int start, int end) const; + virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const; + virtual void printNull(QuickString &outBuf) const; + virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED3_RECORD_TYPE; } + +protected: + virtual ~Bed3Interval(); + + bool _skipFirstThreeFieldsWhenPrinting; + +}; + + +#endif /* BED3INTERVAL_H_ */ diff --git a/src/utils/FileRecordTools/Records/Bed4Interval.cpp b/src/utils/FileRecordTools/Records/Bed4Interval.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eef359138455e71d6ac2f7cd233759fbfc8fab2a --- /dev/null +++ b/src/utils/FileRecordTools/Records/Bed4Interval.cpp @@ -0,0 +1,49 @@ +#include "Bed4Interval.h" +#include "SingleLineDelimTextFileReader.h" +#include <cstring> + +Bed4Interval::Bed4Interval() +{ + +} + +Bed4Interval::~Bed4Interval() +{ + +} + +bool Bed4Interval::initFromFile(SingleLineDelimTextFileReader *fileReader) +{ + bool baseRetFlag = Bed3Interval::initFromFile(fileReader); + + fileReader->getField(3, _name); + return baseRetFlag; +} + +void Bed4Interval::print(QuickString &outBuf) const +{ + Bed3Interval::print(outBuf); + outBuf.append(_name); +} + +void Bed4Interval::print(QuickString &outBuf, int start, int end) const +{ + Bed3Interval::print(outBuf, start, end); + outBuf.append('\t'); + outBuf.append(_name); +} + +void Bed4Interval::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const +{ + Bed3Interval::print(outBuf, start, end); + outBuf.append('\t'); + outBuf.append(_name); +} + +void Bed4Interval::printNull(QuickString &outBuf) const +{ + Bed3Interval::printNull(outBuf); + outBuf.append('\t'); + outBuf.append("\t.", 2); +} + diff --git a/src/utils/FileRecordTools/Records/Bed4Interval.h b/src/utils/FileRecordTools/Records/Bed4Interval.h new file mode 100644 index 0000000000000000000000000000000000000000..4d9fda7acb574170f5aaa1b39db4efa654da5263 --- /dev/null +++ b/src/utils/FileRecordTools/Records/Bed4Interval.h @@ -0,0 +1,34 @@ +/* + * Bed4Interval.h + * + * Created on: Jun 27, 2013 + * Author: nek3d + */ + +#ifndef BED4INTERVAL_H_ +#define BED4INTERVAL_H_ + + + +#include "Bed3Interval.h" + +class SingleLineDelimTextFileReader; + +class Bed4Interval : public Bed3Interval { +public: + friend class FreeList<Bed4Interval>; + + Bed4Interval(); + virtual bool initFromFile(SingleLineDelimTextFileReader *); + virtual void print(QuickString &outBuf) const; + virtual void print(QuickString &outBuf, int start, int end) const; + virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const; + virtual void printNull(QuickString &outBuf) const; + virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED4_RECORD_TYPE; } + +protected: + virtual ~Bed4Interval(); +}; + + +#endif /* BED4INTERVAL_H_ */ diff --git a/src/utils/FileRecordTools/Records/Bed5Interval.cpp b/src/utils/FileRecordTools/Records/Bed5Interval.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8e643608a1633929a1141c0b21d24ec1c3a81309 --- /dev/null +++ b/src/utils/FileRecordTools/Records/Bed5Interval.cpp @@ -0,0 +1,57 @@ +#include "Bed5Interval.h" +#include "SingleLineDelimTextFileReader.h" +#include <cstring> + +Bed5Interval::Bed5Interval() +{ + +} + +Bed5Interval::~Bed5Interval() +{ + +} + +bool Bed5Interval::initFromFile(SingleLineDelimTextFileReader *fileReader) +{ + bool baseRetFlag = Bed3Interval::initFromFile(fileReader); + + fileReader->getField(3, _name); + fileReader->getField(4, _score); + return baseRetFlag; +} + +void Bed5Interval::print(QuickString &outBuf) const +{ + Bed3Interval::print(outBuf); + outBuf.append('\t'); + outBuf.append(_name); + outBuf.append('\t'); + outBuf.append(_score); +} + +void Bed5Interval::print(QuickString &outBuf, int start, int end) const +{ + Bed3Interval::print(outBuf, start, end); + outBuf.append('\t'); + outBuf.append(_name); + outBuf.append('\t'); + outBuf.append(_score); +} + +void Bed5Interval::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const +{ + Bed3Interval::print(outBuf, start, end); + outBuf.append('\t'); + outBuf.append(_name); + outBuf.append('\t'); + outBuf.append(_score); +} + + +void Bed5Interval::printNull(QuickString &outBuf) const +{ + Bed3Interval::printNull(outBuf); + outBuf.append("\t.\t-1", 5); +} + diff --git a/src/utils/FileRecordTools/Records/Bed5Interval.h b/src/utils/FileRecordTools/Records/Bed5Interval.h new file mode 100644 index 0000000000000000000000000000000000000000..01d6596684a9764969ee415578906755ad386d20 --- /dev/null +++ b/src/utils/FileRecordTools/Records/Bed5Interval.h @@ -0,0 +1,32 @@ +/* + * Bed5Interval.h + * + * Created on: Nov 13, 2012 + * Author: nek3d + */ + +#ifndef BED5INTERVAL_H_ +#define BED5INTERVAL_H_ + + +#include "Bed3Interval.h" + +class SingleLineDelimTextFileReader; + +class Bed5Interval : public Bed3Interval { +public: + friend class FreeList<Bed5Interval>; + + Bed5Interval(); + virtual bool initFromFile(SingleLineDelimTextFileReader *); + virtual void print(QuickString &outBuf) const; + virtual void print(QuickString &outBuf, int start, int end) const; + virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const; + virtual void printNull(QuickString &outBuf) const; + virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED5_RECORD_TYPE; } + +protected: + virtual ~Bed5Interval(); +}; + +#endif /* BED5INTERVAL_H_ */ diff --git a/src/utils/FileRecordTools/Records/Bed6Interval.cpp b/src/utils/FileRecordTools/Records/Bed6Interval.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0f9e7ea4183981acf281156e5dd85fe88edd10cd --- /dev/null +++ b/src/utils/FileRecordTools/Records/Bed6Interval.cpp @@ -0,0 +1,68 @@ +#include "Bed6Interval.h" +#include "SingleLineDelimTextFileReader.h" +#include <cstring> + +Bed6Interval::Bed6Interval() +{ + +} + +Bed6Interval::~Bed6Interval() +{ + +} + +bool Bed6Interval::initFromFile(SingleLineDelimTextFileReader *fileReader) +{ + bool baseRetFlag = Bed3Interval::initFromFile(fileReader); + + fileReader->getField(3, _name); + fileReader->getField(4, _score); + char strandChar = 0; + fileReader->getField(5, strandChar); + setStrand(strandChar); + + return baseRetFlag; +} + +void Bed6Interval::print(QuickString &outBuf) const +{ + Bed3Interval::print(outBuf); + + outBuf.append('\t'); + outBuf.append(_name); + outBuf.append('\t'); + outBuf.append(_score); + outBuf.append('\t'); + outBuf.append(getStrandChar()); +} + +void Bed6Interval::print(QuickString &outBuf, int start, int end) const +{ + Bed3Interval::print(outBuf, start, end); + outBuf.append('\t'); + outBuf.append(_name); + outBuf.append('\t'); + outBuf.append(_score); + outBuf.append('\t'); + outBuf.append(getStrandChar()); +} + +void Bed6Interval::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const +{ + Bed3Interval::print(outBuf, start, end); + outBuf.append('\t'); + outBuf.append(_name); + outBuf.append('\t'); + outBuf.append(_score); + outBuf.append('\t'); + outBuf.append(getStrandChar()); +} + + +void Bed6Interval::printNull(QuickString &outBuf) const +{ + Bed3Interval::printNull(outBuf); + outBuf.append("\t.\t-1\t.", 7); +} + diff --git a/src/utils/FileRecordTools/Records/Bed6Interval.h b/src/utils/FileRecordTools/Records/Bed6Interval.h new file mode 100644 index 0000000000000000000000000000000000000000..91a00e09f131a7d922515e40974c6f1f7f3dcea5 --- /dev/null +++ b/src/utils/FileRecordTools/Records/Bed6Interval.h @@ -0,0 +1,32 @@ +/* + * Bed6Interval.h + * + * Created on: Nov 13, 2012 + * Author: nek3d + */ + +#ifndef BED6INTERVAL_H_ +#define BED6INTERVAL_H_ + + +#include "Bed3Interval.h" + +class SingleLineDelimTextFileReader; + +class Bed6Interval : public Bed3Interval { +public: + friend class FreeList<Bed6Interval>; + + Bed6Interval(); + virtual bool initFromFile(SingleLineDelimTextFileReader *); + virtual void print(QuickString &outBuf) const; + virtual void print(QuickString &outBuf, int start, int end) const; + virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const; + virtual void printNull(QuickString &outBuf) const; + virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED6_RECORD_TYPE; } + +protected: + virtual ~Bed6Interval(); +}; + +#endif /* BED6INTERVAL_H_ */ diff --git a/src/utils/FileRecordTools/Records/BedGraphInterval.cpp b/src/utils/FileRecordTools/Records/BedGraphInterval.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e089782c1499956a6b94c25e070b133da40ecc8d --- /dev/null +++ b/src/utils/FileRecordTools/Records/BedGraphInterval.cpp @@ -0,0 +1,50 @@ +#include "BedGraphInterval.h" +#include "SingleLineDelimTextFileReader.h" +#include <cstring> + +BedGraphInterval::BedGraphInterval() +{ + +} + +BedGraphInterval::~BedGraphInterval() +{ + +} + +bool BedGraphInterval::initFromFile(SingleLineDelimTextFileReader *fileReader) +{ + bool baseRetFlag = Bed3Interval::initFromFile(fileReader); + + fileReader->getField(3, _name); + return baseRetFlag; +} + +void BedGraphInterval::print(QuickString &outBuf) const +{ + Bed3Interval::print(outBuf); + outBuf.append('\t'); + outBuf.append(_name); +} + +void BedGraphInterval::print(QuickString &outBuf, int start, int end) const +{ + Bed3Interval::print(outBuf, start, end); + outBuf.append('\t'); + outBuf.append(_name); +} + +void BedGraphInterval::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const +{ + Bed3Interval::print(outBuf, start, end); + outBuf.append('\t'); + outBuf.append(_name); +} + + +void BedGraphInterval::printNull(QuickString &outBuf) const +{ + Bed3Interval::printNull(outBuf); + outBuf.append("\t.", 2); +} + diff --git a/src/utils/FileRecordTools/Records/BedGraphInterval.h b/src/utils/FileRecordTools/Records/BedGraphInterval.h new file mode 100644 index 0000000000000000000000000000000000000000..f3d612e899981a895e94a0518c098144f05fcca4 --- /dev/null +++ b/src/utils/FileRecordTools/Records/BedGraphInterval.h @@ -0,0 +1,32 @@ +/* + * Bed6Interval.h + * + * Created on: Nov 13, 2012 + * Author: nek3d + */ + +#ifndef BEDGRAPHINTERVAL_H_ +#define BEDGRAPHINTERVAL_H_ + + +#include "Bed3Interval.h" + +class SingleLineDelimTextFileReader; + +class BedGraphInterval : public Bed3Interval { +public: + friend class FreeList<BedGraphInterval>; + + BedGraphInterval(); + virtual bool initFromFile(SingleLineDelimTextFileReader *); + virtual void print(QuickString &outBuf) const; + virtual void print(QuickString &outBuf, int start, int end) const; + virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const; + virtual void printNull(QuickString &outBuf) const; + virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BEDGRAPH_RECORD_TYPE; } + +protected: + virtual ~BedGraphInterval(); +}; + +#endif /* BEDGRAPHINTERVAL_H_ */ diff --git a/src/utils/FileRecordTools/Records/BedPlusInterval.cpp b/src/utils/FileRecordTools/Records/BedPlusInterval.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f062075361957ba655a84218ecc69b183697d724 --- /dev/null +++ b/src/utils/FileRecordTools/Records/BedPlusInterval.cpp @@ -0,0 +1,145 @@ +#include "BedPlusInterval.h" +#include "SingleLineDelimTextFileReader.h" + +BedPlusInterval::BedPlusInterval() +: _numPrintFields(0) +{ +} + +BedPlusInterval::~BedPlusInterval() +{ + for (int i=0; i < (int)_otherIdxs.size(); i++) { + delete _otherIdxs[i]; + } +} + +const BedPlusInterval &BedPlusInterval::operator=(const BedPlusInterval &other) { + Bed6Interval::operator=(other); + + int otherSize = other._otherIdxs.size(); + int mySize = _otherIdxs.size(); + + _numPrintFields = other._numPrintFields; + int numMatchingFields = min(mySize, otherSize); + for (int i=0; i < numMatchingFields; i++) { + (*(_otherIdxs[i])) = (*(other._otherIdxs[i])); + } + if (mySize < otherSize) { + for (int i = mySize; i < otherSize; i++) { + QuickString *pqs = new QuickString(*(other._otherIdxs[i])); + _otherIdxs.push_back(pqs); + } + } else if (mySize > otherSize) { + for (int i= otherSize; i < mySize; i++) { + delete _otherIdxs[i]; + } + _otherIdxs.resize(otherSize); + } + return *this; +} + +bool BedPlusInterval::initFromFile(SingleLineDelimTextFileReader *fileReader) +{ + return (Bed6Interval::initFromFile(fileReader) && initOtherFieldsFromFile(fileReader)); +} + +bool BedPlusInterval::initOtherFieldsFromFile(SingleLineDelimTextFileReader *fileReader) +{ + int numFields = fileReader->getNumFields() - startOtherIdx; + + if ((int)_otherIdxs.size() != numFields) { + if ((int)_otherIdxs.size() > 0) { + return false; //file had a number of fields not matching what was expected. + } + for (int i=0; i < numFields; i++) { + _otherIdxs.push_back(new QuickString()); + } + } + + for (int i=0; i < numFields; i++) { + fileReader->getField(i + startOtherIdx, (*(_otherIdxs[i]))); + } + return true; +} + +void BedPlusInterval::clear() { + Bed6Interval::clear(); + _numPrintFields = 0; + for (int i=0; i < (int)_otherIdxs.size(); i++) { + _otherIdxs[i]->clear(); + } +} + +void BedPlusInterval::print(QuickString &outBuf) const +{ + Bed6Interval::print(outBuf); + + for (int i=0; i < (int)_otherIdxs.size(); i++) { + outBuf.append('\t'); + outBuf.append(*(_otherIdxs[i])); + } +} + +void BedPlusInterval::print(QuickString &outBuf, int start, int end) const +{ + Bed6Interval::print(outBuf, start, end); + for (int i=0; i < (int)_otherIdxs.size(); i++) { + outBuf.append('\t'); + outBuf.append(*(_otherIdxs[i])); + } +} + +void BedPlusInterval::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const +{ + Bed6Interval::print(outBuf, start, end); + for (int i=0; i < (int)_otherIdxs.size(); i++) { + outBuf.append('\t'); + outBuf.append(*(_otherIdxs[i])); + } +} + + +void BedPlusInterval::printNull(QuickString &outBuf) const +{ + Bed6Interval::printNull(outBuf); + for (int i=startOtherIdx; i < _numPrintFields; i++) { + outBuf.append("\t."); + } + +} + +QuickString BedPlusInterval::getField(int fieldNum) const +{ + //a request for any of the first six fields will retrieve + //chrom, start, end, name, score, and strand, in that order. + //A request for field 6+ will go to the otherIdxs. + + QuickString buffer; + + switch (fieldNum) { + case 0: + return _chrName; + break; //redundant after a return, but good practice anyway. + case 1: + int2str(_startPos, buffer); + return buffer; + break; + case 2: + int2str(_endPos, buffer); + return buffer; + break; + case 3: + return _name; + break; + case 4: + return _score; + break; + case 5: + buffer.append(getStrandChar()); + return buffer; + break; + default: + return (*(_otherIdxs[fieldNum - startOtherIdx])); + break; + } +} diff --git a/src/utils/FileRecordTools/Records/BedPlusInterval.h b/src/utils/FileRecordTools/Records/BedPlusInterval.h new file mode 100644 index 0000000000000000000000000000000000000000..4ebd3b67a88fa757e833e95bfddd18a897bcff8d --- /dev/null +++ b/src/utils/FileRecordTools/Records/BedPlusInterval.h @@ -0,0 +1,52 @@ +/* + * BedPlusInterval.h + * + * Created on: Nov 13, 2012 + * Author: nek3d + */ + +#ifndef BEDPLUSINTERVAL_H_ +#define BEDPLUSINTERVAL_H_ + +#include "Bed6Interval.h" +#include <vector> + +class SingleLineDelimTextFileReader; + +class BedPlusInterval : public Bed6Interval { +public: + friend class FreeList<BedPlusInterval>; + + BedPlusInterval(); + virtual bool initFromFile(SingleLineDelimTextFileReader *); + virtual void clear(); + virtual void print(QuickString &outBuf) const; + virtual void print(QuickString &outBuf, int start, int end) const; + virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const; + virtual void printNull(QuickString &outBuf) const; + virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED_PLUS_RECORD_TYPE; } + + //Note: using the assignment operator in a BedPlusInterval can potentially be a performance hit, + //if the number of fields frequently differ between this object and the one being copied. + const BedPlusInterval &operator=(const BedPlusInterval &other); + + virtual QuickString getField(int fieldNum) const; + virtual void setField(int fieldNum, const QuickString &str) { (*(_otherIdxs[fieldNum])) = str; } + virtual void setField(int fieldNum, const string &str) { (*(_otherIdxs[fieldNum])) = str; } + virtual void setField(int fieldNum, const char *str) { (*(_otherIdxs[fieldNum])) = str; } + virtual void setNumPrintFields(int num) { _numPrintFields = num; } + virtual int getNumPrintFields() const { return _numPrintFields; } + +protected: + virtual ~BedPlusInterval(); + bool initOtherFieldsFromFile(SingleLineDelimTextFileReader *fileReader); + + + vector<QuickString *> _otherIdxs; + static const int startOtherIdx = 6; //first six fields have names, and are not stored in otherIdxs. + int _numPrintFields; +}; + + + +#endif /* BEDPLUSINTERVAL_H_ */ diff --git a/src/utils/FileRecordTools/Records/BlockMgr.cpp b/src/utils/FileRecordTools/Records/BlockMgr.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6182fcd7f4fac3c32b9b01713f1af8572b1fe561 --- /dev/null +++ b/src/utils/FileRecordTools/Records/BlockMgr.cpp @@ -0,0 +1,209 @@ +/* + * BlockMgr.cpp + * + * Created on: May 14, 2013 + * Author: nek3d + */ + +#include "BlockMgr.h" +#include "RecordMgr.h" +#include "Context.h" +#include "Bed12Interval.h" +#include "BamRecord.h" +#include "ParseTools.h" +#include "api/BamAlignment.h" +#include "api/BamAux.h" + +BlockMgr::BlockMgr() +: _blockRecordsMgr(NULL), + _breakOnDeletionOps(false), + _breakOnSkipOps(true) +{ + _blockRecordsMgr = new RecordMgr(_blockRecordsType); +} + + + +BlockMgr::~BlockMgr() +{ + delete _blockRecordsMgr; +} + +void BlockMgr::getBlocks(RecordKeyList &keyList, bool &mustDelete) +{ + switch (keyList.getKey()->getType()) { + case FileRecordTypeChecker::BED12_RECORD_TYPE: + getBlocksFromBed12(keyList, mustDelete); + break; //not necessary after return, but here in case code is later modified. + + case FileRecordTypeChecker::BAM_RECORD_TYPE: + getBlocksFromBam(keyList, mustDelete); + break; + + default: + keyList.push_back(keyList.getKey()); + mustDelete = false; + break; + } +} + + + +void BlockMgr::getBlocksFromBed12(RecordKeyList &keyList, bool &mustDelete) +{ + const Bed12Interval *keyRecord = static_cast<const Bed12Interval *>(keyList.getKey()); + int blockCount = keyRecord->getBlockCount(); + + if ( blockCount <= 0 ) { + mustDelete = false; + return; + } + + vector<QuickString> sizes; + vector<QuickString> starts; + + int sizeCount = Tokenize(keyRecord->getBlockSizes(), sizes, ',', blockCount); + int startCount = Tokenize(keyRecord->getBlockStarts(), starts, ',', blockCount); + + if (blockCount != sizeCount || sizeCount != startCount) { + fprintf(stderr, "Error: found wrong block counts while splitting entry.\n"); + exit(-1); + } + + for (int i=0; i < blockCount; i++) { + int startPos = keyRecord->getStartPos() + str2chrPos(starts[i].c_str()); + int endPos = startPos + str2chrPos(sizes[i].c_str()); + + const Record *record = allocateAndAssignRecord(keyRecord, startPos, endPos); + keyList.push_back(record); + } + mustDelete = true; +} + +void BlockMgr::getBlocksFromBam(RecordKeyList &keyList, bool &mustDelete) +{ + const BamRecord *keyRecord = static_cast<const BamRecord *>(keyList.getKey()); + const vector<BamTools::CigarOp> &cigarData = keyRecord->getAlignment().CigarData; + + int currPos = keyRecord->getStartPos(); + int blockLength = 0; + + for (int i=0; i < (int)cigarData.size(); i++) { + char opType = cigarData[i].Type; + int opLen = (int)(cigarData[i].Length); + + switch(opType) { + case 'I': + case 'S': + case 'P': + case 'H': + break; + case 'M': + blockLength += opLen; + break; + case 'D': + case 'N' : + if ((opType == 'D' && !_breakOnDeletionOps) || + (opType == 'N' && !_breakOnSkipOps)) { + blockLength += opLen; + } else { + keyList.push_back(allocateAndAssignRecord(keyRecord, currPos, currPos + blockLength)); + currPos += opLen + blockLength; + blockLength = 0; + } + break; + default: + fprintf(stderr, "ERROR: Found invalid Cigar operation: %c.\n", opType); + exit(1); + break; + } + } + if (blockLength > 0) { + keyList.push_back(allocateAndAssignRecord(keyRecord, currPos, currPos + blockLength)); + } + mustDelete = true; +} + +Record *BlockMgr::allocateAndAssignRecord(const Record *keyRecord, int startPos, int endPos) +{ + Record *record = _blockRecordsMgr->allocateRecord(); + record->setChrName(keyRecord->getChrName()); + record->setChromId(keyRecord->getChromId()); + record->setStartPos(startPos); + record->setEndPos(endPos); + record->setName(keyRecord->getName()); + record->setScore(keyRecord->getScore()); + record->setStrand(keyRecord->getStrand()); + + return record; +} + +int BlockMgr::getTotalBlockLength(RecordKeyList &keyList) { + int sum = 0; + for (RecordKeyList::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { + const Record *record = iter->value(); + sum += record->getEndPos() - record->getStartPos(); + } + return sum; +} + +void BlockMgr::deleteBlocks(RecordKeyList &keyList) +{ + for (RecordKeyList::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { + _blockRecordsMgr->deleteRecord(iter->value()); + } + keyList.clearList(); + +} + + +int BlockMgr::findBlockedOverlaps(RecordKeyList &keyList, RecordKeyList &hitList, RecordKeyList &resultList) +{ + bool deleteKeyBlocks = false; + if (keyList.empty()) { + getBlocks(keyList, deleteKeyBlocks); + } + int keyBlocksSumLength = getTotalBlockLength(keyList); + for (RecordKeyList::const_iterator_type hitListIter = hitList.begin(); hitListIter != hitList.end(); hitListIter = hitList.next()) { + RecordKeyList hitBlocks(hitListIter->value()); + bool deleteHitBlocks = false; + getBlocks(hitBlocks, deleteHitBlocks); + int hitBlockSumLength = getTotalBlockLength(hitBlocks); + int totalHitOverlap = 0; + bool hitHasOverlap = false; + for (RecordKeyList::const_iterator_type hitBlockIter = hitBlocks.begin(); hitBlockIter != hitBlocks.end(); hitBlockIter = hitBlocks.next()) { + for (RecordKeyList::const_iterator_type keyListIter = keyList.begin(); keyListIter != keyList.end(); keyListIter = keyList.next()) { + const Record *keyBlock = keyListIter->value(); + const Record *hitBlock = hitBlockIter->value(); + + int maxStart = max(keyBlock->getStartPos(), hitBlock->getStartPos()); + int minEnd = min(keyBlock->getEndPos(), hitBlock->getEndPos()); + int overlap = minEnd - maxStart; + if (overlap > 0) { + hitHasOverlap = true; + totalHitOverlap += overlap; + } + + } + } + if (hitHasOverlap) { + if ((float) totalHitOverlap / (float)keyBlocksSumLength > _context->getOverlapFraction()) { + if (_context->getReciprocal() && + ((float)totalHitOverlap / (float)hitBlockSumLength > _context->getOverlapFraction())) { + resultList.push_back(hitListIter->value()); + } else if (!_context->getReciprocal()) { + resultList.push_back(hitListIter->value()); + } + } + } + if (deleteHitBlocks) { + deleteBlocks(hitBlocks); + } + } + if (deleteKeyBlocks) { + deleteBlocks(keyList); + } + resultList.setKey(keyList.getKey()); + return (int)resultList.size(); +} + diff --git a/src/utils/FileRecordTools/Records/BlockMgr.h b/src/utils/FileRecordTools/Records/BlockMgr.h new file mode 100644 index 0000000000000000000000000000000000000000..c2aab25f75d56639a8965db2ee934cbbae36586e --- /dev/null +++ b/src/utils/FileRecordTools/Records/BlockMgr.h @@ -0,0 +1,62 @@ +/* + * BlockMgr.h + * + * Created on: May 14, 2013 + * Author: nek3d + */ + +#ifndef BLOCKMGR_H_ +#define BLOCKMGR_H_ + +//This class handles blocks inside of a larger record, such as BED12 and BAM records. +//Produce and manage seperate records for the sub-intervals inside the + +using namespace std; + +#include "FileRecordTypeChecker.h" +#include "RecordKeyList.h" + +class Context; +class RecordMgr; + +class BlockMgr { +public: + BlockMgr(); + ~BlockMgr(); + void setContext(Context *context) { _context = context; } + + // Return value is the number of blocks this main record has been split into. + void getBlocks(RecordKeyList &keyList, bool &mustDelete); + void deleteBlocks(RecordKeyList &keyList); + + // Get the sum of the lengths of all blocks for a record. + int getTotalBlockLength(RecordKeyList &keyList); + + // Determine which hits in the hitList intersect the hits in the keyList by comparing all blocks in each + // and checking that their total intersection meets any overlapFraction and reciprocal criteria compared to + // the total block lengths of the hitList and keyList. All hits that pass will be in the resultList. + // Return value is the number of hits in the result set. + int findBlockedOverlaps(RecordKeyList &keyList, RecordKeyList &hitList, RecordKeyList &resultList); + + //these are setting options for splitting BAM records + void setBreakOnDeletionOps(bool val) { _breakOnDeletionOps = val; } + void setBreakOnSkipOps(bool val) { _breakOnSkipOps = val; } + +private: + RecordMgr *_blockRecordsMgr; + Context *_context; + bool _breakOnDeletionOps; + bool _breakOnSkipOps; + + // For now, all records will be split into Bed6 records. + const static FileRecordTypeChecker::RECORD_TYPE _blockRecordsType = FileRecordTypeChecker::BED6_RECORD_TYPE; + void getBlocksFromBed12(RecordKeyList &keyList, bool &mustDelete); + void getBlocksFromBam(RecordKeyList &keyList, bool &mustDelete); + + Record *allocateAndAssignRecord(const Record *keyRecord, int startPos, int endPos); + + +}; + + +#endif /* BLOCKMGR_H_ */ diff --git a/src/utils/FileRecordTools/Records/GffRecord.cpp b/src/utils/FileRecordTools/Records/GffRecord.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5a6cc6fa5459f4e7aa9be7632399d994fddf26c7 --- /dev/null +++ b/src/utils/FileRecordTools/Records/GffRecord.cpp @@ -0,0 +1,124 @@ +#include "GffRecord.h" +#include "SingleLineDelimTextFileReader.h" +#include <cstring> + +GffRecord::GffRecord() +{ + +} + +GffRecord::~GffRecord() +{ + +} + +void GffRecord::clear() +{ + Bed6Interval::clear(); + _source.clear(); + _frame.clear(); + _group.clear(); + _numFields = 8; +} + +bool GffRecord::initFromFile(SingleLineDelimTextFileReader *fileReader) +{ + fileReader->getField(0, _chrName); + _chrId = fileReader->getCurrChromdId(); + fileReader->getField(3, _startPosStr); + _startPos = str2chrPos(_startPosStr); + _startPos--; // VCF is one-based. Here we intentionally don't decrement the string version, + //because we'll still want to output the one-based number in the print methods, even though + //internally we decrement the integer to comply with the 0-based format common to other records. + fileReader->getField(4, _endPosStr); + //endPos is just the startPos plus the length of the variant + _endPos = str2chrPos(_endPosStr); + + fileReader->getField(2, _name); + fileReader->getField(1, _source); + fileReader->getField(5, _score); + + //GFF allows a '.' for the strandChar, signifying it is not known. + char strandChar = 0; + fileReader->getField(6, strandChar); + setStrand(strandChar); + + fileReader->getField(7, _frame); + _numFields = fileReader->getNumFields(); + if (_numFields == 9) { + fileReader->getField(8, _group); + } + + + return true; +} + +void GffRecord::print(QuickString &outBuf) const +{ + outBuf.append(_chrName); + outBuf.append('\t'); + outBuf.append(_source); + outBuf.append('\t'); + outBuf.append(_name); + outBuf.append('\t'); + outBuf.append(_startPosStr); + outBuf.append('\t'); + outBuf.append(_endPosStr); + outBuf.append('\t'); + + printRemainingFields(outBuf); +} + +void GffRecord::print(QuickString &outBuf, int start, int end) const +{ + outBuf.append(_chrName); + outBuf.append('\t'); + outBuf.append(_source); + outBuf.append('\t'); + outBuf.append(_name); + outBuf.append('\t'); + outBuf.append(start +1); + outBuf.append('\t'); + outBuf.append(end); + outBuf.append('\t'); + + printRemainingFields(outBuf); +} + +void GffRecord::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const +{ + outBuf.append(_chrName); + outBuf.append('\t'); + outBuf.append(_source); + outBuf.append('\t'); + outBuf.append(_name); + outBuf.append('\t'); + outBuf.append(start); + outBuf.append('\t'); + outBuf.append(end); + outBuf.append('\t'); + + printRemainingFields(outBuf); +} + +void GffRecord::printRemainingFields(QuickString &outBuf) const +{ + outBuf.append(_score); + outBuf.append('\t'); + outBuf.append(getStrandChar()); + outBuf.append('\t'); + outBuf.append(_frame); + if (_numFields == 9) { + outBuf.append('\t'); + outBuf.append(_group); + } +} + +void GffRecord::printNull(QuickString &outBuf) const +{ + outBuf.append(".\t.\t.\t-1\t-1\t.\t.\t.", 17); + if (_numFields > 8) { + outBuf.append("\t.", 2); + } +} + diff --git a/src/utils/FileRecordTools/Records/GffRecord.h b/src/utils/FileRecordTools/Records/GffRecord.h new file mode 100644 index 0000000000000000000000000000000000000000..195d4beaf5453f946922b2d5ad3ba60d6da8aec5 --- /dev/null +++ b/src/utils/FileRecordTools/Records/GffRecord.h @@ -0,0 +1,50 @@ +/* + * GffRecord.h + * + * Created on: Nov 13, 2012 + * Author: nek3d + */ + +#ifndef GFF_RECORD_H_ +#define GFF_RECORD_H_ + +#include "Bed6Interval.h" + +class SingleLineDelimTextFileReader; + +class GffRecord : public Bed6Interval { +public: + friend class FreeList<GffRecord>; + + GffRecord(); + virtual bool initFromFile(SingleLineDelimTextFileReader *); + virtual void clear(); + virtual void print(QuickString &outBuf) const; + virtual void print(QuickString &outBuf, int start, int end) const; + virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const; + virtual void printNull(QuickString &outBuf) const; + virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED_PLUS_RECORD_TYPE; } + virtual const QuickString &getSource() const { return _source; } + virtual const QuickString &getFrame() const { return _frame; } + virtual const QuickString &getGroup() const { return _group; } + virtual const int getNumFields() const { return _numFields; } + virtual void setNumFields(int val) { _numFields = val; } + + //Note: using the assignment operator in a GffRecord can potentially be a performance hit, + //if the number of fields frequently differ between this object and the one being copied. + const GffRecord &operator=(const GffRecord &other); + +protected: + virtual ~GffRecord(); + void printRemainingFields(QuickString &outbuf) const; + + int _numFields; + QuickString _source; + QuickString _frame; + QuickString _group; + +}; + + + +#endif /* GFF_RECORD_H_ */ diff --git a/src/utils/FileRecordTools/Records/Makefile b/src/utils/FileRecordTools/Records/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..757739c7d5a8ca595c8240bb4c6ecb74a7e342d3 --- /dev/null +++ b/src/utils/FileRecordTools/Records/Makefile @@ -0,0 +1,42 @@ +OBJ_DIR = ../../../../obj/ +BIN_DIR = ../../../../bin/ +UTILITIES_DIR = ../../../utils/ +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ + -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/Contexts/ \ + -I$(UTILITIES_DIR)/GenomeFile/ \ + -I$(UTILITIES_DIR)/BamTools/include + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= RecordMgr.cpp RecordMgr.h Record.h Record.cpp Bed3Interval.h Bed3Interval.cpp \ + Bed4Interval.h Bed4Interval.cpp BedGraphInterval.h BedGraphInterval.cpp Bed5Interval.h Bed5Interval.cpp \ + Bed6Interval.h Bed6Interval.cpp \ + BedPlusInterval.h BedPlusInterval.cpp Bed12Interval.h Bed12Interval.cpp BamRecord.h BamRecord.cpp VcfRecord.h VcfRecord.cpp \ + GffRecord.h GffRecord.cpp RecordKeyList.h RecordKeyList.cpp BlockMgr.h BlockMgr.cpp +OBJECTS= RecordMgr.o Record.o Bed3Interval.o Bed4Interval.o BedGraphInterval.o Bed5Interval.o Bed6Interval.o BedPlusInterval.o Bed12Interval.o BamRecord.o \ + VcfRecord.o GffRecord.o RecordKeyList.o BlockMgr.o +_EXT_OBJECTS=ParseTools.o QuickString.o ChromIdLookup.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/RecordMgr.o $(OBJ_DIR)/Record.o $(OBJ_DIR)/Bed3Interval.o $(OBJ_DIR)/Bed4Interval.o \ + $(OBJ_DIR)/BedGraphInterval.o $(OBJ_DIR)/Bed5Interval.o $(OBJ_DIR)/Bed6Interval.o \ + $(OBJ_DIR)/BedPlusInterval.o $(OBJ_DIR)/Bed12Interval.o $(OBJ_DIR)/BamRecord.o $(OBJ_DIR)/VcfRecord.o $(OBJ_DIR)/GffRecord.o $(OBJ_DIR)/BlockMgr.o + +.PHONY: clean \ No newline at end of file diff --git a/src/utils/FileRecordTools/Records/Record.cpp b/src/utils/FileRecordTools/Records/Record.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1f5351636aec2107915d9e89627d235bf570ef23 --- /dev/null +++ b/src/utils/FileRecordTools/Records/Record.cpp @@ -0,0 +1,179 @@ + +#include "Record.h" +#include <cstdio> + +Record::Record() +: _chrId(-1), + _startPos(-1), + _endPos(-1), + _strand(UNKNOWN) +{ +} + +Record::~Record() { +} + +const Record &Record::operator=(const Record &other) +{ + _chrName = other._chrName; + _chrId = other._chrId; + _startPos = other._startPos; + _endPos = other._endPos; + _strand = other._strand; + _name = other._name; + return *this; +} + +void Record::clear() { + _chrName.clear(); + _chrId = -1; + _startPos = -1; + _endPos = -1; + _name.clear(); + _score.clear(); + _strand = UNKNOWN; + _startPosStr.clear(); + _endPosStr.clear(); +} + +void Record::setStrand(char val) +{ + switch (val) { + case '+': + _strand = FORWARD; + break; + case '-': + _strand = REVERSE; + break; + default: + _strand = UNKNOWN; + break; + } +} + +char Record::getStrandChar() const +{ + switch (_strand) { + case FORWARD: + return '+'; + break; + case REVERSE: + return '-'; + break; + case UNKNOWN: + default: + return '.'; + } +// return '.'; +} + +bool Record::operator < (const Record &other) const +{ + + if (!sameChrom(&other)) { + return chromBefore(&other); + } + if (_startPos != other._startPos) { + return _startPos < other._startPos; + } + if (_endPos != other._endPos) { + return _endPos < other._endPos; + } + return false; +} + +bool Record::operator > (const Record &other) const { + if (!sameChrom(&other)) { + return chromAfter(&other); + } + if (_startPos != other._startPos) { + return _startPos > other._startPos; + } + if (_endPos != other._endPos) { + return _endPos > other._endPos; + } + return false; +} + +bool Record::sameChrom(const Record *other) const { + return (_chrId == -1 || other->_chrId == -1) ? ( _chrName == other->_chrName) : (_chrId == other->_chrId); +} + +bool Record::chromBefore(const Record *other) const +{ + return (_chrId == -1 || other->_chrId == -1) ? ( _chrName < other->_chrName) : (_chrId < other->_chrId); +} + +bool Record::chromAfter(const Record *other) const +{ + return (_chrId == -1 || other->_chrId == -1) ? ( _chrName > other->_chrName) : (_chrId > other->_chrId); +} + + + +bool Record::after(const Record *other) const +{ + return (_chrId == other->_chrId && _startPos >= other->_endPos); +} + +bool Record::intersects(const Record *record, + bool wantSameStrand, bool wantDiffStrand, float overlapFraction, bool reciprocal) const +{ + //must be on same chromosome + if (!sameChrom(record)) { + return false; + } + return sameChromIntersects(record, wantSameStrand, wantDiffStrand, overlapFraction, reciprocal); +} + +bool Record::sameChromIntersects(const Record *record, + bool wantSameStrand, bool wantDiffStrand, float overlapFraction, bool reciprocal) const +{ + + //If user requested hits only on same strand, or only on different strands, + //rule out different strandedness first. + //If the strand is unknown in either case, then queries regarding strandedness + //can not be answered, so we return false; + bool isSameStrand = (_strand == record->_strand && _strand != UNKNOWN); + bool isDiffStrand = ( _strand != UNKNOWN && record->_strand != UNKNOWN && _strand != record->_strand); + + if (wantSameStrand && !isSameStrand) { + return false; //want same, but they're not same. + } + if (wantDiffStrand && !isDiffStrand) { + return false; //want different, but they're not different. + } + + int otherStart = record->getStartPos(); + int otherEnd = record->getEndPos(); + + int maxStart = max(_startPos, otherStart); + int minEnd = min(_endPos, otherEnd); + + //rule out all cases of no intersection at all + if (minEnd < maxStart) { + return false; + } + + + int overlapBases = minEnd - maxStart; + int len = _endPos - _startPos; + int otherLen = otherEnd - otherStart; + + if ((float)overlapBases / (float)len < overlapFraction) { + return false; + } + + //at this point, we've ruled out strandedness, non-intersection, + //and query coverage (overlapFraction). The only thing left is + //database coverage. + if (!reciprocal) { + return true; + } + + if ((float)overlapBases / (float)otherLen >= overlapFraction) { + return true; + } + + return false; +} diff --git a/src/utils/FileRecordTools/Records/Record.h b/src/utils/FileRecordTools/Records/Record.h new file mode 100644 index 0000000000000000000000000000000000000000..9dc1ae4e555fa7eb8e5f3a5a814296e09f351748 --- /dev/null +++ b/src/utils/FileRecordTools/Records/Record.h @@ -0,0 +1,117 @@ +/* + * Record.h + * + * Created on: Nov 8, 2012 + * Author: nek3d + */ + +using namespace std; + +#ifndef RECORD_H_ +#define RECORD_H_ + +#include "FreeList.h" +#include "QuickString.h" +#include "FileRecordTypeChecker.h" + +class FileRecordMgr; +class FileReader; +class ChromIdLookup; + +class Record { +public: + friend class RecordMgr; + friend class RecordOutputMgr; + + friend class FreeList<Record>; + + typedef enum { FORWARD, REVERSE, UNKNOWN } strandType; + Record(); + virtual bool initFromFile(FileReader *) =0; + virtual void clear(); + virtual void print(QuickString &outBuf) const {} + virtual void print(QuickString &outBuf, int start, int end) const {} + virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const {} + virtual void printNull(QuickString &outBuf) const {} + virtual const Record & operator=(const Record &); + + virtual const QuickString &getChrName() const { return _chrName; } + virtual void setChrName(const QuickString &chr) { _chrName = chr; } + virtual void setChrName(const string &chr) { _chrName = chr; } + virtual void setChrName(const char *chr) { _chrName = chr; } + + + virtual int getChromId() const { return _chrId; } + virtual void setChromId(int id) { _chrId = id; } + + virtual int getStartPos() const { return _startPos; } + virtual void setStartPos(int startPos) { _startPos = startPos; } + virtual const QuickString &getStartPosStr() const { return _startPosStr; } + virtual void setStartPosStr(const QuickString &str) { _startPosStr = str; } + + virtual int getEndPos() const { return _endPos; } + virtual void setEndPos(int endPos) { _endPos = endPos; } + virtual const QuickString &getEndPosStr() const { return _endPosStr; } + virtual void setEndPosStr(const QuickString &str) { _endPosStr = str; } + + + virtual strandType getStrand() const { return _strand; } + virtual void setStrand(strandType val) { _strand = val; } + virtual void setStrand(char val); + virtual char getStrandChar() const; + + //And we have a similar problem with name and score + virtual const QuickString &getName() const { return _name; } + virtual void setName(const QuickString &chr) { _name = chr; } + virtual void setName(const string &chr) { _name = chr; } + virtual void setName(const char *chr) { _name = chr; } + + virtual const QuickString &getScore() const { return _score; } + virtual void setScore(const QuickString &chr) { _score = chr; } + virtual void setScore(const string &chr) { _score = chr; } + virtual void setScore(const char *chr) { _score = chr; } + + virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::UNKNOWN_RECORD_TYPE; } + + virtual bool operator < (const Record &other) const; + virtual bool operator > (const Record &other) const; + + + //is this on the same chromosome as another record? + bool sameChrom(const Record *other) const; + bool chromBefore(const Record *other) const; + bool chromAfter(const Record *other) const; + + //is this record after the other one? + virtual bool after(const Record *other) const; + + //does this record intersect with another record? + virtual bool intersects(const Record *otherRecord, + bool sameStrand, bool diffStrand, float overlapFraction, bool reciprocal) const; + + // *** WARNING !!! ** sameChromIntersects is a faster version of the intersects method, + // BUT the caller MUST ensure that the records are on the same + //chromosome. If you're not absolutely sure, use the regular intersects method. + virtual bool sameChromIntersects(const Record *otherRecord, + bool sameStrand, bool diffStrand, float overlapFraction, bool reciprocal) const; + + +protected: + virtual ~Record(); //by making the destructor protected, only the friend class(es) can actually delete Record objects, or objects derived from Record. + + QuickString _chrName; + int _chrId; + int _startPos; + int _endPos; + //It is actually faster to also store the start and end positions as their original strings than to + //have to convert their integer representations back to strings when printing them. + QuickString _startPosStr; + QuickString _endPosStr; + QuickString _name; + QuickString _score; + strandType _strand; +}; + + + +#endif /* RECORD_H_ */ diff --git a/src/utils/FileRecordTools/Records/RecordKeyList.cpp b/src/utils/FileRecordTools/Records/RecordKeyList.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6f25dacd3501852fdea29dc235ee8ed972b82468 --- /dev/null +++ b/src/utils/FileRecordTools/Records/RecordKeyList.cpp @@ -0,0 +1,68 @@ +#include "RecordKeyList.h" + +RecordKeyList::RecordKeyList() +: _key(NULL) +{ +} + +RecordKeyList::RecordKeyList(const Record * item) +: _key(item) +{ +} + +RecordKeyList::RecordKeyList(const Record * item, const listType &list) +: _key(item) +{ + _list = list; +} + +RecordKeyList::~RecordKeyList() { +} + +const RecordKeyList &RecordKeyList::operator=(const RecordKeyList &other) +{ + setKey(other._key); + _list = other._list; + return *this; +} + +const RecordKeyList::const_iterator_type RecordKeyList::begin() { + return _list.begin(); +} + +const RecordKeyList::const_iterator_type RecordKeyList::next() { + return _list.next(); +} + + +const RecordKeyList::const_iterator_type RecordKeyList::end() { + return _list.end(); +} + +size_t RecordKeyList::size() const { + return _list.size(); +} + +bool RecordKeyList::empty() const { + return _list.empty(); +} + +void RecordKeyList::push_back(elemType item) { + _list.push_back(item); +} + +const Record *RecordKeyList::getKey() const { + return _key; +} + +void RecordKeyList::setKey(elemType key) { + _key = key; +} + +void RecordKeyList::setListNoCopy(listType &list) { + _list.assignNoCopy(list); +} + +void RecordKeyList::clearList() { + _list.clear(); +} diff --git a/src/utils/FileRecordTools/Records/RecordKeyList.h b/src/utils/FileRecordTools/Records/RecordKeyList.h new file mode 100644 index 0000000000000000000000000000000000000000..f5d1251427d16143d57c8744dc5263782e1af646 --- /dev/null +++ b/src/utils/FileRecordTools/Records/RecordKeyList.h @@ -0,0 +1,60 @@ +/* + * RecordKeyList.h + * + * Created on: Dec 14, 2012 + * Author: nek3d + */ + +#ifndef KEYLIST_H_ +#define KEYLIST_H_ + + + + +using namespace std; + +#include "Record.h" +#include "BTlist.h" + +class RecordKeyList { +public: + typedef const Record * elemType; + typedef BTlist<elemType> listType; + typedef const BTlistNode<elemType> *const_iterator_type; + RecordKeyList(); + RecordKeyList(elemType item); + RecordKeyList(elemType item, const listType &list); + ~RecordKeyList(); + + const RecordKeyList &operator=(const RecordKeyList &other); + const const_iterator_type begin(); + const const_iterator_type next(); + const const_iterator_type end(); + size_t size() const ; + bool empty() const ; //only checks whether list is empty. Doesn't check key. + void push_back(elemType item); + elemType getKey() const ; + void setKey(elemType key); + + //setListNoCopy will make our list share the nodes of another + //list, not copy them. + void setListNoCopy(listType &list); + + // WARNING! clearList will NOT delete records pointed to by list nodes. Caller must do that separately, since the RecordKeyList + // does not have it's own RecordMgr member. + void clearList(); + + void clearAll() { + setKey(NULL); + clearList(); + } + bool allClear() { return (_key == NULL && empty()); } + + +private: + elemType _key; + listType _list; +}; + + +#endif /* RECORDKEYLIST_H_ */ diff --git a/src/utils/FileRecordTools/Records/RecordMgr.cpp b/src/utils/FileRecordTools/Records/RecordMgr.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9799fedc325d208fa502cee814a1f29f6ca471e6 --- /dev/null +++ b/src/utils/FileRecordTools/Records/RecordMgr.cpp @@ -0,0 +1,277 @@ +#include "RecordMgr.h" + +#include "FreeList.h" + +RecordMgr::RecordMgr(FileRecordTypeChecker::RECORD_TYPE recType, int blockSize) +: _recordType(recType), + _freeList(NULL), + _freeListBlockSize(blockSize) +{ + switch(_recordType) { + case FileRecordTypeChecker::BED3_RECORD_TYPE: + { + _freeList = new FreeList<Bed3Interval>(_freeListBlockSize); + break; + } + + case FileRecordTypeChecker::BED4_RECORD_TYPE: + { + _freeList = new FreeList<Bed4Interval>(_freeListBlockSize); + break; + } + + case FileRecordTypeChecker::BED5_RECORD_TYPE: + { + _freeList = new FreeList<Bed5Interval>(_freeListBlockSize); + break; + } + + case FileRecordTypeChecker::BEDGRAPH_RECORD_TYPE: + { + _freeList = new FreeList<BedGraphInterval>(_freeListBlockSize); + break; + } + + case FileRecordTypeChecker::BED6_RECORD_TYPE: + { + _freeList = new FreeList<Bed6Interval>(_freeListBlockSize); + break; + } + case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE: + { + _freeList = new FreeList<BedPlusInterval>(_freeListBlockSize); + break; + } + case FileRecordTypeChecker::BED12_RECORD_TYPE: + { + _freeList = new FreeList<Bed12Interval>(_freeListBlockSize); + break; + } + case FileRecordTypeChecker::BAM_RECORD_TYPE: + { + _freeList = new FreeList<BamRecord>(_freeListBlockSize); + break; + } + case FileRecordTypeChecker::VCF_RECORD_TYPE: + { + _freeList = new FreeList<VcfRecord>(_freeListBlockSize); + break; + } + case FileRecordTypeChecker::GFF_RECORD_TYPE: + { + _freeList = new FreeList<GffRecord>(_freeListBlockSize); + break; + } + + + default: + break; + } +} + +RecordMgr::~RecordMgr() +{ + switch(_recordType) { + case FileRecordTypeChecker::BED3_RECORD_TYPE: + { + delete (FreeList<Bed3Interval> *)_freeList; + break; + } + + case FileRecordTypeChecker::BED4_RECORD_TYPE: + { + delete (FreeList<Bed4Interval> *)_freeList; + break; + } + + case FileRecordTypeChecker::BED5_RECORD_TYPE: + { + delete (FreeList<Bed5Interval> *)_freeList; + break; + } + + case FileRecordTypeChecker::BEDGRAPH_RECORD_TYPE: + { + delete (FreeList<BedGraphInterval> *)_freeList; + break; + } + + case FileRecordTypeChecker::BED6_RECORD_TYPE: + { + delete (FreeList<Bed6Interval> *)_freeList; + break; + } + case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE: + { + delete (FreeList<BedPlusInterval> *)_freeList; + break; + } + case FileRecordTypeChecker::BED12_RECORD_TYPE: + { + delete (FreeList<Bed12Interval> *)_freeList; + break; + } + case FileRecordTypeChecker::BAM_RECORD_TYPE: + { + delete (FreeList<BamRecord> *)_freeList; + break; + } + case FileRecordTypeChecker::VCF_RECORD_TYPE: + { + delete (FreeList<VcfRecord> *)_freeList; + break; + } + + case FileRecordTypeChecker::GFF_RECORD_TYPE: + { + delete (FreeList<GffRecord> *)_freeList; + break; + } + + + default: + break; + } +} + +Record *RecordMgr::allocateRecord() +{ + Record *record = NULL; + switch(_recordType) { + case FileRecordTypeChecker::BED3_RECORD_TYPE: + { + Bed3Interval *b3i = ((FreeList<Bed3Interval> *)_freeList)->newObj(); + record = b3i; + break; + } + + case FileRecordTypeChecker::BED4_RECORD_TYPE: + { + Bed4Interval *b4i = ((FreeList<Bed4Interval> *)_freeList)->newObj(); + record = b4i; + break; + } + + case FileRecordTypeChecker::BED5_RECORD_TYPE: + { + Bed5Interval *b5i = ((FreeList<Bed5Interval> *)_freeList)->newObj(); + record = b5i; + break; + } + + case FileRecordTypeChecker::BEDGRAPH_RECORD_TYPE: + { + BedGraphInterval *bgi = ((FreeList<BedGraphInterval> *)_freeList)->newObj(); + record = bgi; + break; + } + + case FileRecordTypeChecker::BED6_RECORD_TYPE: + { + Bed6Interval *b6i = ((FreeList<Bed6Interval> *)_freeList)->newObj(); + record = b6i; + break; + } + case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE: + { + BedPlusInterval *bPi = ((FreeList<BedPlusInterval> *)_freeList)->newObj(); + record = bPi; + break; + } + case FileRecordTypeChecker::BED12_RECORD_TYPE: + { + Bed12Interval *b12i = ((FreeList<Bed12Interval> *)_freeList)->newObj(); + record = b12i; + break; + } + case FileRecordTypeChecker::BAM_RECORD_TYPE: + { + BamRecord *bamRec = ((FreeList<BamRecord> *)_freeList)->newObj(); + record = bamRec; + break; + } + + case FileRecordTypeChecker::VCF_RECORD_TYPE: + { + VcfRecord *vcfRec = ((FreeList<VcfRecord> *)_freeList)->newObj(); + record = vcfRec; + break; + } + + case FileRecordTypeChecker::GFF_RECORD_TYPE: + { + GffRecord *gfr = ((FreeList<GffRecord> *)_freeList)->newObj(); + record = gfr; + break; + } + + + default: + break; + } + return record; +} + +void RecordMgr::deleteRecord(const Record *record) +{ + switch(_recordType) { + case FileRecordTypeChecker::BED3_RECORD_TYPE: + { + ((FreeList<Bed3Interval> *)_freeList)->deleteObj(static_cast<const Bed3Interval *>(record)); + break; + } + + case FileRecordTypeChecker::BED4_RECORD_TYPE: + { + ((FreeList<Bed4Interval> *)_freeList)->deleteObj(static_cast<const Bed4Interval *>(record)); + break; + } + + case FileRecordTypeChecker::BED5_RECORD_TYPE: + { + ((FreeList<Bed5Interval> *)_freeList)->deleteObj(static_cast<const Bed5Interval *>(record)); + break; + } + + case FileRecordTypeChecker::BEDGRAPH_RECORD_TYPE: + { + ((FreeList<BedGraphInterval> *)_freeList)->deleteObj(static_cast<const BedGraphInterval *>(record)); + break; + } + + case FileRecordTypeChecker::BED6_RECORD_TYPE: + { + ((FreeList<Bed6Interval> *)_freeList)->deleteObj(static_cast<const Bed6Interval *>(record)); + break; + } + case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE: + { + ((FreeList<BedPlusInterval> *)_freeList)->deleteObj(static_cast<const BedPlusInterval *>(record)); + break; + } + case FileRecordTypeChecker::BED12_RECORD_TYPE: + { + ((FreeList<Bed12Interval> *)_freeList)->deleteObj(static_cast<const Bed12Interval *>(record)); + break; + } + case FileRecordTypeChecker::BAM_RECORD_TYPE: + { + ((FreeList<BamRecord> *)_freeList)->deleteObj(static_cast<const BamRecord *>(record)); + break; + } + + case FileRecordTypeChecker::VCF_RECORD_TYPE: + { + ((FreeList<VcfRecord> *)_freeList)->deleteObj(static_cast<const VcfRecord *>(record)); + break; + } + case FileRecordTypeChecker::GFF_RECORD_TYPE: + { + ((FreeList<GffRecord> *)_freeList)->deleteObj(static_cast<const GffRecord *>(record)); + break; + } + + default: + break; + } +} diff --git a/src/utils/FileRecordTools/Records/RecordMgr.h b/src/utils/FileRecordTools/Records/RecordMgr.h new file mode 100644 index 0000000000000000000000000000000000000000..1d73f9055877fc42f76cc2cf0d10efd498101a51 --- /dev/null +++ b/src/utils/FileRecordTools/Records/RecordMgr.h @@ -0,0 +1,47 @@ +/* + * RecordMgr.h + * + * Created on: Jan 9, 2013 + * Author: nek3d + */ + +#ifndef RECORDMGR_H_ +#define RECORDMGR_H_ + +//include headers for all Records and derivative classes +#include "Record.h" +#include "Bed3Interval.h" +#include "Bed4Interval.h" +#include "Bed5Interval.h" +#include "BedGraphInterval.h" +#include "Bed6Interval.h" +#include "BedPlusInterval.h" +#include "Bed12Interval.h" +#include "BamRecord.h" +#include "VcfRecord.h" +#include "GffRecord.h" + +#include "FileRecordTypeChecker.h" + +class Record; + +class RecordMgr { +public: + RecordMgr(FileRecordTypeChecker::RECORD_TYPE recType, int blockSize = 512); + ~RecordMgr(); + + Record *allocateRecord(); + + //WARNING: Even though the deleteRecord method takes a const pointer, user should only pass objects + //that are safe to delete! Const-ness will be cast away internally. + void deleteRecord(const Record *); + +private: + FileRecordTypeChecker::RECORD_TYPE _recordType; + void *_freeList; + int _freeListBlockSize; +}; + + + +#endif /* RECORDMGR_H_ */ diff --git a/src/utils/FileRecordTools/Records/VcfRecord.cpp b/src/utils/FileRecordTools/Records/VcfRecord.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1da6e9ad897ccb28474e6da9ea381160e37043b0 --- /dev/null +++ b/src/utils/FileRecordTools/Records/VcfRecord.cpp @@ -0,0 +1,82 @@ +/* + * VcfRecord.cpp + * + * Created on: May 1, 2013 + * Author: nek3d + */ + +#include "VcfRecord.h" +#include "SingleLineDelimTextFileReader.h" + +bool VcfRecord::initFromFile(SingleLineDelimTextFileReader *fileReader) +{ + fileReader->getField(0, _chrName); + _chrId = fileReader->getCurrChromdId(); + fileReader->getField(1, _startPosStr); + _startPos = str2chrPos(_startPosStr); + _startPos--; // VCF is one-based. Here we intentionally don't decrement the string version, + //because we'll still want to output the one-based number in the print methods, even though + //internally we decrement the integer to comply with the 0-based format common to other records. + fileReader->getField(3, _varAlt); + //endPos is just the startPos plus the length of the variant + _endPos = _startPos + _varAlt.size(); + int2str(_endPos, _endPosStr); + + fileReader->getField(2, _name); + fileReader->getField(4, _varRef); + fileReader->getField(5, _score); + + return initOtherFieldsFromFile(fileReader); +} + +void VcfRecord::clear() +{ + BedPlusInterval::clear(); + _varAlt.clear(); + _varRef.clear(); +} + +void VcfRecord::print(QuickString &outBuf) const { + outBuf.append(_chrName); + outBuf.append('\t'); + outBuf.append(_startPosStr); + printOtherFields(outBuf); +} + +void VcfRecord::print(QuickString &outBuf, int start, int end) const { + outBuf.append(_chrName); + outBuf.append('\t'); + outBuf.append(_startPosStr); + printOtherFields(outBuf); +} + +void VcfRecord::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const { + outBuf.append(_chrName); + outBuf.append('\t'); + outBuf.append(_startPosStr); + printOtherFields(outBuf); + +} + +void VcfRecord::printNull(QuickString &outBuf) const { + outBuf.append(".\t-1\t.\t.\t.\t-1"); + for (int i= startOtherIdx; i < _numPrintFields; i++) { + outBuf.append("\t."); + } +} + +void VcfRecord::printOtherFields(QuickString &outBuf) const { + outBuf.append('\t'); + outBuf.append(_name); + outBuf.append('\t'); + outBuf.append(_varAlt); + outBuf.append('\t'); + outBuf.append(_varRef); + outBuf.append('\t'); + outBuf.append(_score); + for (int i= 0; i < (int)_otherIdxs.size(); i++) { + outBuf.append('\t'); + outBuf.append(*(_otherIdxs[i])); + } + +} diff --git a/src/utils/FileRecordTools/Records/VcfRecord.h b/src/utils/FileRecordTools/Records/VcfRecord.h new file mode 100644 index 0000000000000000000000000000000000000000..c4325f948144d8a6077c7afbf6c7fc16d5a91776 --- /dev/null +++ b/src/utils/FileRecordTools/Records/VcfRecord.h @@ -0,0 +1,39 @@ +/* + * VcfRecord.h + * + * Created on: Apr 30, 2013 + * Author: nek3d + */ + +#ifndef VCFRECORD_H_ +#define VCFRECORD_H_ + +#include "BedPlusInterval.h" + +class SingleLineDelimTextFileReader; + +class VcfRecord : public BedPlusInterval { +public: + friend class FreeList<VcfRecord>; + + VcfRecord() {} + virtual bool initFromFile(SingleLineDelimTextFileReader *); + virtual void clear(); + virtual void print(QuickString &outBuf) const; + virtual void print(QuickString &outBuf, int start, int end) const; + virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const; + virtual void printNull(QuickString &outBuf) const; + virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::VCF_RECORD_TYPE; } + + //Note: using the assignment operator in a BedPlusInterval can potentially be a performance hit, + //if the number of fields frequently differ between this object and the one being copied. + const BedPlusInterval &operator=(const VcfRecord &other); + + +protected: + QuickString _varAlt; + QuickString _varRef; + void printOtherFields(QuickString &outBuf) const; +}; + +#endif /* VCFRECORD_H_ */ diff --git a/src/utils/fileType/FileRecordTypeChecker.cpp b/src/utils/fileType/FileRecordTypeChecker.cpp new file mode 100644 index 0000000000000000000000000000000000000000..09a4d9eb6d0e7da7110e05493c88e5158f27bc60 --- /dev/null +++ b/src/utils/fileType/FileRecordTypeChecker.cpp @@ -0,0 +1,387 @@ + +#include "FileRecordTypeChecker.h" +#include "api/BamReader.h" +#include "ParseTools.h" + +FileRecordTypeChecker::FileRecordTypeChecker() +{ + _fileType = UNKNOWN_FILE_TYPE; + _recordType = UNKNOWN_RECORD_TYPE; + _numFields = 0; + _isBinary = false; + _isText = false; + _isBed = false; + _isDelimited = false; + _delimChar = '\0'; + _lines.clear(); + _firstValidDataLineIdx = -1; + _isVCF = false; + _isBAM = false; + _isGFF = false; + _isGzipped = false; + _insufficientData = false; + _fourthFieldNumeric = false; + + _hasName[UNKNOWN_RECORD_TYPE] = false; + _hasName[BED3_RECORD_TYPE] = false; + _hasName[BED6_RECORD_TYPE] = true; + _hasName[BED12_RECORD_TYPE] = true; + _hasName[BED_PLUS_RECORD_TYPE] = true; + _hasName[BAM_RECORD_TYPE] = true; + _hasName[VCF_RECORD_TYPE] = true; + _hasName[GFF_RECORD_TYPE] = true; + + _hasScore[UNKNOWN_RECORD_TYPE] = false; + _hasScore[BED3_RECORD_TYPE] = false; + _hasScore[BED6_RECORD_TYPE] = true; + _hasScore[BED12_RECORD_TYPE] = true; + _hasScore[BED_PLUS_RECORD_TYPE] = true; + _hasScore[BAM_RECORD_TYPE] = true; + _hasScore[VCF_RECORD_TYPE] = true; + _hasScore[GFF_RECORD_TYPE] = true; + + _hasStrand[UNKNOWN_RECORD_TYPE] = false; + _hasStrand[BED3_RECORD_TYPE] = false; + _hasStrand[BED6_RECORD_TYPE] = true; + _hasStrand[BED12_RECORD_TYPE] = true; + _hasStrand[BED_PLUS_RECORD_TYPE] = true; + _hasStrand[BAM_RECORD_TYPE] = true; + _hasStrand[VCF_RECORD_TYPE] = true; + _hasStrand[GFF_RECORD_TYPE] = true; + + _recordTypeNames[UNKNOWN_RECORD_TYPE] = "Unknown record type"; + _recordTypeNames[BED3_RECORD_TYPE] = "Bed3 record type"; + _recordTypeNames[BED6_RECORD_TYPE] = "Bed6 record type"; + _recordTypeNames[BED12_RECORD_TYPE] = "Bed12 record type"; + _recordTypeNames[BED_PLUS_RECORD_TYPE] = "BedPlus record type"; + _recordTypeNames[BAM_RECORD_TYPE] = "BAM record type"; + _recordTypeNames[VCF_RECORD_TYPE] = "VCF record type"; + _recordTypeNames[GFF_RECORD_TYPE] = "GFF record type"; + +// UNKNOWN_FILE_TYPE, SINGLE_LINE_DELIM_TEXT_FILE_TYPE, +// MULTI_LINE_ENTRY_TEXT_FILE_TYPE, +// GFF_FILE_TYPE, GZIP_FILE_TYPE, BAM_FILE_TYPE + + _fileTypeNames[UNKNOWN_FILE_TYPE] = "Unknown file type"; + _fileTypeNames[SINGLE_LINE_DELIM_TEXT_FILE_TYPE] = "Delimited text file type"; + _fileTypeNames[GZIP_FILE_TYPE] = "Gzip file type"; + _fileTypeNames[BAM_FILE_TYPE] = "BAM file type"; + _fileTypeNames[VCF_FILE_TYPE] = "VCF file type"; +} + + +bool FileRecordTypeChecker::scanBuffer(const char *buffer, size_t len) +{ + if (len == 0) { + len = strlen(buffer); + } + _numBytesInBuffer = len; + if (_numBytesInBuffer == 0) { + cerr << "Error: Type checker received empty buffer." << endl; + exit(1); + } + + //special: the first thing we do is look for a gzipped file. + if (!_isGzipped && ((unsigned char)(buffer[0]) == 0x1f)) { + _isGzipped = true; + return true; + } + //scan the first 8K block of the streamBuf. + + //now we have a buffer from the file. + //first, test to see if it's binary or text. + if (isBinaryBuffer(buffer, len)) { + _isText = false; + _isBinary = true; + return true; + } else { + _isText = true; + _isBinary = false; + return handleTextFormat(buffer, len); + } +} + +bool FileRecordTypeChecker::isBinaryBuffer(const char *buffer, size_t len) +{ + if (isBAM(buffer)) { + return true; + } + + //Let's say that in a text file, at least 90% of the characters + //should be alphanumeric, whitespace, or punctuation. + + int alphaNumCount = 0; + int whiteSpaceCount = 0; + int punctuationCount = 0; + + for (int i=0; i < (int)len; i++) { + char currChar = buffer[i]; + if (isalnum(currChar)) { + alphaNumCount++; + } else if (isspace(currChar)) { + whiteSpaceCount++; + } else if (ispunct(currChar)) { + punctuationCount++; + } + } + + if ((float)(alphaNumCount + whiteSpaceCount + punctuationCount) / (float)(_numBytesInBuffer) < PERCENTAGE_PRINTABLE) { + return true; + } + return false; +} + + +bool FileRecordTypeChecker::isBAM(const char *buffer) +{ + //check for BAM. The Bam Magic String is "BAM\1", and should be the first 4 characters of the file. + + if (strncmp(buffer, "BAM\1", 4) == 0) { + _isBAM = true; + _fileType = BAM_FILE_TYPE; + _recordType = BAM_RECORD_TYPE; + return true; + } + + //TBD: Handle other binary formats + return false; +} + +bool FileRecordTypeChecker::handleTextFormat(const char *buffer, size_t len) +{ + if (isVCFformat(buffer)) { + return isTextDelimtedFormat(buffer, len); + } else if (isTextDelimtedFormat(buffer, len)) { + + //At this point, _isText and _isDelimited are set. _numFields and _delimChar are + //set. + _fileType = SINGLE_LINE_DELIM_TEXT_FILE_TYPE; + + //Tokenize the first line of valid data into fields. + const QuickString &line = _lines[_firstValidDataLineIdx]; + _currLineElems.clear(); + if (Tokenize(line, _currLineElems, _delimChar, _numFields) != _numFields) { + cerr << "Error: Type checker found wrong number of fields while tokenizing data line." << endl; + exit(1); + } + + if (isBedFormat()) { + _isBed = true; + if (_numFields == 3) { + _recordType = BED3_RECORD_TYPE; + } else if (_numFields == 4) { + if (isNumeric(_currLineElems[3])) { + _recordType = BEDGRAPH_RECORD_TYPE; + _fourthFieldNumeric = true; + } else { + _fourthFieldNumeric = false; + _recordType = BED4_RECORD_TYPE; + } + } else if (_numFields == 5) { + _recordType = BED5_RECORD_TYPE; + } else if (_numFields == 6) { + _recordType = BED6_RECORD_TYPE; + } else if (_numFields == 12) { + _recordType = BED12_RECORD_TYPE; + } else if (_numFields >6) { + _recordType = BED_PLUS_RECORD_TYPE; + } + return true; + } + if (isGFFformat()) { + _isGFF = true; + _recordType = GFF_RECORD_TYPE; + return true; + } + return false; + } + return false; +} + +bool FileRecordTypeChecker::isVCFformat(const char *buffer) +{ + if (_isVCF) { + return true; //previous pass through this method has determined file is VCF. + } + if (memcmp(buffer, "##fileformat=VCF", 16) == 0) { + _isVCF = true; + _fileType = VCF_FILE_TYPE; + _recordType = VCF_RECORD_TYPE; + return true; + } + return false; +} + +bool FileRecordTypeChecker::isBedFormat() { + + //test that the file has at least three fields. + //2nd and 3rd fields of first valid data line must be integers. 3rd must not be less than 2nd. + if (_numFields < 3) { + return false; + } + //the 2nd and 3rd fields must be numeric. + if (!isNumeric(_currLineElems[1]) || !isNumeric(_currLineElems[2])) { + return false; + } + + int start = str2chrPos(_currLineElems[1]); + int end = str2chrPos(_currLineElems[2]); + if (end < start) { + return false; + } + return true; +} + +bool FileRecordTypeChecker::isGFFformat() +{ + //a GFF file may have 8 or 9 fields. + if (_numFields < 7 || _numFields > 9) { + return false; + } + //the 4th and 5th fields must be numeric. + if (!isNumeric(_currLineElems[3]) || !isNumeric(_currLineElems[4])) { + return false; + } + int start = str2chrPos(_currLineElems[3]); + int end = str2chrPos(_currLineElems[4]); + if (end < start) { + return false; + } + return true; +} + +bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len) +{ + //Break single string buffer into vector of QuickStrings. Delimiter is newline. + _lines.clear(); + int numLines = Tokenize(buffer, _lines, '\n', len); + + //anticipated delimiter characters are tab, comma, and semi-colon. + //If we need new ones, they must be added in this method. + //search each line for delimiter characters. + + vector<int> tabCounts; + vector<int> commaCounts; + vector<int> semicolonCounts; + + tabCounts.reserve(numLines); + commaCounts.reserve(numLines); + semicolonCounts.reserve(numLines); + + //loop through the lines, ignoring headers. Count potential delimiters, + //see if we can find a few lines with the same number of a given delimiter. + //delims are tested in hierarchical order, starting with tab,then comma, then semi-colon. + + int validLinesFound=0; + int headerCount = 0; + int emptyLines = 0; + for (int i=0; i < numLines; i++ ) { + + if (validLinesFound >=4) { + break; //really only need to look at like 4 lines of data, max. + } + QuickString &line = _lines[i]; + int len =line.size(); + //skip over any empty line + if (len == 0) { + emptyLines++; + continue; + } + + //skip over any header line + if (isHeaderLine(line)) { + //clear any previously found supposedly valid data lines, because valid lines can only come after header lines. + if (_firstValidDataLineIdx > -1 && _firstValidDataLineIdx < i) { + _firstValidDataLineIdx = -1; + validLinesFound--; + headerCount++; + } + headerCount++; + continue; + } + //a line must have some alphanumeric characters in order to be valid. + bool hasAlphaNum = false; + for (int j=0; j < len; j++) { + if(isalnum(line[j])) { + hasAlphaNum = true; + break; + } + } + if (!hasAlphaNum) { + continue; + } + + validLinesFound++; + if (_firstValidDataLineIdx == -1) { + _firstValidDataLineIdx = i; + } + + int lineTabCount = 0, lineCommaCount=0, lineSemicolonCount =0; + for (int j=0; j < len; j++) { + char currChar = line[j]; + if (currChar == '\t') { + lineTabCount++; + } else if (currChar == ',') { + lineCommaCount++; + } else if (currChar == ';') { + lineSemicolonCount++; + } + } + tabCounts.push_back(lineTabCount); + commaCounts.push_back(lineCommaCount); + semicolonCounts.push_back(lineSemicolonCount); + } + + if (headerCount + emptyLines == numLines) { + _insufficientData = true; + } + if (validLinesFound == 0) { + return false; + } + _insufficientData = false; + + if (delimiterTesting(tabCounts, '\t')) { + return true; + } + if (delimiterTesting(commaCounts, ',')) { + return true; + } + if (delimiterTesting(semicolonCounts, ';')) { + return true; + } + + return false; //unable to detect delimited file. +} + +bool FileRecordTypeChecker::delimiterTesting(vector<int> &counts, char suspectChar) +{ + //check to see if we found the same number of tabs in every line. + int numDelims = counts[0]; + if (numDelims != 0) { + bool countsMatch = true; + for (int i=1; i < (int)counts.size(); i++) { + if (counts[i] != numDelims) { + countsMatch = false; + } + } + if (countsMatch) { + //Hurray!! We have successfully found a delimited file. + _isDelimited = true; + _delimChar = suspectChar; + _numFields = numDelims +1; + return true; + } else { + return false; + } + } + return false; +} + + +void FileRecordTypeChecker::setBam() +{ + _fileType = BAM_FILE_TYPE; + _recordType = BAM_RECORD_TYPE; + _isBinary = true; + _isBAM = true; +} diff --git a/src/utils/fileType/FileRecordTypeChecker.h b/src/utils/fileType/FileRecordTypeChecker.h new file mode 100644 index 0000000000000000000000000000000000000000..6ed3eff05dbc25dfb2117494c69fe88410066f17 --- /dev/null +++ b/src/utils/fileType/FileRecordTypeChecker.h @@ -0,0 +1,125 @@ +/* + * FileRecordTypeChecker.h + * + * Created on: Nov 19, 2012 + * Author: nek3d + */ + +#ifndef FILERECORDTYPECHECKER_H_ +#define FILERECORDTYPECHECKER_H_ + +using namespace std; + +#include <string> +#include <cstring> +#include <cstdio> +#include <cctype> +#include <cstdlib> +#include <vector> +#include <map> +#include "PushBackStreamBuf.h" + +class FileRecordTypeChecker { +public: + + FileRecordTypeChecker(); + ~FileRecordTypeChecker() {} + + typedef enum { UNKNOWN_FILE_TYPE, SINGLE_LINE_DELIM_TEXT_FILE_TYPE, + MULTI_LINE_ENTRY_TEXT_FILE_TYPE, + GFF_FILE_TYPE, GZIP_FILE_TYPE, BAM_FILE_TYPE, VCF_FILE_TYPE} FILE_TYPE; + + typedef enum { UNKNOWN_RECORD_TYPE, BED3_RECORD_TYPE, BED4_RECORD_TYPE, BEDGRAPH_RECORD_TYPE, BED5_RECORD_TYPE, + BED6_RECORD_TYPE, BED12_RECORD_TYPE, BED_PLUS_RECORD_TYPE, BAM_RECORD_TYPE, VCF_RECORD_TYPE, GFF_RECORD_TYPE} RECORD_TYPE; + + bool scanBuffer(const char *buf, size_t len=0); + bool needsMoreData() const { return _insufficientData; } + + bool recordTypeHasName(RECORD_TYPE type) const { return _hasName.find(type) != _hasName.end(); } + bool recordTypeHasScore(RECORD_TYPE type) const { return _hasScore.find(type) != _hasScore.end(); } + bool recordTypeHasStrand(RECORD_TYPE type) const { return _hasStrand.find(type) != _hasStrand.end(); } + + FILE_TYPE getFileType() const { return _fileType; } + RECORD_TYPE getRecordType() const { return _recordType; } + const string &getFileTypeName() const { + return _fileTypeNames.find(_fileType)->second; + } + const string &getRecordTypeName() const { + return _recordTypeNames.find(_recordType)->second; + } + + bool isBinary() const { return _isBinary; } + bool isBam() const { return _isBAM; } + bool isGzipped() const { return _isGzipped; } + + void setBam(); //call only if you're SURE the file is BAM! + + + bool isText() const { return _isText; } + bool isDelimited() const { return _isDelimited; } + char getDelimChar() const { return _delimChar; } + int getNumFields() const { return _numFields; } + + bool isVcf() const; + bool isBed() const { return _isBed; } + bool isBed3() const { return (_isBed && _numFields == 3); } + bool isBed4() const { return (_isBed && _numFields == 4 && !_fourthFieldNumeric); } + bool isBedGraph() const { return (_isBed && _numFields == 4 && _fourthFieldNumeric); } + bool isBed5() const { return (_isBed && _numFields == 3); } + bool isBed6() const { return (_isBed && _numFields == 6); } + bool isBedPlus() const { return (_isBed && _numFields > 6 && _numFields != 12); } + bool isBed12() const { return (_isBed && _numFields == 12); } + bool isGFF() const { return _isGFF; } + + + + + + +private: + FILE_TYPE _fileType; + RECORD_TYPE _recordType; + + vector<QuickString> _lines; + vector<QuickString> _currLineElems; + int _firstValidDataLineIdx; + static const int SCAN_BUFFER_SIZE = 8192; //8 KB buffer + int _numBytesInBuffer; //this will hold the length of the buffer after the scan. + + int _numFields; + bool _isBinary; + bool _isText; + bool _isBed; + bool _isDelimited; + char _delimChar; + bool _isVCF; + bool _isBAM; + bool _isGFF; + bool _isGzipped; + bool _insufficientData; //set to true if scan buffer had only header lines. + bool _fourthFieldNumeric; //this is just to distinguish between Bed4 and BedGraph files. + + map<RECORD_TYPE, string> _recordTypeNames; + map<FILE_TYPE, string> _fileTypeNames; + + map<RECORD_TYPE, bool> _hasName; + map<RECORD_TYPE, bool> _hasScore; + map<RECORD_TYPE, bool> _hasStrand; + + //this will be used in determining whether we are looking at a binary or text file. + static const float PERCENTAGE_PRINTABLE = .9; + bool isBinaryBuffer(const char *buffer, size_t len); + bool isBAM(const char *buffer); + bool handleTextFormat(const char *buffer, size_t len); + bool isTextDelimtedFormat(const char *buffer, size_t len); + bool isBedFormat(); + bool isVCFformat(const char *buffer); + bool isGFFformat(); + bool delimiterTesting(vector<int> &counts, char suspectChar); + + + + +}; + +#endif /* FILERECORDTYPECHECKER_H_ */ diff --git a/src/utils/general/BTlist.h b/src/utils/general/BTlist.h new file mode 100644 index 0000000000000000000000000000000000000000..0a5a57646bf4d346d5934aa8de5833ba39f8c1a3 --- /dev/null +++ b/src/utils/general/BTlist.h @@ -0,0 +1,234 @@ +/* + * BTlist.h + * + * Created on: Jan 18, 2013 + * Author: nek3d + */ + +#ifndef BT_LIST_H_ +#define BT_LIST_H_ + +// +// Writing our own singly linked list template class because: +// +// 1) STL doesn't currently support one that I can find without +// C++0x support or SGI extensions. +// +// 2) A normal singly linked list doesn't support constant time +// insertion or deletion at the end. It also doesn't track +// your current position or let you delete at that position, +// but this does. +// +// 3) A normal STL list is doubly-linked, which would do all +// these things, but more slowly and with more memory. +// +// 4) I can. + + +#include "FreeList.h" +#include "QuickString.h" +#include <cstring> //for memset + +template <class T> class BTlist; + +template <class T> +class BTlistNode { +public: + friend class BTlist<T>; + + BTlistNode() : _next(NULL){} + BTlistNode(const T &val) : _val(val), _next(NULL) {} + const T &value() const { return _val; } + const BTlistNode<T> *next() const { return _next; } + BTlistNode<T> *next() { return _next; } + bool hasNext() const { return _next != NULL; } + void clear() { + _next = NULL; + //for the value, set numbers to 0, ptrs to NULL, etc. + memset((void *)_val, 0, sizeof(T)); + } + +private: + + T _val; + BTlistNode *_next; +}; + +template <class T> class BTlist { +public: + BTlist() : + _begin(NULL), + _currEnd(NULL), + _prevCursor(NULL), + _size(0), + _dontDelete(false) + { + } + //This version of the constructor will convert a QuickString into a BTlist. + //It is effectively the opposite of the toStr method. + BTlist(const QuickString &str) : + _begin(NULL), + _currEnd(NULL), + _prevCursor(NULL), + _size(0), + _dontDelete(false) + { + for (int i=0; i < (int)str.size(); i++) { + push_back((T)(str[i])); + } + } + + ~BTlist() { + clear(); + } + + typedef const BTlistNode<T> * const_iterator_type; + + bool empty() const { return _begin == NULL; } + size_t size() const { return _size; } + + const BTlistNode<T> *front() const { return _begin;} + BTlistNode<T> *back() const { return _currEnd; } + void pop_front() { + if (empty()) { + return; + } + BTlistNode<T> *killNode = _begin; + if (_begin->_next == NULL) { //this is the only item. List is 1. + delete killNode; + _begin = NULL; + _currEnd = NULL; + _prevCursor = NULL; + _size = 0; + return; + } + if (_prevCursor == _begin) { + _prevCursor = _begin->_next; + } + _begin = _begin->next(); + delete killNode; + _size--; + } + + const BTlistNode<T> *begin() { + _prevCursor = NULL; + return _begin; + } + + const BTlistNode<T> *begin() const { + return _begin; + } + const BTlistNode<T> *next() { + if (_prevCursor == NULL) { + _prevCursor = _begin; + return _begin->_next; + } + _prevCursor = _prevCursor->_next; + return _prevCursor->_next; + } + const BTlistNode<T> *end() const { return NULL; } + + BTlistNode<T> * deleteCurrent() { + //delete what the current cursor is pointing to, + //meaning whatever the prevCursor's next member points to. + BTlistNode<T> *returnNode = NULL; + BTlistNode<T> *killNode = NULL; + if (_prevCursor == NULL) { + //deleting first item in list. + killNode = _begin; + _begin = _begin->_next; + returnNode = _begin; + + } else { + killNode = _prevCursor->_next; + _prevCursor->_next = _prevCursor->_next->_next; + returnNode = _prevCursor->_next; + } + if (_currEnd == killNode) { + //deleting last item in list + _currEnd = _prevCursor; //back up the current end. + } + delete killNode; + _size--; + + return returnNode; + } + + void push_back(const T &val) { + BTlistNode<T> *newNode = new BTlistNode<T>(val); + if (empty()) { + _begin = newNode; + _currEnd = newNode; + _prevCursor = NULL; + } else { + _currEnd->_next = newNode; + _prevCursor = _currEnd; + _currEnd = newNode; + } + _size++; + } + + void clear() { + if (_dontDelete) { + return; + } + //delete all nodes, set cursors to NULL, set size to 0. + BTlistNode<T> *killNode = _begin; + while (killNode != NULL) { + BTlistNode<T> *nextNode = killNode->_next; + delete killNode; + killNode = nextNode; + } + _begin = NULL; + _currEnd = NULL; + _prevCursor = NULL; + _size = 0; + } + + const BTlist<T> &operator=(const BTlist<T> &other) { + //need to make copies of all nodes and assign values + clear(); + + BTlistNode<T> *node = other._begin; + while (node != NULL) { + push_back(node->_val); + node = node->_next; + } + return *this; + } + void assignNoCopy(BTlist<T> &other) { + _begin = other._begin; + _size = other._size; + _currEnd = other._currEnd; + _prevCursor = other._prevCursor; + _dontDelete = true; + } + + //this method will convert the contents of the list into a QuickString. + //It assumes that the templated type of the listNode can be converted to a + //char using the (char) cast. The user must ensure that this is the case. + void toStr(QuickString &str, bool append = false) const { + if (!append) { + str.clear(); + } + str.reserve(_size + str.size()); + for (const BTlistNode<T> *iter = begin(); iter != end(); iter = iter->next()) { + str.append((char)iter->value()); + } + } + +private: + BTlistNode<T> *_begin; + //keep a cursor to the last element for constant time push_back and pop_back functions. + BTlistNode<T> *_currEnd; + BTlistNode<T> *_prevCursor; + size_t _size; + + bool _dontDelete; //this will usally be false, but set to true when + //calling assignNoCopy, as our list will actually just be a pointer + //to another list. + + +}; + +#endif /* BT_LIST_H_ */ diff --git a/src/utils/general/BedtoolsTypes.h b/src/utils/general/BedtoolsTypes.h new file mode 100644 index 0000000000000000000000000000000000000000..46aef49cdadef84fa0b961a847f69ee4a9b2aaea --- /dev/null +++ b/src/utils/general/BedtoolsTypes.h @@ -0,0 +1,33 @@ +/* + * BedtoolsTypes.h + * + * Created on: Feb 13, 2013 + * Author: nek3d + */ + +#ifndef BEDTOOLSTYPES_H_ +#define BEDTOOLSTYPES_H_ + +using namespace std; + +#include <stdint.h> +#include <climits> +#include <string> +#include <cstdlib> +#include <cstring> +#include <cstdio> +#include <iostream> +#include "QuickString.h" + +//STL headers +#include <utility> //include pair and make_pair +#include <vector> +#include <set> +#include <map> + + +typedef uint32_t CHRPOS; + + + +#endif /* BEDTOOLSTYPES_H_ */ diff --git a/src/utils/general/CompressionTools.cpp b/src/utils/general/CompressionTools.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a29d35ad03afa59c48fe431327dae0be9186274f --- /dev/null +++ b/src/utils/general/CompressionTools.cpp @@ -0,0 +1,45 @@ +/* + * CompressionTools.cpp + * + * Created on: Apr 10, 2013 + * Author: nek3d + */ + +#include "CompressionTools.h" +#include <cstring> //for strlen + +unsigned long inflateGzippedArray(const BTlist<int> &inList, unsigned char* outbuf, size_t outbufSize, size_t inbufSize) +{ + if (inbufSize == 0) { + inbufSize = inList.size(); + } + Bytef *inbuf = new Bytef[inbufSize +1]; + memset(inbuf, 0, inbufSize +1); + + int i=0; + for (BTlist<int>::const_iterator_type iter = inList.begin(); iter != inList.end(); iter = iter->next()) { + inbuf[i] = iter->value(); + i++; + } + // zlib struct + z_stream infstream; + infstream.zalloc = Z_NULL; + infstream.zfree = Z_NULL; + infstream.opaque = Z_NULL; + // setup "b" as the input and "c" as the compressed output + infstream.avail_in = inbufSize; // size of input + infstream.next_in = (Bytef *)inbuf; // input char array + infstream.avail_out = outbufSize; // size of output + infstream.next_out = (Bytef *)outbuf; // output char array + + // the actual DE-compression work. + inflateInit2(&infstream, 16+MAX_WBITS); //decompresses gzipped data instead of zlib compressed data. + inflate(&infstream, Z_NO_FLUSH); + inflateEnd(&infstream); + + delete [] inbuf; + + return infstream.total_out; + +} + diff --git a/src/utils/general/CompressionTools.h b/src/utils/general/CompressionTools.h new file mode 100644 index 0000000000000000000000000000000000000000..8b41336a7c3607a678fb694b9a4b5c48f4fb7115 --- /dev/null +++ b/src/utils/general/CompressionTools.h @@ -0,0 +1,25 @@ +/* + * CompressionTools.h + * + * Created on: Apr 10, 2013 + * Author: nek3d + */ + +#ifndef COMPRESSIONTOOLS_H_ +#define COMPRESSIONTOOLS_H_ + +using namespace std; + +#include <zlib.h> +#include "BTlist.h" + +//pass an array of gzipped data, and an empty, PRE-ALLOCATED buffer to store the unzipped results. +//return value is the length of the uncompressed data. +unsigned long inflateGzippedArray(const BTlist<int> &inbuf, //the input. Gzipped data. + unsigned char* outbuf, //user must allocate enough space for uncompressed output! + size_t outbufSize, //capacity of output buffer + size_t inbufSize =0); //save time on call to strlen by providing size of inbuf. + + + +#endif /* COMPRESSIONTOOLS_H_ */ diff --git a/src/utils/general/DualQueue.h b/src/utils/general/DualQueue.h new file mode 100644 index 0000000000000000000000000000000000000000..ccc8b7b0e7dd5a78561851fc519e82912100c335 --- /dev/null +++ b/src/utils/general/DualQueue.h @@ -0,0 +1,123 @@ +/* + * DualQueue.h + * + * Created on: Jan 29, 2013 + * Author: nek3d + */ + +#ifndef DUALQUEUE_H_ +#define DUALQUEUE_H_ + +using namespace std; + +#include <vector> +#include <queue> +#include <cstdio> +#include <cstdlib> + +template <class T> class DualQueueAscending { +public: + bool operator() ( const T &item1, const T &item2) const { + + printf("\n\nIn comparison method:\n item1=\n"); +// item1->print(); + printf("\nitem2=\n"); +// item2->print(); + printf("\n"); + + if( *(item1) < *(item2) ) { + printf("Item1 less than item2. Returning false.\n"); + return false; + } + printf("Item1 not less than item2. Returning true.\n"); + return true; + } +}; + +template <class T> class DualQueueDescending { +public: + bool operator() ( const T &item1, const T &item2) const { + if( *(item2) < *(item1) ) { + return false; + } + return true; + } +}; + + +template <class T, template<class T> class CompareFunc> class DualQueue { +public: + DualQueue() {} + ~DualQueue() {} + + const T & top() const { + if (empty()) { + fprintf(stderr, "ERROR. Tried to top from empty dualQueue.\n"); + exit(1); + } + if (emptyForward()) { + return topReverse(); + } + if (emptyReverse()) { + return topForward(); + } + + return (topFowardHigherPriorityThanTopReverse() ? topForward() : topReverse()); + } + void pop() { + if (empty()) { + fprintf(stderr, "ERROR. Tried to pop from empty dualQueue.\n"); + exit(1); + } + if (emptyForward()) { + popReverse(); + return; + } + if (emptyReverse()) { + popForward(); + return; + } + topFowardHigherPriorityThanTopReverse() ? popForward() : popReverse(); + } + void push(const T &item) { item->getStrand() ? pushForward(item) : pushReverse(item); } + size_t size() const { return sizeForward() + sizeReverse(); } + bool empty() const { return _forwardQueue.empty() && _reverseQueue.empty(); } + + const T & topForward() const { return _forwardQueue.top(); } + void popForward() { _forwardQueue.pop(); } + void pushForward(const T &item) { _forwardQueue.push(item); } + size_t sizeForward() const { return _forwardQueue.size(); } + bool emptyForward() const { return _forwardQueue.empty(); } + + + const T & topReverse() const { return _reverseQueue.top(); } + void popReverse() { _reverseQueue.pop(); } + void pushReverse(const T &item) { _reverseQueue.push(item); } + size_t sizeReverse() const { return _reverseQueue.size(); } + bool emptyReverse() const { return _reverseQueue.empty(); } + + +private: + typedef priority_queue<T, vector<T>, CompareFunc<T> > queueType; + queueType _forwardQueue; + queueType _reverseQueue; + + bool topFowardHigherPriorityThanTopReverse() const { + printf("\n\nIn priority method:\n TopForward=\n"); +// topForward()->print(); + printf("\nTopReverse=\n"); +// topReverse()->print(); + printf("\n"); + if (CompareFunc<T>()(topForward(), topReverse())) { + printf("Forward higher priority than reverse.\n"); + return true; + } else { + printf("Reverse higher priority than forward.\n"); + return false; + } + } + +}; + + +#endif /* DUALQUEUE_H_ */ diff --git a/src/utils/general/ErrorMsg.h b/src/utils/general/ErrorMsg.h new file mode 100644 index 0000000000000000000000000000000000000000..e595402df4e16ba81416700687c6fbf1e841ae9c --- /dev/null +++ b/src/utils/general/ErrorMsg.h @@ -0,0 +1,14 @@ +/* + * ErrorMsg.h + * + * Created on: Feb 13, 2013 + * Author: nek3d + */ + +#ifndef ERRORMSG_H_ +#define ERRORMSG_H_ + + + + +#endif /* ERRORMSG_H_ */ diff --git a/src/utils/general/FreeList.h b/src/utils/general/FreeList.h new file mode 100644 index 0000000000000000000000000000000000000000..4aaa6f551ed96964a8034ea74954e4296de05c20 --- /dev/null +++ b/src/utils/general/FreeList.h @@ -0,0 +1,83 @@ +/* + * FreeList.h + * + * Created on: Nov 30, 2012 + * Author: nek3d + */ + +#ifndef FREELIST_H_ +#define FREELIST_H_ + +using namespace std; + +#include <cstddef> //defines NULL +#include <deque> +#include <vector> + +template <class T> +class FreeList { +public: + FreeList(int blockSize=512) + : _nextPos(0), + _blockSize(blockSize) + { + _freeList.reserve(_blockSize); + } + + ~FreeList() { + for (int i=0; i < (int)_buffer.size(); i++) { + delete _buffer[i]; + } + } + + void clear() { + _nextPos = 0; + _freeList.clear(); + } + + T *newObj() { + T *ptr = NULL; + + if (_freeList.size() > 0) { + ptr = _freeList.back(); + _freeList.pop_back(); + } else { + if (_nextPos == (int)_buffer.size()) { + growBuffer(); + } + ptr = _buffer[_nextPos]; + _nextPos ++; + } + + return ptr; + } + + void deleteObj(const T *ptr) { + if (ptr != NULL) { + T *volatilePtr = const_cast<T *>(ptr); //remove const-ness. + volatilePtr->clear(); + _freeList.push_back(volatilePtr); + } + } + + int capacity() const { return _buffer.size(); } + +private: + deque<T *> _buffer; + vector<T *> _freeList; + int _nextPos; + int _blockSize; + void growBuffer() { + for (int i=0; i < _blockSize; ++i) { + _buffer.push_back(new T()); + } + } + + //this number determines how many times larger than the blockSize the freeList + //is allowed to grow. If speed performance is poor, we may need to increase this number. + //If memory performance is poor, decrease it. + static const int MAX_MULTIPLE_FREE_LIST_SIZE = 10; +}; + + +#endif /* FREELIST_H_ */ diff --git a/src/utils/general/InflateStreamBuf.h b/src/utils/general/InflateStreamBuf.h new file mode 100644 index 0000000000000000000000000000000000000000..7c37339d252a58c1be3c3192362f78a19addc32b --- /dev/null +++ b/src/utils/general/InflateStreamBuf.h @@ -0,0 +1,130 @@ +/* + * InflateStreamBuf.h + * + * Created on: Apr 11, 2013 + * Author: nek3d + */ + +#ifndef INFLATESTREAMBUF_H_ +#define INFLATESTREAMBUF_H_ + +using namespace std; + +#include <iostream> +#include <cstdio> +#include <cerrno> +#include <cstring> +#include <zlib.h> +#include <cstdlib> + +#define GZBUFSIZ BUFSIZ + +class InflateStreamBuf:public std::streambuf +{ +public: + InflateStreamBuf(std::istream* in):in(in),status_flag(0) { + if(in==NULL) throw ("Null pointer"); + std::memset((void*)&strm,0,sizeof(z_stream)); + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.opaque = Z_NULL; + strm.avail_in = 0; + strm.next_in = Z_NULL; + + buffer=(char*)std::malloc(GZBUFSIZ*sizeof(char)); + if(buffer==NULL) { + throw ("out of memory"); + } + if ( inflateInit2(&strm, 16+MAX_WBITS) != Z_OK) { + throw ("::inflateInit failed"); + } + setg(&buffer[0], &buffer[GZBUFSIZ], &buffer[GZBUFSIZ]); + } + + virtual void close() { + if(in!=NULL) { + in=NULL; + (void)inflateEnd(&strm); + free(buffer); + buffer=NULL; + } + in=NULL; + } + + virtual ~InflateStreamBuf() { + close(); + } + + virtual int underflow () { + + if(in==NULL) return EOF; + if(status_flag == Z_STREAM_END) { + close(); + return EOF; + } + in->read((char*)buffin, GZBUFSIZ); + + strm.avail_in = in->gcount(); + + if(strm.avail_in == 0) { + close(); + return EOF; + } + strm.next_in=buffin; + /* run inflate() on input until output buffer not full */ + unsigned int total=0; + _currentOutBufSize = 0; + do { + strm.avail_out = GZBUFSIZ; + strm.next_out = buffout; + status_flag = ::inflate(&strm, Z_NO_FLUSH); + + switch (status_flag) { + case 0:break; + case Z_STREAM_END:break; + case Z_NEED_DICT: + status_flag=Z_DATA_ERROR; + case Z_DATA_ERROR: + case Z_MEM_ERROR: + + close(); + throw status_flag; + break; + default: + { + close(); + throw status_flag; + break; + } + } + + unsigned int have = GZBUFSIZ - strm.avail_out; + buffer=(char*)std::realloc(buffer,sizeof(char)*(total+have)); + if(buffer==NULL) { + throw ("out of memory"); + } + + memcpy((void*)&buffer[total], &buffout[0], sizeof(char)*have); + total+=have; + } while (strm.avail_out == 0 && strm.avail_in > 0); + + + setg(buffer, buffer, &buffer[total]); + _currentOutBufSize = total; + + return total==0?EOF:this->buffer[0]; + } + +protected: + std::istream* in; + unsigned char buffin[GZBUFSIZ]; + unsigned char buffout[GZBUFSIZ]; + char* buffer; + z_stream strm; + int status_flag; + int _currentOutBufSize; +}; + + + +#endif /* INFLATESTREAMBUF_H_ */ diff --git a/src/utils/general/Makefile b/src/utils/general/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..43dcfba076d000829c4f2842e16136f24e0ede32 --- /dev/null +++ b/src/utils/general/Makefile @@ -0,0 +1,28 @@ +OBJ_DIR = ../../../obj/ +BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../../utils/ +# ------------------- +# define our includes +# ------------------- +INCLUDES = + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= QuickString.h QuickString.cpp ParseTools.h ParseTools.cpp PushBackStreamBuf.cpp PushBackStreamBuf.h CompressionTools.h CompressionTools.cpp +OBJECTS= QuickString.o ParseTools.o PushBackStreamBuf.o CompressionTools.o +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/QuickString.o $(OBJ_DIR)/ParseTools.o $(OBJ_DIR)/PushBackStreamBuf.o + +.PHONY: clean diff --git a/src/utils/general/ParseTools.cpp b/src/utils/general/ParseTools.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ad5764b416e8a76566c3b7210a0f97fb66b55325 --- /dev/null +++ b/src/utils/general/ParseTools.cpp @@ -0,0 +1,143 @@ +#include "ParseTools.h" +#include <climits> +#include <cctype> +#include <cstring> + +//This functions recognizes only numbers with digits, plus sign, minus sign, decimal point, e, or E. Hexadecimal and pointers not currently supported. +bool isNumeric(const QuickString &str) { + for (int i=0; i < (int)str.size(); i++) { + char currChar = str[i]; + if (!(isdigit(currChar) || currChar == '-' || currChar == '.' || currChar == '+' || currChar == 'e' || currChar == 'E')) { + return false; + } + } + return true; +} + +int str2chrPos(const QuickString &str) { + return str2chrPos(str.c_str(), str.size()); +} + +int str2chrPos(const char *str, size_t ulen) { + int len=(int)ulen; + if (len < 1 || len > 10) { + return INT_MIN; //can't do more than 9 digits and a minus sign + } + + register int sum=0; + int startPos =0; + bool isNegative = false; + + //check for negative numbers + if (str[0] == '-') { + isNegative = true; + startPos = 1; + } + + for (int i=startPos; i < len; i++) { + char currChar = str[i]; + if (!isdigit(currChar)) { + return INT_MIN; + } + int dig = currChar - 48; //ascii code for zero. + int power = len -i -1; + + switch (power) { + case 0: + sum += dig; + break; + case 1: + sum += dig * 10; + break; + case 2: + sum += dig *100; + break; + case 3: + sum += dig *1000; + break; + case 4: + sum += dig *10000; + break; + case 5: + sum += dig *100000; + break; + case 6: + sum += dig *1000000; + break; + case 7: + sum += dig *10000000; + break; + case 8: + sum += dig *100000000; + break; + case 9: + sum += dig *1000000000; + break; + default: + return INT_MIN; + break; + } + } + return isNegative ? 0 - sum : sum; +} + +string vectorIntToStr(const vector<int> &vec) { + string str; + str.reserve(vec.size()); + for (int i=0; i < (int)vec.size(); i++) { + str += (char)(vec[i]); + } + return str; +} + +// TBD: Could be better optimized. I'm allocating 8KB for every call, then destroying it. +// That memory needs to stay in scope. +// Also, is this handling subsequent delimiters? +int Tokenize(const QuickString &str, vector<QuickString> &elems, char delimiter, int numExpectedItems) { + + elems.reserve(numExpectedItems); + int strLen = (int)str.size(); + + int startPos = 0; + int currPos = 0; + + char elemBuf[8192]; + + while (startPos < strLen) { + memset(elemBuf, 0, 8192); + while (str[currPos] != delimiter && currPos < strLen) { + currPos++; + } + if (currPos > startPos) { + memcpy(elemBuf, str.c_str() + startPos, min(currPos, strLen) - startPos); + elems.push_back(elemBuf); + } + startPos = currPos +1; + currPos = startPos; + } + return (int)elems.size(); +} + +bool isHeaderLine(QuickString &line) { + if (line[0] == '>') { + return true; + } + if (line[0] == '!') { + return true; + } + if (line[0] == '#') { + return true; + } + //GFF file headers can also start with the words "browser" or "track", followed by a whitespace character. + if (memcmp(line.c_str(), "browser", 7) == 0 && isspace(line[7])) { + return true; + } + if (memcmp(line.c_str(), "track", 5) == 0 && isspace(line[5])) { + return true; + } + if (memcmp(line.c_str(), "visibility", 10) == 0) { + return true; + } + return false; +} + diff --git a/src/utils/general/ParseTools.h b/src/utils/general/ParseTools.h new file mode 100644 index 0000000000000000000000000000000000000000..2132d0e4c6be8994bbc2a579fbf951f3b4bc323d --- /dev/null +++ b/src/utils/general/ParseTools.h @@ -0,0 +1,97 @@ +/* + * parseTools.h + * + * Created on: Dec 3, 2012 + * Author: nek3d + */ + +#ifndef PARSETOOLS_H_ +#define PARSETOOLS_H_ + +using namespace std; + +#include <cstring> //for memset +#include <string> +#include <vector> +#include "QuickString.h" + +bool isNumeric(const QuickString &str); + +//This method is a faster version of atoi, but is limited to a maximum of +//9 digit numbers in Base 10 only. The string may begin with a negative. +//Empty strings, too long strings, or strings containing anything other than +//digits (with the excpetion of a minus sign in the first position) +//will result in error. Errors return INT_MIN. +int str2chrPos(const char *str, size_t len); +int str2chrPos(const QuickString &str); + + +//int2str is faster but less flexible version of the ToString method in +//lineFileUtilities. Unlike ToString, which uses streams, this method +//can only handle integers. The buffer, which is templated, needs to support the +//assignment operater for char *, meaning it needs a T::operator = (const char *) method. +//strings, QuickStrings, stringbuffers, and the like are acceptable. + +template<class T> +void int2str(int number, T& buffer, bool appendToBuf = false) +{ + + register int useNum = number; + if (useNum == 0) { + if (appendToBuf) { + buffer.append("0"); + } else { + buffer = "0"; + } + return; + } + //check for negative numbers. + bool isNegative = useNum < 0; + if (isNegative) { + useNum = 0 - useNum; //convert to positive. + } + + //figure out how many digits we have + register int power = 10; + int numChars = 2 + (isNegative ? 1: 0); + while (power <= useNum) { + power *= 10; + numChars++; + } + + char tmpBuf[numChars]; + memset(tmpBuf, 0, numChars); + + int bufIdx=0; + if (isNegative) { + tmpBuf[0] = '-'; + bufIdx = 1; + } + register int currDig=0; + + power /= 10; + while (power) { + + currDig = useNum / power; + useNum -= currDig * power; + tmpBuf[bufIdx] = currDig + 48; //48 is ascii for zero. + bufIdx++; + power /= 10; + } + if (!appendToBuf) { + buffer = tmpBuf; + } else { + buffer.append(tmpBuf, numChars-1); + } + +} + +bool isHeaderLine(QuickString &line); + +string vectorIntToStr(const vector<int> &vec); + + +//This is a faster version of tokenize that doesn't use strings. Return value is final size of elems vector. +int Tokenize(const QuickString &str, vector<QuickString> &elems, char delimiter = '\t', int numExpectedItems = 0); + +#endif /* PARSETOOLS_H_ */ diff --git a/src/utils/general/PushBackStreamBuf.cpp b/src/utils/general/PushBackStreamBuf.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a4c12857bf6b4e287406bb992cc7eaade0554e07 --- /dev/null +++ b/src/utils/general/PushBackStreamBuf.cpp @@ -0,0 +1,51 @@ +/* + * PushBackStream.cpp + * + * Created on: Mar 18, 2013 + * Author: nek3d + */ +#include <cstdio> + +#include "PushBackStreamBuf.h" +#include <cstring> +#include <iostream> + +PushBackStreamBuf::PushBackStreamBuf(streambuf* primary_stream) +: streambuf(), + _primary_stream(primary_stream) + { + +} + +PushBackStreamBuf::~PushBackStreamBuf() +{ +} + +int PushBackStreamBuf::sbumpc() + { + int c = 0; + if (!_buffer.empty()) { + c = _buffer.front()->value(); + _buffer.pop_front(); + return c; + } + c = _primary_stream->sbumpc(); + return c; +} + +void PushBackStreamBuf::pushBack(const BTlist<int> &newBuf) +{ + _buffer = newBuf; +} + +int PushBackStreamBuf::underflow() +{ + int c = 0; + + if(!_buffer.empty()) { + c=_buffer.front()->value(); + return c; + } + c= _primary_stream->sgetc(); + return c; +} diff --git a/src/utils/general/PushBackStreamBuf.h b/src/utils/general/PushBackStreamBuf.h new file mode 100644 index 0000000000000000000000000000000000000000..b39a1eb1dbdafec0114a53344576548a6f79e620 --- /dev/null +++ b/src/utils/general/PushBackStreamBuf.h @@ -0,0 +1,36 @@ +/* + * PushBackStream.h + * + * Created on: Mar 18, 2013 + * Author: nek3d + */ + +#ifndef PUSHBACKSTREAM_H_ +#define PUSHBACKSTREAM_H_ + +using namespace std; + +#include <iostream> +#include "BTlist.h" + +class PushBackStreamBuf: public std::streambuf { +public: + PushBackStreamBuf(streambuf* primary_stream); + ~PushBackStreamBuf(); + void pushBack(const BTlist<int> &vec); + + int sbumpc(); +protected: + int uflow() { return sbumpc(); } + + int underflow(); + +private: + streambuf* _primary_stream; + BTlist<int> _buffer; + static const int SCAN_BUFFER_SIZE = 4096; //4 KB buffer + static const int MAIN_BUFFER_SIZE = 128 * 1024; //128K buffer +}; + + +#endif /* PUSHBACKSTREAM_H_ */ diff --git a/src/utils/general/QuickString.cpp b/src/utils/general/QuickString.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1229a245547ade6ab08f491d9fc7062c2e226845 --- /dev/null +++ b/src/utils/general/QuickString.cpp @@ -0,0 +1,229 @@ +#include "QuickString.h" +#include <cstring> +#include <cstdlib> +#include <cstdio> +#include "ParseTools.h" + +QuickString::QuickString(size_t capacity) +: _buffer(NULL), + _currCapacity(capacity), + _currSize(0) +{ + build(); +} + +QuickString::QuickString(const QuickString &qs) +: _buffer(NULL), + _currCapacity(qs._currCapacity), + _currSize(0) +{ + build(); + set(qs._buffer, qs._currSize); +} + +QuickString::QuickString(const char *inBuf) +{ + size_t len = strlen(inBuf); + _currCapacity = len +1; + + build(); + set(inBuf, len); +} + +QuickString::QuickString(const string &inString) +{ + size_t len = (int)inString.size(); + _currCapacity = len +1; + + build(); + set(inString.c_str(), len); +} + +QuickString::QuickString(char c) +{ + _currCapacity =2; + + build(); + + char buffer[2]; + buffer[0] = c; + buffer[1] = 0; + + set(buffer, 1); +} + +void QuickString::build() { + _buffer = (char *)malloc(_currCapacity); + clear(); +} + +QuickString::~QuickString(){ + free(_buffer); +} + +void QuickString::clear() { + memset(_buffer, 0, _currCapacity); + _currSize = 0; +} + + +QuickString &QuickString::operator = (const char *inBuf){ + set(inBuf, strlen(inBuf)); + return *this; +} + +QuickString &QuickString::operator = (const string & inBuf){ + set(inBuf.c_str(), (int)inBuf.size()); + return *this; +} + +QuickString &QuickString::operator = (const QuickString & inBuf){ + set(inBuf._buffer, (int)inBuf._currSize); + return *this; +} + + +QuickString &QuickString::operator += (const QuickString & inBuf) +{ + append(inBuf._buffer, (int)inBuf._currSize); + return *this; +} + +QuickString &QuickString::operator +=(const string &inBuf) +{ + append(inBuf.c_str(), (int)inBuf.size()); + return *this; +} + +QuickString &QuickString::operator +=(char c) { + + append(c); + return *this; +} + +QuickString &QuickString::operator += (const char *inBuf) +{ + append(inBuf, strlen(inBuf)); + return *this; +} + +bool QuickString::operator == (const QuickString &qs) const { + if ( _currSize != qs._currSize) { + return false; + } + for (int i= _currSize-1; i > -1; i--) { + if (_buffer[i] != qs._buffer[i]) return false; + } + return true; +} + +bool QuickString::operator == (const string &str) const { + if ( _currSize != str.size()) { + return false; + } + for (int i= _currSize-1; i > -1; i--) { + if (_buffer[i] != str[i]) return false; + } + return true; + +} + +bool QuickString::operator == (const char *str) const { + size_t inLen = strlen(str); + if (inLen != _currSize) { + return false; + } + for (int i= _currSize-1; i > -1; i--) { + if (_buffer[i] != str[i]) return false; + } + return true; +} + + +bool QuickString::operator != (const QuickString &qs) const { + return !(*this == qs); +} + +bool QuickString::operator < (const QuickString &qs) const { + return (memcmp(_buffer, qs._buffer, max(_currSize, qs._currSize)) < 0); +} + +bool QuickString::operator > (const QuickString &qs) const { + return (memcmp(_buffer, qs._buffer, max(_currSize, qs._currSize))> 0); +} + +void QuickString::set(const char *inBuf, size_t newLen) { + reserve(newLen); + clear(); + memcpy(_buffer, inBuf, newLen); + _currSize = newLen; +} + +void QuickString::reserve(size_t newLen) { + if (_currCapacity <= newLen) { + while (_currCapacity <= newLen) { + _currCapacity = _currCapacity << 1; + } + _buffer = (char *)realloc(_buffer, _currCapacity ); + if (_buffer == NULL) { + fprintf(stderr, "Error: failed to reallocate string.\n"); + _currSize = 0; + _currCapacity = 0; + exit(1); + } + //initialize newly reserved memory. + memset(_buffer + _currSize, 0, _currCapacity - _currSize); + } +} + +void QuickString::append(char c) +{ + reserve(_currSize +1); + _buffer[_currSize] = c; + _currSize++; +} + +void QuickString::append(const char *inBuf, size_t inBufLen) +{ + reserve(_currSize + inBufLen); + memcpy(_buffer + _currSize, inBuf, inBufLen); + _currSize += inBufLen; +} + +void QuickString::append(int num) { + int2str(num, *this, true); +} +QuickString &QuickString::assign(const char *inBuf, size_t inBufLen) +{ + clear(); + append(inBuf, inBufLen); + return *this; +} + +void QuickString::resize(size_t newSize, char fillChar) +{ + if (newSize > _currSize) { //grow the string, pad with fillChar + reserve(newSize); + memset(_buffer + _currSize, fillChar, newSize -_currSize); + } else if (newSize < _currSize) { //cut off characters from the end + memset(_buffer + newSize, 0, _currSize - newSize); + } + _currSize = newSize; +} + + +void QuickString::substr (QuickString &newStr, size_t pos, size_t len) const +{ + if (pos >= _currSize) { + return; + } + if (pos + len >= _currSize) { + len = _currSize - pos; + } + newStr.set(_buffer + pos, len); +} + +ostream &operator << (ostream &out, const QuickString &str) { + out << str._buffer; + return out; +} diff --git a/src/utils/general/QuickString.h b/src/utils/general/QuickString.h new file mode 100644 index 0000000000000000000000000000000000000000..87ee03f4cb632966a9ef1354117182ba37ace739 --- /dev/null +++ b/src/utils/general/QuickString.h @@ -0,0 +1,71 @@ +/* + * QuickString.h + * + * Created on: Dec 3, 2012 + * Author: nek3d + */ + +#ifndef QUICKSTRING_H_ +#define QUICKSTRING_H_ + +using namespace std; +#include <string> +#include <climits> +#include <ostream> + +class QuickString { +public: + QuickString(size_t capacity = DEFAULT_CAPACITY); + QuickString(const QuickString &); //need explicit copy constructor + //so address of _buffer member is not copied! Each QuickString object + //needs to build it's own buffer. + QuickString(const char *); + QuickString(const string &); + QuickString(char c); + ~QuickString(); + size_t size() const { return _currSize; } + size_t capacity() const { return _currCapacity; } + void reserve(size_t newCapacity=0); + bool empty() const { return _currSize == 0; } + + void clear(); //only clears buffer, doesn't delete it. + QuickString &operator = (const string &); + QuickString &operator = (const char *); + QuickString &operator = (const QuickString &); + QuickString &operator += (const QuickString &); + QuickString &operator += (const string &); + QuickString &operator += (const char *); + QuickString &operator += (char); + + friend ostream &operator << (ostream &out, const QuickString &str); + bool operator == (const QuickString &) const; + bool operator == (const string &) const; + bool operator == (const char *) const; + bool operator != (const QuickString &) const; + bool operator < (const QuickString &) const; + bool operator > (const QuickString &) const; + const char *c_str() const { return _buffer; } + const char &operator [] (int pos) const { return _buffer[pos]; } + char &operator [] (int pos) { return _buffer[pos]; } + + void append(const QuickString &str) { append(str.c_str(), str.size()); } + void append(const char *buf, size_t bufLen); + void append(char c); + void append(int num); + + QuickString &assign(const char *str, size_t n); + void resize(size_t n, char c = '\0'); + void substr(QuickString &newStr, size_t pos = 0, size_t len = UINT_MAX) const; + +private: + char *_buffer; + size_t _currCapacity; + size_t _currSize; + + static const int DEFAULT_CAPACITY = 256; + void build(); + void set(const char *len, size_t size); +}; + + +#endif /* QUICKSTRING_H_ */