diff --git a/src/mapFile/mapMain.cpp b/src/mapFile/mapMain.cpp index 235c1fb72a31362ea755a56fa467492c58a608d1..49d1b270431d1be7800b669159b0a3d32dc2421f 100644 --- a/src/mapFile/mapMain.cpp +++ b/src/mapFile/mapMain.cpp @@ -48,29 +48,8 @@ void map_help(void) { cerr << "Options: " << endl; - cerr << "\t-c\t" << "Specify columns from the B file to map onto intervals in A." << endl; - cerr << "\t\tDefault: 5." << endl; - cerr << "\t\tMultiple columns can be specified in a comma-delimited list." << endl << endl; - - cerr << "\t-o\t" << "Specify the operation that should be applied to -c." << endl; - cerr << "\t\tValid operations:" << endl; - cerr << "\t\t sum, min, max, absmin, absmax," << endl; - cerr << "\t\t mean, median," << endl; - cerr << "\t\t collapse (i.e., print a comma separated list (duplicates allowed)), " << endl; - cerr << "\t\t distinct (i.e., print a comma separated list (NO duplicates allowed)), " << endl; - cerr << "\t\t count" << endl; - cerr << "\t\t count_distinct (i.e., a count of the unique values in the column), " << endl; - cerr << "\t\tDefault: sum" << endl; - cerr << "\t\tMultiple operations can be specified in a comma-delimited list." << endl << endl; - - cerr << "\t\tIf there is only column, but multiple operations, all operations will be" << endl; - cerr << "\t\tapplied on that column. Likewise, if there is only one operation, but" << endl; - cerr << "multiple columns, that operation will be applied to all columns." << endl; - cerr << "\t\tOtherwise, the number of columns must match the the number of operations," << endl; - cerr << "and will be applied in respective order." << endl; - cerr << "\t\tE.g., \"-c 5,4,6 -o sum,mean,count\" will give the sum of column 5," << endl; - cerr << "the mean of column 4, and the count of column 6." << endl; - cerr << "\t\tThe order of output columns will match the ordering given in the command." << endl << endl<<endl; + KeyListOpsHelp(); + cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl; cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl; diff --git a/src/mergeFile/mergeMain.cpp b/src/mergeFile/mergeMain.cpp index 3f443e27ba21b39b9ed28a4f91f3121017abf1ff..0318725a09ec8af9c71446eb31f3e12a93f3c5db 100644 --- a/src/mergeFile/mergeMain.cpp +++ b/src/mergeFile/mergeMain.cpp @@ -56,29 +56,13 @@ void merge_help(void) { cerr << "\t\tthat are the same strand." << endl; cerr << "\t\t- By default, merging is done without respect to strand." << endl << endl; - cerr << "\t-n\t" << "Report the number of BED entries that were merged." << endl; - cerr << "\t\t- Note: \"1\" is reported if no merging occurred." << endl << endl; - cerr << "\t-d\t" << "Maximum distance between features allowed for features" << endl; cerr << "\t\tto be merged." << endl; cerr << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl; cerr << "\t\t- (INTEGER)" << endl << endl; - cerr << "\t-nms\t" << "Report the names of the merged features separated by commas." << endl; - cerr << "\t\tChange delim. with -delim." << endl << endl; - - cerr << "\t-scores\t" << "Report the scores of the merged features. Specify one of " << endl; - cerr << "\t\tthe following options for reporting scores:" << endl; - cerr << "\t\t sum, min, max," << endl; - cerr << "\t\t mean, median, mode, antimode," << endl; - cerr << "\t\t collapse (i.e., print a semicolon-separated list)," << endl; - cerr << "\t\t- (INTEGER)" << endl << endl; - - cerr << "\t-delim\t" << "Specify a custom delimiter for the -nms and -scores concat options" << endl; - cerr << "\t\t- Example: -delim \"|\"" << endl; - cerr << "\t\t- Default: \",\"." << endl << endl; - + KeyListOpsHelp(); cerr << "Notes: " << endl; cerr << "\t(1) All output, regardless of input type (e.g., GFF or VCF)" << endl; diff --git a/src/utils/Contexts/ContextBase.cpp b/src/utils/Contexts/ContextBase.cpp index 55d5b6ad3b25965e26b6872cedc6fc7b637fbe69..f05c5b186ab71b1b3c4821650da715c0bb518d15 100644 --- a/src/utils/Contexts/ContextBase.cpp +++ b/src/utils/Contexts/ContextBase.cpp @@ -52,6 +52,7 @@ ContextBase::ContextBase() _forwardOnly(false), _reverseOnly(false), _hasColumnOpsMethods(false), + _keyListOps(NULL), _desiredStrand(FileRecordMergeMgr::ANY_STRAND), _maxDistance(0), _useMergedIntervals(false) @@ -459,10 +460,11 @@ bool ContextBase::handle_delim() void ContextBase::setColumnOpsMethods(bool val) { - _hasColumnOpsMethods = val; - if (val) { + if (val && !_hasColumnOpsMethods) { + //was off, but we're turning it on. _keyListOps = new KeyListOps(); } + _hasColumnOpsMethods = val; } const QuickString &ContextBase::getColumnOpsVal(RecordKeyList &keyList) const { diff --git a/src/utils/Contexts/ContextMerge.cpp b/src/utils/Contexts/ContextMerge.cpp index d17d3bdb6823041c497d5f596a758756e7463a1b..917072d6897a344c54b4faea1a0f4887a27caddf 100644 --- a/src/utils/Contexts/ContextMerge.cpp +++ b/src/utils/Contexts/ContextMerge.cpp @@ -103,7 +103,7 @@ bool ContextMerge::isValidState() } //column operations not allowed with BAM input - if ((!_keyListOps->getColumns().empty() || !_keyListOps->getOperations().empty()) && + if (hasColumnOpsMethods() && getFile(0)->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { _errorMsg = "\n***** ERROR: stranded merge not supported for VCF files. *****"; return false; diff --git a/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp index 38074edf3513817e7413cf4426269fe618d84b7d..4beaccf2ace7a6d16c8a476ba73c09ad9f0ee9ae 100644 --- a/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp +++ b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp @@ -104,7 +104,11 @@ void SingleLineDelimTextFileReader::appendField(int fieldNum, QuickString &str) bool SingleLineDelimTextFileReader::detectAndHandleHeader() { - if (!isHeaderLine(_sLine)) { + //not sure why the linker is giving me a hard time about + //passing a non-const QuickString to isHeaderLine, but + //this const ref is a workaround. + const QuickString &sLine2 = _sLine; + if (!isHeaderLine(sLine2)) { return false; } if (!_fullHeaderFound) { diff --git a/src/utils/FileRecordTools/Records/BlockMgr.cpp b/src/utils/FileRecordTools/Records/BlockMgr.cpp index 21b060df73b71af677bde159f1a980c73b48bbf8..d63687130be1ba5e2ea62fe5f0d9187be8571104 100644 --- a/src/utils/FileRecordTools/Records/BlockMgr.cpp +++ b/src/utils/FileRecordTools/Records/BlockMgr.cpp @@ -60,11 +60,8 @@ void BlockMgr::getBlocksFromBed12(RecordKeyList &keyList, bool &mustDelete) return; } - vector<QuickString> sizes; - vector<QuickString> starts; - - int sizeCount = Tokenize(keyRecord->getBlockSizes(), sizes, ',', blockCount); - int startCount = Tokenize(keyRecord->getBlockStarts(), starts, ',', blockCount); + int sizeCount = _blockSizeTokens.tokenize(keyRecord->getBlockSizes(), ','); + int startCount = _blockStartTokens.tokenize(keyRecord->getBlockStarts(), ','); if (blockCount != sizeCount || sizeCount != startCount) { fprintf(stderr, "Error: found wrong block counts while splitting entry.\n"); @@ -72,8 +69,8 @@ void BlockMgr::getBlocksFromBed12(RecordKeyList &keyList, bool &mustDelete) } for (int i=0; i < blockCount; i++) { - int startPos = keyRecord->getStartPos() + str2chrPos(starts[i].c_str()); - int endPos = startPos + str2chrPos(sizes[i].c_str()); + int startPos = keyRecord->getStartPos() + str2chrPos(_blockStartTokens.getElem(i).c_str()); + int endPos = startPos + str2chrPos(_blockSizeTokens.getElem(i).c_str()); const Record *record = allocateAndAssignRecord(keyRecord, startPos, endPos); keyList.push_back(record); diff --git a/src/utils/FileRecordTools/Records/BlockMgr.h b/src/utils/FileRecordTools/Records/BlockMgr.h index c83b1e0125284bb5743c95ff5d3cc91708a06d04..bf6f116f5a0f060a912aec16d4fb6910102f9ea8 100644 --- a/src/utils/FileRecordTools/Records/BlockMgr.h +++ b/src/utils/FileRecordTools/Records/BlockMgr.h @@ -16,6 +16,7 @@ using namespace std; #include "FileRecordTypeChecker.h" #include "RecordKeyList.h" + class RecordMgr; class BlockMgr { @@ -50,6 +51,8 @@ private: float _overlapFraction; bool _hasReciprocal; + Tokenizer _blockSizeTokens; + Tokenizer _blockStartTokens; // For now, all records will be split into Bed6 records. const static FileRecordTypeChecker::RECORD_TYPE _blockRecordsType = FileRecordTypeChecker::BED6_RECORD_TYPE; diff --git a/src/utils/GenomeFile/NewGenomeFile.cpp b/src/utils/GenomeFile/NewGenomeFile.cpp index d100759707837bd0b1426437a42477b2778098ff..84e44cbed97dcdbd7c0fdb760dc02edca634bad8 100644 --- a/src/utils/GenomeFile/NewGenomeFile.cpp +++ b/src/utils/GenomeFile/NewGenomeFile.cpp @@ -11,6 +11,7 @@ ******************************************************************************/ #include "NewGenomeFile.h" #include "ParseTools.h" +#include "Tokenizer.h" NewGenomeFile::NewGenomeFile(const QuickString &genomeFilename) : _maxId(-1) @@ -44,21 +45,20 @@ void NewGenomeFile::loadGenomeFileIntoMap() { exit(1); } string sLine; - vector<QuickString> fields; + Tokenizer fieldTokens; CHRPOS chrSize = 0; QuickString chrName; while (!genFile.eof()) { sLine.clear(); - fields.clear(); chrSize = 0; chrName.clear(); getline(genFile, sLine); - Tokenize(sLine.c_str(), fields); - if (fields.size() != 2) { + int numFields = fieldTokens.tokenize(sLine.c_str()); + if (numFields != 2) { continue; } - chrName = fields[0]; - chrSize = str2chrPos(fields[1]); + chrName = fieldTokens.getElem(0); + chrSize = str2chrPos(fieldTokens.getElem(1)); _maxId++; _chromSizeIds[chrName] = pair<CHRPOS, int>(chrSize, _maxId); _startOffsets.push_back(_genomeLength); diff --git a/src/utils/KeyListOps/KeyListOps.cpp b/src/utils/KeyListOps/KeyListOps.cpp index 657635000e5313c200567d25ea36a29623bd4056..a3b25131aaff6845bcbda3496a359d232f572de1 100644 --- a/src/utils/KeyListOps/KeyListOps.cpp +++ b/src/utils/KeyListOps/KeyListOps.cpp @@ -98,25 +98,28 @@ bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) { //member of each pair is a column number, and the second member is the code for the //operation to perform on that column. - vector<QuickString> columnsVec; - vector<QuickString> opsVec; - int numCols = Tokenize(_columns, columnsVec, ','); - int numOps = Tokenize(_operations, opsVec, ','); + Tokenizer colTokens; + Tokenizer opsTokens; + + int numCols = colTokens.tokenize(_columns, ','); + int numOps = opsTokens.tokenize(_operations, ','); if (numOps < 1 || numCols < 1) { cerr << endl << "*****" << endl << "***** ERROR: There must be at least one column and at least one operation named." << endl; return false; } - if (numOps > 1 && numCols != numOps) { + if (numOps > 1 && numCols > 1 && numCols != numOps) { cerr << endl << "*****" << endl << "***** ERROR: There are " << numCols <<" columns given, but there are " << numOps << " operations." << endl; cerr << "\tPlease provide either a single operation that will be applied to all listed columns, " << endl; + cerr << "\ta single column to which all operations will be applied," << endl; cerr << "\tor an operation for each column." << endl; return false; } - for (int i=0; i < (int)columnsVec.size(); i++) { - int col = str2chrPos(columnsVec[i]); + int loop = max(numCols, numOps); + for (int i=0; i < loop; i++) { + int col = str2chrPos(colTokens.getElem(numCols > 1 ? i : 0)); //check that the column number is valid if (col < 1 || col > dbFile->getNumFields()) { @@ -124,7 +127,7 @@ bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) { << dbFile->getFileName() << " only has fields 1 - " << dbFile->getNumFields() << "." << endl; return false; } - const QuickString &operation = opsVec.size() > 1 ? opsVec[i] : opsVec[0]; + const QuickString &operation = opsTokens.getElem(numOps > 1 ? i : 0); OP_TYPES opCode = getOpCode(operation); if (opCode == INVALID) { cerr << endl << "*****" << endl @@ -361,4 +364,33 @@ const QuickString & KeyListOps::getOpVals(RecordKeyList &hits) return _outVals; } +void KeyListOpsHelp() { + cerr << "\t-c\t" << "Specify columns from the B file to map onto intervals in A." << endl; + cerr << "\t\tDefault: 5." << endl; + cerr << "\t\tMultiple columns can be specified in a comma-delimited list." << endl << endl; + + cerr << "\t-o\t" << "Specify the operation that should be applied to -c." << endl; + cerr << "\t\tValid operations:" << endl; + cerr << "\t\t sum, min, max, absmin, absmax," << endl; + cerr << "\t\t mean, median," << endl; + cerr << "\t\t collapse (i.e., print a delimited list (duplicates allowed)), " << endl; + cerr << "\t\t distinct (i.e., print a delimited list (NO duplicates allowed)), " << endl; + cerr << "\t\t count" << endl; + cerr << "\t\t count_distinct (i.e., a count of the unique values in the column), " << endl; + cerr << "\t\tDefault: sum" << endl; + cerr << "\t\tMultiple operations can be specified in a comma-delimited list." << endl << endl; + + cerr << "\t\tIf there is only column, but multiple operations, all operations will be" << endl; + cerr << "\t\tapplied on that column. Likewise, if there is only one operation, but" << endl; + cerr << "multiple columns, that operation will be applied to all columns." << endl; + cerr << "\t\tOtherwise, the number of columns must match the the number of operations," << endl; + cerr << "and will be applied in respective order." << endl; + cerr << "\t\tE.g., \"-c 5,4,6 -o sum,mean,count\" will give the sum of column 5," << endl; + cerr << "the mean of column 4, and the count of column 6." << endl; + cerr << "\t\tThe order of output columns will match the ordering given in the command." << endl << endl<<endl; + + cerr << "\t-delim\t" << "Specify a custom delimiter for the collapse operations." << endl; + cerr << "\t\t- Example: -delim \"|\"" << endl; + cerr << "\t\t- Default: \",\"." << endl << endl; +} diff --git a/src/utils/KeyListOps/KeyListOps.h b/src/utils/KeyListOps/KeyListOps.h index 5046ec1883ae92e00183dc232da8e67ca8cc164b..5c5ea63c6780d0012b03bc510c7e11188898be55 100644 --- a/src/utils/KeyListOps/KeyListOps.h +++ b/src/utils/KeyListOps/KeyListOps.h @@ -12,6 +12,9 @@ class FileRecordMgr; +//print help message +void KeyListOpsHelp(); + class KeyListOps { public: diff --git a/src/utils/fileType/FileRecordTypeChecker.cpp b/src/utils/fileType/FileRecordTypeChecker.cpp index fb81f09c53e489d62c83a88fb421527532fc41fd..5b3eb97e7b36cd73ff54315e49c66d2ea6d2f138 100644 --- a/src/utils/fileType/FileRecordTypeChecker.cpp +++ b/src/utils/fileType/FileRecordTypeChecker.cpp @@ -13,7 +13,6 @@ FileRecordTypeChecker::FileRecordTypeChecker() _isBed = false; _isDelimited = false; _delimChar = '\t'; //tab by default - _lines.clear(); _firstValidDataLineIdx = -1; _isVCF = false; _isBAM = false; @@ -161,9 +160,14 @@ bool FileRecordTypeChecker::handleTextFormat(const char *buffer, size_t len) _fileType = SINGLE_LINE_DELIM_TEXT_FILE_TYPE; //Tokenize the first line of valid data into fields. - const QuickString &line = _lines[_firstValidDataLineIdx]; - _currLineElems.clear(); - if (Tokenize(line, _currLineElems, _delimChar, _numFields) != _numFields) { + //Need to make a copy so next call to tokenizer doesn't overwrite the line. + + QuickString line(_tokenizer.getElem(_firstValidDataLineIdx)); + + _tokenizer.setKeepFinalIncompleteElem(Tokenizer::USE_NOW); + _tokenizer.setNumExpectedItems(_numFields); + + if (_tokenizer.tokenize(line, _delimChar) != _numFields) { cerr << "Error: Type checker found wrong number of fields while tokenizing data line." << endl; exit(1); } @@ -173,7 +177,7 @@ bool FileRecordTypeChecker::handleTextFormat(const char *buffer, size_t len) if (_numFields == 3) { _recordType = BED3_RECORD_TYPE; } else if (_numFields == 4) { - if (isNumeric(_currLineElems[3])) { + if (isNumeric(_tokenizer.getElem(3))) { _recordType = BEDGRAPH_RECORD_TYPE; _fourthFieldNumeric = true; } else { @@ -223,12 +227,12 @@ bool FileRecordTypeChecker::isBedFormat() { return false; } //the 2nd and 3rd fields must be numeric. - if (!isNumeric(_currLineElems[1]) || !isNumeric(_currLineElems[2])) { + if (!isNumeric(_tokenizer.getElem(1)) || !isNumeric(_tokenizer.getElem(2))) { return false; } - int start = str2chrPos(_currLineElems[1]); - int end = str2chrPos(_currLineElems[2]); + int start = str2chrPos(_tokenizer.getElem(1)); + int end = str2chrPos(_tokenizer.getElem(2)); if (end < start) { return false; } @@ -242,11 +246,11 @@ bool FileRecordTypeChecker::isGFFformat() return false; } //the 4th and 5th fields must be numeric. - if (!isNumeric(_currLineElems[3]) || !isNumeric(_currLineElems[4])) { + if (!isNumeric(_tokenizer.getElem(3)) || !isNumeric(_tokenizer.getElem(4))) { return false; } - int start = str2chrPos(_currLineElems[3]); - int end = str2chrPos(_currLineElems[4]); + int start = str2chrPos(_tokenizer.getElem(3)); + int end = str2chrPos(_tokenizer.getElem(4)); if (end < start) { return false; } @@ -256,8 +260,8 @@ bool FileRecordTypeChecker::isGFFformat() bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len) { //Break single string buffer into vector of QuickStrings. Delimiter is newline. - _lines.clear(); - int numLines = Tokenize(buffer, _lines, '\n', len); + _tokenizer.setKeepFinalIncompleteElem(Tokenizer::IGNORE); + int numLines = _tokenizer.tokenize(buffer, '\n'); //anticipated delimiter characters are tab, comma, and semi-colon. //If we need new ones, they must be added in this method. @@ -283,7 +287,7 @@ bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len) if (validLinesFound >=4) { break; //really only need to look at like 4 lines of data, max. } - QuickString &line = _lines[i]; + const QuickString &line = _tokenizer.getElem(i); int len =line.size(); //skip over any empty line if (len == 0) { diff --git a/src/utils/fileType/FileRecordTypeChecker.h b/src/utils/fileType/FileRecordTypeChecker.h index 4af5a80206ff3d53582fb0526465b4a6dd44413c..15f31ca30c57ab11e83abbea02f05cd910de7bc1 100644 --- a/src/utils/fileType/FileRecordTypeChecker.h +++ b/src/utils/fileType/FileRecordTypeChecker.h @@ -18,6 +18,7 @@ using namespace std; #include <vector> #include <map> #include "PushBackStreamBuf.h" +#include "Tokenizer.h" class FileRecordTypeChecker { public: @@ -87,8 +88,8 @@ private: RECORD_TYPE _recordType; QuickString _filename; //useful for reporting errors with file. - vector<QuickString> _lines; - vector<QuickString> _currLineElems; + Tokenizer _tokenizer; + int _firstValidDataLineIdx; int _numBytesInBuffer; //this will hold the length of the buffer after the scan. diff --git a/src/utils/general/Makefile b/src/utils/general/Makefile index 0361fab41acde1e99e8268075d35b533dc3bfc8a..20d7aeead018248fc8d954d4eb982c66c4009701 100644 --- a/src/utils/general/Makefile +++ b/src/utils/general/Makefile @@ -9,8 +9,8 @@ INCLUDES = -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= QuickString.h QuickString.cpp ParseTools.h ParseTools.cpp PushBackStreamBuf.cpp PushBackStreamBuf.h CompressionTools.h CompressionTools.cpp -OBJECTS= QuickString.o ParseTools.o PushBackStreamBuf.o CompressionTools.o +SOURCES= QuickString.h QuickString.cpp ParseTools.h ParseTools.cpp PushBackStreamBuf.cpp PushBackStreamBuf.h CompressionTools.h CompressionTools.cpp Tokenizer.h Tokenizer.h +OBJECTS= QuickString.o ParseTools.o PushBackStreamBuf.o CompressionTools.o Tokenizer.o BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) all: $(BUILT_OBJECTS) @@ -23,6 +23,6 @@ $(BUILT_OBJECTS): $(SOURCES) clean: @echo "Cleaning up." - @rm -f $(OBJ_DIR)/QuickString.o $(OBJ_DIR)/ParseTools.o $(OBJ_DIR)/PushBackStreamBuf.o + @rm -f $(OBJ_DIR)/QuickString.o $(OBJ_DIR)/ParseTools.o $(OBJ_DIR)/PushBackStreamBuf.o $(OBJ_DIR)/Tokenizer.o .PHONY: clean diff --git a/src/utils/general/ParseTools.cpp b/src/utils/general/ParseTools.cpp index bef426a4ed7d7b3625f75f62861e874747bc2672..04782a717630a0c0457df8cfa3b80427b0170002 100644 --- a/src/utils/general/ParseTools.cpp +++ b/src/utils/general/ParseTools.cpp @@ -93,35 +93,7 @@ string vectorIntToStr(const vector<int> &vec) { return str; } -// TBD: Could be better optimized. I'm allocating 8KB for every call, then destroying it. -// That memory needs to stay in scope. -// Also, is this handling subsequent delimiters? -int Tokenize(const QuickString &str, vector<QuickString> &elems, char delimiter, int numExpectedItems) { - - elems.reserve(numExpectedItems); - int strLen = (int)str.size(); - - int startPos = 0; - int currPos = 0; - - char elemBuf[8192]; - - while (startPos < strLen) { - memset(elemBuf, 0, 8192); - while (str[currPos] != delimiter && currPos < strLen) { - currPos++; - } - if (currPos > startPos) { - memcpy(elemBuf, str.c_str() + startPos, min(currPos, strLen) - startPos); - elems.push_back(elemBuf); - } - startPos = currPos +1; - currPos = startPos; - } - return (int)elems.size(); -} - -bool isHeaderLine(QuickString &line) { +bool isHeaderLine(const QuickString &line) { if (line[0] == '>') { return true; } @@ -143,4 +115,3 @@ bool isHeaderLine(QuickString &line) { } return false; } - diff --git a/src/utils/general/ParseTools.h b/src/utils/general/ParseTools.h index 405d631a88ae067dfd121ebfd0ad890c857c5690..871f53bfea150ad615c2416c53701d4e94600fdc 100644 --- a/src/utils/general/ParseTools.h +++ b/src/utils/general/ParseTools.h @@ -86,7 +86,7 @@ void int2str(int number, T& buffer, bool appendToBuf = false) } -bool isHeaderLine(QuickString &line); +bool isHeaderLine(const QuickString &line); string vectorIntToStr(const vector<int> &vec); diff --git a/src/utils/general/Tokenizer.cpp b/src/utils/general/Tokenizer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b88e013409421074790395e2fc0dbf88b92d9c10 --- /dev/null +++ b/src/utils/general/Tokenizer.cpp @@ -0,0 +1,93 @@ +/* + * Tokenizer.cpp + * + * Created on: Apr 15, 2014 + * Author: nek3d + */ +#include "Tokenizer.h" +#include <cstring> + +Tokenizer::Tokenizer() +: _numExpectedElems(0), + _keepFinalIncElem(USE_NOW), + _numValidElems(0) { + + _elems.resize(INITIAL_NUM_ELEMS, NULL); + for (int i = 0; i < INITIAL_NUM_ELEMS; i++) { + _elems[i] = new QuickString(); + } +} + +Tokenizer::~Tokenizer() { + resize(0); //easy way to delete elems without repeating code. +} + +void Tokenizer::setNumExpectedItems(int newSize) { + _numExpectedElems = newSize; + resize(newSize); +} + +int Tokenizer::tokenize(const QuickString &str, char delimiter) { + + int strLen = (int)str.size(); + + int startPos = 0; + int currPos = 0; + + int currIdx = 0; + + while (startPos < strLen) { + while (str[currPos] != delimiter && currPos < strLen) { + currPos++; + } + if (currPos > startPos) { + if (currPos == strLen && _keepFinalIncElem != USE_NOW) { + //we found an incomplete final element. + // if we're ignoring incomplete elems, do nothing with it. + currIdx--; //make sure it's not included in the final count of valid elems. + + } else { + QuickString *newStr = fetchElem(currIdx); + newStr->assign(str.c_str() + startPos, min(currPos, strLen) - startPos); + } + } + startPos = currPos +1; + currPos = startPos; + currIdx++; + } + _numValidElems = currIdx; + return currIdx; +} + +void Tokenizer::setKeepFinalIncompleteElem(lastElemCode code) { + _keepFinalIncElem = code; +} + +QuickString *Tokenizer::fetchElem(int idx) +{ + if (idx >= (int)_elems.size()) { + resize(idx +1); + } + return _elems[idx]; +} + + +void Tokenizer::resize(int newSize) { + int oldSize = (int)_elems.size(); + + if (newSize > oldSize) { //need to add items. + _elems.resize(newSize); + for (int i=oldSize; i < newSize; i++) { + _elems[i] = new QuickString(); + } + } else if (oldSize > newSize) { + //need to remove items. + for (int i = oldSize - 1; i >= newSize; i--) { + delete _elems[i]; + _elems[i] = NULL; + } + _elems.resize(newSize); + } + //if oldSize is the same as newSize, do nothing. +} + diff --git a/src/utils/general/Tokenizer.h b/src/utils/general/Tokenizer.h new file mode 100644 index 0000000000000000000000000000000000000000..df49f11408b5d7d14d08cdaff7a6b9c2237f2d05 --- /dev/null +++ b/src/utils/general/Tokenizer.h @@ -0,0 +1,57 @@ +/* + * Tokenizer.h + * + * Created on: Apr 15, 2014 + * Author: nek3d + */ + +#ifndef TOKENIZER_H_ +#define TOKENIZER_H_ + +using namespace std; + +#include "QuickString.h" +#include <vector> + +class Tokenizer { +public: + Tokenizer(); + ~Tokenizer(); + + // If you know the expected number of items, set this. + // If not, don't worry about it. + void setNumExpectedItems(int val); + + int tokenize(const QuickString &str, char delimiter = '\t'); + + // If the final element ends before a delim char, that means + // the buffer passed in ends mid-element. The last, incomplete + // element found can either be: + // 1) Used now. We want it whether it's complete or not. + // 3) Ignored altogether. + typedef enum { USE_NOW, IGNORE } lastElemCode; + void setKeepFinalIncompleteElem(lastElemCode code); + + //final number of valid elems may be less than total number of elems, + //because elems are not necessarily deleted between subsequent calls + //to tokenizer. + int getNumValidElems() const { return _numValidElems; } + int getNumTotalElems() const { return (int)_elems.size(); } + const QuickString &getElem(int i) const { return (*(_elems[i])); } + + + +private: + static const int DEFAULT_PARSE_BUFFER_SIZE = 4096; // 8Kb + static const int INITIAL_NUM_ELEMS = 10; + vector<QuickString *> _elems; + int _numExpectedElems; + lastElemCode _keepFinalIncElem; + int _numValidElems; + + QuickString *fetchElem(int idx); + void resize(int newSize); +}; + + +#endif /* TOKENIZER_H_ */ diff --git a/test/map/test-map.sh b/test/map/test-map.sh index 70d550a92fcc929fccc04e09fbeea7fe86ce93cb..0a188bec9d5acf6aa6b36e7b6d7fc648d6f24088 100644 --- a/test/map/test-map.sh +++ b/test/map/test-map.sh @@ -691,10 +691,13 @@ rm obs exp ############################################################ echo " map.t46...\c" echo \ -" -***** -***** ERROR: There are 1 columns given, but there are 2 operations." > exp -$BT map -a ivls.bed -b values.bed -o count,sum 2>&1 > /dev/null | head -3 > obs +"chr1 0 100 3 30 +chr1 100 200 1 1 +chr2 0 100 0 . +chr2 100 200 0 . +chr3 0 100 3 6 +chr3 100 200 1 4" > exp +$BT map -a ivls.bed -b values.bed -o count,sum > obs check obs exp rm obs exp diff --git a/test/merge/test-merge.sh b/test/merge/test-merge.sh index 165e1e863e485477eb65ddbceb965596777b0e33..d22caab6b87584ad03c608cc7935657836111ebe 100644 --- a/test/merge/test-merge.sh +++ b/test/merge/test-merge.sh @@ -18,7 +18,6 @@ check() # chr1 45 100 ########################################################### -# Test #1 # Test a basic merge; one interval should be un-merged, # the other two should be merged. ########################################################### @@ -31,71 +30,49 @@ check obs exp rm obs exp ########################################################### -# -# NOTE: Testing for sorted input is now deprecated, as the -# FileRecordMgr is already testing for that. -# -########################################################### -# Test #2 -# Enforce coordinate sorted input. -########################################################### -#echo " merge.t2...\c" -#command -v tac 2>/dev/null || alias tac="sed '1!G;h;\$!d'" -#tac a.bed | $BT merge -i - 2> obs -#echo "ERROR: input file: (-) is not sorted by chrom then start. -# The start coordinate at line 3 is less than the start at line 2" > exp -#check obs exp -#rm obs exp +# Test that -n option is shown as deperecated +########################################################### +echo " merge.t2...\c" +echo "***** ERROR: -n option is deprecated. Please see the documentation for the -c and -o column operation options. *****" > exp +$BT merge -i a.bed -n 2>&1 > /dev/null | head -2 | tail -1 > obs +check obs exp +rm obs exp ########################################################### -# Test #3 # Test the counting of merged intervals. (-n) ########################################################### echo " merge.t3...\c" echo \ "chr1 10 20 1 chr1 30 100 3" > exp -$BT merge -i a.bed -n > obs +$BT merge -i a.bed -c 1 -o count > obs check obs exp rm obs exp ########################################################### -# Test #4 -# Test the listing of names from merged intervals. (-nms) -# a.bed should fail, as there is no name field +# Test that -nms option is deprecated ########################################################### echo " merge.t4...\c" -echo \ -"***** -***** ERROR: Requested column 4, but database file a.bed only has fields 1 - 3." > exp -$BT merge -i a.bed -nms 2>&1 > /dev/null | head -3 | tail -2 > obs +echo "***** ERROR: -nms option is deprecated. Please see the documentation for the -c and -o column operation options. *****" > exp +$BT merge -i a.bed -nms 2>&1 > /dev/null | head -2 | tail -1 > obs check obs exp rm obs exp - ########################################################### -# Test #5 -# Test the listing of names from merged intervals. (-nms) -# a.named.bed should work, as there are name fields -# -# cat a.names.bed -# chr1 10 20 a1 -# chr1 30 40 a2 -# chr1 40 50 a3 -# chr1 45 100 a4 +# Test the listing of names from merged intervals. ########################################################### echo " merge.t5...\c" echo \ "chr1 10 20 a1 chr1 30 100 a2,a3,a4" > exp -$BT merge -i a.names.bed -nms > obs +$BT merge -i a.names.bed -c 4 -o collapse > obs check obs exp rm obs exp ########################################################### -# -nms and -scores sum +# collapsed list of the names, and sum of the scores ########################################################### echo " merge.t6...\c" echo \ @@ -104,12 +81,12 @@ chr1 30 100 a2,a3,a4 9 chr2 10 20 a1 5 chr2 30 40 a2 6 chr2 42 100 a3,a4 15" > exp -$BT merge -i a.full.bed -nms -scores sum> obs +$BT merge -i a.full.bed -c 4,5 -o collapse,sum > obs check obs exp rm obs exp ########################################################### -# -n and -scores sum +# count intervals and sum of scores ########################################################### echo " merge.t7...\c" echo \ @@ -118,12 +95,12 @@ chr1 30 100 3 9 chr2 10 20 1 5 chr2 30 40 1 6 chr2 42 100 2 15" > exp -$BT merge -i a.full.bed -n -scores sum> obs +$BT merge -i a.full.bed -c 5 -o count,sum> obs check obs exp rm obs exp ########################################################### -# -n, -nms, and -scores sum +# count, collapsed names, and sum of scores ########################################################### echo " merge.t8...\c" echo \ @@ -132,12 +109,13 @@ chr1 30 100 a2,a3,a4 9 3 chr2 10 20 a1 5 1 chr2 30 40 a2 6 1 chr2 42 100 a3,a4 15 2" > exp -$BT merge -i a.full.bed -nms -scores sum -n> obs +$BT merge -i a.full.bed -c 4,5,4 -o collapse,sum,count > obs check obs exp rm obs exp ########################################################### -# -s, -n, -nms, and -scores sum +# stranded merge, show sign, collapsed names, sum of +# scores, and count ########################################################### echo " merge.t9...\c" echo \ @@ -149,24 +127,17 @@ chr2 10 20 + a1 5 1 chr2 30 40 + a2 6 1 chr2 42 50 + a3 7 1 chr2 45 100 - a4 8 1" > exp -$BT merge -i a.full.bed -s -nms -scores sum -n> obs +$BT merge -i a.full.bed -s -c 6,4,5,6 -o distinct,collapse,sum,count > obs check obs exp rm obs exp ########################################################### -# Test #10 # Test the use of a custom delimiter for -nms -# -# cat a.names.bed -# chr1 10 20 a1 -# chr1 30 40 a2 -# chr1 40 50 a3 -# chr1 45 100 a4 ########################################################### echo " merge.t10...\c" echo \ "chr1 10 20 a1 chr1 30 100 a2|a3|a4" > exp -$BT merge -i a.names.bed -nms -delim "|" > obs +$BT merge -i a.names.bed -delim "|" -c 4 -o collapse > obs check obs exp rm obs exp