diff --git a/src/mapFile/mapFile.h b/src/mapFile/mapFile.h index fbb431ac930bf68d7475f03ac8850438dd6d8302..6009bf88081a2bae4bf8e22efbe0227726b805dc 100644 --- a/src/mapFile/mapFile.h +++ b/src/mapFile/mapFile.h @@ -18,7 +18,6 @@ using namespace std; #include <iomanip> #include "VectorOps.h" #include "RecordKeyList.h" -#include "KeyListOps.h" #include "ContextMap.h" using namespace std; diff --git a/src/mapFile/mapMain.cpp b/src/mapFile/mapMain.cpp index 49d1b270431d1be7800b669159b0a3d32dc2421f..933fe0c1312bb47e6e66af72ea44ecce0e0fb098 100644 --- a/src/mapFile/mapMain.cpp +++ b/src/mapFile/mapMain.cpp @@ -48,6 +48,10 @@ void map_help(void) { cerr << "Options: " << endl; + cerr << "\t-c\t" << "Specify columns from the B file to map onto intervals in A." << endl; + cerr << "\t\tDefault: 5." << endl; + cerr << "\t\tMultiple columns can be specified in a comma-delimited list." << endl << endl; + KeyListOpsHelp(); diff --git a/src/mergeFile/mergeMain.cpp b/src/mergeFile/mergeMain.cpp index cf6804e08211b47ffe86fdd42f0ef5a31c378264..2bee4e32ebb9a34118bf60f23b84dfda2fedb845 100644 --- a/src/mergeFile/mergeMain.cpp +++ b/src/mergeFile/mergeMain.cpp @@ -66,6 +66,9 @@ void merge_help(void) { cerr << "\t\t- Note: negative values enforce the number of b.p. required for overlap." << endl << endl; cerr << "\t-header\t" << "Print the header from the A file prior to results." << endl << endl; + cerr << "\t-c\t" << "Specify columns from the input file to operate upon (see -o option, below)." << endl; + cerr << "\t\tMultiple columns can be specified in a comma-delimited list." << endl << endl; + KeyListOpsHelp(); cerr << "Notes: " << endl; cerr << "\t(1) All output, regardless of input type (e.g., GFF or VCF)" << endl; diff --git a/src/utils/Contexts/ContextMap.h b/src/utils/Contexts/ContextMap.h index b5bf5959cbfea429ee30c4dcd41ed2f2e4e005e4..fb23f4d5412fedcd3599f3951fd5e24bd76dbfb0 100644 --- a/src/utils/Contexts/ContextMap.h +++ b/src/utils/Contexts/ContextMap.h @@ -9,7 +9,6 @@ #define CONTEXTMAP_H_ #include "ContextIntersect.h" -#include "KeyListOps.h" class ContextMap : public ContextIntersect { public: diff --git a/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp index 373b8d15a040471a40cb8f42b26ff527c296151f..1e0837105580db9ec1a51465876ff47f0aa4892a 100644 --- a/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp +++ b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp @@ -104,6 +104,7 @@ bool BufferedStreamMgr::getLine(QuickString &line) return false; } } + bool retVal = true; while (1) { int searchPos = _mainBufCurrStartPos; while (searchPos < _mainBufCurrLen && _mainBuf[searchPos] != '\n') { @@ -111,15 +112,25 @@ bool BufferedStreamMgr::getLine(QuickString &line) } line.append((char *)_mainBuf + _mainBufCurrStartPos, searchPos - _mainBufCurrStartPos); + _mainBufCurrStartPos = searchPos +1; if (searchPos == _mainBufCurrLen) { //hit end of buffer, but no newline yet if (!readFileChunk()) { //hit eof - return true; + retVal = true; + break; } } else if (_mainBuf[searchPos] == '\n') { - return true; + retVal = true; + break; } } + //strip any whitespace characters, such as DOS newline characters or extra tabs, + //from the end of the line + int lastPos = line.size(); + while (isspace(line[lastPos-1])) lastPos--; + line.resize(lastPos); + + return retVal; } bool BufferedStreamMgr::readFileChunk() diff --git a/src/utils/KeyListOps/KeyListOps.cpp b/src/utils/KeyListOps/KeyListOps.cpp index 4a8ec61484f74129e5c7b013694c95fac865159a..c71e2425dd3008e6c9952556d9eef1df588c3407 100644 --- a/src/utils/KeyListOps/KeyListOps.cpp +++ b/src/utils/KeyListOps/KeyListOps.cpp @@ -118,6 +118,12 @@ bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) { return false; } int loop = max(numCols, numOps); + + // If there is only one column, all ops are performed on it. + // Otherwise, if there is only op, it is performed on all columns. + // Besides that, ops are performed on columns in their respective + // ordering. + for (int i=0; i < loop; i++) { int col = str2chrPos(colTokens.getElem(numCols > 1 ? i : 0)); @@ -138,70 +144,70 @@ bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) { } - //The final step we need to do is check that for each column/operation pair, - //if the operation is numeric, see if the database's record type supports - //numeric operations for that column. For instance, we can allow the mean - //of column 4 for a BedGraph file, because that's numeric, but not for Bed4, - //because that isn't. - - for (int i = 0; i < (int)_colOps.size(); i++) { - int col = _colOps[i].first; - OP_TYPES opCode = _colOps[i].second; - FileRecordTypeChecker::RECORD_TYPE recordType = dbFile->getRecordType(); - - if (isNumericOp(opCode)) { - bool isValidNumOp = false; - switch(recordType) { - case FileRecordTypeChecker::BED3_RECORD_TYPE: - isValidNumOp = Bed3Interval::isNumericField(col); - break; - - case FileRecordTypeChecker::BED4_RECORD_TYPE: - isValidNumOp = Bed4Interval::isNumericField(col); - break; - - case FileRecordTypeChecker::BED5_RECORD_TYPE: - isValidNumOp = Bed5Interval::isNumericField(col); - break; - - case FileRecordTypeChecker::BEDGRAPH_RECORD_TYPE: - isValidNumOp = BedGraphInterval::isNumericField(col); - break; - - case FileRecordTypeChecker::BED6_RECORD_TYPE: - isValidNumOp = Bed6Interval::isNumericField(col); - break; - - case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE: - isValidNumOp = BedPlusInterval::isNumericField(col); - break; - - case FileRecordTypeChecker::BED12_RECORD_TYPE: - isValidNumOp = Bed12Interval::isNumericField(col); - break; - - case FileRecordTypeChecker::BAM_RECORD_TYPE: - isValidNumOp = BamRecord::isNumericField(col); - break; - - case FileRecordTypeChecker::VCF_RECORD_TYPE: - isValidNumOp = VcfRecord::isNumericField(col); - break; - - case FileRecordTypeChecker::GFF_RECORD_TYPE: - isValidNumOp = GffRecord::isNumericField(col); - break; - - default: - break; - } - if (!isValidNumOp) { - cerr << endl << "*****" << endl << "***** ERROR: Column " << col << " is not a numeric field for database file " - << dbFile->getFileName() << "." << endl; - return false; - } - } - } +// //The final step we need to do is check that for each column/operation pair, +// //if the operation is numeric, see if the database's record type supports +// //numeric operations for that column. For instance, we can allow the mean +// //of column 4 for a BedGraph file, because that's numeric, but not for Bed4, +// //because that isn't. +// +// for (int i = 0; i < (int)_colOps.size(); i++) { +// int col = _colOps[i].first; +// OP_TYPES opCode = _colOps[i].second; +// FileRecordTypeChecker::RECORD_TYPE recordType = dbFile->getRecordType(); +// +// if (isNumericOp(opCode)) { +// bool isValidNumOp = false; +// switch(recordType) { +// case FileRecordTypeChecker::BED3_RECORD_TYPE: +// isValidNumOp = Bed3Interval::isNumericField(col); +// break; +// +// case FileRecordTypeChecker::BED4_RECORD_TYPE: +// isValidNumOp = Bed4Interval::isNumericField(col); +// break; +// +// case FileRecordTypeChecker::BED5_RECORD_TYPE: +// isValidNumOp = Bed5Interval::isNumericField(col); +// break; +// +// case FileRecordTypeChecker::BEDGRAPH_RECORD_TYPE: +// isValidNumOp = BedGraphInterval::isNumericField(col); +// break; +// +// case FileRecordTypeChecker::BED6_RECORD_TYPE: +// isValidNumOp = Bed6Interval::isNumericField(col); +// break; +// +// case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE: +// isValidNumOp = BedPlusInterval::isNumericField(col); +// break; +// +// case FileRecordTypeChecker::BED12_RECORD_TYPE: +// isValidNumOp = Bed12Interval::isNumericField(col); +// break; +// +// case FileRecordTypeChecker::BAM_RECORD_TYPE: +// isValidNumOp = BamRecord::isNumericField(col); +// break; +// +// case FileRecordTypeChecker::VCF_RECORD_TYPE: +// isValidNumOp = VcfRecord::isNumericField(col); +// break; +// +// case FileRecordTypeChecker::GFF_RECORD_TYPE: +// isValidNumOp = GffRecord::isNumericField(col); +// break; +// +// default: +// break; +// } +// if (!isValidNumOp) { +// cerr << endl << "*****" << endl << "***** ERROR: Column " << col << " is not a numeric field for database file " +// << dbFile->getFileName() << "." << endl; +// return false; +// } +// } +// } return true; } @@ -361,13 +367,15 @@ const QuickString & KeyListOps::getOpVals(RecordKeyList &hits) _outVals.append('\t'); } } + if (_methods.nonNumErrFlagSet()) { + //asked for a numeric op on a column in which a non numeric value was found. + cerr << _methods.getErrMsg() << endl; + _methods.resetNonNumErrFlag(); + } return _outVals; } void KeyListOpsHelp() { - cerr << "\t-c\t" << "Specify columns from the B file to map onto intervals in A." << endl; - cerr << "\t\tDefault: 5." << endl; - cerr << "\t\tMultiple columns can be specified in a comma-delimited list." << endl << endl; cerr << "\t-o\t" << "Specify the operation that should be applied to -c." << endl; cerr << "\t\tValid operations:" << endl; diff --git a/src/utils/KeyListOps/KeyListOpsMethods.cpp b/src/utils/KeyListOps/KeyListOpsMethods.cpp index 0b0013582ee059f3f564a2788a455b4527ddef46..757739e951ae0b34b03c213fec373ab3bb14a0ef 100644 --- a/src/utils/KeyListOps/KeyListOpsMethods.cpp +++ b/src/utils/KeyListOps/KeyListOpsMethods.cpp @@ -9,13 +9,15 @@ #include <cfloat> #include <cmath> #include <algorithm> +#include "ParseTools.h" //to get the isNumeric function KeyListOpsMethods::KeyListOpsMethods() : _keyList(&_nullKeyList), _column(1), _nullVal("."), _delimStr(","), - _iter(_nullKeyList.begin()) + _iter(_nullKeyList.begin()), + _nonNumErrFlag(false) { } @@ -314,7 +316,17 @@ const QuickString &KeyListOpsMethods::getColVal() { } double KeyListOpsMethods::getColValNum() { - return atof(_iter->value()->getField(_column).c_str()); + const QuickString &strVal = _iter->value()->getField(_column); + if (!isNumeric(strVal)) { + _nonNumErrFlag = true; + _errMsg = " ***** WARNING: Non numeric value "; + _errMsg.append(strVal); + _errMsg.append(" in "); + _errMsg.append(_column); + _errMsg.append("."); + return NAN; + } + return atof(strVal.c_str()); } void KeyListOpsMethods::toArray(bool useNum, SORT_TYPE sortVal) { diff --git a/src/utils/KeyListOps/KeyListOpsMethods.h b/src/utils/KeyListOps/KeyListOpsMethods.h index 0cac9c87a196cd3107dd953e7710e637205d2125..4c114ce1038d2b7ef1ca47bf9aff177d856b95ba 100644 --- a/src/utils/KeyListOps/KeyListOpsMethods.h +++ b/src/utils/KeyListOps/KeyListOpsMethods.h @@ -73,6 +73,13 @@ public: // return the last value in the list const QuickString & getLast(); + bool nonNumErrFlagSet() const { return _nonNumErrFlag; } + const QuickString &getErrMsg() const { return _errMsg; } + void resetNonNumErrFlag() { + _nonNumErrFlag = false; + _errMsg.clear(); + } + private: RecordKeyList *_keyList; int _column; @@ -93,6 +100,9 @@ private: typedef enum { UNSORTED, ASC, DESC} SORT_TYPE; + bool _nonNumErrFlag; + QuickString _errMsg; + typedef multimap<int, QuickString, less<int> > histAscType; typedef multimap<int, QuickString, greater<int> > histDescType; void init(); diff --git a/src/utils/general/QuickString.h b/src/utils/general/QuickString.h index a76e5ff223c9554ec42c09599cdb50e4c6b46aff..ebdfb714324382ea1dfa5529c37abed096681c6e 100644 --- a/src/utils/general/QuickString.h +++ b/src/utils/general/QuickString.h @@ -58,6 +58,7 @@ public: const string str() const { return _buffer; } const char &operator [] (int pos) const { return _buffer[pos]; } char &operator [] (int pos) { return _buffer[pos]; } + char &at(size_t pos) { return _buffer[pos]; } void append(const QuickString &str) { append(str.c_str(), str.size()); } void append(const char *buf, size_t bufLen); diff --git a/src/utils/general/Tokenizer.cpp b/src/utils/general/Tokenizer.cpp index b88e013409421074790395e2fc0dbf88b92d9c10..84d2b2bf40e080da4dc587ac6731f11470e67dd8 100644 --- a/src/utils/general/Tokenizer.cpp +++ b/src/utils/general/Tokenizer.cpp @@ -6,7 +6,7 @@ */ #include "Tokenizer.h" #include <cstring> - +#include <cstdio> Tokenizer::Tokenizer() : _numExpectedElems(0), _keepFinalIncElem(USE_NOW), @@ -49,6 +49,14 @@ int Tokenizer::tokenize(const QuickString &str, char delimiter) { } else { QuickString *newStr = fetchElem(currIdx); newStr->assign(str.c_str() + startPos, min(currPos, strLen) - startPos); + + // If splitting lines, strip any white space from the end of the line + // including DOS newline characters and excess tabs. + if (delimiter == '\n') { + int lastPos = newStr->size(); + while (isspace(newStr->at(lastPos-1))) lastPos--; + newStr->resize(lastPos); + } } } startPos = currPos +1; diff --git a/test/intersect/dosLineCharWithExtraTab_b.bed b/test/intersect/dosLineCharWithExtraTab_b.bed new file mode 100644 index 0000000000000000000000000000000000000000..f3c4a76750be2a774a7cc51f237a65b7f3d356eb --- /dev/null +++ b/test/intersect/dosLineCharWithExtraTab_b.bed @@ -0,0 +1,4 @@ +chr1 1 789 +chr1 882 1033 +chr1 1034 1369 +chr1 18746 18972 diff --git a/test/intersect/dosLineChar_a.bed b/test/intersect/dosLineChar_a.bed new file mode 100644 index 0000000000000000000000000000000000000000..80db1e118589cd68ec11633112d8b0a06ae71fcc --- /dev/null +++ b/test/intersect/dosLineChar_a.bed @@ -0,0 +1,3 @@ +chr1 11323785 11617177 +chr1 12645605 13926923 +chr1 14750216 15119039 diff --git a/test/intersect/new_test-intersect.sh b/test/intersect/new_test-intersect.sh index 3b7722239ba0217ad98e55f35466845b25430d01..2732785ef0f30019f57a33e4f99d523d3eddb898 100755 --- a/test/intersect/new_test-intersect.sh +++ b/test/intersect/new_test-intersect.sh @@ -742,8 +742,16 @@ check obs headerOnly.vcf rm obs - - - - +########################################################### +# Test that files with DOS newline characters, '\r', +# and/or extra tabs at end of line are handled +############################################################ +echo " intersect.new.t63...\c" +echo \ +"chr1 11323785 11617177 +chr1 12645605 13926923 +chr1 14750216 15119039" >exp +~/mergeBugSpace/bt2-merge-debug/bin/bedtools intersect -a dosLineChar_a.bed -b dosLineCharWithExtraTab_b.bed -v > obs +check exp obs +rm exp obs diff --git a/test/map/test-map.sh b/test/map/test-map.sh index 0a188bec9d5acf6aa6b36e7b6d7fc648d6f24088..67a0b79049bea6c165accf2d81e150c3337852dd 100644 --- a/test/map/test-map.sh +++ b/test/map/test-map.sh @@ -717,14 +717,16 @@ rm obs exp ########################################################### -# Test that numeric ops for non-numeric columns aren't allowed +# Test that numeric ops for non-numeric columns are +# allowed, but give a warning ############################################################ echo " map.t48...\c" echo \ -" -***** -***** ERROR: Column 1 is not a numeric field for database file values.bed." > exp -$BT map -a ivls.bed -b values.bed -c 1 -o sum 2>&1 > /dev/null | head -3 > obs +" ***** WARNING: Non numeric value chr1 in 1. + ***** WARNING: Non numeric value chr1 in 1. + ***** WARNING: Non numeric value chr3 in 1. + ***** WARNING: Non numeric value chr3 in 1." > exp +$BT map -a ivls.bed -b values.bed -c 1 -o sum 2>&1 > /dev/null | cat - > obs check obs exp rm obs exp @@ -792,4 +794,3 @@ chr3 100 200 0.5" > exp $BT map -a ivls.bed -b values4.bed -c 7 -o sample_stddev > obs check obs exp rm obs exp - diff --git a/test/merge/test-merge.sh b/test/merge/test-merge.sh index 9cde0cdda4f3b9a6688efcba3b80e55056fc0d95..172738f126489c4ffed2ce84a408e58bff7736ed 100644 --- a/test/merge/test-merge.sh +++ b/test/merge/test-merge.sh @@ -132,7 +132,7 @@ check obs exp rm obs exp ########################################################### -# Test the use of a custom delimiter for -nms +# Test the use of a custom delimiter for -delim option ########################################################### echo " merge.t10...\c" echo \ @@ -191,8 +191,8 @@ check exp obs rm obs exp ########################################################### -# Test that stranded merge with unknown records works -# correctly +# Test that stranded merge where some records have +# unknown strand works correctly ########################################################### echo " merge.t15...\c" echo \ @@ -293,3 +293,29 @@ chr2L 577 635 -0.24" > exp $BT merge -i precisionTest.bed -c 5 -o mean > obs check obs exp rm obs exp + +########################################################### +# Test that numeric ops on non-numeric columns +# are allowed, but produce a warning and null +# value result. +########################################################### +echo " merge.t23a...\c" +echo \ +"chr1 10 20 . +chr1 30 100 ." > expOut +$BT merge -i a.names.bed -c 4 -o sum 2>&1 > obsOut | cat - > obsErr +check obsOut expOut +rm expOut obsOut + + +########################################################### +# Just check that the warning message from the previous +# test was correct. +########################################################### +echo " merge.t23b...\c" +echo \ +" ***** WARNING: Non numeric value a1 in 4. + ***** WARNING: Non numeric value a4 in 4." > expErr +check obsErr expErr +rm obsErr expErr +