diff --git a/src/utils/FileRecordTools/FileRecordMgr.cpp b/src/utils/FileRecordTools/FileRecordMgr.cpp index 3721f9616e6fff9128fad5c643e9c5b644f6b30c..0657e1ebbdf6ec493759dc2d945d850fdcbc2515 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.cpp +++ b/src/utils/FileRecordTools/FileRecordMgr.cpp @@ -44,10 +44,10 @@ FileRecordMgr::~FileRecordMgr(){ bool FileRecordMgr::open(){ - const QuickString &filename = _context->getInputFileName(_contextFileIdx); - _bufStreamMgr = new BufferedStreamMgr(filename); + _filename = _context->getInputFileName(_contextFileIdx); + _bufStreamMgr = new BufferedStreamMgr(_filename); if (!_bufStreamMgr->init()) { - fprintf(stderr, "Error: unable to open file or unable to determine types for file %s.\n", filename.c_str()); + cerr << "Error: unable to open file or unable to determine types for file " << _filename << endl; delete _bufStreamMgr; _bufStreamMgr = NULL; exit(1); @@ -56,7 +56,7 @@ bool FileRecordMgr::open(){ _fileType = _bufStreamMgr->getTypeChecker().getFileType(); _recordType = _bufStreamMgr->getTypeChecker().getRecordType(); if (_fileType == FileRecordTypeChecker::UNKNOWN_FILE_TYPE || _recordType == FileRecordTypeChecker::UNKNOWN_RECORD_TYPE) { - fprintf(stderr, "Error: Unable to determine type for file %s.\n", filename.c_str()); + cerr << "Error: Unable to determine type for file " << _filename << endl; delete _bufStreamMgr; _bufStreamMgr = NULL; exit(1); @@ -64,11 +64,11 @@ bool FileRecordMgr::open(){ allocateFileReader(); _recordMgr = new RecordMgr(_recordType, _freeListBlockSize); - _fileReader->setFileName(filename.c_str()); + _fileReader->setFileName(_filename.c_str()); _fileReader->setInputStream(_bufStreamMgr); _fileReader->setContext(_context); if (!_fileReader->open()) { - fprintf(stderr, "Error: Types determined but can't open file %s.\n", filename.c_str()); + cerr << "Error: Types determined but can't open file " << _filename << endl; delete _bufStreamMgr; _bufStreamMgr = NULL; exit(1); @@ -125,6 +125,11 @@ Record *FileRecordMgr::allocateAndGetNextRecord() return NULL; } + if (!record->coordsValid()) { + cerr << "Error: Invalid record in file " << _filename << ". Record is " << endl << *record << endl; + exit(1); + } + //test for sorted order, if necessary. if (_context->getSortedInput()) { testInputSortOrder(record); @@ -184,17 +189,16 @@ void FileRecordMgr::testInputSortOrder(Record *record) void FileRecordMgr::sortError(const Record *record, bool genomeFileError) { if (genomeFileError) { - fprintf(stderr, "Error: Sorted input specified, but the file %s has the following record with a different sort order than the genomeFile %s:\n", - _context->getInputFileName(_contextFileIdx).c_str(), _context->getGenomeFile()->getGenomeFileName().c_str()); + cerr << "Error: Sorted input specified, but the file " << _filename << " has the following record with a different sort order than the genomeFile " << + _context->getGenomeFile()->getGenomeFileName() << endl; } else { - fprintf(stderr, "Error: Sorted input specified, but the file %s has the following out of order record:\n", _context->getInputFileName(_contextFileIdx).c_str()); + cerr << "Error: Sorted input specified, but the file " << _filename << " has the following out of order record" << endl; } - QuickString errBuf; - record->print(errBuf); - fprintf(stderr, "%s\n", errBuf.c_str()); + cerr << *record << endl; exit(1); - } + + void FileRecordMgr::deleteRecord(const Record *record) { _recordMgr->deleteRecord(record); } diff --git a/src/utils/FileRecordTools/FileRecordMgr.h b/src/utils/FileRecordTools/FileRecordMgr.h index 4c73d0faf25745c40e8ad705bcd96ea2cac92de3..cc176890616b58f019db498b7c37f99e1ce7f95c 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.h +++ b/src/utils/FileRecordTools/FileRecordMgr.h @@ -141,6 +141,7 @@ public: void setFullBamFlags(bool flag) { _useFullBamTags = flag; } private: + QuickString _filename; BufferedStreamMgr *_bufStreamMgr; int _contextFileIdx; diff --git a/src/utils/FileRecordTools/RecordOutputMgr.cpp b/src/utils/FileRecordTools/RecordOutputMgr.cpp index 2ef905345ca1cdfd5558b3786225df2cb4853a08..984d6abadd2d33132d23f27885a869f9a3bc9cde 100644 --- a/src/utils/FileRecordTools/RecordOutputMgr.cpp +++ b/src/utils/FileRecordTools/RecordOutputMgr.cpp @@ -113,6 +113,7 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi _outBuf.append(_context->getHeader(_context->getQueryFileIdx())); _context->setPrintHeader(false); } + const_cast<Record *>(keyList.getKey())->undoZeroLength(); _currBlockList = blockList; @@ -157,15 +158,67 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record *hitRecord) { //get the max start and min end as strings. - const QuickString &startStr = keyRecord->getStartPos() > hitRecord->getStartPos() ? keyRecord->getStartPosStr() : hitRecord->getStartPosStr(); - const QuickString &endStr = keyRecord->getEndPos() < hitRecord->getEndPos() ? keyRecord->getEndPosStr() : hitRecord->getEndPosStr(); + const_cast<Record *>(hitRecord)->undoZeroLength(); - int maxStart = max(keyRecord->getStartPos(), hitRecord->getStartPos()); - int minEnd = min(keyRecord->getEndPos(), hitRecord->getEndPos()); + const QuickString *startStr = NULL; + const QuickString *endStr = NULL; + int maxStart = 0; + int minEnd = 0; + + int keyStart = keyRecord->getStartPos(); + int keyEnd = keyRecord->getEndPos(); + int hitStart = hitRecord->getStartPos(); + int hitEnd = hitRecord->getEndPos(); + + if ( keyStart>= hitStart) { + //the key start is after the hit start, but we need to check and make sure the hit end is at least after the keyStart. + //The reason for this is that, in some rare cases, such as both the key and hit having been zero length intervals, + //the normal process for intersection that allows us to simply report the maxStart and minEnd do not necessarily apply. + if (hitEnd >= keyStart) { + //this is ok. We have a normal intersection where the key comes after the hit. + + maxStart = keyStart; + startStr = &(keyRecord->getStartPosStr()); + + minEnd = min(keyEnd, hitEnd); + endStr = keyRecord->getEndPos() < hitRecord->getEndPos() ? &(keyRecord->getEndPosStr()) : &(hitRecord->getEndPosStr()); + + } else { + //this is the weird case of not a "real" intersection. The keyStart is greater than the hitEnd. So just report the key as is. + maxStart = keyStart; + minEnd = keyEnd; + startStr = &(keyRecord->getStartPosStr()); + endStr = &(keyRecord->getEndPosStr()); + } + + } else { + //all of the above, but backwards. keyStart is before hitStart. + if (keyEnd >= hitStart) { + //normal intersection, key first + maxStart = hitStart; + startStr = &(hitRecord->getStartPosStr()); + minEnd = min(keyEnd, hitEnd); + endStr = keyRecord->getEndPos() < hitRecord->getEndPos() ? &(keyRecord->getEndPosStr()) : &(hitRecord->getEndPosStr()); + } else { + //this is the weird case of not a "real" intersection. The hitStart is greater than the keyEnd. So just report the hit as is. + maxStart = hitStart; + minEnd = hitEnd; + startStr = &(hitRecord->getStartPosStr()); + endStr = &(hitRecord->getEndPosStr()); + + } + } + +// const QuickString &startStr = keyRecord->getStartPos() > hitRecord->getStartPos() ? keyRecord->getStartPosStr() : hitRecord->getStartPosStr(); +// const QuickString &endStr = keyRecord->getEndPos() < hitRecord->getEndPos() ? keyRecord->getEndPosStr() : hitRecord->getEndPosStr(); +// +// int maxStart = max(keyRecord->getStartPos(), hitRecord->getStartPos()); +// int minEnd = min(keyRecord->getEndPos(), hitRecord->getEndPos()); +// if (!_context->getWriteA() && !_context->getWriteB() && !_context->getWriteOverlap() && !_context->getLeftJoin()) { - printKey(keyRecord, startStr, endStr); + printKey(keyRecord, *startStr, *endStr); newline(); } else if ((_context->getWriteA() && _context->getWriteB()) || _context->getLeftJoin()) { @@ -179,7 +232,7 @@ void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record newline(); } else if (_context->getWriteB()) { - printKey(keyRecord, startStr, endStr); + printKey(keyRecord, *startStr, *endStr); tab(); hitRecord->print(_outBuf); newline(); diff --git a/src/utils/FileRecordTools/Records/Bed4Interval.cpp b/src/utils/FileRecordTools/Records/Bed4Interval.cpp index eef359138455e71d6ac2f7cd233759fbfc8fab2a..13f5265e0c2310d6791b0e2c5dedbe0572b30795 100644 --- a/src/utils/FileRecordTools/Records/Bed4Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed4Interval.cpp @@ -23,6 +23,7 @@ bool Bed4Interval::initFromFile(SingleLineDelimTextFileReader *fileReader) void Bed4Interval::print(QuickString &outBuf) const { Bed3Interval::print(outBuf); + outBuf.append('\t'); outBuf.append(_name); } diff --git a/src/utils/FileRecordTools/Records/Record.cpp b/src/utils/FileRecordTools/Records/Record.cpp index 1f5351636aec2107915d9e89627d235bf570ef23..96e4eab004c41ca23193ab0384c43c1004d63c45 100644 --- a/src/utils/FileRecordTools/Records/Record.cpp +++ b/src/utils/FileRecordTools/Records/Record.cpp @@ -6,7 +6,8 @@ Record::Record() : _chrId(-1), _startPos(-1), _endPos(-1), - _strand(UNKNOWN) + _strand(UNKNOWN), + _zeroLength(false) { } @@ -177,3 +178,37 @@ bool Record::sameChromIntersects(const Record *record, return false; } + +bool Record::coordsValid() { + if (_startPos < 0 || _endPos < 0 || _endPos < _startPos) { + return false; + } + adjustZeroLength(); + return true; +} + +void Record::adjustZeroLength() +{ + if (_startPos == _endPos) { + _zeroLength = true; + _startPos--; + _endPos++; + } +} + +void Record::undoZeroLength() +{ + if (_zeroLength) { + _startPos++; + _endPos--; + _zeroLength = false; + } +} + +ostream &operator << (ostream &out, const Record &record) +{ + QuickString errBuf; + record.print(errBuf); + out << errBuf; + return out; +} diff --git a/src/utils/FileRecordTools/Records/Record.h b/src/utils/FileRecordTools/Records/Record.h index 9dc1ae4e555fa7eb8e5f3a5a814296e09f351748..b2004231c14f41a4b316bfef155baaeeb5c96168 100644 --- a/src/utils/FileRecordTools/Records/Record.h +++ b/src/utils/FileRecordTools/Records/Record.h @@ -33,6 +33,8 @@ public: virtual void print(QuickString &outBuf, int start, int end) const {} virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const {} virtual void printNull(QuickString &outBuf) const {} + friend ostream &operator << (ostream &out, const Record &record); + virtual const Record & operator=(const Record &); virtual const QuickString &getChrName() const { return _chrName; } @@ -54,13 +56,14 @@ public: virtual const QuickString &getEndPosStr() const { return _endPosStr; } virtual void setEndPosStr(const QuickString &str) { _endPosStr = str; } + virtual bool getZeroLength() const { return _zeroLength; } + virtual void setZeroLength(bool val) { _zeroLength = val; } virtual strandType getStrand() const { return _strand; } virtual void setStrand(strandType val) { _strand = val; } virtual void setStrand(char val); virtual char getStrandChar() const; - //And we have a similar problem with name and score virtual const QuickString &getName() const { return _name; } virtual void setName(const QuickString &chr) { _name = chr; } virtual void setName(const string &chr) { _name = chr; } @@ -73,6 +76,18 @@ public: virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::UNKNOWN_RECORD_TYPE; } + virtual bool coordsValid(); //test that no coords negative, end not less than start, check zeroLength (see below). + + //Some files can have insertions of the form 2,2. If found this should translate to cover the base before and after, + //thus meaning the startPos is decremented and the endPos is incremented. This method will find and handle that case. + //Don't adjust the startPosStr and endPosStr strings because they aren't used in + //calculation. They're only used in output, and it would be slower to change them + //and then change them back. + virtual void adjustZeroLength(); + virtual void undoZeroLength(); //change it back just before output; + virtual bool isZeroLength() const { return _zeroLength; } + + virtual bool operator < (const Record &other) const; virtual bool operator > (const Record &other) const; @@ -110,6 +125,7 @@ protected: QuickString _name; QuickString _score; strandType _strand; + bool _zeroLength; };