diff --git a/src/intersectFile/intersectFile.cpp b/src/intersectFile/intersectFile.cpp index 7eb46d89f0a268a33b59612e93820c7ce1363024..5c78887425de2e8a7712aac3ba69e41db86a56ac 100644 --- a/src/intersectFile/intersectFile.cpp +++ b/src/intersectFile/intersectFile.cpp @@ -23,9 +23,12 @@ FileIntersect::FileIntersect(ContextIntersect *context) _blockMgr(NULL), _recordOutputMgr(NULL) { - _blockMgr = new BlockMgr(); - _blockMgr->setContext(context); _recordOutputMgr = new RecordOutputMgr(); + if (_context->getObeySplits()) { + _blockMgr = new BlockMgr(); + _blockMgr->setContext(context); + _recordOutputMgr->setSplitInfo(_blockMgr); + } } FileIntersect::~FileIntersect(void) { diff --git a/src/utils/FileRecordTools/FileRecordMgr.cpp b/src/utils/FileRecordTools/FileRecordMgr.cpp index 3230f75a0f34db7b73f71967ed8efc7ecb8ef481..16ebf006d9eac5054940b4147e18833f6e66ff1e 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.cpp +++ b/src/utils/FileRecordTools/FileRecordMgr.cpp @@ -127,9 +127,11 @@ Record *FileRecordMgr::allocateAndGetNextRecord() _recordMgr->deleteRecord(record); return NULL; } + // In the rare case of Bam records where both the read and it's mate failed to map, - // Ignore the record. Delete and return null - if (!(record->isUnmapped() && record->isMateUnmapped())) { + // Ignore the record. Delete and return null. + + if (!(record->isUnmapped() )) { if (!record->coordsValid()) { cerr << "Error: Invalid record in file " << _filename << ". Record is " << endl << *record << endl; exit(1); diff --git a/src/utils/FileRecordTools/RecordOutputMgr.cpp b/src/utils/FileRecordTools/RecordOutputMgr.cpp index ac8b37ed07b4e90af788598869d33d2154261c1f..ea8a23334ad4226b2559cae6c3234f84523ebab4 100644 --- a/src/utils/FileRecordTools/RecordOutputMgr.cpp +++ b/src/utils/FileRecordTools/RecordOutputMgr.cpp @@ -29,10 +29,10 @@ RecordOutputMgr::RecordOutputMgr() : _context(NULL), _printable(true), _bamWriter(NULL), - _currBlockList(NULL), - _blockMgr(NULL) + _currBamBlockList(NULL), + _bamBlockMgr(NULL) { - _blockMgr = new BlockMgr(); + _bamBlockMgr = new BlockMgr(); } RecordOutputMgr::~RecordOutputMgr() @@ -45,8 +45,8 @@ RecordOutputMgr::~RecordOutputMgr() delete _bamWriter; _bamWriter = NULL; } - delete _blockMgr; - _blockMgr = NULL; + delete _bamBlockMgr; + _bamBlockMgr = NULL; } @@ -102,7 +102,7 @@ RecordOutputMgr::printBamType RecordOutputMgr::printBamRecord(RecordKeyList &key if (record->isUnmapped()) { record->printUnmapped(_outBuf); } else { - static_cast<const BamRecord *>(record)->print(_outBuf, _currBlockList); + static_cast<const BamRecord *>(record)->print(_outBuf, _currBamBlockList); } } return BAM_AS_BED; @@ -121,10 +121,10 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList) { if (keyList.getKey()->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) { RecordKeyList blockList(keyList.getKey()); bool deleteBlocks = false; - _blockMgr->getBlocks(blockList, deleteBlocks); + _bamBlockMgr->getBlocks(blockList, deleteBlocks); printRecord(keyList, &blockList); if (deleteBlocks) { - _blockMgr->deleteBlocks(blockList); + _bamBlockMgr->deleteBlocks(blockList); } return; } @@ -143,7 +143,7 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi checkForHeader(); } const_cast<Record *>(keyList.getKey())->undoZeroLength(); - _currBlockList = blockList; + _currBamBlockList = blockList; if (_context->getProgram() == ContextBase::INTERSECT) { if (_printable) { @@ -151,7 +151,7 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi if ((static_cast<ContextIntersect *>(_context))->getWriteAllOverlap()) { // -wao the user wants to force the reporting of 0 overlap if (printKeyAndTerminate(keyList)) { - _currBlockList = NULL; + _currBamBlockList = NULL; return; } tab(); @@ -163,34 +163,36 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi } else if ((static_cast<ContextIntersect *>(_context))->getLeftJoin()) { if (printKeyAndTerminate(keyList)) { - _currBlockList = NULL; + _currBamBlockList = NULL; return; } tab(); null(false, true); newline(); if (needsFlush()) flush(); - _currBlockList = NULL; + _currBamBlockList = NULL; return; } } else { if (printBamRecord(keyList, true) == BAM_AS_BAM) { - _currBlockList = NULL; + _currBamBlockList = NULL; return; } + int hitIdx = 0; for (RecordKeyList::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { - reportOverlapDetail(keyList.getKey(), iter->value()); + reportOverlapDetail(keyList.getKey(), iter->value(), hitIdx); + hitIdx++; } } } else { // not printable reportOverlapSummary(keyList); } - _currBlockList = NULL; + _currBamBlockList = NULL; } else if (_context->getProgram() == ContextBase::SAMPLE) { if (!printKeyAndTerminate(keyList)) { newline(); } - _currBlockList = NULL; + _currBamBlockList = NULL; return; } } @@ -209,7 +211,7 @@ void RecordOutputMgr::checkForHeader() { if (needsFlush()) flush(); } -void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record *hitRecord) +void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record *hitRecord, int hitIdx) { //get the max start and min end as strings. const_cast<Record *>(hitRecord)->undoZeroLength(); @@ -264,12 +266,6 @@ void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record } } -// const QuickString &startStr = keyRecord->getStartPos() > hitRecord->getStartPos() ? keyRecord->getStartPosStr() : hitRecord->getStartPosStr(); -// const QuickString &endStr = keyRecord->getEndPos() < hitRecord->getEndPos() ? keyRecord->getEndPosStr() : hitRecord->getEndPosStr(); -// -// int maxStart = max(keyRecord->getStartPos(), hitRecord->getStartPos()); -// int minEnd = min(keyRecord->getEndPos(), hitRecord->getEndPos()); -// if (!(static_cast<ContextIntersect *>(_context))->getWriteA() && !(static_cast<ContextIntersect *>(_context))->getWriteB() && !(static_cast<ContextIntersect *>(_context))->getWriteOverlap() && !(static_cast<ContextIntersect *>(_context))->getLeftJoin()) { @@ -298,7 +294,12 @@ void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record if (needsFlush()) flush(); } else if ((static_cast<ContextIntersect *>(_context))->getWriteOverlap()) { - int printOverlapBases = max(0, minEnd-maxStart); + int printOverlapBases = 0; + if (_context->getObeySplits()) { + printOverlapBases = _splitInfo->getOverlapBases(hitIdx); + } else { + printOverlapBases = minEnd - maxStart; + } printKey(keyRecord); tab(); hitRecord->print(_outBuf); @@ -399,7 +400,7 @@ void RecordOutputMgr::printKey(const Record *key, const QuickString & start, con if (key->getType() != FileRecordTypeChecker::BAM_RECORD_TYPE) { key->print(_outBuf, start, end); } else { - static_cast<const BamRecord *>(key)->print(_outBuf, start, end, _currBlockList); + static_cast<const BamRecord *>(key)->print(_outBuf, start, end, _currBamBlockList); } } @@ -408,7 +409,7 @@ void RecordOutputMgr::printKey(const Record *key) if (key->getType() != FileRecordTypeChecker::BAM_RECORD_TYPE) { key->print(_outBuf); } else { - static_cast<const BamRecord *>(key)->print(_outBuf, _currBlockList); + static_cast<const BamRecord *>(key)->print(_outBuf, _currBamBlockList); } } diff --git a/src/utils/FileRecordTools/RecordOutputMgr.h b/src/utils/FileRecordTools/RecordOutputMgr.h index 8725fc0fdc54342b0d45967581a83e9fd2909921..46cca17e4e00d7e0d430ad318559ade6b69c8b4b 100644 --- a/src/utils/FileRecordTools/RecordOutputMgr.h +++ b/src/utils/FileRecordTools/RecordOutputMgr.h @@ -24,20 +24,25 @@ public: //The init method must be called after all the input files are open. bool init(ContextBase *context); -// void printHeader(const string &); void printRecord(const Record *record); void printRecord(RecordKeyList &keyList); + //where necessary, pass additional information about splits through the blockMgr. + void setSplitInfo(const BlockMgr *blockMgr) { _splitInfo = blockMgr; } + private: typedef enum { NOT_BAM, BAM_AS_BAM, BAM_AS_BED} printBamType; ContextBase *_context; bool _printable; BamTools::BamWriter *_bamWriter; - RecordKeyList *_currBlockList; + RecordKeyList *_currBamBlockList; QuickString _outBuf; - BlockMgr *_blockMgr; + + // + BlockMgr *_bamBlockMgr; + const BlockMgr *_splitInfo; //some helper functions to neaten the code. void tab() { _outBuf.append('\t'); } void newline() { _outBuf.append('\n'); } @@ -49,7 +54,7 @@ private: bool printKeyAndTerminate(RecordKeyList &keyList); printBamType printBamRecord(RecordKeyList &keyList, bool bamOutputOnly = false); void checkForHeader(); - void reportOverlapDetail(const Record *keyRecord, const Record *hitRecord); + void reportOverlapDetail(const Record *keyRecord, const Record *hitRecord, int hitIdx = 0); void reportOverlapSummary(RecordKeyList &keyList); static const unsigned int MAX_OUTBUF_SIZE = 16384; //16 K diff --git a/src/utils/FileRecordTools/Records/BamRecord.cpp b/src/utils/FileRecordTools/Records/BamRecord.cpp index cde07329a2dffb56905d700e532c913cb092c8fa..4c5cd8dc2fb4b3716ed1fc3de598428be747eb4e 100644 --- a/src/utils/FileRecordTools/Records/BamRecord.cpp +++ b/src/utils/FileRecordTools/Records/BamRecord.cpp @@ -1,3 +1,10 @@ +/* + * BamRecord.cpp + * + * Created on: Jan 14, 2014 + * Author: nek3d + */ + #include "BamRecord.h" #include "BamFileReader.h" #include "RecordKeyList.h" @@ -15,143 +22,155 @@ BamRecord::~BamRecord() const BamRecord &BamRecord::operator=(const BamRecord &other) { - Bed6Interval::operator=(other); - _bamAlignment = other._bamAlignment; - return *this; + Bed6Interval::operator=(other); + _bamAlignment = other._bamAlignment; + return *this; } bool BamRecord::initFromFile(FileReader *fileReader) { - BamFileReader *bamFileReader = static_cast<BamFileReader*>(fileReader); - return initFromFile(bamFileReader); + BamFileReader *bamFileReader = static_cast<BamFileReader*>(fileReader); + return initFromFile(bamFileReader); } bool BamRecord::initFromFile(BamFileReader *bamFileReader) { - bamFileReader->getChrName(_chrName); - - _bamChromId = bamFileReader->getCurrChromdId(); - _startPos = bamFileReader->getStartPos(); - int2str(_startPos, _startPosStr); - _endPos = bamFileReader->getEndPos(); - int2str(_endPos, _endPosStr); - bamFileReader->getName(_name); - bamFileReader->getScore(_score); - char strandChar = bamFileReader->getStrand(); - setStrand(strandChar); - - _bamAlignment = bamFileReader->getAlignment(); - _isUnmapped = !_bamAlignment.IsMapped(); - _isMateUnmapped = !_bamAlignment.IsMateMapped(); - return true; + bamFileReader->getChrName(_chrName); + + _bamChromId = bamFileReader->getCurrChromdId(); + _startPos = bamFileReader->getStartPos(); + int2str(_startPos, _startPosStr); + _endPos = bamFileReader->getEndPos(); + int2str(_endPos, _endPosStr); + bamFileReader->getName(_name); + bamFileReader->getScore(_score); + char strandChar = bamFileReader->getStrand(); + setStrand(strandChar); + + _bamAlignment = bamFileReader->getAlignment(); + _isUnmapped = !_bamAlignment.IsMapped(); + _isMateUnmapped = !_bamAlignment.IsMateMapped(); + return true; } void BamRecord::clear() { - Bed6Interval::clear(); - _bamChromId = -1; - - - //Clear the BamAlignment object. Sadly, it does not have a clear() method, - //so we have to do each member manually. - _bamAlignment.Name.clear(); - _bamAlignment.Length = 0; - _bamAlignment.QueryBases.clear(); - _bamAlignment.AlignedBases.clear(); - _bamAlignment.Qualities.clear(); - _bamAlignment.TagData.clear(); - _bamAlignment.RefID = -1; - _bamAlignment.Position = -1; - _bamAlignment.Bin = 0; - _bamAlignment.MapQuality = 0; - _bamAlignment.AlignmentFlag = 0; - _bamAlignment.CigarData.clear(); - _bamAlignment.MateRefID = -1; - _bamAlignment.MatePosition = -1; - _bamAlignment.InsertSize = -1; - _bamAlignment.Filename.clear(); - - _bamAlignment.SupportData.AllCharData.clear(); - _bamAlignment.SupportData.BlockLength = 0; - _bamAlignment.SupportData.NumCigarOperations = 0; - _bamAlignment.SupportData.QueryNameLength = 0; - _bamAlignment.SupportData.QuerySequenceLength = 0; - _bamAlignment.SupportData.HasCoreOnly = false; - - _bamAlignment.ErrorString.clear(); + Bed6Interval::clear(); + _bamChromId = -1; + + + //Clear the BamAlignment object. Sadly, it does not have a clear() method, + //so we have to do each member manually. + _bamAlignment.Name.clear(); + _bamAlignment.Length = 0; + _bamAlignment.QueryBases.clear(); + _bamAlignment.AlignedBases.clear(); + _bamAlignment.Qualities.clear(); + _bamAlignment.TagData.clear(); + _bamAlignment.RefID = -1; + _bamAlignment.Position = -1; + _bamAlignment.Bin = 0; + _bamAlignment.MapQuality = 0; + _bamAlignment.AlignmentFlag = 0; + _bamAlignment.CigarData.clear(); + _bamAlignment.MateRefID = -1; + _bamAlignment.MatePosition = -1; + _bamAlignment.InsertSize = -1; + _bamAlignment.Filename.clear(); + + _bamAlignment.SupportData.AllCharData.clear(); + _bamAlignment.SupportData.BlockLength = 0; + _bamAlignment.SupportData.NumCigarOperations = 0; + _bamAlignment.SupportData.QueryNameLength = 0; + _bamAlignment.SupportData.QuerySequenceLength = 0; + _bamAlignment.SupportData.HasCoreOnly = false; + + _bamAlignment.ErrorString.clear(); } void BamRecord::print(QuickString &outBuf, RecordKeyList *keyList) const { - Bed6Interval::print(outBuf); + Bed6Interval::print(outBuf); printRemainingBamFields(outBuf, keyList); } void BamRecord::print(QuickString &outBuf, int start, int end, RecordKeyList *keyList) const { - Bed6Interval::print(outBuf, start, end); + Bed6Interval::print(outBuf, start, end); printRemainingBamFields(outBuf, keyList); } void BamRecord::print(QuickString &outBuf, const QuickString & start, const QuickString & end, RecordKeyList *keyList) const { - Bed6Interval::print(outBuf, start, end); + Bed6Interval::print(outBuf, start, end); printRemainingBamFields(outBuf, keyList); } void BamRecord::printNull(QuickString &outBuf) const { - Bed6Interval::printNull(outBuf); - outBuf.append("\t.\t.\t.\t.\t.\t.", 12); + Bed6Interval::printNull(outBuf); + outBuf.append("\t.\t.\t.\t.\t.\t.", 12); } void BamRecord::printRemainingBamFields(QuickString &outBuf, RecordKeyList *keyList) const { - outBuf.append('\t'); - outBuf.append(_bamAlignment.Position); - outBuf.append('\t'); - outBuf.append(_endPos); - outBuf.append("\t0,0,0", 6); - outBuf.append('\t'); - - int numBlocks = (int)keyList->size(); - - if (numBlocks > 0) { - outBuf.append(numBlocks); - - vector<int> blockLengths; - vector<int> blockStarts; - for (RecordKeyList::const_iterator_type iter = keyList->begin(); iter != keyList->end(); iter = keyList->next()) { - const Record *block = iter->value(); - blockLengths.push_back(block->getEndPos() - block->getStartPos()); - blockStarts.push_back(block->getStartPos() - _bamAlignment.Position); - } - - outBuf.append('\t'); - for (int i=0; i < (int)blockLengths.size(); i++) { - outBuf.append(blockLengths[i]); - outBuf.append(','); - } - outBuf.append('\t'); - for (int i=0; i < (int)blockStarts.size(); i++) { - outBuf.append( blockStarts[i]); - outBuf.append(','); - } - } - else { - outBuf.append("1\t0,\t0,"); - } + outBuf.append('\t'); + outBuf.append(_bamAlignment.Position); + outBuf.append('\t'); + outBuf.append(_endPos); + outBuf.append("\t0,0,0", 6); + outBuf.append('\t'); + + int numBlocks = (int)keyList->size(); + + if (numBlocks > 0) { + outBuf.append(numBlocks); + + vector<int> blockLengths; + vector<int> blockStarts; + for (RecordKeyList::const_iterator_type iter = keyList->begin(); iter != keyList->end(); iter = keyList->next()) { + const Record *block = iter->value(); + blockLengths.push_back(block->getEndPos() - block->getStartPos()); + blockStarts.push_back(block->getStartPos() - _bamAlignment.Position); + } + + outBuf.append('\t'); + for (int i=0; i < (int)blockLengths.size(); i++) { + outBuf.append(blockLengths[i]); + outBuf.append(','); + } + outBuf.append('\t'); + for (int i=0; i < (int)blockStarts.size(); i++) { + outBuf.append( blockStarts[i]); + outBuf.append(','); + } + } + else { + outBuf.append("1\t0,\t0,"); + } } void BamRecord::printUnmapped(QuickString &outBuf) const { - outBuf.append(_chrName.empty() ? "." : _chrName); - outBuf.append("\t-1\t-1\t"); - outBuf.append(_name.empty() ? "." : _name); - outBuf.append('\t'); - outBuf.append(_score.empty() ? "." : _score); - outBuf.append("\t.\t-1\t-1\t-1\t0,0,0\t0\t.\t."); // dot for strand, -1 for blockStarts and blockEnd + outBuf.append(_chrName.empty() ? "." : _chrName); + outBuf.append("\t-1\t-1\t"); + outBuf.append(_name.empty() ? "." : _name); + outBuf.append('\t'); + outBuf.append(_score.empty() ? "." : _score); + outBuf.append("\t.\t-1\t-1\t-1\t0,0,0\t0\t.\t."); // dot for strand, -1 for blockStarts and blockEnd +} + +const QuickString &BamRecord::getField(int fieldNum) const +{ + //TBD: Determine what correct behavior should be. + //I.e. if users requests field 2, do they want Flag + //for Bam Records, or startPos for all records? -NEK 1/14/14. + + return Bed6Interval::getField(fieldNum); } + + + diff --git a/src/utils/FileRecordTools/Records/BamRecord.h b/src/utils/FileRecordTools/Records/BamRecord.h index 262ef1ab9586006525857446be72bc800b0c287f..b74dbc2c92b05e83ba64e7eca1e1ddf6e59dd525 100644 --- a/src/utils/FileRecordTools/Records/BamRecord.h +++ b/src/utils/FileRecordTools/Records/BamRecord.h @@ -38,6 +38,9 @@ public: const BamTools::BamAlignment &getAlignment() const { return _bamAlignment; } int getBamChromId() const { return _bamChromId; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return 12; } + protected: BamTools::BamAlignment _bamAlignment; int _bamChromId; //different from chromId, because BAM file may be in different order diff --git a/src/utils/FileRecordTools/Records/Bed12Interval.cpp b/src/utils/FileRecordTools/Records/Bed12Interval.cpp index 648dcccda2d73dac78482719a0536c570feb9cab..867a69ec4c23e50d2f1c3588f9622a78ce992176 100644 --- a/src/utils/FileRecordTools/Records/Bed12Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed12Interval.cpp @@ -119,4 +119,30 @@ void Bed12Interval::printNull(QuickString &outBuf) const outBuf.append("\t.\t.\t.\t.\t.\t.", 12); } +const QuickString &Bed12Interval::getField(int fieldNum) const +{ + switch (fieldNum) { + case 7: + return _thickStartStr; + break; + case 8: + return _thickEndStr; + break; + case 9: + return _itemRGB; + break; + case 10: + return _blockCountStr; + break; + case 11: + return _blockSizes; + break; + case 12: + return _blockStarts; + break; + default: + return Bed6Interval::getField(fieldNum); + break; + } +} diff --git a/src/utils/FileRecordTools/Records/Bed12Interval.h b/src/utils/FileRecordTools/Records/Bed12Interval.h index a902b4f4472503d3ece54509dafa7ad30b60ea35..711800c3d7560d0834f5b823d48ffa47f3ec3535 100644 --- a/src/utils/FileRecordTools/Records/Bed12Interval.h +++ b/src/utils/FileRecordTools/Records/Bed12Interval.h @@ -52,6 +52,9 @@ public: virtual void setBlockStarts(const string & blockStarts) { _blockStarts = blockStarts; } virtual void setBlockStarts(const char *blockStarts) { _blockStarts = blockStarts; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return 12; } + protected: virtual ~Bed12Interval(); diff --git a/src/utils/FileRecordTools/Records/Bed3Interval.cpp b/src/utils/FileRecordTools/Records/Bed3Interval.cpp index 742ea7b9824cece5d29fe05e9c039d4b4e9b3081..3f896be5a80614cccc8b058eb04b746f2ec19b84 100644 --- a/src/utils/FileRecordTools/Records/Bed3Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed3Interval.cpp @@ -61,3 +61,21 @@ void Bed3Interval::print(QuickString &outBuf, const QuickString & start, const Q void Bed3Interval::printNull(QuickString &outBuf) const { outBuf.append(".\t-1\t-1", 7); } + +const QuickString &Bed3Interval::getField(int fieldNum) const +{ + switch (fieldNum) { + case 1: + return _chrName; + break; + case 2: + return _startPosStr; + break; + case 3: + return _endPosStr; + break; + default: + return Record::getField(fieldNum); + break; + } +} diff --git a/src/utils/FileRecordTools/Records/Bed3Interval.h b/src/utils/FileRecordTools/Records/Bed3Interval.h index a3015e0be90b12bedf4694a8ce6e7a3422032391..9f1ff1183a975fa7defda698abee05831bd5b771 100644 --- a/src/utils/FileRecordTools/Records/Bed3Interval.h +++ b/src/utils/FileRecordTools/Records/Bed3Interval.h @@ -29,6 +29,9 @@ public: virtual void printNull(QuickString &outBuf) const; virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED3_RECORD_TYPE; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return 3; } + protected: virtual ~Bed3Interval(); diff --git a/src/utils/FileRecordTools/Records/Bed4Interval.cpp b/src/utils/FileRecordTools/Records/Bed4Interval.cpp index 13f5265e0c2310d6791b0e2c5dedbe0572b30795..c1ef81a3c51d91fd563c351d7ae751e80892cfe6 100644 --- a/src/utils/FileRecordTools/Records/Bed4Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed4Interval.cpp @@ -48,3 +48,15 @@ void Bed4Interval::printNull(QuickString &outBuf) const outBuf.append("\t.", 2); } +const QuickString &Bed4Interval::getField(int fieldNum) const +{ + switch (fieldNum) { + case 4: + return _name; + break; + default: + return Bed3Interval::getField(fieldNum); + break; + } +} + diff --git a/src/utils/FileRecordTools/Records/Bed4Interval.h b/src/utils/FileRecordTools/Records/Bed4Interval.h index 4d9fda7acb574170f5aaa1b39db4efa654da5263..f42817c495ed4780dd57aa82d3c4a93718840e5b 100644 --- a/src/utils/FileRecordTools/Records/Bed4Interval.h +++ b/src/utils/FileRecordTools/Records/Bed4Interval.h @@ -26,6 +26,10 @@ public: virtual void printNull(QuickString &outBuf) const; virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED4_RECORD_TYPE; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return 4; } + + protected: virtual ~Bed4Interval(); }; diff --git a/src/utils/FileRecordTools/Records/Bed5Interval.cpp b/src/utils/FileRecordTools/Records/Bed5Interval.cpp index 8e643608a1633929a1141c0b21d24ec1c3a81309..7307fb667ee5bbbf40fe0576fd9654bec0c9ad0f 100644 --- a/src/utils/FileRecordTools/Records/Bed5Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed5Interval.cpp @@ -55,3 +55,18 @@ void Bed5Interval::printNull(QuickString &outBuf) const outBuf.append("\t.\t-1", 5); } +const QuickString &Bed5Interval::getField(int fieldNum) const +{ + switch (fieldNum) { + case 4: + return _name; + break; + case 5: + return _score; + break; + + default: + return Bed3Interval::getField(fieldNum); + break; + } +} diff --git a/src/utils/FileRecordTools/Records/Bed5Interval.h b/src/utils/FileRecordTools/Records/Bed5Interval.h index 01d6596684a9764969ee415578906755ad386d20..bc913d1df6f4b22c355d0ace56d5e0143feced12 100644 --- a/src/utils/FileRecordTools/Records/Bed5Interval.h +++ b/src/utils/FileRecordTools/Records/Bed5Interval.h @@ -25,6 +25,10 @@ public: virtual void printNull(QuickString &outBuf) const; virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED5_RECORD_TYPE; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return 5; } + + protected: virtual ~Bed5Interval(); }; diff --git a/src/utils/FileRecordTools/Records/Bed6Interval.cpp b/src/utils/FileRecordTools/Records/Bed6Interval.cpp index b486333a66757b4d36896c273d0d46d9798e9ab6..8371553a435941c0b713d64a55d198e4760a75ac 100644 --- a/src/utils/FileRecordTools/Records/Bed6Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed6Interval.cpp @@ -64,3 +64,20 @@ void Bed6Interval::printNull(QuickString &outBuf) const outBuf.append("\t.\t-1\t.", 7); } +const QuickString &Bed6Interval::getField(int fieldNum) const +{ + switch (fieldNum) { + case 4: + return _name; + break; + case 5: + return _score; + break; + case 6: + return _strand; + break; + default: + return Bed3Interval::getField(fieldNum); + break; + } +} diff --git a/src/utils/FileRecordTools/Records/Bed6Interval.h b/src/utils/FileRecordTools/Records/Bed6Interval.h index 91a00e09f131a7d922515e40974c6f1f7f3dcea5..9ad9f80b523ae5f5aef43aa3fd61e29cdc09e27f 100644 --- a/src/utils/FileRecordTools/Records/Bed6Interval.h +++ b/src/utils/FileRecordTools/Records/Bed6Interval.h @@ -25,6 +25,10 @@ public: virtual void printNull(QuickString &outBuf) const; virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED6_RECORD_TYPE; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return 6; } + + protected: virtual ~Bed6Interval(); }; diff --git a/src/utils/FileRecordTools/Records/BedGraphInterval.cpp b/src/utils/FileRecordTools/Records/BedGraphInterval.cpp index e089782c1499956a6b94c25e070b133da40ecc8d..e0808573c0a6cb35de787421a4a60e260d08ffa7 100644 --- a/src/utils/FileRecordTools/Records/BedGraphInterval.cpp +++ b/src/utils/FileRecordTools/Records/BedGraphInterval.cpp @@ -48,3 +48,15 @@ void BedGraphInterval::printNull(QuickString &outBuf) const outBuf.append("\t.", 2); } +const QuickString &BedGraphInterval::getField(int fieldNum) const +{ + switch (fieldNum) { + case 4: + return _name; + break; + default: + return Bed3Interval::getField(fieldNum); + break; + } +} + diff --git a/src/utils/FileRecordTools/Records/BedGraphInterval.h b/src/utils/FileRecordTools/Records/BedGraphInterval.h index f3d612e899981a895e94a0518c098144f05fcca4..1bdf619a807885347ace0a55fbcaf302c4ff95a2 100644 --- a/src/utils/FileRecordTools/Records/BedGraphInterval.h +++ b/src/utils/FileRecordTools/Records/BedGraphInterval.h @@ -25,6 +25,10 @@ public: virtual void printNull(QuickString &outBuf) const; virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BEDGRAPH_RECORD_TYPE; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return 4; } + + protected: virtual ~BedGraphInterval(); }; diff --git a/src/utils/FileRecordTools/Records/BedPlusInterval.cpp b/src/utils/FileRecordTools/Records/BedPlusInterval.cpp index d3e0c01ad9df1ac218cc15c310affba359e0d199..7e3c5651e470823b4b9a13f89e6f4b0887f82154 100644 --- a/src/utils/FileRecordTools/Records/BedPlusInterval.cpp +++ b/src/utils/FileRecordTools/Records/BedPlusInterval.cpp @@ -105,36 +105,16 @@ void BedPlusInterval::printNull(QuickString &outBuf) const for (int i=startOtherIdx; i < _numPrintFields; i++) { outBuf.append("\t."); } - } -QuickString BedPlusInterval::getField(int fieldNum) const +const QuickString &BedPlusInterval::getField(int fieldNum) const { //a request for any of the first six fields will retrieve //chrom, start, end, name, score, and strand, in that order. //A request for field 6+ will go to the otherIdxs. - switch (fieldNum) { - case 0: - return _chrName; - break; //redundant after a return, but good practice anyway. - case 1: - return _startPosStr; - break; - case 2: - return _endPosStr; - break; - case 3: - return _name; - break; - case 4: - return _score; - break; - case 5: - return _strand; - break; - default: + if (fieldNum > startOtherIdx && fieldNum < startOtherIdx + (int)_otherIdxs.size()) { return (*(_otherIdxs[fieldNum - startOtherIdx])); - break; } + return Bed6Interval::getField(fieldNum); } diff --git a/src/utils/FileRecordTools/Records/BedPlusInterval.h b/src/utils/FileRecordTools/Records/BedPlusInterval.h index 4ebd3b67a88fa757e833e95bfddd18a897bcff8d..4b98b4f3d0a745ea80709f68bfd0bfa7a7e23d09 100644 --- a/src/utils/FileRecordTools/Records/BedPlusInterval.h +++ b/src/utils/FileRecordTools/Records/BedPlusInterval.h @@ -30,7 +30,9 @@ public: //if the number of fields frequently differ between this object and the one being copied. const BedPlusInterval &operator=(const BedPlusInterval &other); - virtual QuickString getField(int fieldNum) const; + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return startOtherIdx + _otherIdxs.size(); } + virtual void setField(int fieldNum, const QuickString &str) { (*(_otherIdxs[fieldNum])) = str; } virtual void setField(int fieldNum, const string &str) { (*(_otherIdxs[fieldNum])) = str; } virtual void setField(int fieldNum, const char *str) { (*(_otherIdxs[fieldNum])) = str; } diff --git a/src/utils/FileRecordTools/Records/BlockMgr.cpp b/src/utils/FileRecordTools/Records/BlockMgr.cpp index 54dbd78835177f3dc26660abce7478edc27df4dd..4aede83d0de63f3259cf81e66d20fa59b7c18f05 100644 --- a/src/utils/FileRecordTools/Records/BlockMgr.cpp +++ b/src/utils/FileRecordTools/Records/BlockMgr.cpp @@ -161,17 +161,23 @@ int BlockMgr::findBlockedOverlaps(RecordKeyList &keyList, RecordKeyList &hitList { bool deleteKeyBlocks = false; if (keyList.empty()) { + //get all the blocks for the query record, put them in it's list. getBlocks(keyList, deleteKeyBlocks); } + _overlapBases.clear(); int keyBlocksSumLength = getTotalBlockLength(keyList); + //Loop through every database record the query intersected with for (RecordKeyList::const_iterator_type hitListIter = hitList.begin(); hitListIter != hitList.end(); hitListIter = hitList.next()) { RecordKeyList hitBlocks(hitListIter->value()); bool deleteHitBlocks = false; - getBlocks(hitBlocks, deleteHitBlocks); - int hitBlockSumLength = getTotalBlockLength(hitBlocks); + getBlocks(hitBlocks, deleteHitBlocks); //get all blocks for the hit record. + int hitBlockSumLength = getTotalBlockLength(hitBlocks); //get total lentgh of the bocks for the hitRecord. int totalHitOverlap = 0; bool hitHasOverlap = false; + + //loop through every block of the database record. for (RecordKeyList::const_iterator_type hitBlockIter = hitBlocks.begin(); hitBlockIter != hitBlocks.end(); hitBlockIter = hitBlocks.next()) { + //loop through every block of the query record. for (RecordKeyList::const_iterator_type keyListIter = keyList.begin(); keyListIter != keyList.end(); keyListIter = keyList.next()) { const Record *keyBlock = keyListIter->value(); const Record *hitBlock = hitBlockIter->value(); @@ -186,6 +192,7 @@ int BlockMgr::findBlockedOverlaps(RecordKeyList &keyList, RecordKeyList &hitList } } + _overlapBases.push_back(totalHitOverlap); if (hitHasOverlap) { if ((float) totalHitOverlap / (float)keyBlocksSumLength > _context->getOverlapFraction()) { if (_context->getReciprocal() && diff --git a/src/utils/FileRecordTools/Records/BlockMgr.h b/src/utils/FileRecordTools/Records/BlockMgr.h index 5de642f8cac274e4020829763750805de117b65b..86ddfd7e2b5e1b3d7de129e10b18d87b0cb13866 100644 --- a/src/utils/FileRecordTools/Records/BlockMgr.h +++ b/src/utils/FileRecordTools/Records/BlockMgr.h @@ -36,17 +36,20 @@ public: // and checking that their total intersection meets any overlapFraction and reciprocal criteria compared to // the total block lengths of the hitList and keyList. All hits that pass will be in the resultList. // Return value is the number of hits in the result set. + int findBlockedOverlaps(RecordKeyList &keyList, RecordKeyList &hitList, RecordKeyList &resultList); //these are setting options for splitting BAM records void setBreakOnDeletionOps(bool val) { _breakOnDeletionOps = val; } void setBreakOnSkipOps(bool val) { _breakOnSkipOps = val; } + int getOverlapBases(int hitIdx) const { return _overlapBases[hitIdx]; } private: RecordMgr *_blockRecordsMgr; ContextIntersect *_context; bool _breakOnDeletionOps; bool _breakOnSkipOps; + vector<int> _overlapBases; // For now, all records will be split into Bed6 records. const static FileRecordTypeChecker::RECORD_TYPE _blockRecordsType = FileRecordTypeChecker::BED6_RECORD_TYPE; diff --git a/src/utils/FileRecordTools/Records/GffRecord.cpp b/src/utils/FileRecordTools/Records/GffRecord.cpp index f13c8fd8c59f357eaefb4136ddb5a1a4fa8d72cb..b36a0fd899a89b87fe53e183c9b15e5d5e3b3963 100644 --- a/src/utils/FileRecordTools/Records/GffRecord.cpp +++ b/src/utils/FileRecordTools/Records/GffRecord.cpp @@ -120,3 +120,22 @@ void GffRecord::printNull(QuickString &outBuf) const } } +const QuickString &GffRecord::getField(int fieldNum) const +{ + if (fieldNum == 9 && _numFields == 9) { + return _group; + } + switch (fieldNum) { + case 7: + return _source; + break; + case 8: + return _frame; + break; + default: + return Bed6Interval::getField(fieldNum); + break; + } +} + + diff --git a/src/utils/FileRecordTools/Records/GffRecord.h b/src/utils/FileRecordTools/Records/GffRecord.h index 195d4beaf5453f946922b2d5ad3ba60d6da8aec5..b84d96a71384750e850de0ffe4d03bb3e8272dfd 100644 --- a/src/utils/FileRecordTools/Records/GffRecord.h +++ b/src/utils/FileRecordTools/Records/GffRecord.h @@ -27,9 +27,10 @@ public: virtual const QuickString &getSource() const { return _source; } virtual const QuickString &getFrame() const { return _frame; } virtual const QuickString &getGroup() const { return _group; } - virtual const int getNumFields() const { return _numFields; } + virtual int getNumFields() const { return _numFields; } virtual void setNumFields(int val) { _numFields = val; } + virtual const QuickString &getField(int fieldNum) const; //Note: using the assignment operator in a GffRecord can potentially be a performance hit, //if the number of fields frequently differ between this object and the one being copied. const GffRecord &operator=(const GffRecord &other); diff --git a/src/utils/FileRecordTools/Records/Record.cpp b/src/utils/FileRecordTools/Records/Record.cpp index 6bad45dd232c6e06db3fa169986518d640538188..37b88305b1071212ab2a6066e9ec4d6fca16be65 100644 --- a/src/utils/FileRecordTools/Records/Record.cpp +++ b/src/utils/FileRecordTools/Records/Record.cpp @@ -192,3 +192,17 @@ ostream &operator << (ostream &out, const Record &record) out << errBuf; return out; } + +const QuickString &Record::getField(int fieldNum) const +{ +// try { +// _column_vec.push_back(hits[i].fields.at(_column)); +// } +// catch(std::out_of_range& e) { + cerr << endl << "*****" << endl + << "*****ERROR: requested column " << fieldNum +1 << + " , but record only has fields 1 - " << getNumFields() << ". Exiting." << endl + << endl << "*****" << endl; + exit(1); +// } +} diff --git a/src/utils/FileRecordTools/Records/Record.h b/src/utils/FileRecordTools/Records/Record.h index 92df4bad97dadde7e8a9e6b1d6d4b0dacd88dad0..2c303d90c02272ea1c7a3956d29e8fac1abd0398 100644 --- a/src/utils/FileRecordTools/Records/Record.h +++ b/src/utils/FileRecordTools/Records/Record.h @@ -82,6 +82,9 @@ public: virtual void setScore(const string &chr) { _score = chr; } virtual void setScore(const char *chr) { _score = chr; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const = 0; + virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::UNKNOWN_RECORD_TYPE; } virtual bool coordsValid(); //test that no coords negative, end not less than start, check zeroLength (see below). diff --git a/src/utils/FileRecordTools/Records/VcfRecord.cpp b/src/utils/FileRecordTools/Records/VcfRecord.cpp index 1da6e9ad897ccb28474e6da9ea381160e37043b0..20cbb6f7cf881f8af7a3f468cb73ce96e21210dc 100644 --- a/src/utils/FileRecordTools/Records/VcfRecord.cpp +++ b/src/utils/FileRecordTools/Records/VcfRecord.cpp @@ -80,3 +80,32 @@ void VcfRecord::printOtherFields(QuickString &outBuf) const { } } + +const QuickString &VcfRecord::getField(int fieldNum) const +{ + //a request for any of the first six fields will retrieve + //chrom, start, end, name, score, and strand, in that order. + //A request for field 6+ will go to the otherIdxs. + + switch (fieldNum) { + case 1: + return _chrName; + break; + case 2: + return _startPosStr; + break; + case 3: + return _name; + case 4: + return _varAlt; + break; + case 5: + return _varRef; + break; + case 6: + return _score; + default: + return BedPlusInterval::getField(fieldNum); + break; + } +} diff --git a/src/utils/FileRecordTools/Records/VcfRecord.h b/src/utils/FileRecordTools/Records/VcfRecord.h index c4325f948144d8a6077c7afbf6c7fc16d5a91776..72aafe5fef77d8e8e159085650d619a6c1ff9ce6 100644 --- a/src/utils/FileRecordTools/Records/VcfRecord.h +++ b/src/utils/FileRecordTools/Records/VcfRecord.h @@ -29,6 +29,7 @@ public: //if the number of fields frequently differ between this object and the one being copied. const BedPlusInterval &operator=(const VcfRecord &other); + virtual const QuickString &getField(int fieldNum) const; protected: QuickString _varAlt; diff --git a/src/utils/FileRecordTools/Records/recordsTar.tar.gz b/src/utils/FileRecordTools/Records/recordsTar.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..321a88e4935d73f7034cce0611f9191d27bfeaa2 Binary files /dev/null and b/src/utils/FileRecordTools/Records/recordsTar.tar.gz differ diff --git a/test/intersect/d.bed b/test/intersect/d.bed new file mode 100644 index 0000000000000000000000000000000000000000..27061a4a23c160d7106c1eb20209d2830c1141fe --- /dev/null +++ b/test/intersect/d.bed @@ -0,0 +1 @@ +chr1 5 15 diff --git a/test/intersect/new_test-intersect.sh b/test/intersect/new_test-intersect.sh index 85c6a8bbbf77c8f3f2ff7600bf8f6c4cc01efb4b..8516c7cb3124956846e8fda2322777593f12eb77 100755 --- a/test/intersect/new_test-intersect.sh +++ b/test/intersect/new_test-intersect.sh @@ -223,8 +223,8 @@ rm obs exp ########################################################### -# Test intersection of bam file containing read where one -# mate is mapped and one is not. +# Test intersection of bam file containing read where query +# is mapped and mate is not. ############################################################ echo " intersect.new.t19...\c" echo \ @@ -234,8 +234,8 @@ check obs exp rm obs exp ########################################################### -# Test intersection of bam file containing read where one -# mate is mapped and one is not, with noHit (-v) option. +# Test intersection of bam file containing read where query +# is mapped and mate is not, with noHit (-v) option. ############################################################ echo " intersect.new.t20...\c" echo \ @@ -245,6 +245,30 @@ check obs exp rm obs exp +########################################################### +# Test intersection of bam file containing read where query +# is unmapped but mate is mapped. +############################################################ +echo " intersect.new.t20.b...\c" +touch exp +$BT intersect -a queryUnmappedMateMappedCoordsInvalid.bam -b j1.bed -bed > obs +check obs exp +rm obs exp + +########################################################### +# Test intersection of bam file containing read where one +# mate is mapped and one is not, with noHit (-v) option. +############################################################ +echo " intersect.new.t20.c...\c" +echo ". -1 -1 TTTACCTTT:4FSQ5P1:286:D2GA7ACXX:6:2316:20858:89646 0 . -1 -1 -1 0,0,0 0 . ." > exp +$BT intersect -a queryUnmappedMateMappedCoordsInvalid.bam -b j1.bed -bed -v > obs +check obs exp +rm obs exp + + + + + ########################################################### # Test intersection with -sorted, see that order specified # in genome file is enforced. diff --git a/test/intersect/queryUnmappedMateMappedCoordsInvalid.bam b/test/intersect/queryUnmappedMateMappedCoordsInvalid.bam new file mode 100644 index 0000000000000000000000000000000000000000..1ecaaae0cef02ede4312af6b87a55764112e3382 Binary files /dev/null and b/test/intersect/queryUnmappedMateMappedCoordsInvalid.bam differ diff --git a/test/intersect/test-intersect.sh b/test/intersect/test-intersect.sh index 40e2ec487651044a4c43baa0877e63547555eaa4..da0ba967e8530b2b6feb0f366c3acc6eb4e6ec5b 100644 --- a/test/intersect/test-intersect.sh +++ b/test/intersect/test-intersect.sh @@ -129,6 +129,7 @@ $BT intersect -a a.bed -b b.bed -wo -s > obs check obs exp rm obs exp + ########################################################### # Test with -wao (write all overlap) with -s ############################################################ @@ -255,6 +256,67 @@ rm obs exp +########################################################### +# Test three blocks with -split -wo only shows 5 overlap +# bases, not ten. +############################################################ +echo " intersect.t22.a...\c" +echo "chr1 0 50 three_blocks_match 0 + 0 0 0 3 10,10,10, 0,20,40, chr1 5 15 5" > exp +$BT intersect -a three_blocks_match.bed -b d.bed -split -wo > obs +check obs exp +rm obs exp + +########################################################### +# Same test but for BAM file +############################################################ +echo " intersect.t22.b...\c" +echo "chr1 0 50 three_blocks_match 255 + 0 50 0,0,0 3 10,10,10, 0,20,40, chr1 5 15 5" > exp +$BT intersect -a three_blocks_match.bam -b d.bed -split -wo -bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test three blocks with -split -wo, and DB record also has +# blocks that somewhat intersect +############################################################ +echo " intersect.t22.c...\c" +echo "chr1 0 50 three_blocks_match 0 + 0 0 0 3 10,10,10, 0,20,40, chr1 0 45 three_blocks_match 0 + 0 0 0 2 5,10, 25,35, 10" > exp +$BT intersect -a three_blocks_match.bed -b two_blocks_partial.bed -split -wo > obs +check obs exp +rm obs exp + +########################################################### +# Same test but for BAM file +############################################################ +echo " intersect.t22.d...\c" +echo "chr1 0 50 three_blocks_match 255 + 0 50 0,0,0 3 10,10,10, 0,20,40, chr1 0 45 three_blocks_match 0 + 0 0 0 2 5,10, 25,35, 10" > exp +$BT intersect -a three_blocks_match.bam -b two_blocks_partial.bed -split -wo -bed > obs +check obs exp +rm obs exp + +########################################################### +# Test three blocks with -split -wo, and DB record also has +# blocks that do not intersect +############################################################ +echo " intersect.t22.e...\c" +touch exp +$BT intersect -a three_blocks_match.bed -b three_blocks_nomatch.bed -split -wo > obs +check obs exp +rm obs exp + + +########################################################### +# Same test but for BAM file +############################################################ +echo " intersect.t22.f...\c" +touch exp +$BT intersect -a three_blocks_match.bam -b three_blocks_nomatch.bed -split -wo -bed > obs +check obs exp +rm obs exp + + + ################################################################## # Test that only the mapped read is is found as an intersection ################################################################## diff --git a/test/intersect/three_blocks_match.bam b/test/intersect/three_blocks_match.bam new file mode 100644 index 0000000000000000000000000000000000000000..f67547b5d0b324171cc5148d9d49a09b7cb322ff Binary files /dev/null and b/test/intersect/three_blocks_match.bam differ diff --git a/test/intersect/two_blocks_partial.bam b/test/intersect/two_blocks_partial.bam new file mode 100644 index 0000000000000000000000000000000000000000..3a82822b49a0adffcdafe9b4b81aa7841db1cde1 Binary files /dev/null and b/test/intersect/two_blocks_partial.bam differ diff --git a/test/intersect/two_blocks_partial.bed b/test/intersect/two_blocks_partial.bed new file mode 100644 index 0000000000000000000000000000000000000000..8a83f351da64186c42d6b8945a8cab750a021601 --- /dev/null +++ b/test/intersect/two_blocks_partial.bed @@ -0,0 +1 @@ +chr1 0 45 three_blocks_match 0 + 0 0 0 2 5,10, 25,35,