From 61b7a97626bc8a535fc2a5953f758b19926a0766 Mon Sep 17 00:00:00 2001 From: arq5x <arq5x@virginia.edu> Date: Sat, 1 Feb 2014 16:35:47 -0500 Subject: [PATCH] add methods to extract column from record by its col. number --- .../FileRecordTools/Records/BamRecord.cpp | 213 ++++++++++-------- src/utils/FileRecordTools/Records/BamRecord.h | 3 + .../FileRecordTools/Records/Bed3Interval.cpp | 18 ++ .../FileRecordTools/Records/Bed3Interval.h | 3 + .../FileRecordTools/Records/Bed4Interval.cpp | 12 + .../FileRecordTools/Records/Bed4Interval.h | 4 + .../FileRecordTools/Records/Bed5Interval.cpp | 15 ++ .../FileRecordTools/Records/Bed5Interval.h | 4 + .../FileRecordTools/Records/Bed6Interval.cpp | 17 ++ .../FileRecordTools/Records/Bed6Interval.h | 4 + .../Records/BedGraphInterval.cpp | 12 + .../Records/BedGraphInterval.h | 4 + .../Records/BedPlusInterval.cpp | 29 +-- .../FileRecordTools/Records/BedPlusInterval.h | 4 +- src/utils/FileRecordTools/Records/Record.cpp | 14 ++ src/utils/FileRecordTools/Records/Record.h | 3 + 16 files changed, 236 insertions(+), 123 deletions(-) diff --git a/src/utils/FileRecordTools/Records/BamRecord.cpp b/src/utils/FileRecordTools/Records/BamRecord.cpp index cde07329..4c5cd8dc 100644 --- a/src/utils/FileRecordTools/Records/BamRecord.cpp +++ b/src/utils/FileRecordTools/Records/BamRecord.cpp @@ -1,3 +1,10 @@ +/* + * BamRecord.cpp + * + * Created on: Jan 14, 2014 + * Author: nek3d + */ + #include "BamRecord.h" #include "BamFileReader.h" #include "RecordKeyList.h" @@ -15,143 +22,155 @@ BamRecord::~BamRecord() const BamRecord &BamRecord::operator=(const BamRecord &other) { - Bed6Interval::operator=(other); - _bamAlignment = other._bamAlignment; - return *this; + Bed6Interval::operator=(other); + _bamAlignment = other._bamAlignment; + return *this; } bool BamRecord::initFromFile(FileReader *fileReader) { - BamFileReader *bamFileReader = static_cast<BamFileReader*>(fileReader); - return initFromFile(bamFileReader); + BamFileReader *bamFileReader = static_cast<BamFileReader*>(fileReader); + return initFromFile(bamFileReader); } bool BamRecord::initFromFile(BamFileReader *bamFileReader) { - bamFileReader->getChrName(_chrName); - - _bamChromId = bamFileReader->getCurrChromdId(); - _startPos = bamFileReader->getStartPos(); - int2str(_startPos, _startPosStr); - _endPos = bamFileReader->getEndPos(); - int2str(_endPos, _endPosStr); - bamFileReader->getName(_name); - bamFileReader->getScore(_score); - char strandChar = bamFileReader->getStrand(); - setStrand(strandChar); - - _bamAlignment = bamFileReader->getAlignment(); - _isUnmapped = !_bamAlignment.IsMapped(); - _isMateUnmapped = !_bamAlignment.IsMateMapped(); - return true; + bamFileReader->getChrName(_chrName); + + _bamChromId = bamFileReader->getCurrChromdId(); + _startPos = bamFileReader->getStartPos(); + int2str(_startPos, _startPosStr); + _endPos = bamFileReader->getEndPos(); + int2str(_endPos, _endPosStr); + bamFileReader->getName(_name); + bamFileReader->getScore(_score); + char strandChar = bamFileReader->getStrand(); + setStrand(strandChar); + + _bamAlignment = bamFileReader->getAlignment(); + _isUnmapped = !_bamAlignment.IsMapped(); + _isMateUnmapped = !_bamAlignment.IsMateMapped(); + return true; } void BamRecord::clear() { - Bed6Interval::clear(); - _bamChromId = -1; - - - //Clear the BamAlignment object. Sadly, it does not have a clear() method, - //so we have to do each member manually. - _bamAlignment.Name.clear(); - _bamAlignment.Length = 0; - _bamAlignment.QueryBases.clear(); - _bamAlignment.AlignedBases.clear(); - _bamAlignment.Qualities.clear(); - _bamAlignment.TagData.clear(); - _bamAlignment.RefID = -1; - _bamAlignment.Position = -1; - _bamAlignment.Bin = 0; - _bamAlignment.MapQuality = 0; - _bamAlignment.AlignmentFlag = 0; - _bamAlignment.CigarData.clear(); - _bamAlignment.MateRefID = -1; - _bamAlignment.MatePosition = -1; - _bamAlignment.InsertSize = -1; - _bamAlignment.Filename.clear(); - - _bamAlignment.SupportData.AllCharData.clear(); - _bamAlignment.SupportData.BlockLength = 0; - _bamAlignment.SupportData.NumCigarOperations = 0; - _bamAlignment.SupportData.QueryNameLength = 0; - _bamAlignment.SupportData.QuerySequenceLength = 0; - _bamAlignment.SupportData.HasCoreOnly = false; - - _bamAlignment.ErrorString.clear(); + Bed6Interval::clear(); + _bamChromId = -1; + + + //Clear the BamAlignment object. Sadly, it does not have a clear() method, + //so we have to do each member manually. + _bamAlignment.Name.clear(); + _bamAlignment.Length = 0; + _bamAlignment.QueryBases.clear(); + _bamAlignment.AlignedBases.clear(); + _bamAlignment.Qualities.clear(); + _bamAlignment.TagData.clear(); + _bamAlignment.RefID = -1; + _bamAlignment.Position = -1; + _bamAlignment.Bin = 0; + _bamAlignment.MapQuality = 0; + _bamAlignment.AlignmentFlag = 0; + _bamAlignment.CigarData.clear(); + _bamAlignment.MateRefID = -1; + _bamAlignment.MatePosition = -1; + _bamAlignment.InsertSize = -1; + _bamAlignment.Filename.clear(); + + _bamAlignment.SupportData.AllCharData.clear(); + _bamAlignment.SupportData.BlockLength = 0; + _bamAlignment.SupportData.NumCigarOperations = 0; + _bamAlignment.SupportData.QueryNameLength = 0; + _bamAlignment.SupportData.QuerySequenceLength = 0; + _bamAlignment.SupportData.HasCoreOnly = false; + + _bamAlignment.ErrorString.clear(); } void BamRecord::print(QuickString &outBuf, RecordKeyList *keyList) const { - Bed6Interval::print(outBuf); + Bed6Interval::print(outBuf); printRemainingBamFields(outBuf, keyList); } void BamRecord::print(QuickString &outBuf, int start, int end, RecordKeyList *keyList) const { - Bed6Interval::print(outBuf, start, end); + Bed6Interval::print(outBuf, start, end); printRemainingBamFields(outBuf, keyList); } void BamRecord::print(QuickString &outBuf, const QuickString & start, const QuickString & end, RecordKeyList *keyList) const { - Bed6Interval::print(outBuf, start, end); + Bed6Interval::print(outBuf, start, end); printRemainingBamFields(outBuf, keyList); } void BamRecord::printNull(QuickString &outBuf) const { - Bed6Interval::printNull(outBuf); - outBuf.append("\t.\t.\t.\t.\t.\t.", 12); + Bed6Interval::printNull(outBuf); + outBuf.append("\t.\t.\t.\t.\t.\t.", 12); } void BamRecord::printRemainingBamFields(QuickString &outBuf, RecordKeyList *keyList) const { - outBuf.append('\t'); - outBuf.append(_bamAlignment.Position); - outBuf.append('\t'); - outBuf.append(_endPos); - outBuf.append("\t0,0,0", 6); - outBuf.append('\t'); - - int numBlocks = (int)keyList->size(); - - if (numBlocks > 0) { - outBuf.append(numBlocks); - - vector<int> blockLengths; - vector<int> blockStarts; - for (RecordKeyList::const_iterator_type iter = keyList->begin(); iter != keyList->end(); iter = keyList->next()) { - const Record *block = iter->value(); - blockLengths.push_back(block->getEndPos() - block->getStartPos()); - blockStarts.push_back(block->getStartPos() - _bamAlignment.Position); - } - - outBuf.append('\t'); - for (int i=0; i < (int)blockLengths.size(); i++) { - outBuf.append(blockLengths[i]); - outBuf.append(','); - } - outBuf.append('\t'); - for (int i=0; i < (int)blockStarts.size(); i++) { - outBuf.append( blockStarts[i]); - outBuf.append(','); - } - } - else { - outBuf.append("1\t0,\t0,"); - } + outBuf.append('\t'); + outBuf.append(_bamAlignment.Position); + outBuf.append('\t'); + outBuf.append(_endPos); + outBuf.append("\t0,0,0", 6); + outBuf.append('\t'); + + int numBlocks = (int)keyList->size(); + + if (numBlocks > 0) { + outBuf.append(numBlocks); + + vector<int> blockLengths; + vector<int> blockStarts; + for (RecordKeyList::const_iterator_type iter = keyList->begin(); iter != keyList->end(); iter = keyList->next()) { + const Record *block = iter->value(); + blockLengths.push_back(block->getEndPos() - block->getStartPos()); + blockStarts.push_back(block->getStartPos() - _bamAlignment.Position); + } + + outBuf.append('\t'); + for (int i=0; i < (int)blockLengths.size(); i++) { + outBuf.append(blockLengths[i]); + outBuf.append(','); + } + outBuf.append('\t'); + for (int i=0; i < (int)blockStarts.size(); i++) { + outBuf.append( blockStarts[i]); + outBuf.append(','); + } + } + else { + outBuf.append("1\t0,\t0,"); + } } void BamRecord::printUnmapped(QuickString &outBuf) const { - outBuf.append(_chrName.empty() ? "." : _chrName); - outBuf.append("\t-1\t-1\t"); - outBuf.append(_name.empty() ? "." : _name); - outBuf.append('\t'); - outBuf.append(_score.empty() ? "." : _score); - outBuf.append("\t.\t-1\t-1\t-1\t0,0,0\t0\t.\t."); // dot for strand, -1 for blockStarts and blockEnd + outBuf.append(_chrName.empty() ? "." : _chrName); + outBuf.append("\t-1\t-1\t"); + outBuf.append(_name.empty() ? "." : _name); + outBuf.append('\t'); + outBuf.append(_score.empty() ? "." : _score); + outBuf.append("\t.\t-1\t-1\t-1\t0,0,0\t0\t.\t."); // dot for strand, -1 for blockStarts and blockEnd +} + +const QuickString &BamRecord::getField(int fieldNum) const +{ + //TBD: Determine what correct behavior should be. + //I.e. if users requests field 2, do they want Flag + //for Bam Records, or startPos for all records? -NEK 1/14/14. + + return Bed6Interval::getField(fieldNum); } + + + diff --git a/src/utils/FileRecordTools/Records/BamRecord.h b/src/utils/FileRecordTools/Records/BamRecord.h index 262ef1ab..b74dbc2c 100644 --- a/src/utils/FileRecordTools/Records/BamRecord.h +++ b/src/utils/FileRecordTools/Records/BamRecord.h @@ -38,6 +38,9 @@ public: const BamTools::BamAlignment &getAlignment() const { return _bamAlignment; } int getBamChromId() const { return _bamChromId; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return 12; } + protected: BamTools::BamAlignment _bamAlignment; int _bamChromId; //different from chromId, because BAM file may be in different order diff --git a/src/utils/FileRecordTools/Records/Bed3Interval.cpp b/src/utils/FileRecordTools/Records/Bed3Interval.cpp index 742ea7b9..3f896be5 100644 --- a/src/utils/FileRecordTools/Records/Bed3Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed3Interval.cpp @@ -61,3 +61,21 @@ void Bed3Interval::print(QuickString &outBuf, const QuickString & start, const Q void Bed3Interval::printNull(QuickString &outBuf) const { outBuf.append(".\t-1\t-1", 7); } + +const QuickString &Bed3Interval::getField(int fieldNum) const +{ + switch (fieldNum) { + case 1: + return _chrName; + break; + case 2: + return _startPosStr; + break; + case 3: + return _endPosStr; + break; + default: + return Record::getField(fieldNum); + break; + } +} diff --git a/src/utils/FileRecordTools/Records/Bed3Interval.h b/src/utils/FileRecordTools/Records/Bed3Interval.h index a3015e0b..9f1ff118 100644 --- a/src/utils/FileRecordTools/Records/Bed3Interval.h +++ b/src/utils/FileRecordTools/Records/Bed3Interval.h @@ -29,6 +29,9 @@ public: virtual void printNull(QuickString &outBuf) const; virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED3_RECORD_TYPE; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return 3; } + protected: virtual ~Bed3Interval(); diff --git a/src/utils/FileRecordTools/Records/Bed4Interval.cpp b/src/utils/FileRecordTools/Records/Bed4Interval.cpp index 13f5265e..c1ef81a3 100644 --- a/src/utils/FileRecordTools/Records/Bed4Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed4Interval.cpp @@ -48,3 +48,15 @@ void Bed4Interval::printNull(QuickString &outBuf) const outBuf.append("\t.", 2); } +const QuickString &Bed4Interval::getField(int fieldNum) const +{ + switch (fieldNum) { + case 4: + return _name; + break; + default: + return Bed3Interval::getField(fieldNum); + break; + } +} + diff --git a/src/utils/FileRecordTools/Records/Bed4Interval.h b/src/utils/FileRecordTools/Records/Bed4Interval.h index 4d9fda7a..f42817c4 100644 --- a/src/utils/FileRecordTools/Records/Bed4Interval.h +++ b/src/utils/FileRecordTools/Records/Bed4Interval.h @@ -26,6 +26,10 @@ public: virtual void printNull(QuickString &outBuf) const; virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED4_RECORD_TYPE; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return 4; } + + protected: virtual ~Bed4Interval(); }; diff --git a/src/utils/FileRecordTools/Records/Bed5Interval.cpp b/src/utils/FileRecordTools/Records/Bed5Interval.cpp index 8e643608..7307fb66 100644 --- a/src/utils/FileRecordTools/Records/Bed5Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed5Interval.cpp @@ -55,3 +55,18 @@ void Bed5Interval::printNull(QuickString &outBuf) const outBuf.append("\t.\t-1", 5); } +const QuickString &Bed5Interval::getField(int fieldNum) const +{ + switch (fieldNum) { + case 4: + return _name; + break; + case 5: + return _score; + break; + + default: + return Bed3Interval::getField(fieldNum); + break; + } +} diff --git a/src/utils/FileRecordTools/Records/Bed5Interval.h b/src/utils/FileRecordTools/Records/Bed5Interval.h index 01d65966..bc913d1d 100644 --- a/src/utils/FileRecordTools/Records/Bed5Interval.h +++ b/src/utils/FileRecordTools/Records/Bed5Interval.h @@ -25,6 +25,10 @@ public: virtual void printNull(QuickString &outBuf) const; virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED5_RECORD_TYPE; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return 5; } + + protected: virtual ~Bed5Interval(); }; diff --git a/src/utils/FileRecordTools/Records/Bed6Interval.cpp b/src/utils/FileRecordTools/Records/Bed6Interval.cpp index b486333a..8371553a 100644 --- a/src/utils/FileRecordTools/Records/Bed6Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed6Interval.cpp @@ -64,3 +64,20 @@ void Bed6Interval::printNull(QuickString &outBuf) const outBuf.append("\t.\t-1\t.", 7); } +const QuickString &Bed6Interval::getField(int fieldNum) const +{ + switch (fieldNum) { + case 4: + return _name; + break; + case 5: + return _score; + break; + case 6: + return _strand; + break; + default: + return Bed3Interval::getField(fieldNum); + break; + } +} diff --git a/src/utils/FileRecordTools/Records/Bed6Interval.h b/src/utils/FileRecordTools/Records/Bed6Interval.h index 91a00e09..9ad9f80b 100644 --- a/src/utils/FileRecordTools/Records/Bed6Interval.h +++ b/src/utils/FileRecordTools/Records/Bed6Interval.h @@ -25,6 +25,10 @@ public: virtual void printNull(QuickString &outBuf) const; virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED6_RECORD_TYPE; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return 6; } + + protected: virtual ~Bed6Interval(); }; diff --git a/src/utils/FileRecordTools/Records/BedGraphInterval.cpp b/src/utils/FileRecordTools/Records/BedGraphInterval.cpp index e089782c..e0808573 100644 --- a/src/utils/FileRecordTools/Records/BedGraphInterval.cpp +++ b/src/utils/FileRecordTools/Records/BedGraphInterval.cpp @@ -48,3 +48,15 @@ void BedGraphInterval::printNull(QuickString &outBuf) const outBuf.append("\t.", 2); } +const QuickString &BedGraphInterval::getField(int fieldNum) const +{ + switch (fieldNum) { + case 4: + return _name; + break; + default: + return Bed3Interval::getField(fieldNum); + break; + } +} + diff --git a/src/utils/FileRecordTools/Records/BedGraphInterval.h b/src/utils/FileRecordTools/Records/BedGraphInterval.h index f3d612e8..1bdf619a 100644 --- a/src/utils/FileRecordTools/Records/BedGraphInterval.h +++ b/src/utils/FileRecordTools/Records/BedGraphInterval.h @@ -25,6 +25,10 @@ public: virtual void printNull(QuickString &outBuf) const; virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BEDGRAPH_RECORD_TYPE; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return 4; } + + protected: virtual ~BedGraphInterval(); }; diff --git a/src/utils/FileRecordTools/Records/BedPlusInterval.cpp b/src/utils/FileRecordTools/Records/BedPlusInterval.cpp index d3e0c01a..fc8be368 100644 --- a/src/utils/FileRecordTools/Records/BedPlusInterval.cpp +++ b/src/utils/FileRecordTools/Records/BedPlusInterval.cpp @@ -105,36 +105,15 @@ void BedPlusInterval::printNull(QuickString &outBuf) const for (int i=startOtherIdx; i < _numPrintFields; i++) { outBuf.append("\t."); } - } -QuickString BedPlusInterval::getField(int fieldNum) const +const QuickString &BedPlusInterval::getField(int fieldNum) const { //a request for any of the first six fields will retrieve //chrom, start, end, name, score, and strand, in that order. //A request for field 6+ will go to the otherIdxs. - - switch (fieldNum) { - case 0: - return _chrName; - break; //redundant after a return, but good practice anyway. - case 1: - return _startPosStr; - break; - case 2: - return _endPosStr; - break; - case 3: - return _name; - break; - case 4: - return _score; - break; - case 5: - return _strand; - break; - default: - return (*(_otherIdxs[fieldNum - startOtherIdx])); - break; + if (fieldNum > startOtherIdx && fieldNum <= startOtherIdx + (int)_otherIdxs.size()) { + return (*(_otherIdxs[fieldNum - startOtherIdx - 1])); } + return Bed6Interval::getField(fieldNum); } diff --git a/src/utils/FileRecordTools/Records/BedPlusInterval.h b/src/utils/FileRecordTools/Records/BedPlusInterval.h index 4ebd3b67..4b98b4f3 100644 --- a/src/utils/FileRecordTools/Records/BedPlusInterval.h +++ b/src/utils/FileRecordTools/Records/BedPlusInterval.h @@ -30,7 +30,9 @@ public: //if the number of fields frequently differ between this object and the one being copied. const BedPlusInterval &operator=(const BedPlusInterval &other); - virtual QuickString getField(int fieldNum) const; + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const { return startOtherIdx + _otherIdxs.size(); } + virtual void setField(int fieldNum, const QuickString &str) { (*(_otherIdxs[fieldNum])) = str; } virtual void setField(int fieldNum, const string &str) { (*(_otherIdxs[fieldNum])) = str; } virtual void setField(int fieldNum, const char *str) { (*(_otherIdxs[fieldNum])) = str; } diff --git a/src/utils/FileRecordTools/Records/Record.cpp b/src/utils/FileRecordTools/Records/Record.cpp index 6bad45dd..8e7d7911 100644 --- a/src/utils/FileRecordTools/Records/Record.cpp +++ b/src/utils/FileRecordTools/Records/Record.cpp @@ -192,3 +192,17 @@ ostream &operator << (ostream &out, const Record &record) out << errBuf; return out; } + +const QuickString &Record::getField(int fieldNum) const +{ +// try { +// _column_vec.push_back(hits[i].fields.at(_column)); +// } +// catch(std::out_of_range& e) { + cerr << endl << "*****" << endl + << "*****ERROR: requested column " << fieldNum << + " , but record only has fields 1 - " << getNumFields() << ". Exiting." << endl + << endl << "*****" << endl; + exit(1); +// } +} diff --git a/src/utils/FileRecordTools/Records/Record.h b/src/utils/FileRecordTools/Records/Record.h index 92df4bad..2c303d90 100644 --- a/src/utils/FileRecordTools/Records/Record.h +++ b/src/utils/FileRecordTools/Records/Record.h @@ -82,6 +82,9 @@ public: virtual void setScore(const string &chr) { _score = chr; } virtual void setScore(const char *chr) { _score = chr; } + virtual const QuickString &getField(int fieldNum) const; + virtual int getNumFields() const = 0; + virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::UNKNOWN_RECORD_TYPE; } virtual bool coordsValid(); //test that no coords negative, end not less than start, check zeroLength (see below). -- GitLab