Commit 0c046ca8 authored by Neil Kindlon's avatar Neil Kindlon
Browse files

Fixed Bug 254: stranded merge of BedPlus records w/ strand. Added strand to...

Fixed Bug 254: stranded merge of BedPlus records w/ strand. Added strand to default output for stranded merge. Added and updated unit tests.
parent 107f803e
......@@ -2,15 +2,31 @@
#include "SingleLineDelimTextFileReader.h"
BedPlusInterval::BedPlusInterval()
: _numPrintFields(0)
: _numFixedFields(defaultNumFixedFields),
_numPrintFields(0)
{
_plusFields.setNumOffsetFields(numFixedFields);
_plusFields.setNumOffsetFields(defaultNumFixedFields);
}
void BedPlusInterval::setNumFixedFields(int numFields) {
_numFixedFields = numFields;
_plusFields.setNumOffsetFields(numFields);
}
bool BedPlusInterval::initFromFile(SingleLineDelimTextFileReader *fileReader)
{
return (Bed3Interval::initFromFile(fileReader) && _plusFields.initFromFile(fileReader));
bool baseRetFlag = Bed3Interval::initFromFile(fileReader);
if (_numFixedFields != defaultNumFixedFields) {
fileReader->getField(3, _name);
fileReader->getField(4, _score);
fileReader->getField(5, _strand);
adjustStrandVal();
}
_plusFields.initFromFile(fileReader);
return baseRetFlag;
}
......@@ -22,18 +38,21 @@ void BedPlusInterval::clear() {
void BedPlusInterval::print(QuickString &outBuf) const
{
Bed3Interval::print(outBuf);
printBed6PlusFields(outBuf);
_plusFields.printFields(outBuf);
}
void BedPlusInterval::print(QuickString &outBuf, int start, int end) const
{
Bed3Interval::print(outBuf, start, end);
printBed6PlusFields(outBuf);
_plusFields.printFields(outBuf);
}
void BedPlusInterval::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const
{
Bed3Interval::print(outBuf, start, end);
printBed6PlusFields(outBuf);
_plusFields.printFields(outBuf);
}
......@@ -41,14 +60,15 @@ void BedPlusInterval::print(QuickString &outBuf, const QuickString & start, cons
void BedPlusInterval::printNull(QuickString &outBuf) const
{
Bed3Interval::printNull(outBuf);
for (int i=numFixedFields; i < _numPrintFields; i++) {
printBed6PlusNullFields(outBuf);
for (int i=_numFixedFields; i < _numPrintFields; i++) {
outBuf.append("\t.");
}
}
const QuickString &BedPlusInterval::getField(int fieldNum) const
{
if (fieldNum > numFixedFields) {
if (fieldNum > _numFixedFields) {
return _plusFields.getField(fieldNum);
}
return Bed3Interval::getField(fieldNum);
......@@ -61,9 +81,27 @@ bool BedPlusInterval::isNumericField(int fieldNum) {
// fields after the 3rd are numeric, so for now we'll give the user the
// benefit of the doubt on those.
//
if (fieldNum > numFixedFields) {
if (fieldNum > defaultNumFixedFields) {
return true;
}
return Bed3Interval::isNumericField(fieldNum);
}
void BedPlusInterval::printBed6PlusFields(QuickString &outBuf) const {
if (_numFixedFields != defaultNumFixedFields) {
outBuf.append('\t');
outBuf.append(_name);
outBuf.append('\t');
outBuf.append(_score);
outBuf.append('\t');
outBuf.append(_strand);
}
}
void BedPlusInterval::printBed6PlusNullFields(QuickString &outBuf) const {
if (_numFixedFields != defaultNumFixedFields) {
outBuf.append("\t.\t.\t.");
}
}
......@@ -19,6 +19,7 @@ public:
BedPlusInterval();
virtual ~BedPlusInterval() {}
void setNumFixedFields(int numFields);
virtual bool initFromFile(SingleLineDelimTextFileReader *);
virtual void clear();
virtual void print(QuickString &outBuf) const;
......@@ -28,7 +29,7 @@ public:
virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED_PLUS_RECORD_TYPE; }
virtual const QuickString &getField(int fieldNum) const;
virtual int getNumFields() const { return numFixedFields + _plusFields.size(); }
virtual int getNumFields() const { return _numFixedFields + _plusFields.size(); }
virtual void setNumPrintFields(int num) { _numPrintFields = num; }
virtual int getNumPrintFields() const { return _numPrintFields; }
......@@ -36,10 +37,14 @@ public:
protected:
static const int numFixedFields = 3; //first three fields have names, and are not stored in otherIdxs.
int _numFixedFields; //first fields have names, and are not stored in otherIdxs.
static const int defaultNumFixedFields = 3;
PlusFields _plusFields;
int _numPrintFields;
void printBed6PlusFields(QuickString &outBuf) const;
void printBed6PlusNullFields(QuickString &outBuf) const;
};
......
......@@ -45,6 +45,7 @@ RecordMgr::RecordMgr(FileRecordTypeChecker::RECORD_TYPE recType, int blockSize)
break;
}
case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE:
case FileRecordTypeChecker::BED6_PLUS_RECORD_TYPE:
{
_freeList = new FreeList<BedPlusInterval>(_freeListBlockSize);
break;
......@@ -119,6 +120,7 @@ RecordMgr::~RecordMgr()
break;
}
case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE:
case FileRecordTypeChecker::BED6_PLUS_RECORD_TYPE:
{
delete (FreeList<BedPlusInterval> *)_freeList;
break;
......@@ -200,8 +202,12 @@ Record *RecordMgr::allocateRecord()
break;
}
case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE:
case FileRecordTypeChecker::BED6_PLUS_RECORD_TYPE:
{
BedPlusInterval *bPi = ((FreeList<BedPlusInterval> *)_freeList)->newObj();
if (_recordType == FileRecordTypeChecker::BED6_PLUS_RECORD_TYPE) {
bPi->setNumFixedFields(6);
}
record = bPi;
break;
}
......@@ -284,6 +290,7 @@ void RecordMgr::deleteRecord(const Record *record)
break;
}
case FileRecordTypeChecker::BED_PLUS_RECORD_TYPE:
case FileRecordTypeChecker::BED6_PLUS_RECORD_TYPE:
{
((FreeList<BedPlusInterval> *)_freeList)->deleteObj(static_cast<const BedPlusInterval *>(record));
break;
......
......@@ -82,6 +82,12 @@ bool RecordOutputMgr::printKeyAndTerminate(RecordKeyVector &keyList) {
//bed3 format, which is surprisingly difficult to do. Had to use the following:
const Bed3Interval *bed3 = static_cast<const Bed3Interval *>(keyList.getKey());
bed3->Bed3Interval::print(_outBuf);
//in addition, if we're doing stranded merges, we need to print the strand sign.
if (_context->getDesiredStrand() != FileRecordMergeMgr::ANY_STRAND) {
_outBuf.append("\t");
_outBuf.append(keyList.getKey()->getStrand());
}
return false;
}
printBamType bamCode = printBamRecord(keyList);
......
......@@ -30,6 +30,7 @@ FileRecordTypeChecker::FileRecordTypeChecker()
_hasName[BED6_RECORD_TYPE] = true;
_hasName[BED12_RECORD_TYPE] = true;
_hasName[BED_PLUS_RECORD_TYPE] = true;
_hasName[BED6_PLUS_RECORD_TYPE] = true;
_hasName[BAM_RECORD_TYPE] = true;
_hasName[VCF_RECORD_TYPE] = true;
_hasName[GFF_RECORD_TYPE] = true;
......@@ -41,6 +42,7 @@ FileRecordTypeChecker::FileRecordTypeChecker()
_hasScore[BED6_RECORD_TYPE] = true;
_hasScore[BED12_RECORD_TYPE] = true;
_hasScore[BED_PLUS_RECORD_TYPE] = true;
_hasScore[BED6_PLUS_RECORD_TYPE] = true;
_hasScore[BAM_RECORD_TYPE] = true;
_hasScore[VCF_RECORD_TYPE] = true;
_hasScore[GFF_RECORD_TYPE] = true;
......@@ -51,7 +53,8 @@ FileRecordTypeChecker::FileRecordTypeChecker()
_hasStrand[BED3_RECORD_TYPE] = false;
_hasStrand[BED6_RECORD_TYPE] = true;
_hasStrand[BED12_RECORD_TYPE] = true;
_hasStrand[BED_PLUS_RECORD_TYPE] = true;
_hasStrand[BED_PLUS_RECORD_TYPE] = false;
_hasStrand[BED6_PLUS_RECORD_TYPE] = true;
_hasStrand[BAM_RECORD_TYPE] = true;
_hasStrand[VCF_RECORD_TYPE] = true;
_hasStrand[GFF_RECORD_TYPE] = true;
......@@ -196,7 +199,12 @@ bool FileRecordTypeChecker::handleTextFormat(const char *buffer, size_t len)
} else if (_numFields == 12 && passesBed12()) {
_recordType = BED12_RECORD_TYPE;
} else if (_numFields >3) {
_recordType = BED_PLUS_RECORD_TYPE;
if (_numFields >= 6 && isStrandField(5)) {
_recordType = BED6_PLUS_RECORD_TYPE;
} else {
_recordType = BED_PLUS_RECORD_TYPE;
}
}
return true;
}
......
......@@ -31,7 +31,8 @@ public:
GFF_FILE_TYPE, GZIP_FILE_TYPE, BAM_FILE_TYPE, VCF_FILE_TYPE} FILE_TYPE;
typedef enum { UNKNOWN_RECORD_TYPE, EMPTY_RECORD_TYPE, BED3_RECORD_TYPE, BED4_RECORD_TYPE, BEDGRAPH_RECORD_TYPE, BED5_RECORD_TYPE,
BED6_RECORD_TYPE, BED12_RECORD_TYPE, BED_PLUS_RECORD_TYPE, BAM_RECORD_TYPE, VCF_RECORD_TYPE, GFF_RECORD_TYPE, GFF_PLUS_RECORD_TYPE} RECORD_TYPE;
BED6_RECORD_TYPE, BED12_RECORD_TYPE, BED_PLUS_RECORD_TYPE, BED6_PLUS_RECORD_TYPE, BAM_RECORD_TYPE, VCF_RECORD_TYPE, GFF_RECORD_TYPE,
GFF_PLUS_RECORD_TYPE} RECORD_TYPE;
void setFilename(const QuickString & filename) { _filename = filename; }
bool scanBuffer(const char *buf, size_t len, bool eofHit);
......
#chr start stop name score strand
chr1 10000 20000 gene1 50 -
chr1 20100 25000 gene1 50 -
#chr start stop name score strand
chr1 10000 20000 gene1 50 - foo
chr1 20100 25000 gene1 50 - bar
#chr start stop name score strand
chr1 10000 20000 gene1 50 - foo
chr1 20100 25000 gene1 50 + bar
......@@ -127,7 +127,7 @@ chr2 10 20 + a1 5 1
chr2 30 40 + a2 6 1
chr2 42 50 + a3 7 1
chr2 45 100 - a4 8 1" > exp
$BT merge -i a.full.bed -s -c 6,4,5,6 -o distinct,collapse,sum,count > obs
$BT merge -i a.full.bed -s -c 4,5,6 -o collapse,sum,count > obs
check obs exp
rm obs exp
......@@ -204,7 +204,7 @@ echo \
chr1 20 90 -
chr2 20 60 +
chr2 25 80 -" > exp
$BT merge -i mixedStrands.bed -s -c 6 -o distinct > obs
$BT merge -i mixedStrands.bed -s > obs
check exp obs
rm obs exp
......@@ -216,7 +216,7 @@ echo " merge.t16...\c"
echo \
"chr1 10 80 +
chr2 20 60 +" > exp
$BT merge -i mixedStrands.bed -S + -c 6 -o distinct > obs
$BT merge -i mixedStrands.bed -S + > obs
check exp obs
rm obs exp
......@@ -228,7 +228,7 @@ echo " merge.t17...\c"
echo \
"chr1 20 90 -
chr2 25 80 -" > exp
$BT merge -i mixedStrands.bed -S - -c 6 -o distinct > obs
$BT merge -i mixedStrands.bed -S - > obs
check exp obs
rm obs exp
......@@ -535,7 +535,7 @@ rm obs exp
###########################################################
echo " merge.t44...\c"
echo \
"19 252805 253194
"19 252805 257416
19 260364 261044
19 265133 265691
19 265985 266386" > exp
......@@ -582,3 +582,24 @@ $BT merge -i precisionTest2.bed -c 8 -o sum -prec 5 > obs
check exp obs
rm obs exp
###########################################################
# Test stranded merge with bedPlus files that have strand
###########################################################
echo " merge.t48...\c"
echo \
"chr1 10000 25000 -" > exp
$BT merge -i bug254_d.bed -s -d 200 > obs
check exp obs
rm obs exp
###########################################################
# Test stranded merge with bedPlus files that have strand
###########################################################
echo " merge.t49...\c"
echo \
"chr1 10000 20000 -
chr1 20100 25000 +" > exp
$BT merge -i bug254_e.bed -s -d 200 > obs
check exp obs
rm obs exp
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment