Commit cd012c6c authored by Neil Kindlon's avatar Neil Kindlon
Browse files

Completed #181, GFF can have extra fields. Also, SVlen in VCF takes max of...

Completed #181, GFF can have extra fields. Also, SVlen in VCF takes max of more than two numbers (as opposed to min)
parent 69ca7e90
...@@ -154,7 +154,7 @@ int SingleLineDelimTextFileReader::getVcfSVlen() { ...@@ -154,7 +154,7 @@ int SingleLineDelimTextFileReader::getVcfSVlen() {
const char *currPtr = startPtr; const char *currPtr = startPtr;
const char *endPtr = _sLine.c_str() + _sLine.size(); const char *endPtr = _sLine.c_str() + _sLine.size();
int minVal = INT_MAX; int maxVal = INT_MIN;
int currVal = 0; int currVal = 0;
QuickString currValStr; QuickString currValStr;
while (1) { while (1) {
...@@ -162,7 +162,7 @@ int SingleLineDelimTextFileReader::getVcfSVlen() { ...@@ -162,7 +162,7 @@ int SingleLineDelimTextFileReader::getVcfSVlen() {
if (currPtr > startPtr) { if (currPtr > startPtr) {
currValStr.assign(startPtr, currPtr - startPtr); currValStr.assign(startPtr, currPtr - startPtr);
currVal = abs(str2chrPos(currValStr)); currVal = abs(str2chrPos(currValStr));
if (currVal < minVal) minVal = currVal; if (currVal > maxVal) maxVal = currVal;
startPtr = currPtr; startPtr = currPtr;
} }
...@@ -176,5 +176,5 @@ int SingleLineDelimTextFileReader::getVcfSVlen() { ...@@ -176,5 +176,5 @@ int SingleLineDelimTextFileReader::getVcfSVlen() {
} }
currPtr++; currPtr++;
}; };
return minVal; return maxVal;
} }
...@@ -4,106 +4,52 @@ ...@@ -4,106 +4,52 @@
BedPlusInterval::BedPlusInterval() BedPlusInterval::BedPlusInterval()
: _numPrintFields(0) : _numPrintFields(0)
{ {
_plusFields.setNumOffsetFields(numFixedFields);
} }
BedPlusInterval::~BedPlusInterval()
{
for (int i=0; i < (int)_otherIdxs.size(); i++) {
delete _otherIdxs[i];
}
}
const BedPlusInterval &BedPlusInterval::operator=(const BedPlusInterval &other) {
Bed3Interval::operator=(other);
int otherSize = other._otherIdxs.size();
int mySize = _otherIdxs.size();
_numPrintFields = other._numPrintFields;
int numMatchingFields = min(mySize, otherSize);
for (int i=0; i < numMatchingFields; i++) {
(*(_otherIdxs[i])) = (*(other._otherIdxs[i]));
}
if (mySize < otherSize) {
for (int i = mySize; i < otherSize; i++) {
QuickString *pqs = new QuickString(*(other._otherIdxs[i]));
_otherIdxs.push_back(pqs);
}
} else if (mySize > otherSize) {
for (int i= otherSize; i < mySize; i++) {
delete _otherIdxs[i];
}
_otherIdxs.resize(otherSize);
}
return *this;
}
bool BedPlusInterval::initFromFile(SingleLineDelimTextFileReader *fileReader) bool BedPlusInterval::initFromFile(SingleLineDelimTextFileReader *fileReader)
{ {
return (Bed3Interval::initFromFile(fileReader) && initOtherFieldsFromFile(fileReader)); return (Bed3Interval::initFromFile(fileReader) && _plusFields.initFromFile(fileReader));
} }
bool BedPlusInterval::initOtherFieldsFromFile(SingleLineDelimTextFileReader *fileReader)
{
int numFields = fileReader->getNumFields() - startOtherIdx;
if ((int)_otherIdxs.size() != numFields) {
if ((int)_otherIdxs.size() > 0) {
return false; //file had a number of fields not matching what was expected.
}
for (int i=0; i < numFields; i++) {
_otherIdxs.push_back(new QuickString());
}
}
for (int i=0; i < numFields; i++) {
fileReader->getField(i + startOtherIdx, (*(_otherIdxs[i])));
}
return true;
}
void BedPlusInterval::clear() { void BedPlusInterval::clear() {
Bed3Interval::clear(); Bed3Interval::clear();
_numPrintFields = 0; _plusFields.clear();
for (int i=0; i < (int)_otherIdxs.size(); i++) {
_otherIdxs[i]->clear();
}
} }
void BedPlusInterval::print(QuickString &outBuf) const void BedPlusInterval::print(QuickString &outBuf) const
{ {
Bed3Interval::print(outBuf); Bed3Interval::print(outBuf);
printOtherFields(outBuf); _plusFields.printFields(outBuf);
} }
void BedPlusInterval::print(QuickString &outBuf, int start, int end) const void BedPlusInterval::print(QuickString &outBuf, int start, int end) const
{ {
Bed3Interval::print(outBuf, start, end); Bed3Interval::print(outBuf, start, end);
printOtherFields(outBuf); _plusFields.printFields(outBuf);
} }
void BedPlusInterval::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const void BedPlusInterval::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const
{ {
Bed3Interval::print(outBuf, start, end); Bed3Interval::print(outBuf, start, end);
printOtherFields(outBuf); _plusFields.printFields(outBuf);
} }
void BedPlusInterval::printNull(QuickString &outBuf) const void BedPlusInterval::printNull(QuickString &outBuf) const
{ {
Bed3Interval::printNull(outBuf); Bed3Interval::printNull(outBuf);
for (int i=startOtherIdx; i < _numPrintFields; i++) { for (int i=numFixedFields; i < _numPrintFields; i++) {
outBuf.append("\t."); outBuf.append("\t.");
} }
} }
const QuickString &BedPlusInterval::getField(int fieldNum) const const QuickString &BedPlusInterval::getField(int fieldNum) const
{ {
//a request for any of the first three fields will retrieve if (fieldNum > numFixedFields) {
//chrom, start, end, in that order. return _plusFields.getField(fieldNum);
//A request for field 3+ will go to the otherIdxs.
if (fieldNum > startOtherIdx && fieldNum <= startOtherIdx + (int)_otherIdxs.size()) {
return (*(_otherIdxs[fieldNum - startOtherIdx - 1]));
} }
return Bed3Interval::getField(fieldNum); return Bed3Interval::getField(fieldNum);
} }
...@@ -112,20 +58,12 @@ bool BedPlusInterval::isNumericField(int fieldNum) { ...@@ -112,20 +58,12 @@ bool BedPlusInterval::isNumericField(int fieldNum) {
// //
// TBD: There is no currently no good way to guarantee / enforce whether // TBD: There is no currently no good way to guarantee / enforce whether
// fields after the 6th are numeric, so for now we'll give the user the // fields after the 3rd are numeric, so for now we'll give the user the
// benefit of the doubt on those. // benefit of the doubt on those.
// //
if (fieldNum > startOtherIdx) { if (fieldNum > numFixedFields) {
return true; return true;
} else {
return Bed3Interval::isNumericField(fieldNum);
} }
return Bed3Interval::isNumericField(fieldNum);
} }
void BedPlusInterval::printOtherFields(QuickString &outBuf) const {
for (int i=0; i < (int)_otherIdxs.size(); i++) {
outBuf.append('\t');
outBuf.append(*(_otherIdxs[i]));
}
}
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#define BEDPLUSINTERVAL_H_ #define BEDPLUSINTERVAL_H_
#include "Bed3Interval.h" #include "Bed3Interval.h"
#include <vector> #include "PlusFields.h"
class SingleLineDelimTextFileReader; class SingleLineDelimTextFileReader;
...@@ -18,6 +18,7 @@ public: ...@@ -18,6 +18,7 @@ public:
friend class FreeList<BedPlusInterval>; friend class FreeList<BedPlusInterval>;
BedPlusInterval(); BedPlusInterval();
virtual ~BedPlusInterval() {}
virtual bool initFromFile(SingleLineDelimTextFileReader *); virtual bool initFromFile(SingleLineDelimTextFileReader *);
virtual void clear(); virtual void clear();
virtual void print(QuickString &outBuf) const; virtual void print(QuickString &outBuf) const;
...@@ -26,29 +27,19 @@ public: ...@@ -26,29 +27,19 @@ public:
virtual void printNull(QuickString &outBuf) const; virtual void printNull(QuickString &outBuf) const;
virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED_PLUS_RECORD_TYPE; } virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED_PLUS_RECORD_TYPE; }
//Note: using the assignment operator in a BedPlusInterval can potentially be a performance hit,
//if the number of fields frequently differ between this object and the one being copied.
const BedPlusInterval &operator=(const BedPlusInterval &other);
virtual const QuickString &getField(int fieldNum) const; virtual const QuickString &getField(int fieldNum) const;
virtual int getNumFields() const { return startOtherIdx + _otherIdxs.size(); } virtual int getNumFields() const { return numFixedFields + _plusFields.size(); }
virtual void setField(int fieldNum, const QuickString &str) { (*(_otherIdxs[fieldNum])) = str; }
virtual void setField(int fieldNum, const string &str) { (*(_otherIdxs[fieldNum])) = str; }
virtual void setField(int fieldNum, const char *str) { (*(_otherIdxs[fieldNum])) = str; }
virtual void setNumPrintFields(int num) { _numPrintFields = num; } virtual void setNumPrintFields(int num) { _numPrintFields = num; }
virtual int getNumPrintFields() const { return _numPrintFields; } virtual int getNumPrintFields() const { return _numPrintFields; }
static bool isNumericField(int fieldNum); static bool isNumericField(int fieldNum);
protected: protected:
vector<QuickString *> _otherIdxs; static const int numFixedFields = 3; //first three fields have names, and are not stored in otherIdxs.
static const int startOtherIdx = 3; //first six fields have names, and are not stored in otherIdxs. PlusFields _plusFields;
int _numPrintFields; int _numPrintFields;
virtual ~BedPlusInterval();
bool initOtherFieldsFromFile(SingleLineDelimTextFileReader *fileReader);
virtual void printOtherFields(QuickString &outBuf) const;
}; };
......
#include "GffPlusRecord.h"
#include "SingleLineDelimTextFileReader.h"
GffPlusRecord::GffPlusRecord()
: _numPrintFields(0)
{
}
GffPlusRecord::~GffPlusRecord() {
}
bool GffPlusRecord::initFromFile(SingleLineDelimTextFileReader *fileReader)
{
if (!GffRecord::initFromFile(fileReader)) {
return false;
}
_plusFields.setNumOffsetFields(GffRecord::getNumFields());
return _plusFields.initFromFile(fileReader);
}
void GffPlusRecord::clear() {
GffRecord::clear();
_plusFields.clear();
}
void GffPlusRecord::print(QuickString &outBuf) const
{
GffRecord::print(outBuf);
_plusFields.printFields(outBuf);
}
void GffPlusRecord::print(QuickString &outBuf, int start, int end) const
{
GffRecord::print(outBuf, start, end);
_plusFields.printFields(outBuf);
}
void GffPlusRecord::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const
{
GffRecord::print(outBuf, start, end);
_plusFields.printFields(outBuf);
}
void GffPlusRecord::printNull(QuickString &outBuf) const
{
GffRecord::printNull(outBuf);
for (int i=_numFields; i < _numPrintFields; i++) {
outBuf.append("\t.");
}
}
const QuickString &GffPlusRecord::getField(int fieldNum) const
{
if (fieldNum > _numFields) {
return _plusFields.getField(fieldNum);
}
return GffRecord::getField(fieldNum);
}
bool GffPlusRecord::isNumericField(int fieldNum) {
if (fieldNum < 9) {
return GffRecord::isNumericField(fieldNum);
}
return true;
}
/*
* GffRecord.h
*
* Created on: Nov 13, 2012
* Author: nek3d
*/
#ifndef GFFPLUSRECORD_H_
#define GFFPLUSRECORD_H_
#include "GffRecord.h"
#include "PlusFields.h"
class SingleLineDelimTextFileReader;
class GffPlusRecord : public GffRecord {
public:
friend class FreeList<GffPlusRecord>;
GffPlusRecord();
virtual ~GffPlusRecord();
virtual bool initFromFile(SingleLineDelimTextFileReader *);
virtual void clear();
virtual void print(QuickString &outBuf) const;
virtual void print(QuickString &outBuf, int start, int end) const;
virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const;
virtual void printNull(QuickString &outBuf) const;
virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::GFF_PLUS_RECORD_TYPE; }
virtual const QuickString &getField(int fieldNum) const;
virtual int getNumFields() const { return GffRecord::getNumFields() + _plusFields.size(); }
virtual void setNumPrintFields(int num) { _numPrintFields = num; }
virtual int getNumPrintFields() const { return _numPrintFields; }
static bool isNumericField(int fieldNum);
protected:
PlusFields _plusFields;
int _numPrintFields;
};
#endif /* GFFPLUSRECORD_H_ */
...@@ -2,13 +2,11 @@ ...@@ -2,13 +2,11 @@
#include "SingleLineDelimTextFileReader.h" #include "SingleLineDelimTextFileReader.h"
#include <cstring> #include <cstring>
GffRecord::GffRecord() GffRecord::GffRecord() {
{
} }
GffRecord::~GffRecord() GffRecord::~GffRecord() {
{
} }
...@@ -43,7 +41,7 @@ bool GffRecord::initFromFile(SingleLineDelimTextFileReader *fileReader) ...@@ -43,7 +41,7 @@ bool GffRecord::initFromFile(SingleLineDelimTextFileReader *fileReader)
adjustStrandVal(); adjustStrandVal();
fileReader->getField(7, _frame); fileReader->getField(7, _frame);
_numFields = fileReader->getNumFields(); _numFields = min(9, fileReader->getNumFields());
if (_numFields == 9) { if (_numFields == 9) {
fileReader->getField(8, _group); fileReader->getField(8, _group);
} }
......
...@@ -17,13 +17,14 @@ public: ...@@ -17,13 +17,14 @@ public:
friend class FreeList<GffRecord>; friend class FreeList<GffRecord>;
GffRecord(); GffRecord();
virtual ~GffRecord();
virtual bool initFromFile(SingleLineDelimTextFileReader *); virtual bool initFromFile(SingleLineDelimTextFileReader *);
virtual void clear(); virtual void clear();
virtual void print(QuickString &outBuf) const; virtual void print(QuickString &outBuf) const;
virtual void print(QuickString &outBuf, int start, int end) const; virtual void print(QuickString &outBuf, int start, int end) const;
virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const; virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const;
virtual void printNull(QuickString &outBuf) const; virtual void printNull(QuickString &outBuf) const;
virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED_PLUS_RECORD_TYPE; } virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::GFF_RECORD_TYPE; }
virtual const QuickString &getSource() const { return _source; } virtual const QuickString &getSource() const { return _source; }
virtual const QuickString &getFrame() const { return _frame; } virtual const QuickString &getFrame() const { return _frame; }
virtual const QuickString &getGroup() const { return _group; } virtual const QuickString &getGroup() const { return _group; }
...@@ -31,13 +32,9 @@ public: ...@@ -31,13 +32,9 @@ public:
virtual void setNumFields(int val) { _numFields = val; } virtual void setNumFields(int val) { _numFields = val; }
virtual const QuickString &getField(int fieldNum) const; virtual const QuickString &getField(int fieldNum) const;
//Note: using the assignment operator in a GffRecord can potentially be a performance hit,
//if the number of fields frequently differ between this object and the one being copied.
const GffRecord &operator=(const GffRecord &other);
static bool isNumericField(int fieldNum); static bool isNumericField(int fieldNum);
protected: protected:
virtual ~GffRecord();
void printRemainingFields(QuickString &outbuf) const; void printRemainingFields(QuickString &outbuf) const;
int _numFields; int _numFields;
......
...@@ -17,14 +17,14 @@ INCLUDES = -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ ...@@ -17,14 +17,14 @@ INCLUDES = -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
# ---------------------------------- # ----------------------------------
# define our source and object files # define our source and object files
# ---------------------------------- # ----------------------------------
SOURCES = Record.h Record.cpp EmptyRecord.h EmptyRecord.cpp Bed3Interval.h Bed3Interval.cpp \ SOURCES = Record.cpp EmptyRecord.cpp Bed3Interval.cpp \
Bed4Interval.h Bed4Interval.cpp BedGraphInterval.h BedGraphInterval.cpp Bed5Interval.h Bed5Interval.cpp \ Bed4Interval.cpp BedGraphInterval.cpp Bed5Interval.cpp \
Bed6Interval.h Bed6Interval.cpp \ Bed6Interval.cpp PlusFields.cpp GffRecord.cpp GffPlusRecord.cpp \
BedPlusInterval.h BedPlusInterval.cpp Bed12Interval.h Bed12Interval.cpp BamRecord.h BamRecord.cpp VcfRecord.h VcfRecord.cpp \ BedPlusInterval.cpp Bed12Interval.cpp BamRecord.cpp VcfRecord.cpp \
GffRecord.h GffRecord.cpp BlockMgr.h BlockMgr.cpp StrandQueue.h StrandQueue.cpp \ BlockMgr.cpp StrandQueue.cpp \
RecordMgr.cpp RecordMgr.h RecordList.h RecordList.cpp RecordKeyList.h RecordKeyList.cpp RecordKeyVector.h RecordKeyVector.cpp RecordMgr.cpp RecordList.cpp RecordKeyList.cpp RecordKeyVector.cpp
OBJECTS= Record.o EmptyRecord.o Bed3Interval.o Bed4Interval.o BedGraphInterval.o Bed5Interval.o Bed6Interval.o BedPlusInterval.o Bed12Interval.o BamRecord.o \ OBJECTS= Record.o EmptyRecord.o Bed3Interval.o Bed4Interval.o BedGraphInterval.o Bed5Interval.o Bed6Interval.o PlusFields.o BedPlusInterval.o Bed12Interval.o BamRecord.o \
VcfRecord.o GffRecord.o BlockMgr.o StrandQueue.o RecordMgr.o RecordList.o RecordKeyList.o RecordKeyVector.o GffRecord.o GffPlusRecord.o VcfRecord.o BlockMgr.o StrandQueue.o RecordMgr.o RecordList.o RecordKeyList.o RecordKeyVector.o
_EXT_OBJECTS=ParseTools.o QuickString.o ChromIdLookup.o _EXT_OBJECTS=ParseTools.o QuickString.o ChromIdLookup.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
...@@ -40,7 +40,7 @@ $(BUILT_OBJECTS): $(SOURCES) ...@@ -40,7 +40,7 @@ $(BUILT_OBJECTS): $(SOURCES)
clean: clean:
@echo "Cleaning up." @echo "Cleaning up."
@rm -f $(OBJ_DIR)/RecordMgr.o $(OBJ_DIR)/RecordList.o $(OBJ_DIR)/Record.o $(OBJ_DIR)/EmptyRecord.o $(OBJ_DIR)/Bed3Interval.o $(OBJ_DIR)/Bed4Interval.o \ @rm -f $(OBJ_DIR)/RecordMgr.o $(OBJ_DIR)/RecordList.o $(OBJ_DIR)/Record.o $(OBJ_DIR)/EmptyRecord.o $(OBJ_DIR)/Bed3Interval.o $(OBJ_DIR)/Bed4Interval.o \
$(OBJ_DIR)/BedGraphInterval.o $(OBJ_DIR)/Bed5Interval.o $(OBJ_DIR)/Bed6Interval.o \ $(OBJ_DIR)/BedGraphInterval.o $(OBJ_DIR)/Bed5Interval.o $(OBJ_DIR)/Bed6Interval.o $(OBJ_DIR)/PlusFields.o $(OBJ_DIR)/GffPlusRecord.o \
$(OBJ_DIR)/BedPlusInterval.o $(OBJ_DIR)/Bed12Interval.o $(OBJ_DIR)/BamRecord.o $(OBJ_DIR)/VcfRecord.o $(OBJ_DIR)/GffRecord.o $(OBJ_DIR)/BlockMgr.o $(OBJ_DIR)/StrandQueue.o \ $(OBJ_DIR)/BedPlusInterval.o $(OBJ_DIR)/Bed12Interval.o $(OBJ_DIR)/BamRecord.o $(OBJ_DIR)/VcfRecord.o $(OBJ_DIR)/GffRecord.o $(OBJ_DIR)/BlockMgr.o $(OBJ_DIR)/StrandQueue.o \
$(OBJ_DIR)/RecordKeyList.o $(OBJ_DIR)/RecordKeyVector.o $(OBJ_DIR)/RecordKeyList.o $(OBJ_DIR)/RecordKeyVector.o
......
#include "PlusFields.h"
#include "SingleLineDelimTextFileReader.h"
PlusFields::PlusFields()
{
}
bool PlusFields::initFromFile(SingleLineDelimTextFileReader *fileReader)
{
size_t numFields = fileReader->getNumFields() - _numOffsetFields;
if (size() != numFields) {
_fields.resize(numFields);
}
for (size_t i=0; i < numFields; i++) {
fileReader->getField(i + _numOffsetFields, _fields[i]);
}
return true;
}
void PlusFields::clear() {
//don't destroy the strings if we don't have to. Just clear their memory.
for (int i=0; i < (int)_fields.size(); i++) {
_fields[i].clear();
}
}
const QuickString &PlusFields::getField(int fieldNum) const
{
return _fields[fieldNum - _numOffsetFields - 1];
}
void PlusFields::printFields(QuickString &outBuf) const {
for (size_t i=0; i < size(); i++) {
outBuf.append('\t');
outBuf.append(_fields[i]);
}
}
/*
* BedPlusInterval.h
*
* Created on: Nov 13, 2012
* Author: nek3d
*/
#ifndef PLUSFIELDS_H_
#define PLUSFIELDS_H_
using namespace std;
#include "QuickString.h"
#include <vector>
class SingleLineDelimTextFileReader;
class PlusFields {