Commit cd012c6c authored by Neil Kindlon's avatar Neil Kindlon
Browse files

Completed #181, GFF can have extra fields. Also, SVlen in VCF takes max of...

Completed #181, GFF can have extra fields. Also, SVlen in VCF takes max of more than two numbers (as opposed to min)
parent 69ca7e90
......@@ -154,7 +154,7 @@ int SingleLineDelimTextFileReader::getVcfSVlen() {
const char *currPtr = startPtr;
const char *endPtr = _sLine.c_str() + _sLine.size();
int minVal = INT_MAX;
int maxVal = INT_MIN;
int currVal = 0;
QuickString currValStr;
while (1) {
......@@ -162,7 +162,7 @@ int SingleLineDelimTextFileReader::getVcfSVlen() {
if (currPtr > startPtr) {
currValStr.assign(startPtr, currPtr - startPtr);
currVal = abs(str2chrPos(currValStr));
if (currVal < minVal) minVal = currVal;
if (currVal > maxVal) maxVal = currVal;
startPtr = currPtr;
}
......@@ -176,5 +176,5 @@ int SingleLineDelimTextFileReader::getVcfSVlen() {
}
currPtr++;
};
return minVal;
return maxVal;
}
......@@ -2,108 +2,54 @@
#include "SingleLineDelimTextFileReader.h"
BedPlusInterval::BedPlusInterval()
: _numPrintFields(0)
: _numPrintFields(0)
{
_plusFields.setNumOffsetFields(numFixedFields);
}
BedPlusInterval::~BedPlusInterval()
{
for (int i=0; i < (int)_otherIdxs.size(); i++) {
delete _otherIdxs[i];
}
}
const BedPlusInterval &BedPlusInterval::operator=(const BedPlusInterval &other) {
Bed3Interval::operator=(other);
int otherSize = other._otherIdxs.size();
int mySize = _otherIdxs.size();
_numPrintFields = other._numPrintFields;
int numMatchingFields = min(mySize, otherSize);
for (int i=0; i < numMatchingFields; i++) {
(*(_otherIdxs[i])) = (*(other._otherIdxs[i]));
}
if (mySize < otherSize) {
for (int i = mySize; i < otherSize; i++) {
QuickString *pqs = new QuickString(*(other._otherIdxs[i]));
_otherIdxs.push_back(pqs);
}
} else if (mySize > otherSize) {
for (int i= otherSize; i < mySize; i++) {
delete _otherIdxs[i];
}
_otherIdxs.resize(otherSize);
}
return *this;
}
bool BedPlusInterval::initFromFile(SingleLineDelimTextFileReader *fileReader)
{
return (Bed3Interval::initFromFile(fileReader) && initOtherFieldsFromFile(fileReader));
return (Bed3Interval::initFromFile(fileReader) && _plusFields.initFromFile(fileReader));
}
bool BedPlusInterval::initOtherFieldsFromFile(SingleLineDelimTextFileReader *fileReader)
{
int numFields = fileReader->getNumFields() - startOtherIdx;
if ((int)_otherIdxs.size() != numFields) {
if ((int)_otherIdxs.size() > 0) {
return false; //file had a number of fields not matching what was expected.
}
for (int i=0; i < numFields; i++) {
_otherIdxs.push_back(new QuickString());
}
}
for (int i=0; i < numFields; i++) {
fileReader->getField(i + startOtherIdx, (*(_otherIdxs[i])));
}
return true;
}
void BedPlusInterval::clear() {
Bed3Interval::clear();
_numPrintFields = 0;
for (int i=0; i < (int)_otherIdxs.size(); i++) {
_otherIdxs[i]->clear();
}
_plusFields.clear();
}
void BedPlusInterval::print(QuickString &outBuf) const
{
Bed3Interval::print(outBuf);
printOtherFields(outBuf);
_plusFields.printFields(outBuf);
}
void BedPlusInterval::print(QuickString &outBuf, int start, int end) const
{
Bed3Interval::print(outBuf, start, end);
printOtherFields(outBuf);
_plusFields.printFields(outBuf);
}
void BedPlusInterval::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const
{
Bed3Interval::print(outBuf, start, end);
printOtherFields(outBuf);
_plusFields.printFields(outBuf);
}
void BedPlusInterval::printNull(QuickString &outBuf) const
{
Bed3Interval::printNull(outBuf);
for (int i=startOtherIdx; i < _numPrintFields; i++) {
for (int i=numFixedFields; i < _numPrintFields; i++) {
outBuf.append("\t.");
}
}
const QuickString &BedPlusInterval::getField(int fieldNum) const
{
//a request for any of the first three fields will retrieve
//chrom, start, end, in that order.
//A request for field 3+ will go to the otherIdxs.
if (fieldNum > startOtherIdx && fieldNum <= startOtherIdx + (int)_otherIdxs.size()) {
return (*(_otherIdxs[fieldNum - startOtherIdx - 1]));
if (fieldNum > numFixedFields) {
return _plusFields.getField(fieldNum);
}
return Bed3Interval::getField(fieldNum);
}
......@@ -112,20 +58,12 @@ bool BedPlusInterval::isNumericField(int fieldNum) {
//
// TBD: There is no currently no good way to guarantee / enforce whether
// fields after the 6th are numeric, so for now we'll give the user the
// fields after the 3rd are numeric, so for now we'll give the user the
// benefit of the doubt on those.
//
if (fieldNum > startOtherIdx) {
if (fieldNum > numFixedFields) {
return true;
} else {
return Bed3Interval::isNumericField(fieldNum);
}
return Bed3Interval::isNumericField(fieldNum);
}
void BedPlusInterval::printOtherFields(QuickString &outBuf) const {
for (int i=0; i < (int)_otherIdxs.size(); i++) {
outBuf.append('\t');
outBuf.append(*(_otherIdxs[i]));
}
}
......@@ -9,7 +9,7 @@
#define BEDPLUSINTERVAL_H_
#include "Bed3Interval.h"
#include <vector>
#include "PlusFields.h"
class SingleLineDelimTextFileReader;
......@@ -18,6 +18,7 @@ public:
friend class FreeList<BedPlusInterval>;
BedPlusInterval();
virtual ~BedPlusInterval() {}
virtual bool initFromFile(SingleLineDelimTextFileReader *);
virtual void clear();
virtual void print(QuickString &outBuf) const;
......@@ -26,29 +27,19 @@ public:
virtual void printNull(QuickString &outBuf) const;
virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED_PLUS_RECORD_TYPE; }
//Note: using the assignment operator in a BedPlusInterval can potentially be a performance hit,
//if the number of fields frequently differ between this object and the one being copied.
const BedPlusInterval &operator=(const BedPlusInterval &other);
virtual const QuickString &getField(int fieldNum) const;
virtual int getNumFields() const { return startOtherIdx + _otherIdxs.size(); }
virtual int getNumFields() const { return numFixedFields + _plusFields.size(); }
virtual void setField(int fieldNum, const QuickString &str) { (*(_otherIdxs[fieldNum])) = str; }
virtual void setField(int fieldNum, const string &str) { (*(_otherIdxs[fieldNum])) = str; }
virtual void setField(int fieldNum, const char *str) { (*(_otherIdxs[fieldNum])) = str; }
virtual void setNumPrintFields(int num) { _numPrintFields = num; }
virtual int getNumPrintFields() const { return _numPrintFields; }
static bool isNumericField(int fieldNum);
protected:
vector<QuickString *> _otherIdxs;
static const int startOtherIdx = 3; //first six fields have names, and are not stored in otherIdxs.
static const int numFixedFields = 3; //first three fields have names, and are not stored in otherIdxs.
PlusFields _plusFields;
int _numPrintFields;
virtual ~BedPlusInterval();
bool initOtherFieldsFromFile(SingleLineDelimTextFileReader *fileReader);
virtual void printOtherFields(QuickString &outBuf) const;
};
......
#include "GffPlusRecord.h"
#include "SingleLineDelimTextFileReader.h"
GffPlusRecord::GffPlusRecord()
: _numPrintFields(0)
{
}
GffPlusRecord::~GffPlusRecord() {
}
bool GffPlusRecord::initFromFile(SingleLineDelimTextFileReader *fileReader)
{
if (!GffRecord::initFromFile(fileReader)) {
return false;
}
_plusFields.setNumOffsetFields(GffRecord::getNumFields());
return _plusFields.initFromFile(fileReader);
}
void GffPlusRecord::clear() {
GffRecord::clear();
_plusFields.clear();
}
void GffPlusRecord::print(QuickString &outBuf) const
{
GffRecord::print(outBuf);
_plusFields.printFields(outBuf);
}
void GffPlusRecord::print(QuickString &outBuf, int start, int end) const
{
GffRecord::print(outBuf, start, end);
_plusFields.printFields(outBuf);
}
void GffPlusRecord::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const
{
GffRecord::print(outBuf, start, end);
_plusFields.printFields(outBuf);
}
void GffPlusRecord::printNull(QuickString &outBuf) const
{
GffRecord::printNull(outBuf);
for (int i=_numFields; i < _numPrintFields; i++) {
outBuf.append("\t.");
}
}
const QuickString &GffPlusRecord::getField(int fieldNum) const
{
if (fieldNum > _numFields) {
return _plusFields.getField(fieldNum);
}
return GffRecord::getField(fieldNum);
}
bool GffPlusRecord::isNumericField(int fieldNum) {
if (fieldNum < 9) {
return GffRecord::isNumericField(fieldNum);
}
return true;
}
/*
* GffRecord.h
*
* Created on: Nov 13, 2012
* Author: nek3d
*/
#ifndef GFFPLUSRECORD_H_
#define GFFPLUSRECORD_H_
#include "GffRecord.h"
#include "PlusFields.h"
class SingleLineDelimTextFileReader;
class GffPlusRecord : public GffRecord {
public:
friend class FreeList<GffPlusRecord>;
GffPlusRecord();
virtual ~GffPlusRecord();
virtual bool initFromFile(SingleLineDelimTextFileReader *);
virtual void clear();
virtual void print(QuickString &outBuf) const;
virtual void print(QuickString &outBuf, int start, int end) const;
virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const;
virtual void printNull(QuickString &outBuf) const;
virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::GFF_PLUS_RECORD_TYPE; }
virtual const QuickString &getField(int fieldNum) const;
virtual int getNumFields() const { return GffRecord::getNumFields() + _plusFields.size(); }
virtual void setNumPrintFields(int num) { _numPrintFields = num; }
virtual int getNumPrintFields() const { return _numPrintFields; }
static bool isNumericField(int fieldNum);
protected:
PlusFields _plusFields;
int _numPrintFields;
};
#endif /* GFFPLUSRECORD_H_ */
......@@ -2,13 +2,11 @@
#include "SingleLineDelimTextFileReader.h"
#include <cstring>
GffRecord::GffRecord()
{
GffRecord::GffRecord() {
}
GffRecord::~GffRecord()
{
GffRecord::~GffRecord() {
}
......@@ -43,7 +41,7 @@ bool GffRecord::initFromFile(SingleLineDelimTextFileReader *fileReader)
adjustStrandVal();
fileReader->getField(7, _frame);
_numFields = fileReader->getNumFields();
_numFields = min(9, fileReader->getNumFields());
if (_numFields == 9) {
fileReader->getField(8, _group);
}
......
......@@ -17,13 +17,14 @@ public:
friend class FreeList<GffRecord>;
GffRecord();
virtual ~GffRecord();
virtual bool initFromFile(SingleLineDelimTextFileReader *);
virtual void clear();
virtual void print(QuickString &outBuf) const;
virtual void print(QuickString &outBuf, int start, int end) const;
virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const;
virtual void printNull(QuickString &outBuf) const;
virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BED_PLUS_RECORD_TYPE; }
virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::GFF_RECORD_TYPE; }
virtual const QuickString &getSource() const { return _source; }
virtual const QuickString &getFrame() const { return _frame; }
virtual const QuickString &getGroup() const { return _group; }
......@@ -31,13 +32,9 @@ public:
virtual void setNumFields(int val) { _numFields = val; }
virtual const QuickString &getField(int fieldNum) const;
//Note: using the assignment operator in a GffRecord can potentially be a performance hit,
//if the number of fields frequently differ between this object and the one being copied.
const GffRecord &operator=(const GffRecord &other);
static bool isNumericField(int fieldNum);
protected:
virtual ~GffRecord();
void printRemainingFields(QuickString &outbuf) const;
int _numFields;
......
......@@ -17,14 +17,14 @@ INCLUDES = -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES = Record.h Record.cpp EmptyRecord.h EmptyRecord.cpp Bed3Interval.h Bed3Interval.cpp \
Bed4Interval.h Bed4Interval.cpp BedGraphInterval.h BedGraphInterval.cpp Bed5Interval.h Bed5Interval.cpp \
Bed6Interval.h Bed6Interval.cpp \
BedPlusInterval.h BedPlusInterval.cpp Bed12Interval.h Bed12Interval.cpp BamRecord.h BamRecord.cpp VcfRecord.h VcfRecord.cpp \
GffRecord.h GffRecord.cpp BlockMgr.h BlockMgr.cpp StrandQueue.h StrandQueue.cpp \
RecordMgr.cpp RecordMgr.h RecordList.h RecordList.cpp RecordKeyList.h RecordKeyList.cpp RecordKeyVector.h RecordKeyVector.cpp
OBJECTS= Record.o EmptyRecord.o Bed3Interval.o Bed4Interval.o BedGraphInterval.o Bed5Interval.o Bed6Interval.o BedPlusInterval.o Bed12Interval.o BamRecord.o \
VcfRecord.o GffRecord.o BlockMgr.o StrandQueue.o RecordMgr.o RecordList.o RecordKeyList.o RecordKeyVector.o
SOURCES = Record.cpp EmptyRecord.cpp Bed3Interval.cpp \
Bed4Interval.cpp BedGraphInterval.cpp Bed5Interval.cpp \
Bed6Interval.cpp PlusFields.cpp GffRecord.cpp GffPlusRecord.cpp \
BedPlusInterval.cpp Bed12Interval.cpp BamRecord.cpp VcfRecord.cpp \
BlockMgr.cpp StrandQueue.cpp \
RecordMgr.cpp RecordList.cpp RecordKeyList.cpp RecordKeyVector.cpp
OBJECTS= Record.o EmptyRecord.o Bed3Interval.o Bed4Interval.o BedGraphInterval.o Bed5Interval.o Bed6Interval.o PlusFields.o BedPlusInterval.o Bed12Interval.o BamRecord.o \
GffRecord.o GffPlusRecord.o VcfRecord.o BlockMgr.o StrandQueue.o RecordMgr.o RecordList.o RecordKeyList.o RecordKeyVector.o
_EXT_OBJECTS=ParseTools.o QuickString.o ChromIdLookup.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
......@@ -40,7 +40,7 @@ $(BUILT_OBJECTS): $(SOURCES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/RecordMgr.o $(OBJ_DIR)/RecordList.o $(OBJ_DIR)/Record.o $(OBJ_DIR)/EmptyRecord.o $(OBJ_DIR)/Bed3Interval.o $(OBJ_DIR)/Bed4Interval.o \
$(OBJ_DIR)/BedGraphInterval.o $(OBJ_DIR)/Bed5Interval.o $(OBJ_DIR)/Bed6Interval.o \
$(OBJ_DIR)/BedGraphInterval.o $(OBJ_DIR)/Bed5Interval.o $(OBJ_DIR)/Bed6Interval.o $(OBJ_DIR)/PlusFields.o $(OBJ_DIR)/GffPlusRecord.o \
$(OBJ_DIR)/BedPlusInterval.o $(OBJ_DIR)/Bed12Interval.o $(OBJ_DIR)/BamRecord.o $(OBJ_DIR)/VcfRecord.o $(OBJ_DIR)/GffRecord.o $(OBJ_DIR)/BlockMgr.o $(OBJ_DIR)/StrandQueue.o \
$(OBJ_DIR)/RecordKeyList.o $(OBJ_DIR)/RecordKeyVector.o
......
#include "PlusFields.h"
#include "SingleLineDelimTextFileReader.h"
PlusFields::PlusFields()
{
}
bool PlusFields::initFromFile(SingleLineDelimTextFileReader *fileReader)
{
size_t numFields = fileReader->getNumFields() - _numOffsetFields;
if (size() != numFields) {
_fields.resize(numFields);
}
for (size_t i=0; i < numFields; i++) {
fileReader->getField(i + _numOffsetFields, _fields[i]);
}
return true;
}
void PlusFields::clear() {
//don't destroy the strings if we don't have to. Just clear their memory.
for (int i=0; i < (int)_fields.size(); i++) {
_fields[i].clear();
}
}
const QuickString &PlusFields::getField(int fieldNum) const
{
return _fields[fieldNum - _numOffsetFields - 1];
}
void PlusFields::printFields(QuickString &outBuf) const {
for (size_t i=0; i < size(); i++) {
outBuf.append('\t');
outBuf.append(_fields[i]);
}
}
/*
* BedPlusInterval.h
*
* Created on: Nov 13, 2012
* Author: nek3d
*/
#ifndef PLUSFIELDS_H_
#define PLUSFIELDS_H_
using namespace std;
#include "QuickString.h"
#include <vector>
class SingleLineDelimTextFileReader;
class PlusFields {
public:
PlusFields();
virtual ~PlusFields() {}
void setNumOffsetFields(int numOffsetFields) { _numOffsetFields = numOffsetFields; }
virtual bool initFromFile(SingleLineDelimTextFileReader *);
virtual void clear();
virtual void printFields(QuickString &outBuf) const;
virtual const QuickString &getField(int fieldNum) const;
virtual size_t size() const { return _fields.size(); }
protected:
vector<QuickString> _fields;
int _numOffsetFields; //could be 3 for BedPlus, but GFF has 8 or 9
};
#endif /* PLUSFIELDS_H_ */
......@@ -69,6 +69,11 @@ RecordMgr::RecordMgr(FileRecordTypeChecker::RECORD_TYPE recType, int blockSize)
_freeList = new FreeList<GffRecord>(_freeListBlockSize);
break;
}
case FileRecordTypeChecker::GFF_PLUS_RECORD_TYPE:
{
_freeList = new FreeList<GffPlusRecord>(_freeListBlockSize);
break;
}
default:
......@@ -133,12 +138,16 @@ RecordMgr::~RecordMgr()
delete (FreeList<VcfRecord> *)_freeList;
break;
}
case FileRecordTypeChecker::GFF_RECORD_TYPE:
{
delete (FreeList<GffRecord> *)_freeList;
break;
}
case FileRecordTypeChecker::GFF_PLUS_RECORD_TYPE:
{
delete (FreeList<GffPlusRecord> *)_freeList;
break;
}
default:
......@@ -222,6 +231,12 @@ Record *RecordMgr::allocateRecord()
record = gfr;
break;
}
case FileRecordTypeChecker::GFF_PLUS_RECORD_TYPE:
{
GffPlusRecord *gfpr = ((FreeList<GffPlusRecord> *)_freeList)->newObj();
record = gfpr;
break;
}
default:
......@@ -294,6 +309,11 @@ void RecordMgr::deleteRecord(const Record *record)
((FreeList<GffRecord> *)_freeList)->deleteObj(static_cast<const GffRecord *>(record));
break;
}
case FileRecordTypeChecker::GFF_PLUS_RECORD_TYPE:
{
((FreeList<GffPlusRecord> *)_freeList)->deleteObj(static_cast<const GffPlusRecord *>(record));
break;
}
default:
break;
......
......@@ -21,6 +21,7 @@
#include "BamRecord.h"
#include "VcfRecord.h"
#include "GffRecord.h"
#include "GffPlusRecord.h"
#include "FileRecordTypeChecker.h"
......
......@@ -37,7 +37,8 @@ bool VcfRecord::initFromFile(SingleLineDelimTextFileReader *fileReader)
fileReader->getField(2, _name);
fileReader->getField(5, _score);
return initOtherFieldsFromFile(fileReader);
_plusFields.setNumOffsetFields(6);
return _plusFields.initFromFile(fileReader);
}
void VcfRecord::clear()
......@@ -71,7 +72,7 @@ void VcfRecord::print(QuickString &outBuf, const QuickString & start, const Quic
void VcfRecord::printNull(QuickString &outBuf) const {
outBuf.append(".\t-1\t.");
for (int i= startOtherIdx; i < _numPrintFields; i++) {
for (int i= 2; i < _numPrintFields; i++) {
outBuf.append("\t.");
}
}
......@@ -85,18 +86,13 @@ void VcfRecord::printOtherFields(QuickString &outBuf) const {
outBuf.append(_varAlt);
outBuf.append('\t');
outBuf.append(_score);
for (int i= constPrintStartIdx; i < (int)_otherIdxs.size(); i++) {
outBuf.append('\t');
outBuf.append(*(_otherIdxs[i]));
}
_plusFields.printFields(outBuf)