Commit e1993b57 authored by Aaron Quinlan's avatar Aaron Quinlan
Browse files

Merge pull request #247 from nkindlon/master

Changed vector copies to references in sortBed.cpp, added unit tests
parents b3504cee 16d4db0d
......@@ -96,7 +96,7 @@ int main(int argc, char *argv[])
if (btDriver.supports(subCmd)) {
if (btDriver.subMain(argc, argv)) {
return 0;
} else {
} else if (!btDriver.hadError()) {
showHelp(subCmd);
return 1;
}
......
......@@ -47,6 +47,10 @@ void CoverageFile::processHits(RecordOutputMgr *outputMgr, RecordKeyVector &hits
doPerBase(outputMgr, hits);
break;
case ContextCoverage::MEAN:
doMean(outputMgr, hits);
break;
case ContextCoverage::HIST:
doHist(outputMgr, hits);
break;
......@@ -55,6 +59,7 @@ void CoverageFile::processHits(RecordOutputMgr *outputMgr, RecordKeyVector &hits
default:
doDefault(outputMgr, hits);
break;
}
}
......@@ -139,7 +144,7 @@ void CoverageFile::doCounts(RecordOutputMgr *outputMgr, RecordKeyVector &hits)
void CoverageFile::doPerBase(RecordOutputMgr *outputMgr, RecordKeyVector &hits)
{
//loop through all bases in query, printing full record and metrcis for each
//loop through all bases in query, printing full record and metrics for each
const Record * queryRec = hits.getKey();
for (size_t i= 0; i < _queryLen; i++) {
_finalOutput = i +1;
......@@ -150,6 +155,17 @@ void CoverageFile::doPerBase(RecordOutputMgr *outputMgr, RecordKeyVector &hits)
}
}
void CoverageFile::doMean(RecordOutputMgr *outputMgr, RecordKeyVector &hits)
{
size_t sum =0;
for (size_t i= 0; i < _queryLen; i++) {
sum += _depthArray[i];
}
format((float)sum / (float)_queryLen);
outputMgr->printRecord(hits.getKey(), _finalOutput);
}
void CoverageFile::doHist(RecordOutputMgr *outputMgr, RecordKeyVector &hits)
{
//make a map of depths to num bases with that depth
......@@ -200,4 +216,3 @@ void CoverageFile::format(float val)
sprintf(_floatValBuf, "%0.7f", val);
_finalOutput.append(_floatValBuf);
}
......@@ -43,6 +43,7 @@ protected:
void doCounts(RecordOutputMgr *outputMgr, RecordKeyVector &hits);
void doPerBase(RecordOutputMgr *outputMgr, RecordKeyVector &hits);
void doMean(RecordOutputMgr *outputMgr, RecordKeyVector &hits);
void doHist(RecordOutputMgr *outputMgr, RecordKeyVector &hits);
void doDefault(RecordOutputMgr *outputMgr, RecordKeyVector &hits);
......
......@@ -34,6 +34,8 @@ void coverage_help(void) {
cerr << "\t-counts\t" << "Only report the count of overlaps, don't compute fraction, etc." << endl << endl;
cerr << "\t-mean\t" << "Report the mean depth of all positions in each A feature." << endl << endl;
IntersectCommonHelp();
sortedHelp();
......
......@@ -42,7 +42,7 @@ void groupby_help(void);
void GroupBy(const string &inFile, const vector<int> &groupColumns,
const vector<int> &opColumns, const vector<string> &ops,
const bool printOriginalLine, const bool printHeaderLine,
const bool InputHaveHeaderLine, const bool ignoreCase, int precision);
const bool InputHaveHeaderLine, const bool ignoreCase, int precision, const string &delim);
void PrintHeaderLine(const vector<string> &InputFields,
const vector<int> &groupColumns,
......@@ -54,7 +54,7 @@ void PrintHeaderLine(const vector<string> &InputFields,
void ReportSummary(const vector<string> &group,
const vector<vector<string> > &data,
const vector<string> &ops,
int precision);
int precision, const string &delim);
void addValue (const vector<string> &fromList,
vector<string> &toList,
......@@ -83,8 +83,8 @@ int groupby_main(int argc, char* argv[]) {
bool InputHaveHeaderLine = false;
bool ignoreCase = false;
int precision = 21;
// check to see if we should print out some help
string delim(",");
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
for(int i = 1; i < argc; i++) {
......@@ -156,6 +156,20 @@ int groupby_main(int argc, char* argv[]) {
i++;
}
}
else if(PARAMETER_CHECK("-delim", 6, parameterLength))
{
if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
cerr << endl
<< "*****ERROR: -delim parameter requires a value."
<< endl << endl;
groupby_help();
break;
}
else {
delim = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-full", 5, parameterLength)) {
printOriginalLine = true;
}
......@@ -186,6 +200,7 @@ int groupby_main(int argc, char* argv[]) {
i++;
}
}
else {
cerr << endl
<< "*****ERROR: Unrecognized parameter: "
......@@ -216,9 +231,9 @@ int groupby_main(int argc, char* argv[]) {
(ops[i] != "antimode") && (ops[i] != "stdev") &&
(ops[i] != "sstdev") && (ops[i] != "count") &&
(ops[i] != "count_distinct") && (ops[i] != "collapse") &&
(ops[i] != "distinct") && (ops[i] != "concat") &&
(ops[i] != "distinct") && (ops[i] != "distinct_sort_num") && (ops[i] != "distinct_sort_num_desc")&& (ops[i] != "concat") &&
(ops[i] != "freqdesc") && (ops[i] != "freqasc") &&
(ops[i] != "first") && (ops[i] != "last") )
(ops[i] != "first") && (ops[i] != "last") && (ops[i] != "delim"))
{
cerr << endl
<< "*****"
......@@ -283,7 +298,7 @@ int groupby_main(int argc, char* argv[]) {
}
GroupBy(inFile, groupColumnsInt, opColumnsInt, ops,
printOriginalLine, printHeaderLine, InputHaveHeaderLine,
ignoreCase, precision);
ignoreCase, precision, delim);
}
else {
groupby_help();
......@@ -318,6 +333,8 @@ void groupby_help(void) {
cerr << "\t\t\t stdev, sstdev (sample standard dev.)," << endl;
cerr << "\t\t\t collapse (i.e., print a comma separated list (duplicates allowed)), " << endl;
cerr << "\t\t\t distinct (i.e., print a comma separated list (NO duplicates allowed)), " << endl;
cerr << "\t\t\t distinct_sort_num (as distinct, but sorted numerically, ascending), " << endl;
cerr << "\t\t\t distinct_sort_num_desc (as distinct, but sorted numerically, descending), " << endl;
cerr << "\t\t\t concat (i.e., merge values into a single, non-delimited string), " << endl;
cerr << "\t\t\t freqdesc (i.e., print desc. list of values:freq)" << endl;
cerr << "\t\t\t freqasc (i.e., print asc. list of values:freq)" << endl;
......@@ -342,6 +359,9 @@ void groupby_help(void) {
cerr << "\t-ignorecase\t" << "Group values regardless of upper/lower case." << endl << endl;
cerr << "\t-prec\t" << "Sets the decimal precision for output (Default: 5)" << endl << endl;
cerr << "\t-delim\t" << "Specify a custom delimiter for the collapse operations." << endl;
cerr << "\t\t- Example: -delim \"|\"" << endl;
cerr << "\t\t- Default: \",\"." << endl << endl;
cerr << "Examples: " << endl;
cerr << "\t$ cat ex1.out" << endl;
......@@ -377,7 +397,7 @@ void GroupBy (const string &inFile,
const bool printHeaderLine,
const bool InputHaveHeaderLine,
const bool ignoreCase,
int precision) {
int precision, const string &delim) {
// current line number
int lineNum = 0;
......@@ -440,7 +460,7 @@ void GroupBy (const string &inFile,
if ((currGroup != prevGroup) && (prevGroup.size() > 0)) {
// Summarize this group
ReportSummary(printOriginalLine?inFieldsFirstLineInGroup:prevGroup,
values, ops, precision);
values, ops, precision, delim);
// reset and add the first value for the next group.
values.clear();
for( size_t i = 0; i < opColumns.size(); i++ ) {
......@@ -465,7 +485,7 @@ void GroupBy (const string &inFile,
}
// report the last group
ReportSummary(printOriginalLine?inFieldsFirstLineInGroup:currGroup,
values, ops, precision);
values, ops, precision, delim);
_tab->Close();
}
......@@ -473,7 +493,7 @@ void GroupBy (const string &inFile,
void ReportSummary(const vector<string> &group,
const vector<vector<string> > &data,
const vector<string> &ops,
int precision)
int precision, const string &delim)
{
vector<string> result;
......@@ -486,6 +506,7 @@ void ReportSummary(const vector<string> &group,
string op = ops[i];
std::stringstream buffer;
VectorOps vo(data[i]);
vo.setDelim(delim);
if (op == "sum") {
buffer << setprecision (precision) << vo.GetSum();
......@@ -497,6 +518,12 @@ void ReportSummary(const vector<string> &group,
else if (op == "distinct") {
result.push_back(vo.GetDistinct());
}
else if (op == "distinct_sort_num") {
result.push_back(vo.GetDistinctSortNum());
}
else if (op == "distinct_sort_num_desc") {
result.push_back(vo.GetDistinctSortNum(false));
}
else if (op == "concat") {
result.push_back(vo.GetConcat());
}
......
......@@ -40,7 +40,7 @@ void BedSort::SortBed() {
for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
const vector<BED> &bedList = m->second;
for (unsigned int i = 0; i < bedList.size(); ++i) {
_bed->reportBedNewLine(bedList[i]);
......@@ -105,7 +105,7 @@ void BedSort::SortBedOnFaidx()
if( m == _bed->bedMapNoBin.end() ) continue; //this chromosome is not present in BED
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
const vector<BED> &bedList = m->second;
for (unsigned int i = 0; i < bedList.size(); ++i) {
_bed->reportBedNewLine(bedList[i]);
......@@ -124,11 +124,11 @@ void BedSort::SortBedBySizeAsc() {
for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
const vector<BED> &bedList = m->second;
// add the entries from this chromosome to the current list
for (unsigned int i = 0; i < m->second.size(); ++i) {
masterList.push_back(m->second[i]);
for (unsigned int i = 0; i < bedList.size(); ++i) {
masterList.push_back(bedList[i]);
}
}
......@@ -151,11 +151,11 @@ void BedSort::SortBedBySizeDesc() {
for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
const vector<BED> &bedList = m->second;
// add the entries from this chromosome to the current list
for (unsigned int i = 0; i < m->second.size(); ++i) {
masterList.push_back(m->second[i]);
for (unsigned int i = 0; i < bedList.size(); ++i) {
masterList.push_back(bedList[i]);
}
}
......@@ -174,7 +174,7 @@ void BedSort::SortBedByChromThenSizeAsc() {
for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
vector<BED> &bedList = m->second;
sort(bedList.begin(), bedList.end(), sortBySizeAsc);
for (unsigned int i = 0; i < bedList.size(); ++i) {
......@@ -190,7 +190,7 @@ void BedSort::SortBedByChromThenSizeDesc() {
for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
vector<BED> &bedList = m->second;
sort(bedList.begin(), bedList.end(), sortBySizeDesc);
......@@ -208,7 +208,7 @@ void BedSort::SortBedByChromThenScoreAsc() {
for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
vector<BED> &bedList = m->second;
sort(bedList.begin(), bedList.end(), sortByScoreAsc);
for (unsigned int i = 0; i < bedList.size(); ++i) {
......@@ -230,7 +230,7 @@ void BedSort::SortBedByChromThenScoreDesc() {
for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
// bedList is already sorted by start position.
vector<BED> bedList = m->second;
vector<BED> &bedList = m->second;
sort(bedList.begin(), bedList.end(), sortByScoreDesc);
for (unsigned int i = 0; i < bedList.size(); ++i) {
......
......@@ -70,6 +70,7 @@ public:
ContextFileType getOutputFileType() const { return _outputFileType; }
virtual bool testCmdArgs(int argc, char **argv);
virtual bool errorEncountered() const { return !_errorMsg.empty(); }
//isValidState checks that parameters to context are in an acceptable state.
// If not, the error msg string will be set with a reason why it failed.
......
......@@ -11,9 +11,12 @@ ContextCoverage::ContextCoverage()
: _count(false),
_perBase(false),
_showHist(false),
_coverageType(DEFAULT)
_coverageType(DEFAULT),
_usingColOps(false)
{
setExplicitBedOutput(true); //do not allow BAM output
setColumnOpsMethods(true);
}
......@@ -36,6 +39,10 @@ bool ContextCoverage::parseCmdArgs(int argc, char **argv, int skipFirstArgs) {
if (strcmp(_argv[_i], "-hist") == 0) {
if (!handle_hist()) return false;
}
if (strcmp(_argv[_i], "-mean") == 0) {
if (!handle_mean()) return false;
}
}
return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs);
}
......@@ -46,8 +53,8 @@ bool ContextCoverage::isValidState()
return false;
}
//Can only use one output option. Were two or more set?
if (((int)_count + (int)_perBase + (int)_showHist) > 1) {
_errorMsg = "\n***** ERROR: -counts, -d, and -hist are mutually exclusive options. *****";
if (((int)_count + (int)_perBase + (int)_showHist) + (int)_mean > 1) {
_errorMsg = "\n***** ERROR: -counts, -d, -mean, and -hist are all mutually exclusive options. *****";
return false;
}
......@@ -76,3 +83,11 @@ bool ContextCoverage::handle_hist()
markUsed(_i - _skipFirstArgs);
return true;
}
bool ContextCoverage::handle_mean()
{
_mean = true;
_coverageType = MEAN;
markUsed(_i - _skipFirstArgs);
return true;
}
......@@ -19,18 +19,22 @@ public:
virtual bool hasIntersectMethods() const { return true; }
virtual bool isValidState();
typedef enum { DEFAULT, COUNT, PER_BASE, HIST } coverageType;
typedef enum { DEFAULT, COUNT, PER_BASE, HIST, MEAN } coverageType;
coverageType getCoverageType() const { return _coverageType; }
virtual bool usingColOps() const { return _usingColOps; }
private:
bool _count;
bool _perBase;
bool _showHist;
bool _mean;
coverageType _coverageType;
bool _usingColOps;
bool handle_c();
bool handle_d();
bool handle_hist();
bool handle_mean();
};
......
......@@ -154,7 +154,7 @@ int SingleLineDelimTextFileReader::getVcfSVlen() {
const char *currPtr = startPtr;
const char *endPtr = _sLine.c_str() + _sLine.size();
int minVal = INT_MAX;
int maxVal = INT_MIN;
int currVal = 0;
QuickString currValStr;
while (1) {
......@@ -162,7 +162,7 @@ int SingleLineDelimTextFileReader::getVcfSVlen() {
if (currPtr > startPtr) {
currValStr.assign(startPtr, currPtr - startPtr);
currVal = abs(str2chrPos(currValStr));
if (currVal < minVal) minVal = currVal;
if (currVal > maxVal) maxVal = currVal;
startPtr = currPtr;
}
......@@ -176,5 +176,5 @@ int SingleLineDelimTextFileReader::getVcfSVlen() {
}
currPtr++;
};
return minVal;
return maxVal;
}
......@@ -2,108 +2,74 @@
#include "SingleLineDelimTextFileReader.h"
BedPlusInterval::BedPlusInterval()
: _numPrintFields(0)
: _numFixedFields(defaultNumFixedFields),
_numPrintFields(0)
{
_plusFields.setNumOffsetFields(defaultNumFixedFields);
}
BedPlusInterval::~BedPlusInterval()
{
for (int i=0; i < (int)_otherIdxs.size(); i++) {
delete _otherIdxs[i];
}
void BedPlusInterval::setNumFixedFields(int numFields) {
_numFixedFields = numFields;
_plusFields.setNumOffsetFields(numFields);
}
const BedPlusInterval &BedPlusInterval::operator=(const BedPlusInterval &other) {
Bed3Interval::operator=(other);
int otherSize = other._otherIdxs.size();
int mySize = _otherIdxs.size();
_numPrintFields = other._numPrintFields;
int numMatchingFields = min(mySize, otherSize);
for (int i=0; i < numMatchingFields; i++) {
(*(_otherIdxs[i])) = (*(other._otherIdxs[i]));
}
if (mySize < otherSize) {
for (int i = mySize; i < otherSize; i++) {
QuickString *pqs = new QuickString(*(other._otherIdxs[i]));
_otherIdxs.push_back(pqs);
}
} else if (mySize > otherSize) {
for (int i= otherSize; i < mySize; i++) {
delete _otherIdxs[i];
}
_otherIdxs.resize(otherSize);
}
return *this;
}
bool BedPlusInterval::initFromFile(SingleLineDelimTextFileReader *fileReader)
{
return (Bed3Interval::initFromFile(fileReader) && initOtherFieldsFromFile(fileReader));
}
bool baseRetFlag = Bed3Interval::initFromFile(fileReader);
bool BedPlusInterval::initOtherFieldsFromFile(SingleLineDelimTextFileReader *fileReader)
{
int numFields = fileReader->getNumFields() - startOtherIdx;
if ((int)_otherIdxs.size() != numFields) {
if ((int)_otherIdxs.size() > 0) {
return false; //file had a number of fields not matching what was expected.
}
for (int i=0; i < numFields; i++) {
_otherIdxs.push_back(new QuickString());
}
if (_numFixedFields != defaultNumFixedFields) {
fileReader->getField(3, _name);
fileReader->getField(4, _score);
fileReader->getField(5, _strand);
adjustStrandVal();
}
_plusFields.initFromFile(fileReader);
return baseRetFlag;
for (int i=0; i < numFields; i++) {
fileReader->getField(i + startOtherIdx, (*(_otherIdxs[i])));
}
return true;
}
void BedPlusInterval::clear() {
Bed3Interval::clear();
_numPrintFields = 0;
for (int i=0; i < (int)_otherIdxs.size(); i++) {
_otherIdxs[i]->clear();
}
_plusFields.clear();
}
void BedPlusInterval::print(QuickString &outBuf) const
{
Bed3Interval::print(outBuf);
printOtherFields(outBuf);
printBed6PlusFields(outBuf);
_plusFields.printFields(outBuf);
}
void BedPlusInterval::print(QuickString &outBuf, int start, int end) const
{
Bed3Interval::print(outBuf, start, end);
printOtherFields(outBuf);
printBed6PlusFields(outBuf);
_plusFields.printFields(outBuf);
}
void BedPlusInterval::print(QuickString &outBuf, const QuickString & start, const QuickString & end) const
{
Bed3Interval::print(outBuf, start, end);
printOtherFields(outBuf);
printBed6PlusFields(outBuf);
_plusFields.printFields(outBuf);
}
void BedPlusInterval::printNull(QuickString &outBuf) const
{
Bed3Interval::printNull(outBuf);
for (int i=startOtherIdx; i < _numPrintFields; i++) {
printBed6PlusNullFields(outBuf);
for (int i=_numFixedFields; i < _numPrintFields; i++) {
outBuf.append("\t.");
}
}
const QuickString &BedPlusInterval::getField(int fieldNum) const
{
//a request for any of the first three fields will retrieve
//chrom, start, end, in that order.
//A request for field 3+ will go to the otherIdxs.
if (fieldNum > startOtherIdx && fieldNum <= startOtherIdx + (int)_otherIdxs.size()) {
return (*(_otherIdxs[fieldNum - startOtherIdx - 1]));
if (fieldNum > _numFixedFields) {
return _plusFields.getField(fieldNum);
}
return Bed3Interval::getField(fieldNum);
}
......@@ -112,20 +78,30 @@ bool BedPlusInterval::isNumericField(int fieldNum) {
//
// TBD: There is no currently no good way to guarantee / enforce whether
// fields after the 6th are numeric, so for now we'll give the user the
// fields after the 3rd are numeric, so for now we'll give the user the
// benefit of the doubt on those.
//
if (fieldNum > startOtherIdx) {
if (fieldNum > defaultNumFixedFields) {
return true;
} else {
return Bed3Interval::isNumericField(fieldNum);
}
return Bed3Interval::isNumericField(fieldNum);
}
void BedPlusInterval::printOtherFields(QuickString &outBuf) const {
for (int i=0; i < (int)_otherIdxs.size(); i++) {
void BedPlusInterval::printBed6PlusFields(QuickString &outBuf) const {
if (_numFixedFields != defaultNumFixedFields) {
outBuf.append('\t');
outBuf.append(_name);
outBuf.append('\t');
outBuf.append(_score);
outBuf.append('\t');
outBuf.append(*(_otherIdxs[i]));
outBuf.append(_strand);
}
}
void BedPlusInterval::printBed6PlusNullFields(QuickString &outBuf) const {
if (_numFixedFields != defaultNumFixedFields) {
outBuf.append("\t.\t.\t.");
}
}
......@@ -9,7 +9,7 @@
#define BEDPLUSINTERVAL_H_
#include "Bed3Interval.h"
#include <vector>
#include "PlusFields.h"
class SingleLineDelimTextFileReader;
......@@ -18,6 +18,8 @@ public:
friend class FreeList<BedPlusInterval>;