Commit 2aad3460 authored by Aaron's avatar Aaron
Browse files

ENH: groupBy now uses VectorOps

parent 93d2520c
......@@ -10,6 +10,7 @@ INCLUDES = -I$(UTILITIES_DIR)/tabFile/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/VectorOps/ \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
......@@ -17,7 +18,7 @@ INCLUDES = -I$(UTILITIES_DIR)/tabFile/ \
# ----------------------------------
SOURCES= groupBy.cpp
OBJECTS= $(SOURCES:.cpp=.o)
_EXT_OBJECTS=tabFile.o gzstream.o fileType.o
_EXT_OBJECTS=tabFile.o gzstream.o fileType.o VectorOps.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
......
......@@ -27,6 +27,7 @@ Licenced under the MIT license.
#include "version.h"
#include "lineFileUtilities.h"
#include "tabFile.h"
#include "VectorOps.h"
using namespace std;
......@@ -39,33 +40,12 @@ const int PRECISION = 21;
#define PARAMETER_CHECK(param, paramLen, actualLen) ((strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen))
#define LOOKS_LIKE_A_PARAM(string) (strlen(string)>0 && string[0]=='-')
struct ValueGreaterThan
{
bool operator()( const vector< pair<int, string> >::value_type& lhs,
const vector< pair<int, string> >::value_type& rhs ) const
{
return lhs.first > rhs.first;
}
};
struct ValueLessThan
{
bool operator()( const vector< pair<int, string> >::value_type& lhs,
const vector< pair<int, string> >::value_type& rhs ) const
{
return lhs.first < rhs.first;
}
};
// function declarations
void groupby_help(void);
void GroupBy(const string &inFile, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops, const bool printOriginalLine, const bool printHeaderLine, const bool InputHaveHeaderLine, const bool ignoreCase);
void PrintHeaderLine(const vector<string> &InputFields, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops, const bool PrintFullInputLine, const bool InputHaveHeaderLine);
void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops);
void addValue (const vector<string> &fromList, vector<string> &toList, int index, int lineNum, const bool ignoreCase);
float ToFloat (string element);
double ToDouble(const string &element);
void TabPrintPost (string element);
void TabPrintPre (string element);
void CommaPrint (string element);
......@@ -407,170 +387,59 @@ void ReportSummary(const vector<string> &group, const vector<vector<string> > &d
string op = ops[i];
std::stringstream buffer;
vector<double> dataF;
// are we doing a numeric conversion? if so, convert the strings to doubles.
if ((op == "sum") || (op == "max") || (op == "min") || (op == "mean") ||
(op == "median") || (op == "stdev") || (op == "sstdev"))
{
transform(data[i].begin(), data[i].end(), back_inserter(dataF), ToDouble);
}
VectorOps vo(data[i]);
if (op == "sum") {
// sum them up
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
buffer << setprecision (PRECISION) << total;
buffer << setprecision (PRECISION) << vo.GetSum();
result.push_back(buffer.str());
}
else if (op == "collapse") {
string collapse;
for( size_t j = 0; j < data[i].size(); j++ ) {//Ugly, but cannot use back_inserter
if (j>0)
collapse.append(",");
collapse.append(data[i][j]);
}
result.push_back(collapse);
result.push_back(vo.GetCollapse());
}
else if (op == "distinct") {
string distinct;
// get the current column's data
vector<string> col_data = data[i];
// remove duplicate entries from the vector
// http://stackoverflow.com/questions/1041620/most-efficient-way-to-erase-duplicates-and-sort-a-c-vector
sort( col_data.begin(), col_data.end() );
col_data.erase( unique( col_data.begin(), col_data.end() ), col_data.end() );
for( size_t j = 0; j < col_data.size(); j++ ) {//Ugly, but cannot use back_inserter
if (j>0)
distinct.append(",");
distinct.append(col_data[j]);
}
result.push_back(distinct);
result.push_back(vo.GetDistinct());
}
else if (op == "concat") {
string concat;
for( size_t j = 0; j < data[i].size(); j++ ) {//Ugly, but cannot use back_inserter
concat.append(data[i][j]);
}
result.push_back(concat);
result.push_back(vo.GetConcat());
}
else if (op == "min") {
buffer << setprecision (PRECISION) << *min_element( dataF.begin(), dataF.end() );
buffer << setprecision (PRECISION) << vo.GetMin();
result.push_back(buffer.str());
}
else if (op == "max") {
buffer << setprecision (PRECISION) << *max_element( dataF.begin(), dataF.end() );
buffer << setprecision (PRECISION) << vo.GetMax();
result.push_back(buffer.str());
}
else if (op == "mean") {
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
double mean = total / dataF.size();
buffer << setprecision (PRECISION) << mean;
buffer << setprecision (PRECISION) << vo.GetMean();
result.push_back(buffer.str());
}
else if (op == "median") {
double median = 0.0;
sort(dataF.begin(), dataF.end());
int totalLines = dataF.size();
if ((totalLines % 2) > 0) {
long mid;
mid = totalLines / 2;
median = dataF[mid];
}
else {
long midLow, midHigh;
midLow = (totalLines / 2) - 1;
midHigh = (totalLines / 2);
median = (dataF[midLow] + dataF[midHigh]) / 2.0;
}
buffer << setprecision (PRECISION) << median;
buffer << setprecision (PRECISION) << vo.GetMedian();
result.push_back(buffer.str());
}
else if (op == "count") {
buffer << setprecision (PRECISION) << data[i].size();
result.push_back(buffer.str());
}
else if ((op == "mode") || (op == "antimode") ||
(op == "freqdesc") || (op == "freqasc")) {
// compute the frequency of each unique value
map<string, int> freqs;
vector<string>::const_iterator dIt = data[i].begin();
vector<string>::const_iterator dEnd = data[i].end();
for (; dIt != dEnd; ++dIt) {
freqs[*dIt]++;
}
// grab the mode and the anti mode
string mode, antiMode;
int count = 0;
int minCount = INT_MAX;
for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) {
if (iter->second > count) {
mode = iter->first;
count = iter->second;
}
if (iter->second < minCount) {
antiMode = iter->first;
minCount = iter->second;
}
}
// report
if (op == "mode") {
buffer << setprecision (PRECISION) << mode;
result.push_back(buffer.str());
}
else if (op == "antimode") {
buffer << setprecision (PRECISION) << antiMode;
result.push_back(buffer.str());
}
else if (op == "freqdesc" || op == "freqasc") {
// pair for the num times a values was
// observed (1) and the value itself (2)
pair<int, string> freqPair;
vector< pair<int, string> > freqList;
// create a list of pairs of all the observed values (second)
// and their occurences (first)
map<string,int>::const_iterator mapIter = freqs.begin();
map<string,int>::const_iterator mapEnd = freqs.end();
for(; mapIter != mapEnd; ++mapIter)
freqList.push_back( make_pair(mapIter->second, mapIter->first) );
// sort the list of pairs in the requested order by the frequency
// this will make the value that was observed least/most bubble to the top
if (op == "freqdesc")
sort(freqList.begin(), freqList.end(), ValueGreaterThan());
else if (op == "freqasc")
sort(freqList.begin(), freqList.end(), ValueLessThan());
// record all of the values and their frequencies.
vector< pair<int, string> >::const_iterator iter = freqList.begin();
vector< pair<int, string> >::const_iterator iterEnd = freqList.end();
for (; iter != iterEnd; ++iter)
buffer << iter->second << ":" << iter->first << ",";
result.push_back(buffer.str());
}
else if (op == "mode") {
result.push_back(vo.GetMode());
}
else if (op == "stdev" || op == "sstdev") {
// get the mean
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
double mean = total / dataF.size();
// get the variance
double totalVariance = 0.0;
vector<double>::const_iterator dIt = dataF.begin();
vector<double>::const_iterator dEnd = dataF.end();
for (; dIt != dEnd; ++dIt) {
totalVariance += pow((*dIt - mean),2);
}
double variance = 0.0;
if (op == "stdev") {
variance = totalVariance / dataF.size();
}
else if (op == "sstdev" && dataF.size() > 1) {
variance = totalVariance / (dataF.size() - 1);
}
double stddev = sqrt(variance);
// report
buffer << setprecision (PRECISION) << stddev;
else if (op == "antimode") {
result.push_back(vo.GetAntiMode());
}
else if (op == "freqdesc") {
result.push_back(vo.GetFreqDesc());
}
else if (op == "freqasc") {
result.push_back(vo.GetFreqAsc());
}
else if (op == "stdev") {
buffer << setprecision (PRECISION) << vo.GetStddev();
result.push_back(buffer.str());
}
else if (op == "sstdev") {
buffer << setprecision (PRECISION) << vo.GetSstddev();
result.push_back(buffer.str());
}
}
......@@ -596,10 +465,6 @@ void addValue (const vector<string> &fromList, vector<string> &toList, int index
}
float ToFloat (string element) {
return atof(element.c_str());
}
void TabPrintPost (string element) {
cout << element << "\t";
}
......@@ -612,16 +477,6 @@ void CommaPrint (string element) {
cout << element << ",";
}
double ToDouble(const string &element) {
std::istringstream i(element);
double x;
if (!(i >> x)) {
cerr << "Error: Could not properly convert string to numeric (\"" + element + "\")" << endl;
exit(1);
}
return x;
}
inline string ColumnHeaderName(const vector<string> &inFields, const size_t FieldIndex,
bool InputHaveHeaderLine)
{
......
......@@ -21,7 +21,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
# ----------------------------------
SOURCES= mapMain.cpp mapBed.cpp
OBJECTS= $(SOURCES:.cpp=.o)
_EXT_OBJECTS=bedFile.o lineFileUtilities.o BamAncillary.o gzstream.o fileType.o chromsweep.o
_EXT_OBJECTS=bedFile.o lineFileUtilities.o BamAncillary.o gzstream.o fileType.o VectorOps.o chromsweep.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= mapBed
......@@ -41,6 +41,7 @@ $(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools-Ancillary/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/VectorOps/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/chromsweep/
clean:
......
......@@ -25,6 +25,24 @@ double MakeDouble(const string &element) {
return x;
}
struct ValueGreaterThan
{
bool operator()( const vector< pair<int, string> >::value_type& lhs,
const vector< pair<int, string> >::value_type& rhs ) const
{
return lhs.first > rhs.first;
}
};
struct ValueLessThan
{
bool operator()( const vector< pair<int, string> >::value_type& lhs,
const vector< pair<int, string> >::value_type& rhs ) const
{
return lhs.first < rhs.first;
}
};
// Constructor
VectorOps::VectorOps(const vector<string> &vec)
: _vecs(vec)
......@@ -52,6 +70,34 @@ double VectorOps::GetMean(void)
return accumulate(_vecd.begin(), _vecd.end(), 0.0) / _size;
}
double VectorOps::GetStddev(void)
{
double mean = GetMean();
// get the variance
double totalVariance = 0.0;
vector<double>::const_iterator dIt = _vecd.begin();
vector<double>::const_iterator dEnd = _vecd.end();
for (; dIt != dEnd; ++dIt) {
totalVariance += pow((*dIt - mean),2);
}
double variance = totalVariance / _vecd.size();
return sqrt(variance);
}
double VectorOps::GetSstddev(void)
{
double mean = GetMean();
// get the variance
double totalVariance = 0.0;
vector<double>::const_iterator dIt = _vecd.begin();
vector<double>::const_iterator dEnd = _vecd.end();
for (; dIt != dEnd; ++dIt) {
totalVariance += pow((*dIt - mean),2);
}
double variance = totalVariance / (_vecd.size() - 1);
return sqrt(variance);
}
double VectorOps::GetMedian(void)
{
// convert the vec of strings to a vec of doubles
......@@ -169,6 +215,14 @@ string VectorOps::GetCollapse(void)
return collapse.str();
}
string VectorOps::GetConcat(void)
{
ostringstream collapse;
for( size_t i = 0; i < _vecs.size(); i++ )
collapse << _vecs[i];
return collapse.str();
}
string VectorOps::GetDistinct(void)
{
ostringstream distinct;
......@@ -183,4 +237,75 @@ string VectorOps::GetDistinct(void)
distinct << _vecs[i];
}
return distinct.str();
}
string VectorOps::GetFreqDesc(void)
{
// compute the frequency of each unique value
map<string, int> freqs;
vector<string>::const_iterator dIt = _vecs.begin();
vector<string>::const_iterator dEnd = _vecs.end();
for (; dIt != dEnd; ++dIt) {
freqs[*dIt]++;
}
// pair for the num times a values was
// observed (1) and the value itself (2)
pair<int, string> freqPair;
vector< pair<int, string> > freqList;
// create a list of pairs of all the observed values (second)
// and their occurences (first)
map<string,int>::const_iterator mapIter = freqs.begin();
map<string,int>::const_iterator mapEnd = freqs.end();
for(; mapIter != mapEnd; ++mapIter)
freqList.push_back( make_pair(mapIter->second, mapIter->first) );
// sort the list of pairs in the requested order by the frequency
// this will make the value that was observed least/most bubble to the top
sort(freqList.begin(), freqList.end(), ValueGreaterThan());
// record all of the values and their frequencies.
ostringstream buffer;
vector< pair<int, string> >::const_iterator iter = freqList.begin();
vector< pair<int, string> >::const_iterator iterEnd = freqList.end();
for (; iter != iterEnd; ++iter)
buffer << iter->second << ":" << iter->first << ",";
return buffer.str();
}
string VectorOps::GetFreqAsc(void)
{
// compute the frequency of each unique value
map<string, int> freqs;
vector<string>::const_iterator dIt = _vecs.begin();
vector<string>::const_iterator dEnd = _vecs.end();
for (; dIt != dEnd; ++dIt) {
freqs[*dIt]++;
}
// pair for the num times a values was
// observed (1) and the value itself (2)
pair<int, string> freqPair;
vector< pair<int, string> > freqList;
// create a list of pairs of all the observed values (second)
// and their occurences (first)
map<string,int>::const_iterator mapIter = freqs.begin();
map<string,int>::const_iterator mapEnd = freqs.end();
for(; mapIter != mapEnd; ++mapIter)
freqList.push_back( make_pair(mapIter->second, mapIter->first) );
// sort the list of pairs in the requested order by the frequency
// this will make the value that was observed least/most bubble to the top
sort(freqList.begin(), freqList.end(), ValueLessThan());
// record all of the values and their frequencies.
ostringstream buffer;
vector< pair<int, string> >::const_iterator iter = freqList.begin();
vector< pair<int, string> >::const_iterator iterEnd = freqList.end();
for (; iter != iterEnd; ++iter)
buffer << iter->second << ":" << iter->first << ",";
return buffer.str();
}
\ No newline at end of file
......@@ -13,6 +13,8 @@
#define VECTOROPS_H
#include <vector>
#include <map>
#include <math.h>
#include <string>
#include <sstream>
#include <algorithm>
......@@ -38,16 +40,20 @@ public:
// user-interface
double GetSum(void); // return the total of the values in the vector
double GetMean(void); // return the average value in the vector
double GetStddev(void); // return the standard deviation
double GetSstddev(void); // return the sample standard deviation
double GetMedian(void); // return the median value in the vector
string GetMode(void); // return the most common value in the vector
string GetAntiMode(void); // return the least common value in the vector
double GetMin(void); // return the minimum element of the vector
double GetMax(void); // return the maximum element of the vector
uint32_t GetCount(void); // return the count of element in the vector
uint32_t GetCountDistinct(void); // return a the count of _unique_ elements in the list
uint32_t GetCountDistinct(void); // return a the count of _unique_ elements in the vector
string GetCollapse(void); // return a comma-separated list of elements
string GetConcat(void); // return a concatenation of all elements in the vector
string GetDistinct(void); // return a comma-separated list of the _unique_ elements
string GetFreqDesc(void); // return a histogram of values and their frequencies in desc. order of frequency
string GetFreqAsc(void); // return a histogram of values and their frequencies in asc. order of frequency
private:
vector<string> _vecs;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment