Commit 8b8a9cd5 authored by Neil Kindlon's avatar Neil Kindlon
Browse files

Added distinct_sort_num option to VectorOps for groupBy.

parent ec745749
...@@ -83,8 +83,7 @@ int groupby_main(int argc, char* argv[]) { ...@@ -83,8 +83,7 @@ int groupby_main(int argc, char* argv[]) {
bool InputHaveHeaderLine = false; bool InputHaveHeaderLine = false;
bool ignoreCase = false; bool ignoreCase = false;
int precision = 21; int precision = 21;
// check to see if we should print out some help
// check to see if we should print out some help
if(argc <= 1) showHelp = true; if(argc <= 1) showHelp = true;
for(int i = 1; i < argc; i++) { for(int i = 1; i < argc; i++) {
...@@ -186,6 +185,7 @@ int groupby_main(int argc, char* argv[]) { ...@@ -186,6 +185,7 @@ int groupby_main(int argc, char* argv[]) {
i++; i++;
} }
} }
else { else {
cerr << endl cerr << endl
<< "*****ERROR: Unrecognized parameter: " << "*****ERROR: Unrecognized parameter: "
...@@ -216,7 +216,7 @@ int groupby_main(int argc, char* argv[]) { ...@@ -216,7 +216,7 @@ int groupby_main(int argc, char* argv[]) {
(ops[i] != "antimode") && (ops[i] != "stdev") && (ops[i] != "antimode") && (ops[i] != "stdev") &&
(ops[i] != "sstdev") && (ops[i] != "count") && (ops[i] != "sstdev") && (ops[i] != "count") &&
(ops[i] != "count_distinct") && (ops[i] != "collapse") && (ops[i] != "count_distinct") && (ops[i] != "collapse") &&
(ops[i] != "distinct") && (ops[i] != "concat") && (ops[i] != "distinct") && (ops[i] != "distinct_sort_num") && (ops[i] != "concat") &&
(ops[i] != "freqdesc") && (ops[i] != "freqasc") && (ops[i] != "freqdesc") && (ops[i] != "freqasc") &&
(ops[i] != "first") && (ops[i] != "last") ) (ops[i] != "first") && (ops[i] != "last") )
{ {
...@@ -318,6 +318,7 @@ void groupby_help(void) { ...@@ -318,6 +318,7 @@ void groupby_help(void) {
cerr << "\t\t\t stdev, sstdev (sample standard dev.)," << endl; cerr << "\t\t\t stdev, sstdev (sample standard dev.)," << endl;
cerr << "\t\t\t collapse (i.e., print a comma separated list (duplicates allowed)), " << endl; cerr << "\t\t\t collapse (i.e., print a comma separated list (duplicates allowed)), " << endl;
cerr << "\t\t\t distinct (i.e., print a comma separated list (NO duplicates allowed)), " << endl; cerr << "\t\t\t distinct (i.e., print a comma separated list (NO duplicates allowed)), " << endl;
cerr << "\t\t\t distinct_sort_num (as distinct, but sorted numerically), " << endl;
cerr << "\t\t\t concat (i.e., merge values into a single, non-delimited string), " << endl; cerr << "\t\t\t concat (i.e., merge values into a single, non-delimited string), " << endl;
cerr << "\t\t\t freqdesc (i.e., print desc. list of values:freq)" << endl; cerr << "\t\t\t freqdesc (i.e., print desc. list of values:freq)" << endl;
cerr << "\t\t\t freqasc (i.e., print asc. list of values:freq)" << endl; cerr << "\t\t\t freqasc (i.e., print asc. list of values:freq)" << endl;
...@@ -473,7 +474,7 @@ void GroupBy (const string &inFile, ...@@ -473,7 +474,7 @@ void GroupBy (const string &inFile,
void ReportSummary(const vector<string> &group, void ReportSummary(const vector<string> &group,
const vector<vector<string> > &data, const vector<vector<string> > &data,
const vector<string> &ops, const vector<string> &ops,
int precision) int precision)
{ {
vector<string> result; vector<string> result;
...@@ -497,6 +498,10 @@ void ReportSummary(const vector<string> &group, ...@@ -497,6 +498,10 @@ void ReportSummary(const vector<string> &group,
else if (op == "distinct") { else if (op == "distinct") {
result.push_back(vo.GetDistinct()); result.push_back(vo.GetDistinct());
} }
else if (op == "distinct_sort_num") {
result.push_back(vo.GetDistinctSortNum());
}
else if (op == "concat") { else if (op == "concat") {
result.push_back(vo.GetConcat()); result.push_back(vo.GetConcat());
} }
......
...@@ -267,6 +267,25 @@ string VectorOps::GetDistinct(void) ...@@ -267,6 +267,25 @@ string VectorOps::GetDistinct(void)
return distinct.str(); return distinct.str();
} }
string VectorOps::GetDistinctSortNum(void)
{
ostringstream distinct;
// remove duplicate entries from the vector
// http://stackoverflow.com/questions/1041620/most-efficient-way-to-erase-duplicates-and-sort-a-c-vector
// convert the vec of strings to a vec of doubles
transform(_vecs.begin(), _vecs.end(), back_inserter(_vecd), MakeDouble);
sort( _vecd.begin(), _vecd.end() );
_vecd.erase( unique( _vecd.begin(), _vecd.end() ), _vecd.end() );
for( size_t i = 0; i < _vecd.size(); i++ ) {
if (i>0)
distinct << ",";
distinct << _vecd[i];
}
return distinct.str();
}
string VectorOps::GetFreqDesc(void) string VectorOps::GetFreqDesc(void)
{ {
// compute the frequency of each unique value // compute the frequency of each unique value
...@@ -349,5 +368,3 @@ string VectorOps::GetLast(void) ...@@ -349,5 +368,3 @@ string VectorOps::GetLast(void)
{ {
return _vecs[_vecs.size() - 1]; return _vecs[_vecs.size() - 1];
} }
...@@ -72,7 +72,9 @@ public: ...@@ -72,7 +72,9 @@ public:
// return a concatenation of all elements in the vector // return a concatenation of all elements in the vector
string GetConcat(void); string GetConcat(void);
// return a comma-separated list of the _unique_ elements // return a comma-separated list of the _unique_ elements
string GetDistinct(void); string GetDistinct(void);
// return a comma-separated list of the _unique_ elements, sorted numerically
string GetDistinctSortNum(void);
// return a histogram of values and their freqs. in desc. order of frequency // return a histogram of values and their freqs. in desc. order of frequency
string GetFreqDesc(void); string GetFreqDesc(void);
// return a histogram of values and their freqs. in asc. order of frequency // return a histogram of values and their freqs. in asc. order of frequency
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment