Commit 1549933f authored by Aaron's avatar Aaron
Browse files

add "first" and "last" as new groupby options

parent 35bf035e
...@@ -42,10 +42,28 @@ const int PRECISION = 21; ...@@ -42,10 +42,28 @@ const int PRECISION = 21;
// function declarations // function declarations
void groupby_help(void); void groupby_help(void);
void GroupBy(const string &inFile, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops, const bool printOriginalLine, const bool printHeaderLine, const bool InputHaveHeaderLine, const bool ignoreCase); void GroupBy(const string &inFile, const vector<int> &groupColumns,
void PrintHeaderLine(const vector<string> &InputFields, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops, const bool PrintFullInputLine, const bool InputHaveHeaderLine); const vector<int> &opColumns, const vector<string> &ops,
void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops); const bool printOriginalLine, const bool printHeaderLine,
void addValue (const vector<string> &fromList, vector<string> &toList, int index, int lineNum, const bool ignoreCase); const bool InputHaveHeaderLine, const bool ignoreCase);
void PrintHeaderLine(const vector<string> &InputFields,
const vector<int> &groupColumns,
const vector<int> &opColumns,
const vector<string> &ops,
const bool PrintFullInputLine,
const bool InputHaveHeaderLine);
void ReportSummary(const vector<string> &group,
const vector<vector<string> > &data,
const vector<string> &ops);
void addValue (const vector<string> &fromList,
vector<string> &toList,
int index,
int lineNum,
const bool ignoreCase);
void TabPrintPost (string element); void TabPrintPost (string element);
void TabPrintPre (string element); void TabPrintPre (string element);
void CommaPrint (string element); void CommaPrint (string element);
...@@ -92,9 +110,13 @@ int groupby_main(int argc, char* argv[]) { ...@@ -92,9 +110,13 @@ int groupby_main(int argc, char* argv[]) {
i++; i++;
} }
} }
else if (PARAMETER_CHECK("-grp", 4, parameterLength) || PARAMETER_CHECK("-g", 2, parameterLength)) { else if (PARAMETER_CHECK("-grp", 4, parameterLength) ||
PARAMETER_CHECK("-g", 2, parameterLength))
{
if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
cerr << endl << "*****ERROR: -grp parameter requires a value." << endl << endl; cerr << endl
<< "*****ERROR: -grp parameter requires a value."
<< endl << endl;
groupby_help(); groupby_help();
break; break;
} }
...@@ -103,9 +125,13 @@ int groupby_main(int argc, char* argv[]) { ...@@ -103,9 +125,13 @@ int groupby_main(int argc, char* argv[]) {
i++; i++;
} }
} }
else if(PARAMETER_CHECK("-opCols", 7, parameterLength) || PARAMETER_CHECK("-c", 2, parameterLength)) { else if(PARAMETER_CHECK("-opCols", 7, parameterLength) ||
PARAMETER_CHECK("-c", 2, parameterLength))
{
if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
cerr << endl << "*****ERROR: -opCols parameter requires a value." << endl << endl; cerr << endl
<< "*****ERROR: -opCols parameter requires a value."
<< endl << endl;
groupby_help(); groupby_help();
break; break;
} }
...@@ -115,9 +141,13 @@ int groupby_main(int argc, char* argv[]) { ...@@ -115,9 +141,13 @@ int groupby_main(int argc, char* argv[]) {
i++; i++;
} }
} }
else if(PARAMETER_CHECK("-ops", 4, parameterLength) || PARAMETER_CHECK("-o", 2, parameterLength)) { else if(PARAMETER_CHECK("-ops", 4, parameterLength) ||
PARAMETER_CHECK("-o", 2, parameterLength))
{
if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
cerr << endl << "*****ERROR: -ops parameter requires a value." << endl << endl; cerr << endl
<< "*****ERROR: -ops parameter requires a value."
<< endl << endl;
groupby_help(); groupby_help();
break; break;
} }
...@@ -144,13 +174,22 @@ int groupby_main(int argc, char* argv[]) { ...@@ -144,13 +174,22 @@ int groupby_main(int argc, char* argv[]) {
ignoreCase = true; ignoreCase = true;
} }
else { else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; cerr << endl
<< "*****ERROR: Unrecognized parameter: "
<< argv[i]
<< " *****" << endl << endl;
showHelp = true; showHelp = true;
} }
} }
if (!haveOpColumns) { if (!haveOpColumns) {
cerr << endl << "*****" << endl << "*****ERROR: Need -opCols." << endl << "*****" << endl; cerr << endl
<< "*****"
<< endl
<< "*****ERROR: Need -opCols."
<< endl
<< "*****"
<< endl;
showHelp = true; showHelp = true;
} }
// split the opsString into discrete operations and make sure they are all valid. // split the opsString into discrete operations and make sure they are all valid.
...@@ -158,12 +197,24 @@ int groupby_main(int argc, char* argv[]) { ...@@ -158,12 +197,24 @@ int groupby_main(int argc, char* argv[]) {
opsString.erase(remove_if(opsString.begin(),opsString.end(),::isspace),opsString.end()); opsString.erase(remove_if(opsString.begin(),opsString.end(),::isspace),opsString.end());
Tokenize(opsString, ops, ','); Tokenize(opsString, ops, ',');
for( size_t i = 0; i < ops.size(); i++ ) { for( size_t i = 0; i < ops.size(); i++ ) {
if ((ops[i] != "sum") && (ops[i] != "max") && (ops[i] != "min") && (ops[i] != "mean") && if ((ops[i] != "sum") && (ops[i] != "max") &&
(ops[i] != "mode") && (ops[i] != "median") && (ops[i] != "antimode") && (ops[i] != "stdev") && (ops[i] != "min") && (ops[i] != "mean") &&
(ops[i] != "sstdev") && (ops[i] != "count") && (ops[i] != "count_distinct") && (ops[i] != "collapse") && (ops[i] != "distinct") && (ops[i] != "mode") && (ops[i] != "median") &&
(ops[i] != "concat") && (ops[i] != "freqdesc") && (ops[i] != "freqasc")) (ops[i] != "antimode") && (ops[i] != "stdev") &&
(ops[i] != "sstdev") && (ops[i] != "count") &&
(ops[i] != "count_distinct") && (ops[i] != "collapse") &&
(ops[i] != "distinct") && (ops[i] != "concat") &&
(ops[i] != "freqdesc") && (ops[i] != "freqasc") &&
(ops[i] != "first") && (ops[i] != "last") )
{ {
cerr << endl << "*****" << endl << "*****ERROR: Invalid operation selection \"" << ops[i] << endl << "\" *****" << endl; cerr << endl
<< "*****"
<< endl
<< "*****ERROR: Invalid operation selection \""
<< ops[i]
<< endl
<< "\" *****"
<< endl;
showHelp = true; showHelp = true;
} }
} }
...@@ -181,7 +232,12 @@ int groupby_main(int argc, char* argv[]) { ...@@ -181,7 +232,12 @@ int groupby_main(int argc, char* argv[]) {
for(size_t i = 0; i < groupColumnsInt.size(); ++i) { for(size_t i = 0; i < groupColumnsInt.size(); ++i) {
int groupColumnInt = groupColumnsInt[i]; int groupColumnInt = groupColumnsInt[i];
if (groupColumnInt < 1) { if (groupColumnInt < 1) {
cerr << endl << "*****" << endl << "*****ERROR: group columns must be >=1. " << endl << "*****" << endl; cerr << endl
<< "*****"
<< endl
<< "*****ERROR: group columns must be >=1. "
<< endl
<< "*****" << endl;
groupby_help(); groupby_help();
} }
} }
...@@ -190,14 +246,26 @@ int groupby_main(int argc, char* argv[]) { ...@@ -190,14 +246,26 @@ int groupby_main(int argc, char* argv[]) {
for(size_t i = 0; i < opColumnsInt.size(); ++i) { for(size_t i = 0; i < opColumnsInt.size(); ++i) {
int opColumnInt = opColumnsInt[i]; int opColumnInt = opColumnsInt[i];
if (opColumnInt < 1) { if (opColumnInt < 1) {
cerr << endl << "*****" << endl << "*****ERROR: op columns must be >=1. " << endl << "*****" << endl; cerr << endl
<< "*****"
<< endl
<< "*****ERROR: op columns must be >=1. "
<< endl
<< "*****"
<< endl;
groupby_help(); groupby_help();
} }
} }
// sanity check that there are equal number of opColumns and ops // sanity check that there are equal number of opColumns and ops
if (ops.size() != opColumnsInt.size()) { if (ops.size() != opColumnsInt.size()) {
cerr << endl << "*****" << endl << "*****ERROR: There must be equal number of ops and opCols. " << endl << "*****" << endl; cerr << endl
<< "*****"
<< endl
<< "*****ERROR: There must be equal num. of ops and opCols. "
<< endl
<< "*****"
<< endl;
groupby_help(); groupby_help();
} }
GroupBy(inFile, groupColumnsInt, opColumnsInt, ops, GroupBy(inFile, groupColumnsInt, opColumnsInt, ops,
...@@ -227,7 +295,7 @@ void groupby_help(void) { ...@@ -227,7 +295,7 @@ void groupby_help(void) {
cerr << "\t\t\tThe columns must be comma separated." << endl; cerr << "\t\t\tThe columns must be comma separated." << endl;
cerr << "\t\t\t- Default: 1,2,3" << endl << endl; cerr << "\t\t\t- Default: 1,2,3" << endl << endl;
cerr << "\t-c -opCols\t" << "Specify the column (1-based) that should be summarized." << endl; cerr << "\t-c -opCols\t" << "Specify the column (1-based) that should be summarized." << endl;
cerr << "\t\t\t- Required." << endl << endl; cerr << "\t\t\t- Required." << endl << endl;
cerr << "\t-o -ops\t\t" << "Specify the operation that should be applied to opCol." << endl; cerr << "\t-o -ops\t\t" << "Specify the operation that should be applied to opCol." << endl;
...@@ -240,6 +308,9 @@ void groupby_help(void) { ...@@ -240,6 +308,9 @@ void groupby_help(void) {
cerr << "\t\t\t concat (i.e., merge values into a single, non-delimited string), " << endl; cerr << "\t\t\t concat (i.e., merge values into a single, non-delimited string), " << endl;
cerr << "\t\t\t freqdesc (i.e., print desc. list of values:freq)" << endl; cerr << "\t\t\t freqdesc (i.e., print desc. list of values:freq)" << endl;
cerr << "\t\t\t freqasc (i.e., print asc. list of values:freq)" << endl; cerr << "\t\t\t freqasc (i.e., print asc. list of values:freq)" << endl;
cerr << "\t\t\t first (i.e., print first value)" << endl;
cerr << "\t\t\t last (i.e., print last value)" << endl;
cerr << "\t\t\t- Default: sum" << endl << endl; cerr << "\t\t\t- Default: sum" << endl << endl;
cerr << "\t-full\t\t" << "Print all columns from input file. The first line in the group is used." << endl; cerr << "\t-full\t\t" << "Print all columns from input file. The first line in the group is used." << endl;
...@@ -322,7 +393,9 @@ void GroupBy (const string &inFile, ...@@ -322,7 +393,9 @@ void GroupBy (const string &inFile,
// fields change // fields change
TabFile *_tab = new TabFile(inFile); TabFile *_tab = new TabFile(inFile);
_tab->Open(); _tab->Open();
while ((tabLineStatus = _tab->GetNextTabLine(inFields, lineNum)) != TAB_INVALID) { while ((tabLineStatus = _tab->GetNextTabLine(inFields, lineNum)) !=
TAB_INVALID)
{
if ((tabLineStatus == TAB_VALID) || (tabLineStatus == TAB_HEADER)) { if ((tabLineStatus == TAB_VALID) || (tabLineStatus == TAB_HEADER)) {
if (first_line) { if (first_line) {
...@@ -350,19 +423,24 @@ void GroupBy (const string &inFile, ...@@ -350,19 +423,24 @@ void GroupBy (const string &inFile,
// there has been a group change // there has been a group change
if ((currGroup != prevGroup) && (prevGroup.size() > 0)) { if ((currGroup != prevGroup) && (prevGroup.size() > 0)) {
// Summarize this group // Summarize this group
ReportSummary(printOriginalLine?inFieldsFirstLineInGroup:prevGroup, values, ops); ReportSummary(printOriginalLine?inFieldsFirstLineInGroup:prevGroup,
values, ops);
// reset and add the first value for the next group. // reset and add the first value for the next group.
values.clear(); values.clear();
for( size_t i = 0; i < opColumns.size(); i++ ) { for( size_t i = 0; i < opColumns.size(); i++ ) {
values.push_back( vector<string>() ); values.push_back( vector<string>() );
addValue(inFields, values[i], opColumns[i]-1, lineNum, ignoreCase); addValue(inFields, values[i],
opColumns[i]-1, lineNum,
ignoreCase);
} }
inFieldsFirstLineInGroup = inFields; inFieldsFirstLineInGroup = inFields;
} }
// we're still dealing with the same group // we're still dealing with the same group
else { else {
for( size_t i = 0; i < opColumns.size(); i++ ) for( size_t i = 0; i < opColumns.size(); i++ )
addValue(inFields, values[i], opColumns[i]-1, lineNum, ignoreCase); addValue(inFields, values[i],
opColumns[i]-1, lineNum,
ignoreCase);
} }
// reset for the next line // reset for the next line
prevGroup = currGroup; prevGroup = currGroup;
...@@ -370,15 +448,20 @@ void GroupBy (const string &inFile, ...@@ -370,15 +448,20 @@ void GroupBy (const string &inFile,
inFields.clear(); inFields.clear();
} }
// report the last group // report the last group
ReportSummary(printOriginalLine?inFieldsFirstLineInGroup:currGroup, values, ops); ReportSummary(printOriginalLine?inFieldsFirstLineInGroup:currGroup,
values, ops);
_tab->Close(); _tab->Close();
} }
void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops) { void ReportSummary(const vector<string> &group,
const vector<vector<string> > &data,
const vector<string> &ops)
{
vector<string> result; vector<string> result;
for( size_t i = 0; i < data.size(); i++ ) { for( size_t i = 0; i < data.size(); i++ )
{
if (data[i].empty()) if (data[i].empty())
continue; continue;
...@@ -444,6 +527,12 @@ void ReportSummary(const vector<string> &group, const vector<vector<string> > &d ...@@ -444,6 +527,12 @@ void ReportSummary(const vector<string> &group, const vector<vector<string> > &d
buffer << setprecision (PRECISION) << vo.GetSstddev(); buffer << setprecision (PRECISION) << vo.GetSstddev();
result.push_back(buffer.str()); result.push_back(buffer.str());
} }
else if (op == "first") {
result.push_back(vo.GetFirst());
}
else if (op == "last") {
result.push_back(vo.GetLast());
}
} }
if (!result.empty()) { if (!result.empty()) {
for_each(group.begin(), group.end(), TabPrintPost); for_each(group.begin(), group.end(), TabPrintPost);
...@@ -462,8 +551,15 @@ void addValue (const vector<string> &fromList, vector<string> &toList, int index ...@@ -462,8 +551,15 @@ void addValue (const vector<string> &fromList, vector<string> &toList, int index
toList.push_back(s); toList.push_back(s);
} }
catch(std::out_of_range& e) { catch(std::out_of_range& e) {
cerr << endl << "*****" << endl << "*****ERROR: requested column exceeds the number of columns in file at line " cerr << endl
<< lineNum << ". Exiting." << endl << "*****" << endl; << "*****"
<< endl
<< "*****ERROR: requested column exceeds the number of columns in file at line "
<< lineNum
<< ". Exiting."
<< endl
<< "*****"
<< endl;
exit(1); exit(1);
} }
} }
...@@ -505,17 +601,20 @@ void PrintHeaderLine(const vector<string> &inFields, ...@@ -505,17 +601,20 @@ void PrintHeaderLine(const vector<string> &inFields,
if (PrintFullInputLine) { if (PrintFullInputLine) {
//All input columns //All input columns
for (size_t i=0;i<inFields.size();++i) for (size_t i=0;i<inFields.size();++i)
header.push_back( ColumnHeaderName(inFields, i+1, InputHaveHeaderLine) ); header.push_back( ColumnHeaderName(inFields, i+1,
InputHaveHeaderLine) );
} else { } else {
//Only the columns that are actually used in the grouped operations //Only the columns that are actually used in the grouped operations
for (size_t i=0;i<groupColumns.size();++i) for (size_t i=0;i<groupColumns.size();++i)
header.push_back( ColumnHeaderName(inFields, groupColumns[i], InputHaveHeaderLine) ); header.push_back( ColumnHeaderName(inFields, groupColumns[i],
InputHaveHeaderLine) );
} }
//Header fields of output columns, by operation //Header fields of output columns, by operation
for (size_t i=0; i<opColumns.size();++i) { for (size_t i=0; i<opColumns.size();++i) {
stringstream s; stringstream s;
s << ops[i] << "(" << ColumnHeaderName(inFields, opColumns[i], InputHaveHeaderLine) << ")"; s << ops[i] << "(" << ColumnHeaderName(inFields, opColumns[i],
InputHaveHeaderLine) << ")";
header.push_back(s.str()); header.push_back(s.str());
} }
......
...@@ -313,3 +313,17 @@ string VectorOps::GetFreqAsc(void) ...@@ -313,3 +313,17 @@ string VectorOps::GetFreqAsc(void)
return buffer.str(); return buffer.str();
} }
string VectorOps::GetFirst(void)
{
return _vecs[0];
}
string VectorOps::GetLast(void)
{
return _vecs[_vecs.size() - 1];
}
...@@ -40,23 +40,44 @@ public: ...@@ -40,23 +40,44 @@ public:
~VectorOps(void); ~VectorOps(void);
// user-interface // user-interface
double GetSum(void); // return the total of the values in the vector
double GetMean(void); // return the average value in the vector // return the total of the values in the vector
double GetStddev(void); // return the standard deviation double GetSum(void);
double GetSstddev(void); // return the sample standard deviation // return the average value in the vector
double GetMedian(void); // return the median value in the vector double GetMean(void);
string GetMode(void); // return the most common value in the vector // return the standard deviation
string GetAntiMode(void); // return the least common value in the vector double GetStddev(void);
double GetMin(void); // return the minimum element of the vector // return the sample standard deviation
double GetMax(void); // return the maximum element of the vector double GetSstddev(void);
uint32_t GetCount(void); // return the count of element in the vector // return the median value in the vector
uint32_t GetCountDistinct(void); // return a the count of _unique_ elements in the vector double GetMedian(void);
string GetCollapse(void); // return a comma-separated list of elements // return the most common value in the vector
string GetConcat(void); // return a concatenation of all elements in the vector string GetMode(void);
string GetDistinct(void); // return a comma-separated list of the _unique_ elements // return the least common value in the vector
string GetFreqDesc(void); // return a histogram of values and their frequencies in desc. order of frequency string GetAntiMode(void);
string GetFreqAsc(void); // return a histogram of values and their frequencies in asc. order of frequency // return the minimum element of the vector
double GetMin(void);
// return the maximum element of the vector
double GetMax(void);
// return the count of element in the vector
uint32_t GetCount(void);
// return a the count of _unique_ elements in the vector
uint32_t GetCountDistinct(void);
// return a comma-separated list of elements
string GetCollapse(void);
// return a concatenation of all elements in the vector
string GetConcat(void);
// return a comma-separated list of the _unique_ elements
string GetDistinct(void);
// return a histogram of values and their freqs. in desc. order of frequency
string GetFreqDesc(void);
// return a histogram of values and their freqs. in asc. order of frequency
string GetFreqAsc(void);
// return the first value in the list
string GetFirst(void);
// return the last value in the list
string GetLast(void);
private: private:
vector<string> _vecs; vector<string> _vecs;
vector<double> _vecd; vector<double> _vecd;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment