Commit bb139717 authored by Neil Kindlon's avatar Neil Kindlon
Browse files

Added distinct_sort_num and distinct_sort_num desc to VectorOps (and groupBy)...

Added distinct_sort_num and distinct_sort_num desc to VectorOps (and groupBy) and KeyListOps. Added -delim to groupBy.
parent 8b8a9cd5
......@@ -42,7 +42,7 @@ void groupby_help(void);
void GroupBy(const string &inFile, const vector<int> &groupColumns,
const vector<int> &opColumns, const vector<string> &ops,
const bool printOriginalLine, const bool printHeaderLine,
const bool InputHaveHeaderLine, const bool ignoreCase, int precision);
const bool InputHaveHeaderLine, const bool ignoreCase, int precision, const string &delim);
void PrintHeaderLine(const vector<string> &InputFields,
const vector<int> &groupColumns,
......@@ -54,7 +54,7 @@ void PrintHeaderLine(const vector<string> &InputFields,
void ReportSummary(const vector<string> &group,
const vector<vector<string> > &data,
const vector<string> &ops,
int precision);
int precision, const string &delim);
void addValue (const vector<string> &fromList,
vector<string> &toList,
......@@ -83,6 +83,7 @@ int groupby_main(int argc, char* argv[]) {
bool InputHaveHeaderLine = false;
bool ignoreCase = false;
int precision = 21;
string delim(",");
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
......@@ -155,6 +156,20 @@ int groupby_main(int argc, char* argv[]) {
i++;
}
}
else if(PARAMETER_CHECK("-delim", 6, parameterLength))
{
if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
cerr << endl
<< "*****ERROR: -delim parameter requires a value."
<< endl << endl;
groupby_help();
break;
}
else {
delim = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-full", 5, parameterLength)) {
printOriginalLine = true;
}
......@@ -216,9 +231,9 @@ int groupby_main(int argc, char* argv[]) {
(ops[i] != "antimode") && (ops[i] != "stdev") &&
(ops[i] != "sstdev") && (ops[i] != "count") &&
(ops[i] != "count_distinct") && (ops[i] != "collapse") &&
(ops[i] != "distinct") && (ops[i] != "distinct_sort_num") && (ops[i] != "concat") &&
(ops[i] != "distinct") && (ops[i] != "distinct_sort_num") && (ops[i] != "distinct_sort_num_desc")&& (ops[i] != "concat") &&
(ops[i] != "freqdesc") && (ops[i] != "freqasc") &&
(ops[i] != "first") && (ops[i] != "last") )
(ops[i] != "first") && (ops[i] != "last") && (ops[i] != "delim"))
{
cerr << endl
<< "*****"
......@@ -283,7 +298,7 @@ int groupby_main(int argc, char* argv[]) {
}
GroupBy(inFile, groupColumnsInt, opColumnsInt, ops,
printOriginalLine, printHeaderLine, InputHaveHeaderLine,
ignoreCase, precision);
ignoreCase, precision, delim);
}
else {
groupby_help();
......@@ -318,7 +333,8 @@ void groupby_help(void) {
cerr << "\t\t\t stdev, sstdev (sample standard dev.)," << endl;
cerr << "\t\t\t collapse (i.e., print a comma separated list (duplicates allowed)), " << endl;
cerr << "\t\t\t distinct (i.e., print a comma separated list (NO duplicates allowed)), " << endl;
cerr << "\t\t\t distinct_sort_num (as distinct, but sorted numerically), " << endl;
cerr << "\t\t\t distinct_sort_num (as distinct, but sorted numerically, ascending), " << endl;
cerr << "\t\t\t distinct_sort_num_desc (as distinct, but sorted numerically, descending), " << endl;
cerr << "\t\t\t concat (i.e., merge values into a single, non-delimited string), " << endl;
cerr << "\t\t\t freqdesc (i.e., print desc. list of values:freq)" << endl;
cerr << "\t\t\t freqasc (i.e., print asc. list of values:freq)" << endl;
......@@ -343,6 +359,9 @@ void groupby_help(void) {
cerr << "\t-ignorecase\t" << "Group values regardless of upper/lower case." << endl << endl;
cerr << "\t-prec\t" << "Sets the decimal precision for output (Default: 5)" << endl << endl;
cerr << "\t-delim\t" << "Specify a custom delimiter for the collapse operations." << endl;
cerr << "\t\t- Example: -delim \"|\"" << endl;
cerr << "\t\t- Default: \",\"." << endl << endl;
cerr << "Examples: " << endl;
cerr << "\t$ cat ex1.out" << endl;
......@@ -378,7 +397,7 @@ void GroupBy (const string &inFile,
const bool printHeaderLine,
const bool InputHaveHeaderLine,
const bool ignoreCase,
int precision) {
int precision, const string &delim) {
// current line number
int lineNum = 0;
......@@ -441,7 +460,7 @@ void GroupBy (const string &inFile,
if ((currGroup != prevGroup) && (prevGroup.size() > 0)) {
// Summarize this group
ReportSummary(printOriginalLine?inFieldsFirstLineInGroup:prevGroup,
values, ops, precision);
values, ops, precision, delim);
// reset and add the first value for the next group.
values.clear();
for( size_t i = 0; i < opColumns.size(); i++ ) {
......@@ -466,7 +485,7 @@ void GroupBy (const string &inFile,
}
// report the last group
ReportSummary(printOriginalLine?inFieldsFirstLineInGroup:currGroup,
values, ops, precision);
values, ops, precision, delim);
_tab->Close();
}
......@@ -474,7 +493,7 @@ void GroupBy (const string &inFile,
void ReportSummary(const vector<string> &group,
const vector<vector<string> > &data,
const vector<string> &ops,
int precision)
int precision, const string &delim)
{
vector<string> result;
......@@ -487,6 +506,7 @@ void ReportSummary(const vector<string> &group,
string op = ops[i];
std::stringstream buffer;
VectorOps vo(data[i]);
vo.setDelim(delim);
if (op == "sum") {
buffer << setprecision (precision) << vo.GetSum();
......@@ -501,7 +521,9 @@ void ReportSummary(const vector<string> &group,
else if (op == "distinct_sort_num") {
result.push_back(vo.GetDistinctSortNum());
}
else if (op == "distinct_sort_num_desc") {
result.push_back(vo.GetDistinctSortNum(false));
}
else if (op == "concat") {
result.push_back(vo.GetConcat());
}
......
......@@ -28,6 +28,9 @@ _dbFileType(FileRecordTypeChecker::UNKNOWN_FILE_TYPE)
_opCodes["distinct"] = DISTINCT;
_opCodes["count_distinct"] = COUNT_DISTINCT;
_opCodes["distinct_only"] = DISTINCT_ONLY;
_opCodes["distinct_sort_num"] = DISTINCT_SORT_NUM;
_opCodes["distinct_sort_num_desc"] = DISTINCT_SORT_NUM_DESC;
_opCodes["collapse"] = COLLAPSE;
_opCodes["concat"] = CONCAT;
_opCodes["freq_asc"] = FREQ_ASC;
......@@ -267,6 +270,14 @@ const QuickString & KeyListOps::getOpVals(RecordKeyVector &hits)
_outVals.append(_methods.getDistinct());
break;
case DISTINCT_SORT_NUM:
_outVals.append(_methods.getDistinctSortNum());
break;
case DISTINCT_SORT_NUM_DESC:
_outVals.append(_methods.getDistinctSortNum(false));
break;
case COUNT_DISTINCT:
_outVals.append(_methods.getCountDistinct());
break;
......@@ -340,6 +351,9 @@ void KeyListOpsHelp() {
cerr << "\t\t mean, median," << endl;
cerr << "\t\t collapse (i.e., print a delimited list (duplicates allowed)), " << endl;
cerr << "\t\t distinct (i.e., print a delimited list (NO duplicates allowed)), " << endl;
cerr << "\t\t distinct_sort_num (as distinct, sorted numerically, ascending)," << endl;
cerr << "\t\t distinct_sort_num_desc (as distinct, sorted numerically, desscending)," << endl;
cerr << "\t\t distinct_only (delimited list of only unique values)," << endl;
cerr << "\t\t count" << endl;
cerr << "\t\t count_distinct (i.e., a count of the unique values in the column), " << endl;
cerr << "\t\t first (i.e., just the first value in the column), " << endl;
......
......@@ -43,7 +43,7 @@ public:
void setKeyList(RecordKeyVector *keyList) { _methods.setKeyList(keyList); }
typedef enum { SUM, MEAN, STDDEV, SAMPLE_STDDEV, MEDIAN, MODE, ANTIMODE, MIN, MAX, ABSMIN, ABSMAX, COUNT, DISTINCT, COUNT_DISTINCT,
DISTINCT_ONLY, COLLAPSE, CONCAT, FREQ_ASC, FREQ_DESC, FIRST, LAST, INVALID } OP_TYPES;
DISTINCT_ONLY, DISTINCT_SORT_NUM, DISTINCT_SORT_NUM_DESC, COLLAPSE, CONCAT, FREQ_ASC, FREQ_DESC, FIRST, LAST, INVALID } OP_TYPES;
void setDBfileType(FileRecordTypeChecker::FILE_TYPE type) { _dbFileType = type; }
bool isValidColumnOps(FileRecordMgr *dbFile);
......
......@@ -201,18 +201,31 @@ const QuickString &KeyListOpsMethods::getDistinct() {
const QuickString &KeyListOpsMethods::getDistinctOnly() {
if (empty()) return _nullVal;
//separated list of only unique values. If item repeats, discard.
// separated list of unique values. If something repeats, don't report.
makeFreqMap();
_retStr.clear();
for (; _freqIter != _freqMap.end(); _freqIter++) {
if (_freqIter->second != 1) continue;
if (_freqIter->second > 1) continue;
if (_freqIter != _freqMap.begin()) _retStr += _delimStr;
_retStr.append(_freqIter->first);
}
return _retStr;
}
const QuickString &KeyListOpsMethods::getDistinctSortNum(bool asc) {
toArray(true, asc ? ASC : DESC);
vector<double>::iterator endIter = std::unique(_numArray.begin(), _numArray.end());
_retStr.clear();
for (vector<double>::iterator iter = _numArray.begin(); iter != endIter; iter++) {
if (iter != _numArray.begin()) _retStr += _delimStr;
_retStr.append(*iter);
}
return _retStr;
}
// return a the count of _unique_ elements in the vector
uint32_t KeyListOpsMethods::getCountDistinct() {
if (empty()) return 0;
......
......@@ -58,7 +58,12 @@ public:
uint32_t getCountDistinct();
// return only those elements that occur once
const QuickString &getDistinctOnly();
// return a delimiter-separated list of elements
// as distinct, but sorted numerically.
const QuickString &getDistinctSortNum(bool ascending = true);
// as distinct, but sorted numerically, descending
// return a delimiter-separated list of elements
const QuickString & getCollapse(const QuickString & delimiter = ",");
// return a concatenation of all elements in the vector
const QuickString & getConcat();
......
......@@ -59,6 +59,7 @@ VectorOps::VectorOps(const vector<string> &vec)
: _vecs(vec)
, _vecd()
, _size(vec.size())
, _delimStr(",")
{
_vecd.reserve(vec.size());
}
......@@ -261,13 +262,13 @@ string VectorOps::GetDistinct(void)
for( size_t i = 0; i < _vecs.size(); i++ ) {
if (i>0)
distinct << ",";
distinct << _delimStr;
distinct << _vecs[i];
}
return distinct.str();
}
string VectorOps::GetDistinctSortNum(void)
string VectorOps::GetDistinctSortNum(bool ascending)
{
ostringstream distinct;
// remove duplicate entries from the vector
......@@ -275,12 +276,17 @@ string VectorOps::GetDistinctSortNum(void)
// convert the vec of strings to a vec of doubles
transform(_vecs.begin(), _vecs.end(), back_inserter(_vecd), MakeDouble);
sort( _vecd.begin(), _vecd.end() );
if (ascending) {
sort( _vecd.begin(), _vecd.end(), less<double>());
} else {
sort( _vecd.begin(), _vecd.end(), greater<double>());
}
_vecd.erase( unique( _vecd.begin(), _vecd.end() ), _vecd.end() );
for( size_t i = 0; i < _vecd.size(); i++ ) {
if (i>0)
distinct << ",";
distinct << _delimStr;
distinct << _vecd[i];
}
return distinct.str();
......@@ -316,7 +322,7 @@ string VectorOps::GetFreqDesc(void)
vector< pair<int, string> >::const_iterator iter = freqList.begin();
vector< pair<int, string> >::const_iterator iterEnd = freqList.end();
for (; iter != iterEnd; ++iter)
buffer << iter->second << ":" << iter->first << ",";
buffer << iter->second << ":" << iter->first << _delimStr;
return buffer.str();
}
......@@ -352,7 +358,7 @@ string VectorOps::GetFreqAsc(void)
vector< pair<int, string> >::const_iterator iter = freqList.begin();
vector< pair<int, string> >::const_iterator iterEnd = freqList.end();
for (; iter != iterEnd; ++iter)
buffer << iter->second << ":" << iter->first << ",";
buffer << iter->second << ":" << iter->first << _delimStr;
return buffer.str();
}
......
......@@ -40,6 +40,7 @@ public:
~VectorOps(void);
// user-interface
void setDelim(const string &str) { _delimStr = str; }
// return the total of the values in the vector
double GetSum(void);
......@@ -74,7 +75,7 @@ public:
// return a comma-separated list of the _unique_ elements
string GetDistinct(void);
// return a comma-separated list of the _unique_ elements, sorted numerically
string GetDistinctSortNum(void);
string GetDistinctSortNum(bool asc = true);
// return a histogram of values and their freqs. in desc. order of frequency
string GetFreqDesc(void);
// return a histogram of values and their freqs. in asc. order of frequency
......@@ -88,6 +89,7 @@ private:
vector<string> _vecs;
vector<double> _vecd;
uint32_t _size;
string _delimStr;
};
#endif /* VECTOROPS_H */
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment