From bcffe1009905e1211be0003581603d99bcf635ac Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Fri, 30 Dec 2011 13:44:17 -0500 Subject: [PATCH] Added groupby back into the bedtools package. --- Makefile | 3 +- scripts/makeBashScripts.py | 5 +- src/bedtools.cpp | 4 +- src/groupBy/Makefile | 43 +++ src/groupBy/groupBy.cpp | 666 +++++++++++++++++++++++++++++++++++++ 5 files changed, 717 insertions(+), 4 deletions(-) create mode 100644 src/groupBy/Makefile create mode 100644 src/groupBy/groupBy.cpp diff --git a/Makefile b/Makefile index c1cdc30a..bb5c26a5 100644 --- a/Makefile +++ b/Makefile @@ -29,6 +29,8 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/fastaFromBed \ $(SRC_DIR)/flankBed \ $(SRC_DIR)/genomeCoverageBed \ + $(SRC_DIR)/getOverlap \ + $(SRC_DIR)/groupBy \ $(SRC_DIR)/intersectBed \ $(SRC_DIR)/linksBed \ $(SRC_DIR)/maskFastaFromBed \ @@ -36,7 +38,6 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/multiBamCov \ $(SRC_DIR)/multiIntersectBed \ $(SRC_DIR)/nucBed \ - $(SRC_DIR)/getOverlap \ $(SRC_DIR)/pairToBed \ $(SRC_DIR)/pairToPair \ $(SRC_DIR)/shuffleBed \ diff --git a/scripts/makeBashScripts.py b/scripts/makeBashScripts.py index 3e9c4128..0eb86757 100644 --- a/scripts/makeBashScripts.py +++ b/scripts/makeBashScripts.py @@ -15,12 +15,13 @@ def main(): 'bedpetobam': 'bedpeToBam', 'bedtobam': 'bedToBam', 'closest': 'closestBed', - 'cluster': 'clusterBed', + 'cluster': 'clusterBed', 'complement': 'complementBed', 'coverage': 'coverageBed', 'flank': 'flankBed', 'genomecov': 'genomeCoverageBed', - 'getfasta': 'fastaFromBed', + 'getfasta': 'fastaFromBed', + 'groupby': 'groupBy', 'igv': 'bedToIgv', 'intersect': 'intersectBed', 'links': 'linksBed', diff --git a/src/bedtools.cpp b/src/bedtools.cpp index 5e294a9c..714491ba 100644 --- a/src/bedtools.cpp +++ b/src/bedtools.cpp @@ -48,6 +48,7 @@ int fastafrombed_main(int argc, char* argv[]);// int flank_main(int argc, char* argv[]); // int genomecoverage_main(int argc, char* argv[]);// int getoverlap_main(int argc, char* argv[]);// +int groupby_main(int argc, char* argv[]);// int intersect_main(int argc, char* argv[]); // int links_main(int argc, char* argv[]);// int maskfastafrombed_main(int argc, char* argv[]);// @@ -120,7 +121,7 @@ int main(int argc, char *argv[]) else if (sub_cmd == "igv") return bedtoigv_main(argc-1, argv+1); else if (sub_cmd == "links") return links_main(argc-1, argv+1); else if (sub_cmd == "makewindows") return windowmaker_main(argc-1, argv+1); - + else if (sub_cmd == "groupby") return groupby_main(argc-1, argv+1); // help else if (sub_cmd == "-h" || sub_cmd == "--help" || sub_cmd == "-help") @@ -216,6 +217,7 @@ int bedtools_help(void) cout << " igv " << "Create an IGV snapshot batch script.\n"; cout << " links " << "Create a HTML page of links to UCSC locations.\n"; cout << " makewindows " << "Make interval \"windows\" across a genome.\n"; + cout << " groupby " << "Group by common cols. & summarize oth. cols. (~ SQL \"groupBy\")\n"; cout << endl; cout << " -General help:\n"; diff --git a/src/groupBy/Makefile b/src/groupBy/Makefile new file mode 100644 index 00000000..be90da80 --- /dev/null +++ b/src/groupBy/Makefile @@ -0,0 +1,43 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/tabFile/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/version/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= groupBy.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=tabFile.o gzstream.o fileType.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +all: $(BUILT_OBJECTS) + +.PHONY: all + + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/tabFile/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean diff --git a/src/groupBy/groupBy.cpp b/src/groupBy/groupBy.cpp new file mode 100644 index 00000000..38c463dd --- /dev/null +++ b/src/groupBy/groupBy.cpp @@ -0,0 +1,666 @@ +/***************************************************************************** +groupBy.cpp + +(c) 2009, 2010, 2011 - Aaron Quinlan +Center for Public Health Genomics +University of Virginia +aaronquinlan@gmail.com + +Licenced under the MIT license. +******************************************************************************/ +#include <vector> +#include <map> +#include <numeric> +#include <algorithm> +#include <iterator> +#include <iostream> +#include <iomanip> +#include <fstream> +#include <sstream> +#include <stdlib.h> +#include <math.h> +#include <limits.h> +#include <string.h> +#include <exception> +#include <stdexcept> // out_of_range exception + +#include "version.h" +#include "lineFileUtilities.h" +#include "tabFile.h" +using namespace std; + + +const int PRECISION = 21; + + +// define our program name +#define PROGRAM_NAME "bedtools groupby" +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) ((strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)) +#define LOOKS_LIKE_A_PARAM(string) (strlen(string)>0 && string[0]=='-') + + +struct ValueGreaterThan +{ + bool operator()( const vector< pair<int, string> >::value_type& lhs, + const vector< pair<int, string> >::value_type& rhs ) const + { + return lhs.first > rhs.first; + } +}; + +struct ValueLessThan +{ + bool operator()( const vector< pair<int, string> >::value_type& lhs, + const vector< pair<int, string> >::value_type& rhs ) const + { + return lhs.first < rhs.first; + } +}; + +// function declarations +void groupby_help(void); +void GroupBy(const string &inFile, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops, const bool printOriginalLine, const bool printHeaderLine, const bool InputHaveHeaderLine, const bool ignoreCase); +void PrintHeaderLine(const vector<string> &InputFields, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops, const bool PrintFullInputLine, const bool InputHaveHeaderLine); +void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops); +void addValue (const vector<string> &fromList, vector<string> &toList, int index, int lineNum, const bool ignoreCase); +float ToFloat (string element); +double ToDouble(const string &element); +void TabPrintPost (string element); +void TabPrintPre (string element); +void CommaPrint (string element); + +int groupby_main(int argc, char* argv[]) { + + // input files + string inFile = "stdin"; + string groupColumnsString = "1,2,3"; + string opsColumnString; + string opsString; + + // our configuration variables + bool showHelp = false; + bool haveOpColumns = false; + bool haveOps = true; + bool printOriginalLine = false; + bool printHeaderLine = false; + bool InputHaveHeaderLine = false; + bool ignoreCase = false; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) groupby_help(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { + cerr << endl << "*****ERROR: -i parameter requires a value." << endl << endl; + groupby_help(); + break; + } + else { + inFile = argv[i + 1]; + i++; + } + } + else if (PARAMETER_CHECK("-grp", 4, parameterLength) || PARAMETER_CHECK("-g", 2, parameterLength)) { + if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { + cerr << endl << "*****ERROR: -grp parameter requires a value." << endl << endl; + groupby_help(); + break; + } + else { + groupColumnsString = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-opCols", 7, parameterLength) || PARAMETER_CHECK("-c", 2, parameterLength)) { + if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { + cerr << endl << "*****ERROR: -opCols parameter requires a value." << endl << endl; + groupby_help(); + break; + } + else { + haveOpColumns = true; + opsColumnString = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-ops", 4, parameterLength) || PARAMETER_CHECK("-o", 2, parameterLength)) { + if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) { + cerr << endl << "*****ERROR: -ops parameter requires a value." << endl << endl; + groupby_help(); + break; + } + else { + haveOps = true; + opsString = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-full", 5, parameterLength)) { + printOriginalLine = true; + } + else if(PARAMETER_CHECK("-outheader", 10, parameterLength)) { + printHeaderLine = true; + } + else if(PARAMETER_CHECK("-inheader", 9, parameterLength)) { + InputHaveHeaderLine = true; + } + else if(PARAMETER_CHECK("-header", 7, parameterLength)) { + InputHaveHeaderLine = true; + printHeaderLine = true; + } + else if(PARAMETER_CHECK("-ignorecase", 11, parameterLength)) { + ignoreCase = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + if (!haveOpColumns) { + cerr << endl << "*****" << endl << "*****ERROR: Need -opCols." << endl << "*****" << endl; + showHelp = true; + } + // split the opsString into discrete operations and make sure they are all valid. + vector<string> ops; + opsString.erase(remove_if(opsString.begin(),opsString.end(),::isspace),opsString.end()); + Tokenize(opsString, ops, ","); + for( size_t i = 0; i < ops.size(); i++ ) { + if ((ops[i] != "sum") && (ops[i] != "max") && (ops[i] != "min") && (ops[i] != "mean") && + (ops[i] != "mode") && (ops[i] != "median") && (ops[i] != "antimode") && (ops[i] != "stdev") && + (ops[i] != "sstdev") && (ops[i] != "count") && (ops[i] != "collapse") && (ops[i] != "distinct") && + (ops[i] != "concat") && (ops[i] != "freqdesc") && (ops[i] != "freqasc")) + { + cerr << endl << "*****" << endl << "*****ERROR: Invalid operation selection \"" << ops[i] << endl << "\" *****" << endl; + showHelp = true; + } + } + if (!showHelp) { + + // Split the column string sent by the user into discrete column numbers + // A comma separated string is expected. + vector<int> groupColumnsInt; + Tokenize(groupColumnsString, groupColumnsInt, ","); + + vector<int> opColumnsInt; + Tokenize(opsColumnString, opColumnsInt, ","); + + // sanity check the group columns + for(size_t i = 0; i < groupColumnsInt.size(); ++i) { + int groupColumnInt = groupColumnsInt[i]; + if (groupColumnInt < 1) { + cerr << endl << "*****" << endl << "*****ERROR: group columns must be >=1. " << endl << "*****" << endl; + groupby_help(); + } + } + + // sanity check the op columns + for(size_t i = 0; i < opColumnsInt.size(); ++i) { + int opColumnInt = opColumnsInt[i]; + if (opColumnInt < 1) { + cerr << endl << "*****" << endl << "*****ERROR: op columns must be >=1. " << endl << "*****" << endl; + groupby_help(); + } + } + + // sanity check that there are equal number of opColumns and ops + if (ops.size() != opColumnsInt.size()) { + cerr << endl << "*****" << endl << "*****ERROR: There must be equal number of ops and opCols. " << endl << "*****" << endl; + groupby_help(); + } + GroupBy(inFile, groupColumnsInt, opColumnsInt, ops, + printOriginalLine, printHeaderLine, InputHaveHeaderLine, + ignoreCase); + } + else { + groupby_help(); + } + return 0; +} + +void groupby_help(void) { + + cerr << "\nTool: bedtools intersect (aka intersectBed)" << endl; + cerr << "Version: " << VERSION << "\n"; + cerr << "Summary: Summarizes a dataset column based upon" << endl; + cerr << "\t common column groupings. Akin to the SQL \"group by\" command." << endl << endl; + + cerr << "Usage:\t " << PROGRAM_NAME << " -g [group_column(s)] -c [op_column(s)] -o [ops] " << endl; + cerr << "\t " << "cat [FILE] | " << PROGRAM_NAME << " -g [group_column(s)] -c [op_column(s)] -o [ops] " << endl << endl; + + cerr << "Options: " << endl; + cerr << "\t-i\t\t" << "Input file. Assumes \"stdin\" if omitted." << endl << endl; + + cerr << "\t-g -grp\t\t" << "Specify the columns (1-based) for the grouping." << endl; + cerr << "\t\t\tThe columns must be comma separated." << endl; + cerr << "\t\t\t- Default: 1,2,3" << endl << endl; + + cerr << "\t-c -opCols\t" << "Specify the column (1-based) that should be summarized." << endl; + cerr << "\t\t\t- Required." << endl << endl; + + cerr << "\t-o -ops\t\t" << "Specify the operation that should be applied to opCol." << endl; + cerr << "\t\t\tValid operations:" << endl; + cerr << "\t\t\t sum, count, min, max," << endl; + cerr << "\t\t\t mean, median, mode, antimode," << endl; + cerr << "\t\t\t stdev, sstdev (sample standard dev.)," << endl; + cerr << "\t\t\t collapse (i.e., print a comma separated list (duplicates allowed)), " << endl; + cerr << "\t\t\t distinct (i.e., print a comma separated list (NO duplicates allowed)), " << endl; + cerr << "\t\t\t concat (i.e., merge values into a single, non-delimited string), " << endl; + cerr << "\t\t\t freqdesc (i.e., print desc. list of values:freq)" << endl; + cerr << "\t\t\t freqasc (i.e., print asc. list of values:freq)" << endl; + cerr << "\t\t\t- Default: sum" << endl << endl; + + cerr << "\t-full\t\t" << "Print all columns from input file." << endl; + cerr << "\t\t\tDefault: print only grouped columns." << endl << endl; + + cerr << "\t-inheader\t" << "Input file has a header line - the first line will be ignored." << endl << endl ; + + cerr << "\t-outheader\t" << "Print header line in the output, detailing the column names. " << endl; + cerr << "\t\t\tIf the input file has headers (-inheader), the output file" << endl; + cerr << "\t\t\twill use the input's column names." << endl; + cerr << "\t\t\tIf the input file has no headers, the output file" << endl; + cerr << "\t\t\twill use \"col_1\", \"col_2\", etc. as the column names." << endl << endl; + + cerr << "\t-header\t\t" << "same as '-inheader -outheader'" << endl << endl; + + cerr << "\t-ignorecase\t" << "Group values regardless of upper/lower case." << endl << endl; + + cerr << "Examples: " << endl; + cerr << "\t$ cat ex1.out" << endl; + cerr << "\tchr1 10 20 A chr1 15 25 B.1 1000 ATAT" << endl; + cerr << "\tchr1 10 20 A chr1 25 35 B.2 10000 CGCG" << endl << endl; + cerr << "\t$ groupBy -i ex1.out -g 1,2,3,4 -c 9 -o sum" << endl; + cerr << "\tchr1 10 20 A 11000" << endl << endl; + cerr << "\t$ groupBy -i ex1.out -grp 1,2,3,4 -opCols 9,9 -ops sum,max" << endl; + cerr << "\tchr1 10 20 A 11000 10000" << endl << endl; + cerr << "\t$ groupBy -i ex1.out -g 1,2,3,4 -c 8,9 -o collapse,mean" << endl; + cerr << "\tchr1 10 20 A B.1,B.2, 5500" << endl << endl; + cerr << "\t$ cat ex1.out | groupBy -g 1,2,3,4 -c 8,9 -o collapse,mean" << endl; + cerr << "\tchr1 10 20 A B.1,B.2, 5500" << endl << endl; + cerr << "\t$ cat ex1.out | groupBy -g 1,2,3,4 -c 10 -o concat" << endl; + cerr << "\tchr1 10 20 A ATATCGCG" << endl << endl; + + cerr << "Notes: " << endl; + cerr << "\t(1) The input file/stream should be sorted/grouped by the -grp. columns" << endl; + cerr << "\t(2) If -i is unspecified, input is assumed to come from stdin." << endl << endl; + + + // end the program here + exit(1); + +} + + +void GroupBy (const string &inFile, + const vector<int> &groupColumns, + const vector<int> &opColumns, + const vector<string> &ops, + const bool printOriginalLine, + const bool printHeaderLine, + const bool InputHaveHeaderLine, + const bool ignoreCase) { + + // current line number + int lineNum = 0; + // string representing current line + string inLine; + + // vector of strings holding the tokenized current line + vector<string> inFields; + vector<string> inFieldsFirstLineInGroup; + inFields.reserve(20); + + // keys for the current and previous group + vector<string> prevGroup(0); + vector<string> currGroup(0); + + // vector (one per column) of vector (one per value/column) of the opColumn values for the current group + vector< vector<string> > values; + for( size_t i = 0; i < opColumns.size(); i++ ) { + values.push_back( vector<string>() ); + } + + bool first_line = true; + + // check the status of the current line + TabLineStatus tabLineStatus; + + // open a new tab file, loop through it line by line + // and summarize the data for a given group when the group + // fields change + TabFile *_tab = new TabFile(inFile); + _tab->Open(); + while ((tabLineStatus = _tab->GetNextTabLine(inFields, lineNum)) != TAB_INVALID) { + if (tabLineStatus == TAB_VALID) { + + if (first_line) { + first_line = false; + if (printHeaderLine) + PrintHeaderLine(inFields, groupColumns, opColumns, ops, + printOriginalLine, InputHaveHeaderLine); + + if (InputHaveHeaderLine) { + inFields.clear(); + continue; // no need to process this line - it's the header + } + } + + if (inFieldsFirstLineInGroup.empty()) //first line in file? - save it + inFieldsFirstLineInGroup = inFields; + + // build the group vector for the current line + currGroup.clear(); + vector<int>::const_iterator gIt = groupColumns.begin(); + vector<int>::const_iterator gEnd = groupColumns.end(); + for (; gIt != gEnd; ++gIt) + addValue(inFields, currGroup, (*gIt-1), lineNum, ignoreCase); + + // there has been a group change + if ((currGroup != prevGroup) && (prevGroup.size() > 0)) { + // Summarize this group + ReportSummary(printOriginalLine?inFieldsFirstLineInGroup:prevGroup, values, ops); + // reset and add the first value for the next group. + values.clear(); + for( size_t i = 0; i < opColumns.size(); i++ ) { + values.push_back( vector<string>() ); + addValue(inFields, values[i], opColumns[i]-1, lineNum, ignoreCase); + } + inFieldsFirstLineInGroup = inFields; + } + // we're still dealing with the same group + else { + for( size_t i = 0; i < opColumns.size(); i++ ) + addValue(inFields, values[i], opColumns[i]-1, lineNum, ignoreCase); + } + // reset for the next line + prevGroup = currGroup; + } + inFields.clear(); + } + // report the last group + ReportSummary(printOriginalLine?inFieldsFirstLineInGroup:currGroup, values, ops); + _tab->Close(); +} + + +void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops) { + + vector<string> result; + for( size_t i = 0; i < data.size(); i++ ) { + + string op = ops[i]; + std::stringstream buffer; + vector<double> dataF; + // are we doing a numeric conversion? if so, convert the strings to doubles. + if ((op == "sum") || (op == "max") || (op == "min") || (op == "mean") || + (op == "median") || (op == "stdev") || (op == "sstdev")) + { + transform(data[i].begin(), data[i].end(), back_inserter(dataF), ToDouble); + } + + if (op == "sum") { + // sum them up + double total = accumulate(dataF.begin(), dataF.end(), 0.0); + buffer << setprecision (PRECISION) << total; + result.push_back(buffer.str()); + } + else if (op == "collapse") { + string collapse; + for( size_t j = 0; j < data[i].size(); j++ ) {//Ugly, but cannot use back_inserter + if (j>0) + collapse.append(","); + collapse.append(data[i][j]); + } + result.push_back(collapse); + } + else if (op == "distinct") { + string distinct; + // get the current column's data + vector<string> col_data = data[i]; + // remove duplicate entries from the vector + // http://stackoverflow.com/questions/1041620/most-efficient-way-to-erase-duplicates-and-sort-a-c-vector + sort( col_data.begin(), col_data.end() ); + col_data.erase( unique( col_data.begin(), col_data.end() ), col_data.end() ); + + for( size_t j = 0; j < col_data.size(); j++ ) {//Ugly, but cannot use back_inserter + if (j>0) + distinct.append(","); + distinct.append(col_data[j]); + } + result.push_back(distinct); + } + else if (op == "concat") { + string concat; + for( size_t j = 0; j < data[i].size(); j++ ) {//Ugly, but cannot use back_inserter + concat.append(data[i][j]); + } + result.push_back(concat); + } + else if (op == "min") { + buffer << setprecision (PRECISION) << *min_element( dataF.begin(), dataF.end() ); + result.push_back(buffer.str()); + } + else if (op == "max") { + buffer << setprecision (PRECISION) << *max_element( dataF.begin(), dataF.end() ); + result.push_back(buffer.str()); + } + else if (op == "mean") { + double total = accumulate(dataF.begin(), dataF.end(), 0.0); + double mean = total / dataF.size(); + buffer << setprecision (PRECISION) << mean; + result.push_back(buffer.str()); + } + else if (op == "median") { + double median = 0.0; + sort(dataF.begin(), dataF.end()); + int totalLines = dataF.size(); + if ((totalLines % 2) > 0) { + long mid; + mid = totalLines / 2; + median = dataF[mid]; + } + else { + long midLow, midHigh; + midLow = (totalLines / 2) - 1; + midHigh = (totalLines / 2); + median = (dataF[midLow] + dataF[midHigh]) / 2.0; + } + buffer << setprecision (PRECISION) << median; + result.push_back(buffer.str()); + } + else if (op == "count") { + buffer << setprecision (PRECISION) << data[i].size(); + result.push_back(buffer.str()); + } + else if ((op == "mode") || (op == "antimode") || + (op == "freqdesc") || (op == "freqasc")) { + // compute the frequency of each unique value + map<string, int> freqs; + vector<string>::const_iterator dIt = data[i].begin(); + vector<string>::const_iterator dEnd = data[i].end(); + for (; dIt != dEnd; ++dIt) { + freqs[*dIt]++; + } + + // grab the mode and the anti mode + string mode, antiMode; + int count = 0; + int minCount = INT_MAX; + for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) { + if (iter->second > count) { + mode = iter->first; + count = iter->second; + } + if (iter->second < minCount) { + antiMode = iter->first; + minCount = iter->second; + } + } + // report + if (op == "mode") { + buffer << setprecision (PRECISION) << mode; + result.push_back(buffer.str()); + } + else if (op == "antimode") { + buffer << setprecision (PRECISION) << antiMode; + result.push_back(buffer.str()); + } + else if (op == "freqdesc" || op == "freqasc") { + // pair for the num times a values was + // observed (1) and the value itself (2) + pair<int, string> freqPair; + vector< pair<int, string> > freqList; + + // create a list of pairs of all the observed values (second) + // and their occurences (first) + map<string,int>::const_iterator mapIter = freqs.begin(); + map<string,int>::const_iterator mapEnd = freqs.end(); + for(; mapIter != mapEnd; ++mapIter) + freqList.push_back( make_pair(mapIter->second, mapIter->first) ); + + // sort the list of pairs in the requested order by the frequency + // this will make the value that was observed least/most bubble to the top + if (op == "freqdesc") + sort(freqList.begin(), freqList.end(), ValueGreaterThan()); + else if (op == "freqasc") + sort(freqList.begin(), freqList.end(), ValueLessThan()); + + // record all of the values and their frequencies. + vector< pair<int, string> >::const_iterator iter = freqList.begin(); + vector< pair<int, string> >::const_iterator iterEnd = freqList.end(); + for (; iter != iterEnd; ++iter) + buffer << iter->second << ":" << iter->first << ","; + result.push_back(buffer.str()); + } + } + else if (op == "stdev" || op == "sstdev") { + // get the mean + double total = accumulate(dataF.begin(), dataF.end(), 0.0); + double mean = total / dataF.size(); + // get the variance + double totalVariance = 0.0; + vector<double>::const_iterator dIt = dataF.begin(); + vector<double>::const_iterator dEnd = dataF.end(); + for (; dIt != dEnd; ++dIt) { + totalVariance += pow((*dIt - mean),2); + } + double variance = 0.0; + if (op == "stdev") { + variance = totalVariance / dataF.size(); + } + else if (op == "sstdev" && dataF.size() > 1) { + variance = totalVariance / (dataF.size() - 1); + } + double stddev = sqrt(variance); + // report + buffer << setprecision (PRECISION) << stddev; + result.push_back(buffer.str()); + } + } + for_each(group.begin(), group.end(), TabPrintPost); + cout << *result.begin(); + for_each(++result.begin(), result.end(), TabPrintPre); + cout << endl; //Gets rid of extraneous tab +} + + +void addValue (const vector<string> &fromList, vector<string> &toList, int index, int lineNum, const bool ignoreCase) { + try { + string s(fromList.at(index)); + if(ignoreCase) + transform(s.begin(),s.end(),s.begin(),::tolower); + toList.push_back(s); + } + catch(std::out_of_range& e) { + cerr << endl << "*****" << endl << "*****ERROR: requested column exceeds the number of columns in file at line " + << lineNum << ". Exiting." << endl << "*****" << endl; + exit(1); + } +} + + +float ToFloat (string element) { + return atof(element.c_str()); +} + +void TabPrintPost (string element) { + cout << element << "\t"; +} + +void TabPrintPre (string element) { + cout << "\t" << element; +} + +void CommaPrint (string element) { + cout << element << ","; +} + +double ToDouble(const string &element) { + std::istringstream i(element); + double x; + if (!(i >> x)) { + cerr << "Error: Could not properly convert string to numeric (\"" + element + "\")" << endl; + exit(1); + } + return x; +} + +inline string ColumnHeaderName(const vector<string> &inFields, const size_t FieldIndex, + bool InputHaveHeaderLine) +{ + stringstream s; + if (InputHaveHeaderLine) + s << inFields[FieldIndex-1]; + else + s << "col_" << (FieldIndex); + return s.str(); +} + +void PrintHeaderLine(const vector<string> &inFields, + const vector<int> &groupColumns, + const vector<int> &opColumns, + const vector<string> &ops, + const bool PrintFullInputLine, + const bool InputHaveHeaderLine) +{ + vector<string> header; + + //Header fields of input file + if (PrintFullInputLine) { + //All input columns + for (size_t i=0;i<inFields.size();++i) + header.push_back( ColumnHeaderName(inFields, i+1, InputHaveHeaderLine) ); + } else { + //Only the columns that are actually used in the grouped operations + for (size_t i=0;i<groupColumns.size();++i) + header.push_back( ColumnHeaderName(inFields, groupColumns[i], InputHaveHeaderLine) ); + } + + //Header fields of output columns, by operation + for (size_t i=0; i<opColumns.size();++i) { + stringstream s; + s << ops[i] << "(" << ColumnHeaderName(inFields, opColumns[i], InputHaveHeaderLine) << ")"; + header.push_back(s.str()); + } + + //print Header Line + for (size_t i=0; i<header.size();++i) + cout << header[i] << ((i<header.size()-1)?"\t":"\n"); +} -- GitLab