Commit bcffe100 authored by Aaron's avatar Aaron
Browse files

Added groupby back into the bedtools package.

parent 761d4a58
......@@ -29,6 +29,8 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/fastaFromBed \
$(SRC_DIR)/flankBed \
$(SRC_DIR)/genomeCoverageBed \
$(SRC_DIR)/getOverlap \
$(SRC_DIR)/groupBy \
$(SRC_DIR)/intersectBed \
$(SRC_DIR)/linksBed \
$(SRC_DIR)/maskFastaFromBed \
......@@ -36,7 +38,6 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/multiBamCov \
$(SRC_DIR)/multiIntersectBed \
$(SRC_DIR)/nucBed \
$(SRC_DIR)/getOverlap \
$(SRC_DIR)/pairToBed \
$(SRC_DIR)/pairToPair \
$(SRC_DIR)/shuffleBed \
......
......@@ -15,12 +15,13 @@ def main():
'bedpetobam': 'bedpeToBam',
'bedtobam': 'bedToBam',
'closest': 'closestBed',
'cluster': 'clusterBed',
'cluster': 'clusterBed',
'complement': 'complementBed',
'coverage': 'coverageBed',
'flank': 'flankBed',
'genomecov': 'genomeCoverageBed',
'getfasta': 'fastaFromBed',
'getfasta': 'fastaFromBed',
'groupby': 'groupBy',
'igv': 'bedToIgv',
'intersect': 'intersectBed',
'links': 'linksBed',
......
......@@ -48,6 +48,7 @@ int fastafrombed_main(int argc, char* argv[]);//
int flank_main(int argc, char* argv[]); //
int genomecoverage_main(int argc, char* argv[]);//
int getoverlap_main(int argc, char* argv[]);//
int groupby_main(int argc, char* argv[]);//
int intersect_main(int argc, char* argv[]); //
int links_main(int argc, char* argv[]);//
int maskfastafrombed_main(int argc, char* argv[]);//
......@@ -120,7 +121,7 @@ int main(int argc, char *argv[])
else if (sub_cmd == "igv") return bedtoigv_main(argc-1, argv+1);
else if (sub_cmd == "links") return links_main(argc-1, argv+1);
else if (sub_cmd == "makewindows") return windowmaker_main(argc-1, argv+1);
else if (sub_cmd == "groupby") return groupby_main(argc-1, argv+1);
// help
else if (sub_cmd == "-h" || sub_cmd == "--help" ||
sub_cmd == "-help")
......@@ -216,6 +217,7 @@ int bedtools_help(void)
cout << " igv " << "Create an IGV snapshot batch script.\n";
cout << " links " << "Create a HTML page of links to UCSC locations.\n";
cout << " makewindows " << "Make interval \"windows\" across a genome.\n";
cout << " groupby " << "Group by common cols. & summarize oth. cols. (~ SQL \"groupBy\")\n";
cout << endl;
cout << " -General help:\n";
......
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/tabFile/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= groupBy.cpp
OBJECTS= $(SOURCES:.cpp=.o)
_EXT_OBJECTS=tabFile.o gzstream.o fileType.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
all: $(BUILT_OBJECTS)
.PHONY: all
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
$(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/tabFile/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
.PHONY: clean
/*****************************************************************************
groupBy.cpp
(c) 2009, 2010, 2011 - Aaron Quinlan
Center for Public Health Genomics
University of Virginia
aaronquinlan@gmail.com
Licenced under the MIT license.
******************************************************************************/
#include <vector>
#include <map>
#include <numeric>
#include <algorithm>
#include <iterator>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <stdlib.h>
#include <math.h>
#include <limits.h>
#include <string.h>
#include <exception>
#include <stdexcept> // out_of_range exception
#include "version.h"
#include "lineFileUtilities.h"
#include "tabFile.h"
using namespace std;
const int PRECISION = 21;
// define our program name
#define PROGRAM_NAME "bedtools groupby"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) ((strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen))
#define LOOKS_LIKE_A_PARAM(string) (strlen(string)>0 && string[0]=='-')
struct ValueGreaterThan
{
bool operator()( const vector< pair<int, string> >::value_type& lhs,
const vector< pair<int, string> >::value_type& rhs ) const
{
return lhs.first > rhs.first;
}
};
struct ValueLessThan
{
bool operator()( const vector< pair<int, string> >::value_type& lhs,
const vector< pair<int, string> >::value_type& rhs ) const
{
return lhs.first < rhs.first;
}
};
// function declarations
void groupby_help(void);
void GroupBy(const string &inFile, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops, const bool printOriginalLine, const bool printHeaderLine, const bool InputHaveHeaderLine, const bool ignoreCase);
void PrintHeaderLine(const vector<string> &InputFields, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops, const bool PrintFullInputLine, const bool InputHaveHeaderLine);
void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops);
void addValue (const vector<string> &fromList, vector<string> &toList, int index, int lineNum, const bool ignoreCase);
float ToFloat (string element);
double ToDouble(const string &element);
void TabPrintPost (string element);
void TabPrintPre (string element);
void CommaPrint (string element);
int groupby_main(int argc, char* argv[]) {
// input files
string inFile = "stdin";
string groupColumnsString = "1,2,3";
string opsColumnString;
string opsString;
// our configuration variables
bool showHelp = false;
bool haveOpColumns = false;
bool haveOps = true;
bool printOriginalLine = false;
bool printHeaderLine = false;
bool InputHaveHeaderLine = false;
bool ignoreCase = false;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
}
}
if(showHelp) groupby_help();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-i", 2, parameterLength)) {
if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
cerr << endl << "*****ERROR: -i parameter requires a value." << endl << endl;
groupby_help();
break;
}
else {
inFile = argv[i + 1];
i++;
}
}
else if (PARAMETER_CHECK("-grp", 4, parameterLength) || PARAMETER_CHECK("-g", 2, parameterLength)) {
if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
cerr << endl << "*****ERROR: -grp parameter requires a value." << endl << endl;
groupby_help();
break;
}
else {
groupColumnsString = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-opCols", 7, parameterLength) || PARAMETER_CHECK("-c", 2, parameterLength)) {
if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
cerr << endl << "*****ERROR: -opCols parameter requires a value." << endl << endl;
groupby_help();
break;
}
else {
haveOpColumns = true;
opsColumnString = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-ops", 4, parameterLength) || PARAMETER_CHECK("-o", 2, parameterLength)) {
if ((i+1) >= argc || LOOKS_LIKE_A_PARAM(argv[i+1])) {
cerr << endl << "*****ERROR: -ops parameter requires a value." << endl << endl;
groupby_help();
break;
}
else {
haveOps = true;
opsString = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-full", 5, parameterLength)) {
printOriginalLine = true;
}
else if(PARAMETER_CHECK("-outheader", 10, parameterLength)) {
printHeaderLine = true;
}
else if(PARAMETER_CHECK("-inheader", 9, parameterLength)) {
InputHaveHeaderLine = true;
}
else if(PARAMETER_CHECK("-header", 7, parameterLength)) {
InputHaveHeaderLine = true;
printHeaderLine = true;
}
else if(PARAMETER_CHECK("-ignorecase", 11, parameterLength)) {
ignoreCase = true;
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
if (!haveOpColumns) {
cerr << endl << "*****" << endl << "*****ERROR: Need -opCols." << endl << "*****" << endl;
showHelp = true;
}
// split the opsString into discrete operations and make sure they are all valid.
vector<string> ops;
opsString.erase(remove_if(opsString.begin(),opsString.end(),::isspace),opsString.end());
Tokenize(opsString, ops, ",");
for( size_t i = 0; i < ops.size(); i++ ) {
if ((ops[i] != "sum") && (ops[i] != "max") && (ops[i] != "min") && (ops[i] != "mean") &&
(ops[i] != "mode") && (ops[i] != "median") && (ops[i] != "antimode") && (ops[i] != "stdev") &&
(ops[i] != "sstdev") && (ops[i] != "count") && (ops[i] != "collapse") && (ops[i] != "distinct") &&
(ops[i] != "concat") && (ops[i] != "freqdesc") && (ops[i] != "freqasc"))
{
cerr << endl << "*****" << endl << "*****ERROR: Invalid operation selection \"" << ops[i] << endl << "\" *****" << endl;
showHelp = true;
}
}
if (!showHelp) {
// Split the column string sent by the user into discrete column numbers
// A comma separated string is expected.
vector<int> groupColumnsInt;
Tokenize(groupColumnsString, groupColumnsInt, ",");
vector<int> opColumnsInt;
Tokenize(opsColumnString, opColumnsInt, ",");
// sanity check the group columns
for(size_t i = 0; i < groupColumnsInt.size(); ++i) {
int groupColumnInt = groupColumnsInt[i];
if (groupColumnInt < 1) {
cerr << endl << "*****" << endl << "*****ERROR: group columns must be >=1. " << endl << "*****" << endl;
groupby_help();
}
}
// sanity check the op columns
for(size_t i = 0; i < opColumnsInt.size(); ++i) {
int opColumnInt = opColumnsInt[i];
if (opColumnInt < 1) {
cerr << endl << "*****" << endl << "*****ERROR: op columns must be >=1. " << endl << "*****" << endl;
groupby_help();
}
}
// sanity check that there are equal number of opColumns and ops
if (ops.size() != opColumnsInt.size()) {
cerr << endl << "*****" << endl << "*****ERROR: There must be equal number of ops and opCols. " << endl << "*****" << endl;
groupby_help();
}
GroupBy(inFile, groupColumnsInt, opColumnsInt, ops,
printOriginalLine, printHeaderLine, InputHaveHeaderLine,
ignoreCase);
}
else {
groupby_help();
}
return 0;
}
void groupby_help(void) {
cerr << "\nTool: bedtools intersect (aka intersectBed)" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Summarizes a dataset column based upon" << endl;
cerr << "\t common column groupings. Akin to the SQL \"group by\" command." << endl << endl;
cerr << "Usage:\t " << PROGRAM_NAME << " -g [group_column(s)] -c [op_column(s)] -o [ops] " << endl;
cerr << "\t " << "cat [FILE] | " << PROGRAM_NAME << " -g [group_column(s)] -c [op_column(s)] -o [ops] " << endl << endl;
cerr << "Options: " << endl;
cerr << "\t-i\t\t" << "Input file. Assumes \"stdin\" if omitted." << endl << endl;
cerr << "\t-g -grp\t\t" << "Specify the columns (1-based) for the grouping." << endl;
cerr << "\t\t\tThe columns must be comma separated." << endl;
cerr << "\t\t\t- Default: 1,2,3" << endl << endl;
cerr << "\t-c -opCols\t" << "Specify the column (1-based) that should be summarized." << endl;
cerr << "\t\t\t- Required." << endl << endl;
cerr << "\t-o -ops\t\t" << "Specify the operation that should be applied to opCol." << endl;
cerr << "\t\t\tValid operations:" << endl;
cerr << "\t\t\t sum, count, min, max," << endl;
cerr << "\t\t\t mean, median, mode, antimode," << endl;
cerr << "\t\t\t stdev, sstdev (sample standard dev.)," << endl;
cerr << "\t\t\t collapse (i.e., print a comma separated list (duplicates allowed)), " << endl;
cerr << "\t\t\t distinct (i.e., print a comma separated list (NO duplicates allowed)), " << endl;
cerr << "\t\t\t concat (i.e., merge values into a single, non-delimited string), " << endl;
cerr << "\t\t\t freqdesc (i.e., print desc. list of values:freq)" << endl;
cerr << "\t\t\t freqasc (i.e., print asc. list of values:freq)" << endl;
cerr << "\t\t\t- Default: sum" << endl << endl;
cerr << "\t-full\t\t" << "Print all columns from input file." << endl;
cerr << "\t\t\tDefault: print only grouped columns." << endl << endl;
cerr << "\t-inheader\t" << "Input file has a header line - the first line will be ignored." << endl << endl ;
cerr << "\t-outheader\t" << "Print header line in the output, detailing the column names. " << endl;
cerr << "\t\t\tIf the input file has headers (-inheader), the output file" << endl;
cerr << "\t\t\twill use the input's column names." << endl;
cerr << "\t\t\tIf the input file has no headers, the output file" << endl;
cerr << "\t\t\twill use \"col_1\", \"col_2\", etc. as the column names." << endl << endl;
cerr << "\t-header\t\t" << "same as '-inheader -outheader'" << endl << endl;
cerr << "\t-ignorecase\t" << "Group values regardless of upper/lower case." << endl << endl;
cerr << "Examples: " << endl;
cerr << "\t$ cat ex1.out" << endl;
cerr << "\tchr1 10 20 A chr1 15 25 B.1 1000 ATAT" << endl;
cerr << "\tchr1 10 20 A chr1 25 35 B.2 10000 CGCG" << endl << endl;
cerr << "\t$ groupBy -i ex1.out -g 1,2,3,4 -c 9 -o sum" << endl;
cerr << "\tchr1 10 20 A 11000" << endl << endl;
cerr << "\t$ groupBy -i ex1.out -grp 1,2,3,4 -opCols 9,9 -ops sum,max" << endl;
cerr << "\tchr1 10 20 A 11000 10000" << endl << endl;
cerr << "\t$ groupBy -i ex1.out -g 1,2,3,4 -c 8,9 -o collapse,mean" << endl;
cerr << "\tchr1 10 20 A B.1,B.2, 5500" << endl << endl;
cerr << "\t$ cat ex1.out | groupBy -g 1,2,3,4 -c 8,9 -o collapse,mean" << endl;
cerr << "\tchr1 10 20 A B.1,B.2, 5500" << endl << endl;
cerr << "\t$ cat ex1.out | groupBy -g 1,2,3,4 -c 10 -o concat" << endl;
cerr << "\tchr1 10 20 A ATATCGCG" << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) The input file/stream should be sorted/grouped by the -grp. columns" << endl;
cerr << "\t(2) If -i is unspecified, input is assumed to come from stdin." << endl << endl;
// end the program here
exit(1);
}
void GroupBy (const string &inFile,
const vector<int> &groupColumns,
const vector<int> &opColumns,
const vector<string> &ops,
const bool printOriginalLine,
const bool printHeaderLine,
const bool InputHaveHeaderLine,
const bool ignoreCase) {
// current line number
int lineNum = 0;
// string representing current line
string inLine;
// vector of strings holding the tokenized current line
vector<string> inFields;
vector<string> inFieldsFirstLineInGroup;
inFields.reserve(20);
// keys for the current and previous group
vector<string> prevGroup(0);
vector<string> currGroup(0);
// vector (one per column) of vector (one per value/column) of the opColumn values for the current group
vector< vector<string> > values;
for( size_t i = 0; i < opColumns.size(); i++ ) {
values.push_back( vector<string>() );
}
bool first_line = true;
// check the status of the current line
TabLineStatus tabLineStatus;
// open a new tab file, loop through it line by line
// and summarize the data for a given group when the group
// fields change
TabFile *_tab = new TabFile(inFile);
_tab->Open();
while ((tabLineStatus = _tab->GetNextTabLine(inFields, lineNum)) != TAB_INVALID) {
if (tabLineStatus == TAB_VALID) {
if (first_line) {
first_line = false;
if (printHeaderLine)
PrintHeaderLine(inFields, groupColumns, opColumns, ops,
printOriginalLine, InputHaveHeaderLine);
if (InputHaveHeaderLine) {
inFields.clear();
continue; // no need to process this line - it's the header
}
}
if (inFieldsFirstLineInGroup.empty()) //first line in file? - save it
inFieldsFirstLineInGroup = inFields;
// build the group vector for the current line
currGroup.clear();
vector<int>::const_iterator gIt = groupColumns.begin();
vector<int>::const_iterator gEnd = groupColumns.end();
for (; gIt != gEnd; ++gIt)
addValue(inFields, currGroup, (*gIt-1), lineNum, ignoreCase);
// there has been a group change
if ((currGroup != prevGroup) && (prevGroup.size() > 0)) {
// Summarize this group
ReportSummary(printOriginalLine?inFieldsFirstLineInGroup:prevGroup, values, ops);
// reset and add the first value for the next group.
values.clear();
for( size_t i = 0; i < opColumns.size(); i++ ) {
values.push_back( vector<string>() );
addValue(inFields, values[i], opColumns[i]-1, lineNum, ignoreCase);
}
inFieldsFirstLineInGroup = inFields;
}
// we're still dealing with the same group
else {
for( size_t i = 0; i < opColumns.size(); i++ )
addValue(inFields, values[i], opColumns[i]-1, lineNum, ignoreCase);
}
// reset for the next line
prevGroup = currGroup;
}
inFields.clear();
}
// report the last group
ReportSummary(printOriginalLine?inFieldsFirstLineInGroup:currGroup, values, ops);
_tab->Close();
}
void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops) {
vector<string> result;
for( size_t i = 0; i < data.size(); i++ ) {
string op = ops[i];
std::stringstream buffer;
vector<double> dataF;
// are we doing a numeric conversion? if so, convert the strings to doubles.
if ((op == "sum") || (op == "max") || (op == "min") || (op == "mean") ||
(op == "median") || (op == "stdev") || (op == "sstdev"))
{
transform(data[i].begin(), data[i].end(), back_inserter(dataF), ToDouble);
}
if (op == "sum") {
// sum them up
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
buffer << setprecision (PRECISION) << total;
result.push_back(buffer.str());
}
else if (op == "collapse") {
string collapse;
for( size_t j = 0; j < data[i].size(); j++ ) {//Ugly, but cannot use back_inserter
if (j>0)
collapse.append(",");
collapse.append(data[i][j]);
}
result.push_back(collapse);
}
else if (op == "distinct") {
string distinct;
// get the current column's data
vector<string> col_data = data[i];
// remove duplicate entries from the vector
// http://stackoverflow.com/questions/1041620/most-efficient-way-to-erase-duplicates-and-sort-a-c-vector
sort( col_data.begin(), col_data.end() );
col_data.erase( unique( col_data.begin(), col_data.end() ), col_data.end() );
for( size_t j = 0; j < col_data.size(); j++ ) {//Ugly, but cannot use back_inserter
if (j>0)
distinct.append(",");
distinct.append(col_data[j]);
}
result.push_back(distinct);
}
else if (op == "concat") {
string concat;
for( size_t j = 0; j < data[i].size(); j++ ) {//Ugly, but cannot use back_inserter
concat.append(data[i][j]);
}
result.push_back(concat);
}
else if (op == "min") {
buffer << setprecision (PRECISION) << *min_element( dataF.begin(), dataF.end() );
result.push_back(buffer.str());
}
else if (op == "max") {
buffer << setprecision (PRECISION) << *max_element( dataF.begin(), dataF.end() );
result.push_back(buffer.str());
}
else if (op == "mean") {
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
double mean = total / dataF.size();
buffer << setprecision (PRECISION) << mean;
result.push_back(buffer.str());
}
else if (op == "median") {
double median = 0.0;
sort(dataF.begin(), dataF.end());
int totalLines = dataF.size();
if ((totalLines % 2) > 0) {
long mid;
mid = totalLines / 2;
median = dataF[mid];
}
else {
long midLow, midHigh;
midLow = (totalLines / 2) - 1;
midHigh = (totalLines / 2);
median = (dataF[midLow] + dataF[midHigh]) / 2.0;
}
buffer << setprecision (PRECISION) << median;
result.push_back(buffer.str());
}
else if (op == "count") {
buffer << setprecision (PRECISION) << data[i].size();
result.push_back(buffer.str());
}
else if ((op == "mode") || (op == "antimode") ||
(op == "freqdesc") || (op == "freqasc")) {
// compute the frequency of each unique value
map<string, int> freqs;
vector<string>::const_iterator dIt = data[i].begin();
vector<string>::const_iterator dEnd = data[i].end();
for (; dIt != dEnd; ++dIt) {