Commit 5d62d8f6 authored by Aaron's avatar Aaron
Browse files

First commit for unionBedGraphs and multi-column groupBy.

parent 8925af09
......@@ -11,9 +11,39 @@ export CXX = g++
export CXXFLAGS = -Wall -O2
export LIBS = -lz
SUBDIRS = $(SRC_DIR)/bamToBed $(SRC_DIR)/bedToBam $(SRC_DIR)/bedToIgv $(SRC_DIR)/bed12ToBed6 $(SRC_DIR)/closestBed $(SRC_DIR)/complementBed $(SRC_DIR)/coverageBed $(SRC_DIR)/fastaFromBed $(SRC_DIR)/genomeCoverageBed $(SRC_DIR)/groupBy $(SRC_DIR)/intersectBed $(SRC_DIR)/linksBed $(SRC_DIR)/maskFastaFromBed $(SRC_DIR)/mergeBed $(SRC_DIR)/overlap $(SRC_DIR)/pairToBed $(SRC_DIR)/pairToPair $(SRC_DIR)/shuffleBed $(SRC_DIR)/slopBed $(SRC_DIR)/sortBed $(SRC_DIR)/subtractBed $(SRC_DIR)/windowBed
SUBDIRS = $(SRC_DIR)/bamToBed \
$(SRC_DIR)/bedToBam \
$(SRC_DIR)/bedToIgv \
$(SRC_DIR)/bed12ToBed6 \
$(SRC_DIR)/closestBed \
$(SRC_DIR)/complementBed \
$(SRC_DIR)/coverageBed \
$(SRC_DIR)/fastaFromBed \
$(SRC_DIR)/genomeCoverageBed \
$(SRC_DIR)/groupBy \
$(SRC_DIR)/intersectBed \
$(SRC_DIR)/linksBed \
$(SRC_DIR)/maskFastaFromBed \
$(SRC_DIR)/mergeBed \
$(SRC_DIR)/overlap \
$(SRC_DIR)/pairToBed \
$(SRC_DIR)/pairToPair \
$(SRC_DIR)/shuffleBed \
$(SRC_DIR)/slopBed \
$(SRC_DIR)/sortBed \
$(SRC_DIR)/subtractBed \
$(SRC_DIR)/unionBedGraphs \
$(SRC_DIR)/windowBed
UTIL_SUBDIRS = $(SRC_DIR)/utils/lineFileUtilities $(SRC_DIR)/utils/bedFile $(SRC_DIR)/utils/tabFile $(SRC_DIR)/utils/genomeFile $(SRC_DIR)/utils/gzstream $(SRC_DIR)/utils/fileType $(SRC_DIR)/utils/bedFilePE $(SRC_DIR)/utils/sequenceUtilities $(SRC_DIR)/utils/BamTools
UTIL_SUBDIRS = $(SRC_DIR)/utils/lineFileUtilities \
$(SRC_DIR)/utils/bedFile \
$(SRC_DIR)/utils/tabFile \
$(SRC_DIR)/utils/genomeFile \
$(SRC_DIR)/utils/gzstream \
$(SRC_DIR)/utils/fileType \
$(SRC_DIR)/utils/bedFilePE \
$(SRC_DIR)/utils/sequenceUtilities \
$(SRC_DIR)/utils/BamTools
all:
......@@ -33,7 +63,6 @@ all:
done
.PHONY: all
clean:
......
......@@ -159,7 +159,7 @@ void ShowHelp(void) {
cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl;
cerr << "Authors: Aaron Quinlan (aaronquinlan@gmail.com)" << endl;
cerr << " Gordon Assaf, CSHL" << endl << endl;
cerr << " Assaf Gordon, CSHL" << endl << endl;
cerr << "Summary: Compute the coverage of a feature file among a genome." << endl << endl;
......
/*****************************************************************************
overlap.cpp
groupBy.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
......@@ -21,6 +21,8 @@
#include <math.h>
#include <limits.h>
#include <string.h>
#include <exception>
#include <stdexcept> // out_of_range exception
#include "version.h"
#include "lineFileUtilities.h"
......@@ -36,11 +38,13 @@ using namespace std;
// function declarations
void ShowHelp(void);
void GroupBy(const string &inFile, const vector<int> &groupColumns, int opColumn, string op);
void ReportSummary(const vector<string> &group, const vector<string> &data, string op);
void GroupBy(const string &inFile, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops);
void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops);
void addValue (const vector<string> &fromList, vector<string> &toList, int index, int lineNum);
float ToFloat (string element);
double ToDouble(const string &element);
void TabPrint (string element);
void TabPrintPost (string element);
void TabPrintPre (string element);
void CommaPrint (string element);
int main(int argc, char* argv[]) {
......@@ -48,15 +52,15 @@ int main(int argc, char* argv[]) {
// input files
string inFile;
string groupColumnsString = "1,2,3";
string opColumnString;
string op = "sum";
string opsColumnString;
string opsString;
// our configuration variables
bool showHelp = false;
bool haveInFile = false;
bool haveGroupColumns = false;
bool haveOpColumn = false;
bool haveOp = true;
bool showHelp = false;
bool haveInFile = false;
bool haveGroupColumns = false;
bool haveOpColumns = false;
bool haveOps = true;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
......@@ -89,14 +93,14 @@ int main(int argc, char* argv[]) {
groupColumnsString = argv[i + 1];
i++;
}
else if(PARAMETER_CHECK("-opCol", 6, parameterLength)) {
haveOpColumn = true;
opColumnString = argv[i + 1];
else if(PARAMETER_CHECK("-opCols", 7, parameterLength)) {
haveOpColumns = true;
opsColumnString = argv[i + 1];
i++;
}
else if(PARAMETER_CHECK("-op", 3, parameterLength)) {
haveOp = true;
op = argv[i + 1];
else if(PARAMETER_CHECK("-ops", 4, parameterLength)) {
haveOps = true;
opsString = argv[i + 1];
i++;
}
else {
......@@ -110,16 +114,21 @@ int main(int argc, char* argv[]) {
cerr << endl << "*****" << endl << "*****ERROR: Need -i file. " << endl << "*****" << endl;
showHelp = true;
}
if (!haveOpColumn) {
if (!haveOpColumns) {
cerr << endl << "*****" << endl << "*****ERROR: Need -opCol." << endl << "*****" << endl;
showHelp = true;
}
if ((op != "sum") && (op != "max") && (op != "min") && (op != "mean") &&
(op != "mode") && (op != "median") && (op != "antimode") && (op != "stdev") &&
(op != "count") && (op != "collapse")) {
cerr << endl << "*****" << endl << "*****ERROR: Invalid op selection. " << endl << "*****" << endl;
showHelp = true;
}
// split the opsString into discrete operations and make sure they are all valid.
vector<string> ops;
Tokenize(opsString, ops, ",");
for( size_t i = 0; i < ops.size(); i++ ) {
if ((ops[i] != "sum") && (ops[i] != "max") && (ops[i] != "min") && (ops[i] != "mean") &&
(ops[i] != "mode") && (ops[i] != "median") && (ops[i] != "antimode") && (ops[i] != "stdev") &&
(ops[i] != "sstdev") && (ops[i] != "count") && (ops[i] != "collapse")) {
cerr << endl << "*****" << endl << "*****ERROR: Invalid op selection \"" << ops[i] << endl << "*****" << endl;
showHelp = true;
}
}
if (!showHelp) {
// Split the column string sent by the user into discrete column numbers
......@@ -127,25 +136,33 @@ int main(int argc, char* argv[]) {
vector<int> groupColumnsInt;
Tokenize(groupColumnsString, groupColumnsInt, ",");
// sanity check the group columns
vector<int>::const_iterator gIt = groupColumnsInt.begin();
vector<int>::const_iterator gEnd = groupColumnsInt.end();
for (; gIt != gEnd; ++gIt) {
if (*gIt < 1) {
cerr << endl << "*****" << endl << "*****ERROR: group columns must be >=1. " << endl << "*****" << endl;
ShowHelp();
vector<int> opColumnsInt;
Tokenize(opsColumnString, opColumnsInt, ",");
// sanity check the group columns
for(size_t i = 0; i < groupColumnsInt.size(); ++i) {
int groupColumnInt = groupColumnsInt[i];
if (groupColumnInt < 1) {
cerr << endl << "*****" << endl << "*****ERROR: group columns must be >=1. " << endl << "*****" << endl;
ShowHelp();
}
}
// sanity check the op columns
int opColumnInt = atoi(opColumnString.c_str());
if (opColumnInt < 1) {
cerr << endl << "*****" << endl << "*****ERROR: op columns must be >=1. " << endl << "*****" << endl;
ShowHelp();
for(size_t i = 0; i < opColumnsInt.size(); ++i) {
int opColumnInt = opColumnsInt[i];
if (opColumnInt < 1) {
cerr << endl << "*****" << endl << "*****ERROR: op columns must be >=1. " << endl << "*****" << endl;
ShowHelp();
}
}
//DetermineInput(inFile, groupColumnsInt, opColumnInt, op);
GroupBy(inFile, groupColumnsInt, opColumnInt, op);
// sanity check that there are equal number of opColumns and ops
if (ops.size() != opColumnsInt.size()) {
cerr << endl << "*****" << endl << "*****ERROR: There must be equal number of ops and opCols. " << endl << "*****" << endl;
ShowHelp();
}
GroupBy(inFile, groupColumnsInt, opColumnsInt, ops);
}
else {
ShowHelp();
......@@ -201,7 +218,7 @@ void ShowHelp(void) {
}
void GroupBy(const string &inFile, const vector<int> &groupColumns, int opColumn, string op) {
void GroupBy(const string &inFile, const vector<int> &groupColumns, const vector<int> &opColumns, const vector<string> &ops) {
// current line number
int lineNum = 0;
......@@ -216,9 +233,11 @@ void GroupBy(const string &inFile, const vector<int> &groupColumns, int opColumn
vector<string> prevGroup(0);
vector<string> currGroup(0);
// vector of the opColumn values for the current group
vector<string> values;
values.reserve(100000);
// vector (one per column) of vector (one per value/column) of the opColumn values for the current group
vector< vector<string> > values;
for( size_t i = 0; i < opColumns.size(); i++ ) {
values.push_back( vector<string>() );
}
// check the status of the current line
TabLineStatus tabLineStatus;
......@@ -230,154 +249,177 @@ void GroupBy(const string &inFile, const vector<int> &groupColumns, int opColumn
_tab->Open();
while ((tabLineStatus = _tab->GetNextTabLine(inFields, lineNum)) != TAB_INVALID) {
if (tabLineStatus == TAB_VALID) {
// grab the number of columns in the line.
int lineWidth = inFields.size();
// sanity check the op column
if (opColumn > lineWidth) {
cerr << endl << "*****" << endl << "*****ERROR: op column exceeds the number of columns in file at line "
<< lineNum << ". Exiting." << endl << "*****" << endl;
exit(1);
}
// build the group vector for the current line
currGroup.clear();
vector<int>::const_iterator gIt = groupColumns.begin();
vector<int>::const_iterator gEnd = groupColumns.end();
for (; gIt != gEnd; ++gIt) {
if (*gIt > lineWidth) {
cerr << endl << "*****" << endl << "*****ERROR: group column exceeds the number of columns in file at line "
<< lineNum << ". Exiting." << endl << "*****" << endl;
exit(1);
}
currGroup.push_back(inFields[*gIt-1]);
}
for (; gIt != gEnd; ++gIt)
addValue(inFields, currGroup, (*gIt-1), lineNum);
// there has been a group change
if ((currGroup != prevGroup) && (prevGroup.size() > 0)) {
// Summarize this group
ReportSummary(prevGroup, values, op);
ReportSummary(prevGroup, values, ops);
// reset and add the first value for the next group.
values.clear();
values.push_back(inFields[opColumn-1].c_str());
for( size_t i = 0; i < opColumns.size(); i++ ) {
values.push_back( vector<string>() );
addValue(inFields, values[i], opColumns[i]-1, lineNum);
}
}
// we're still dealing with the same group
else
values.push_back(inFields[opColumn-1].c_str());
else {
for( size_t i = 0; i < opColumns.size(); i++ )
addValue(inFields, values[i], opColumns[i]-1, lineNum);
}
// reset for the next line
prevGroup = currGroup;
inFields.clear();
}
inFields.clear();
}
// report the last group
ReportSummary(currGroup, values, op);
ReportSummary(currGroup, values, ops);
_tab->Close();
}
void ReportSummary(const vector<string> &group, const vector<string> &data, string op) {
void ReportSummary(const vector<string> &group, const vector<vector<string> > &data, const vector<string> &ops) {
vector<double> dataF;
// are we doing a numeric conversion? if so, convert the strings to doubles.
if ((op == "sum") || (op == "max") || (op == "min") || (op == "mean") ||
(op == "median") || (op == "stdev")) {
transform(data.begin(), data.end(), back_inserter(dataF), ToDouble);
}
vector<string> result;
for( size_t i = 0; i < data.size(); i++ ) {
string op = ops[i];
std::stringstream buffer;
vector<double> dataF;
// are we doing a numeric conversion? if so, convert the strings to doubles.
if ((op == "sum") || (op == "max") || (op == "min") || (op == "mean") ||
(op == "median") || (op == "stdev") || (op == "sstdev"))
{
transform(data[i].begin(), data[i].end(), back_inserter(dataF), ToDouble);
}
if (op == "sum") {
// sum them up
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << total << endl;
}
if (op == "collapse") {
for_each(group.begin(), group.end(), TabPrint);
for_each(data.begin(), data.end(), CommaPrint);
cout << endl;
}
else if (op == "min") {
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << *min_element( dataF.begin(), dataF.end() ) << endl;
}
else if (op == "max") {
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << *max_element( dataF.begin(), dataF.end() ) << endl;
}
else if (op == "mean") {
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
double mean = total / dataF.size();
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << mean << endl;
}
else if (op == "median") {
double median = 0.0;
sort(dataF.begin(), dataF.end());
int totalLines = dataF.size();
if ((totalLines % 2) > 0) {
long mid;
mid = totalLines / 2;
median = dataF[mid];
}
else {
long midLow, midHigh;
midLow = (totalLines / 2) - 1;
midHigh = (totalLines / 2);
median = (dataF[midLow] + dataF[midHigh]) / 2.0;
}
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << median << endl;
}
else if (op == "count") {
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << data.size() << endl;
}
else if ((op == "mode") || (op == "antimode")) {
// compute the frequency of each unique value
map<string, int> freqs;
vector<string>::const_iterator dIt = data.begin();
vector<string>::const_iterator dEnd = data.end();
for (; dIt != dEnd; ++dIt) {
freqs[*dIt]++;
if (op == "sum") {
// sum them up
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
buffer << setprecision (7) << total;
result.push_back(buffer.str());
}
// grab the mode and the anti mode
string mode, antiMode;
int count = 0;
int minCount = INT_MAX;
for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) {
if (iter->second > count) {
mode = iter->first;
count = iter->second;
}
if (iter->second < minCount) {
antiMode = iter->first;
minCount = iter->second;
}
}
// report
for_each(group.begin(), group.end(), TabPrint);
if (op == "mode")
cout << setprecision (7) << mode << endl;
else if (op == "antimode")
cout << setprecision (7) << antiMode << endl;
}
else if (op == "stdev") {
// get the mean
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
double mean = total / dataF.size();
// get the variance
double totalVariance = 0.0;
vector<double>::const_iterator dIt = dataF.begin();
vector<double>::const_iterator dEnd = dataF.end();
for (; dIt != dEnd; ++dIt) {
totalVariance += pow((*dIt - mean),2);
if (op == "collapse") {
string collapse;
for( size_t j = 0; j < data[i].size(); j++ ) {//Ugly, but cannot use back_inserter
collapse.append(data[i][j]);
collapse.append(",");
}
result.push_back(collapse);
}
else if (op == "min") {
buffer << setprecision (7) << *min_element( dataF.begin(), dataF.end() );
result.push_back(buffer.str());
}
else if (op == "max") {
buffer << setprecision (7) << *max_element( dataF.begin(), dataF.end() );
result.push_back(buffer.str());
}
else if (op == "mean") {
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
double mean = total / dataF.size();
buffer << setprecision (7) << mean;
result.push_back(buffer.str());
}
else if (op == "median") {
double median = 0.0;
sort(dataF.begin(), dataF.end());
int totalLines = dataF.size();
if ((totalLines % 2) > 0) {
long mid;
mid = totalLines / 2;
median = dataF[mid];
}
else {
long midLow, midHigh;
midLow = (totalLines / 2) - 1;
midHigh = (totalLines / 2);
median = (dataF[midLow] + dataF[midHigh]) / 2.0;
}
buffer << setprecision (7) << median;
result.push_back(buffer.str());
}
else if (op == "count") {
buffer << setprecision (7) << data[i].size();
result.push_back(buffer.str());
}
else if ((op == "mode") || (op == "antimode")) {
// compute the frequency of each unique value
map<string, int> freqs;
vector<string>::const_iterator dIt = data[i].begin();
vector<string>::const_iterator dEnd = data[i].end();
for (; dIt != dEnd; ++dIt) {
freqs[*dIt]++;
}
// grab the mode and the anti mode
string mode, antiMode;
int count = 0;
int minCount = INT_MAX;
for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) {
if (iter->second > count) {
mode = iter->first;
count = iter->second;
}
if (iter->second < minCount) {
antiMode = iter->first;
minCount = iter->second;
}
}
// report
if (op == "mode") {
buffer << setprecision (7) << mode;
result.push_back(buffer.str());
}
else if (op == "antimode") {
buffer << setprecision (7) << antiMode;
result.push_back(buffer.str());
}
}
else if (op == "stdev" || op == "sstdev") {
// get the mean
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
double mean = total / dataF.size();
// get the variance
double totalVariance = 0.0;
vector<double>::const_iterator dIt = dataF.begin();
vector<double>::const_iterator dEnd = dataF.end();
for (; dIt != dEnd; ++dIt) {
totalVariance += pow((*dIt - mean),2);
}
double variance = 0.0;
if (op == "stdev") {
variance = totalVariance / dataF.size();
}
else if (op == "sstdev" && dataF.size() > 1) {
variance = totalVariance / (dataF.size() - 1);
}
double stddev = sqrt(variance);
// report
buffer << setprecision (7) << stddev;
result.push_back(buffer.str());
}
double variance = totalVariance / dataF.size();
double stddev = sqrt(variance);
// report
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << stddev << endl;
}
for_each(group.begin(), group.end(), TabPrintPost);
cout << *result.begin();
for_each(++result.begin(), result.end(), TabPrintPre);
cout << endl; //Gets rid of extraneous tab
}
void addValue (const vector<string> &fromList, vector<string> &toList, int index, int lineNum) {
try {
toList.push_back(fromList.at(index));
}
catch(std::out_of_range& e) {
cerr << endl << "*****" << endl << "*****ERROR: requested column exceeds the number of columns in file at line "
<< lineNum << ". Exiting." << endl << "*****" << endl;
exit(1);
}
}
......@@ -386,10 +428,14 @@ float ToFloat (string element) {
return atof(element.c_str());
}
void TabPrint (string element) {
void TabPrintPost (string element) {
cout << element << "\t";
}
void TabPrintPre (string element) {
cout << "\t" << element;
}
void CommaPrint (string element) {
cout << element << ",";
}
......
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedGraphFile/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/genomeFile/ \
-I$(UTILITIES_DIR)/version/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/fileType/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= unionBedGraphs.cpp unionBedGraphsMain.cpp
OBJECTS= $(SOURCES:.cpp=.o)
_EXT_OBJECTS=bedGraphFile.o genomeFile.o lineFileUtilities.o gzstream.o fileType.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= unionBedGraphs
all: $(PROGRAM)
.PHONY: all
$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS)
@echo " * linking $(PROGRAM)"
@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS)
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
$(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedGraphFile/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/genomeFile/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/* $(BIN_DIR)/*