Commit 3cba5c6c authored by Aaron's avatar Aaron
Browse files

Large commit.

	A. Fixed massive bug in parseBedLine that disallowed BED features with start = 0.  Bad news.
	1. Moved ToString to lineFileUtilities
	2. Added tabFile class.  Works with gzip as well.
	3. Changed groupBy to use the new TabFile class.
	4. Added new collapse feature to groupBy.
parent e200d064
......@@ -14,7 +14,7 @@ export LIBS = -lz
# define our source subdirectories
SUBDIRS = $(SRC_DIR)/bamToBed $(SRC_DIR)/bedToBam $(SRC_DIR)/bedToIgv $(SRC_DIR)/bed12ToBed6 $(SRC_DIR)/closestBed $(SRC_DIR)/complementBed $(SRC_DIR)/coverageBed $(SRC_DIR)/fastaFromBed $(SRC_DIR)/genomeCoverageBed $(SRC_DIR)/groupBy $(SRC_DIR)/intersectBed $(SRC_DIR)/linksBed $(SRC_DIR)/maskFastaFromBed $(SRC_DIR)/mergeBed $(SRC_DIR)/overlap $(SRC_DIR)/pairToBed $(SRC_DIR)/pairToPair $(SRC_DIR)/shuffleBed $(SRC_DIR)/slopBed $(SRC_DIR)/sortBed $(SRC_DIR)/subtractBed $(SRC_DIR)/windowBed
UTIL_SUBDIRS = $(SRC_DIR)/utils/lineFileUtilities $(SRC_DIR)/utils/bedFile $(SRC_DIR)/utils/genomeFile $(SRC_DIR)/utils/gzstream $(SRC_DIR)/utils/bedFilePE $(SRC_DIR)/utils/sequenceUtilities $(SRC_DIR)/utils/BamTools
UTIL_SUBDIRS = $(SRC_DIR)/utils/lineFileUtilities $(SRC_DIR)/utils/bedFile $(SRC_DIR)/utils/tabFile $(SRC_DIR)/utils/genomeFile $(SRC_DIR)/utils/gzstream $(SRC_DIR)/utils/bedFilePE $(SRC_DIR)/utils/sequenceUtilities $(SRC_DIR)/utils/BamTools
all:
......
......@@ -5,7 +5,7 @@ BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/BamTools/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/
INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/BamTools/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ -I$(UTILITIES_DIR)/lineFileUtilities/
# ----------------------------------
# define our source and object files
......
......@@ -218,9 +218,6 @@ void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED1
bedEntry = nullBed;
}
}
// close up
bed->Close();
writer->Close();
......
......@@ -6,14 +6,14 @@ BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/
INCLUDES = -I$(UTILITIES_DIR)/tabFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= groupBy.cpp
OBJECTS= $(SOURCES:.cpp=.o)
_EXT_OBJECTS=bedFile.o lineFileUtilities.o gzstream.o
_EXT_OBJECTS=tabFile.o lineFileUtilities.o gzstream.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= groupBy
......@@ -33,7 +33,7 @@ $(BUILT_OBJECTS): $(SOURCES)
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
$(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/tabFile/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/
......
......@@ -21,6 +21,7 @@
#include "version.h"
#include "lineFileUtilities.h"
#include "tabFile.h"
using namespace std;
......@@ -32,12 +33,11 @@ using namespace std;
// function declarations
void ShowHelp(void);
void DetermineInput(string &inFile, const vector<int> &groupColumns, int opColumn, string op);
void GroupBy(istream &input, const vector<int> &groupColumns, int opColumn, string op);
void GroupBy(const string &inFile, const vector<int> &groupColumns, int opColumn, string op);
void ReportSummary(const vector<string> &group, const vector<string> &data, string op);
float ToFloat (string element);
void TabPrint (string element);
void CommaPrint (string element);
int main(int argc, char* argv[]) {
......@@ -110,7 +110,12 @@ int main(int argc, char* argv[]) {
cerr << endl << "*****" << endl << "*****ERROR: Need -opCol." << endl << "*****" << endl;
showHelp = true;
}
if ((op != "sum") && (op != "max") && (op != "min") && (op != "mean") &&
(op != "mode") && (op != "median") && (op != "antimode") && (op != "stdev") &&
(op != "count") && (op != "collapse")) {
cerr << endl << "*****" << endl << "*****ERROR: Invalid op selection. " << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
// Split the column string sent by the user into discrete column numbers
......@@ -120,7 +125,8 @@ int main(int argc, char* argv[]) {
int opColumnInt = atoi(opColumnString.c_str());
DetermineInput(inFile, groupColumnsInt, opColumnInt, op);
//DetermineInput(inFile, groupColumnsInt, opColumnInt, op);
GroupBy(inFile, groupColumnsInt, opColumnInt, op);
}
else {
ShowHelp();
......@@ -150,7 +156,7 @@ void ShowHelp(void) {
cerr << "\t-op\t" << "Specify the operation that should be applied to opCol." << endl;
cerr << "\t\tValid operations: sum, count, min, max, mean, median," << endl;
cerr << "\t\tmode, antimode, stdev" << endl;
cerr << "\t\tmode, antimode, stdev, collapse (i.e., print a comma separated list)" << endl;
cerr << "\t\tDefault: sum" << endl;
cerr << "Examples: " << endl;
......@@ -174,58 +180,63 @@ void ShowHelp(void) {
}
void DetermineInput(string &inFile, const vector<int> &groupColumns, int opColumn, string op) {
if (inFile != "stdin") { // process a file
ifstream in(inFile.c_str(), ios::in);
if ( !in ) {
cerr << "Error: The requested input file (" << inFile << ") could not be opened. Exiting!" << endl;
exit (1);
}
GroupBy(in, groupColumns, opColumn, op);
}
else GroupBy(cin, groupColumns, opColumn, op);
}
void GroupBy(istream &input, const vector<int> &groupColumns, int opColumn, string op) {
void GroupBy(const string &inFile, const vector<int> &groupColumns, int opColumn, string op) {
// current line number
int lineNum = 0;
// string representing current line
string inLine;
// vector of strings holding the tokenized current line
vector<string> inFields;
inFields.reserve(20);
// keys for the current and previous group
vector<string> prevGroup(0);
vector<string> currGroup(0);
// vector of the opColumn values for the current group
vector<string> values;
values.reserve(100000);
while (getline(input, inLine)) {
lineNum++;
Tokenize(inLine, inFields);
currGroup.clear();
vector<int>::const_iterator gIt = groupColumns.begin();
vector<int>::const_iterator gEnd = groupColumns.end();
for (; gIt != gEnd; ++gIt)
currGroup.push_back(inFields[*gIt-1]);
if ((currGroup != prevGroup) && (prevGroup.size() > 0)) {
// Summarize this group
ReportSummary(prevGroup, values, op);
values.clear();
values.push_back(inFields[opColumn-1].c_str());
}
else
values.push_back(inFields[opColumn-1].c_str());
prevGroup = currGroup;
inFields.clear();
}
// check the status of the current line
TabLineStatus tabLineStatus;
// open a new tab file, loop through it line by line
// and summarize the data for a given group when the group
// fields change
TabFile *_tab = new TabFile(inFile);
_tab->Open();
while ((tabLineStatus = _tab->GetNextTabLine(inFields, lineNum)) != TAB_INVALID) {
if (tabLineStatus == TAB_VALID) {
// build the group vector for the current line
currGroup.clear();
vector<int>::const_iterator gIt = groupColumns.begin();
vector<int>::const_iterator gEnd = groupColumns.end();
for (; gIt != gEnd; ++gIt) {currGroup.push_back(inFields[*gIt-1]);}
// group change
if ((currGroup != prevGroup) && (prevGroup.size() > 0)) {
// Summarize this group
ReportSummary(prevGroup, values, op);
values.clear();
values.push_back(inFields[opColumn-1].c_str());
}
// same group
else
values.push_back(inFields[opColumn-1].c_str());
// reset for the next line
prevGroup = currGroup;
inFields.clear();
}
}
// report the last group
values.clear();
values.push_back(inFields[opColumn-1].c_str());
ReportSummary(currGroup, values, op);
_tab->Close();
}
......@@ -240,6 +251,11 @@ void ReportSummary(const vector<string> &group, const vector<string> &data, stri
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << total << endl;
}
if (op == "collapse") {
for_each(group.begin(), group.end(), TabPrint);
for_each(data.begin(), data.end(), CommaPrint);
cout << endl;
}
else if (op == "min") {
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << *min_element( dataF.begin(), dataF.end() ) << endl;
......@@ -333,3 +349,7 @@ float ToFloat (string element) {
void TabPrint (string element) {
cout << element << "\t";
}
void CommaPrint (string element) {
cout << element << ",";
}
......@@ -10,6 +10,7 @@
Licensed under the GNU General Public License 2.0+ license.
******************************************************************************/
#include "bedFile.h"
#include "lineFileUtilities.h"
#include "BamAux.h"
namespace BamTools {
......
......@@ -53,6 +53,7 @@ const BINLEVEL _binLevels = 7;
// Bins 585-4680 span 128Kbp # Level 5
// Bins 4681-37449 span 16Kbp # Level 6
const BIN _binOffsetsExtended[] = {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0};
//const BIN _binOffsetsExtended[] = {4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0};
const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */
const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */
......@@ -79,7 +80,7 @@ struct BED {
CHRPOS end;
string name;
string score;
string strand;
string strand;
// Add'l fields for BED12 and/or custom BED annotations
vector<string> otherFields;
......@@ -221,16 +222,6 @@ int overlaps(CHRPOS aS, CHRPOS aE, CHRPOS bS, CHRPOS bE) {
}
// templated function to convert objects to strings
template <typename T>
std::string ToString(const T & value)
{
std::stringstream ss;
ss << value;
return ss.str();
}
// Ancillary functions
void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks);
......@@ -337,20 +328,21 @@ private:
char *p2End, *p3End, *p4End, *p5End;
long l2, l3, l4, l5;
// bail out if we have a blank line
if (lineVector.size() == 0)
if (lineVector.size() == 0) {
return BED_BLANK;
}
if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) {
// we need at least 3 columns
if (lineVector.size() >= 3) {
// test if columns 2 and 3 are integers. If so, assume BED.
l2 = strtol(lineVector[1].c_str(), &p2End, 10);
l3 = strtol(lineVector[2].c_str(), &p3End, 10);
// strtol will set p2End or p3End to the start of the string if non-integral, base 10
if (p2End != lineVector[1].c_str() && p3End != lineVector[2].c_str()) {
setGff(false);
......@@ -390,7 +382,6 @@ private:
lineNum--;
return BED_HEADER;
}
// default
return BED_INVALID;
}
......@@ -401,7 +392,6 @@ private:
*/
template <typename T>
inline bool parseBedLine (T &bed, const vector<string> &lineVector, int lineNum) {
if ( (lineNum > 1) && (lineVector.size() == this->bedType)) {
bed.chrom = lineVector[0];
......@@ -411,7 +401,7 @@ private:
if (this->bedType == 4) {
bed.name = lineVector[3];
}
else if (this->bedType ==5) {
else if (this->bedType == 5) {
bed.name = lineVector[3];
bed.score = lineVector[4];
}
......@@ -433,7 +423,7 @@ private:
exit(1);
}
if ((bed.start <= bed.end) && (bed.start > 0) && (bed.end > 0)) {
if ((bed.start <= bed.end) && (bed.start >= 0) && (bed.end > 0)) {
return true;
}
else if (bed.start > bed.end) {
......@@ -476,7 +466,7 @@ private:
cerr << "Error: unexpected number of fields at line: " << lineNum << ". Verify that your files are TAB-delimited and that your BED file has 3,4,5 or 6 fields. Exiting..." << endl;
exit(1);
}
if ((bed.start <= bed.end) && (bed.start > 0) && (bed.end > 0)) {
if ((bed.start <= bed.end) && (bed.start >= 0) && (bed.end > 0)) {
return true;
}
else if (bed.start > bed.end) {
......@@ -650,7 +640,6 @@ private:
return true;
}
else {
cout << this->bedType << " " << _isGff << endl;
cerr << "Error: unexpected number of fields at line: " << lineNum <<
". Verify that your files are TAB-delimited and that your GFF file has 9 fields. Exiting..." << endl;
exit(1);
......
......@@ -14,46 +14,37 @@
// lineFileUtilities:
// Common Functions
//***********************************************
void Tokenize(const string &str, vector<string> &tokens, const string &delimiter) {
/*
//method to tokenize on any whitespace
string buf; // Have a buffer string
stringstream ss(str); // Insert the string into a stream
while (ss >> buf)
//cout << buf << endl;
tokens.push_back(buf);
*/
// Skip delimiters at beginning.
string::size_type lastPos = str.find_first_not_of(delimiter, 0);
// Find first "non-delimiter".
string::size_type pos = str.find_first_of(delimiter, lastPos);
while (string::npos != pos || string::npos != lastPos) {
// Found a token, add it to the vector.
tokens.push_back(str.substr(lastPos, pos - lastPos));
// Skip delimiters. Note the "not_of"
lastPos = str.find_first_not_of(delimiter, pos);
// Find next "non-delimiter"
pos = str.find_first_of(delimiter, lastPos);
}
}
void Tokenize(const string &str, vector<int> &tokens, const string &delimiter) {
// Skip delimiters at beginning.
string::size_type lastPos = str.find_first_not_of(delimiter, 0);
// Find first "non-delimiter".
string::size_type pos = str.find_first_of(delimiter, lastPos);
while (string::npos != pos || string::npos != lastPos) {
// Found a token, add it to the vector.
tokens.push_back(atoi(str.substr(lastPos, pos - lastPos).c_str()));
// Skip delimiters. Note the "not_of"
lastPos = str.find_first_not_of(delimiter, pos);
// Find next "non-delimiter"
pos = str.find_first_of(delimiter, lastPos);
}
}
// void Tokenize(const string &str, vector<string> &tokens, const string &delimiter) {
// // Skip delimiters at beginning.
// string::size_type lastPos = str.find_first_not_of(delimiter, 0);
// // Find first "non-delimiter".
// string::size_type pos = str.find_first_of(delimiter, lastPos);
//
// while (string::npos != pos || string::npos != lastPos) {
// // Found a token, add it to the vector.
// tokens.push_back(str.substr(lastPos, pos - lastPos));
// // Skip delimiters. Note the "not_of"
// lastPos = str.find_first_not_of(delimiter, pos);
// // Find next "non-delimiter"
// pos = str.find_first_of(delimiter, lastPos);
// }
// }
//
// void Tokenize(const string &str, vector<int> &tokens, const string &delimiter) {
// // Skip delimiters at beginning.
// string::size_type lastPos = str.find_first_not_of(delimiter, 0);
// // Find first "non-delimiter".
// string::size_type pos = str.find_first_of(delimiter, lastPos);
//
// while (string::npos != pos || string::npos != lastPos) {
// // Found a token, add it to the vector.
// tokens.push_back(atoi(str.substr(lastPos, pos - lastPos).c_str()));
// // Skip delimiters. Note the "not_of"
// lastPos = str.find_first_not_of(delimiter, pos);
// // Find next "non-delimiter"
// pos = str.find_first_of(delimiter, lastPos);
// }
// }
......@@ -4,13 +4,56 @@
#include <vector>
#include <string>
#include <algorithm>
#include <sstream>
using namespace std;
// split a line from a file into a vector of strings. token = "\t"
void Tokenize(const string &str, vector<string>& tokens, const string &delimiter = "\t");
void Tokenize(const string &str, vector<int>& tokens, const string &delimiter = "\t");
//void Tokenize(const string &str, vector<string>& tokens, const string &delimiter = "\t");
//void Tokenize(const string &str, vector<int>& tokens, const string &delimiter = "\t");
// templated function to convert objects to strings
template <typename T>
inline
std::string ToString(const T & value) {
std::stringstream ss;
ss << value;
return ss.str();
}
inline
void Tokenize(const string &str, vector<string> &tokens, const string &delimiter = "\t") {
// Skip delimiters at beginning.
string::size_type lastPos = str.find_first_not_of(delimiter, 0);
// Find first "non-delimiter".
string::size_type pos = str.find_first_of(delimiter, lastPos);
while (string::npos != pos || string::npos != lastPos) {
// Found a token, add it to the vector.
tokens.push_back(str.substr(lastPos, pos - lastPos));
// Skip delimiters. Note the "not_of"
lastPos = str.find_first_not_of(delimiter, pos);
// Find next "non-delimiter"
pos = str.find_first_of(delimiter, lastPos);
}
}
inline
void Tokenize(const string &str, vector<int> &tokens, const string &delimiter = "\t") {
// Skip delimiters at beginning.
string::size_type lastPos = str.find_first_not_of(delimiter, 0);
// Find first "non-delimiter".
string::size_type pos = str.find_first_of(delimiter, lastPos);
while (string::npos != pos || string::npos != lastPos) {
// Found a token, add it to the vector.
tokens.push_back(atoi(str.substr(lastPos, pos - lastPos).c_str()));
// Skip delimiters. Note the "not_of"
lastPos = str.find_first_not_of(delimiter, pos);
// Find next "non-delimiter"
pos = str.find_first_of(delimiter, lastPos);
}
}
#endif /* LINEFILEUTILITIES_H */
/*****************************************************************************
_tabFile.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licensed under the GNU General Public License 2.0+ license.
******************************************************************************/
#include "lineFileUtilities.h"
#include "tabFile.h"
/*******************************************
Class methods
*******************************************/
// Constructor
TabFile::TabFile(const string &tabFile)
: _tabFile(tabFile)
{}
// Destructor
TabFile::~TabFile(void) {
}
void TabFile::Open(void) {
if (_tabFile == "stdin") {
_tabStream = &cin;
}
else {
size_t foundPos;
foundPos = _tabFile.find_last_of(".gz");
// is this a GZIPPED TAB file?
if (foundPos == _tabFile.size() - 1) {
igzstream tabs(_tabFile.c_str(), ios::in);
if ( !tabs ) {
cerr << "Error: The requested file (" << _tabFile << ") could not be opened. Exiting!" << endl;
exit (1);
}
else {
// if so, close it (this was just a test)
tabs.close();
// now set a pointer to the stream so that we
// can read the file later on.
_tabStream = new igzstream(_tabFile.c_str(), ios::in);
}
}
// not GZIPPED.
else {
ifstream tabs(_tabFile.c_str(), ios::in);
// can we open the file?
if ( !tabs ) {
cerr << "Error: The requested file (" << _tabFile << ") could not be opened. Exiting!" << endl;
exit (1);
}
else {
// if so, close it (this was just a test)
tabs.close();
// now set a pointer to the stream so that we
// can read the file later on.
_tabStream = new ifstream(_tabFile.c_str(), ios::in);
}
}
}
}
// Close the TAB file
void TabFile::Close(void) {
if (_tabFile != "stdin") delete _tabStream;
}
TabLineStatus TabFile::GetNextTabLine(TAB_FIELDS &tabFields, int &lineNum) {
// make sure there are still lines to process.
// if so, tokenize, return the TAB_FIELDS.
if (_tabStream->good() == true) {
string tabLine;
tabFields.reserve(20);
// parse the tabStream pointer
getline(*_tabStream, tabLine);
lineNum++;
// split into a string vector.
Tokenize(tabLine,tabFields);
// parse the line and validate it.
return parseTabLine(tabFields, lineNum);
}
// default if file is closed or EOF
return TAB_INVALID;
}
/*****************************************************************************
tabFile.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licensed under the GNU General Public License 2.0+ license.
******************************************************************************/
#ifndef BEDFILE_H
#define BEDFILE_H
#include "gzstream.h"
#include <vector>
#include <map>
#include <set>
#include <string>
#include <iostream>
#include <fstream>
#include <sstream>
#include <cstring>