Commit 484419a9 authored by Aaron's avatar Aaron
Browse files

Minor stylistic tweaks.

parent 9cb00210
...@@ -12,7 +12,7 @@ export CXXFLAGS = -Wall -O2 ...@@ -12,7 +12,7 @@ export CXXFLAGS = -Wall -O2
export LIBS = -lz export LIBS = -lz
# define our source subdirectories # define our source subdirectories
SUBDIRS = $(SRC_DIR)/bamToBed $(SRC_DIR)/bedToBam $(SRC_DIR)/bedToIgv $(SRC_DIR)/bed12ToBed6 $(SRC_DIR)/closestBed $(SRC_DIR)/complementBed $(SRC_DIR)/coverageBed $(SRC_DIR)/fastaFromBed $(SRC_DIR)/genomeCoverageBed $(SRC_DIR)/intersectBed $(SRC_DIR)/linksBed $(SRC_DIR)/maskFastaFromBed $(SRC_DIR)/mergeBed $(SRC_DIR)/overlap $(SRC_DIR)/pairToBed $(SRC_DIR)/pairToPair $(SRC_DIR)/shuffleBed $(SRC_DIR)/slopBed $(SRC_DIR)/sortBed $(SRC_DIR)/subtractBed $(SRC_DIR)/windowBed SUBDIRS = $(SRC_DIR)/bamToBed $(SRC_DIR)/bedToBam $(SRC_DIR)/bedToIgv $(SRC_DIR)/bed12ToBed6 $(SRC_DIR)/closestBed $(SRC_DIR)/complementBed $(SRC_DIR)/coverageBed $(SRC_DIR)/fastaFromBed $(SRC_DIR)/genomeCoverageBed $(SRC_DIR)/groupBy $(SRC_DIR)/intersectBed $(SRC_DIR)/linksBed $(SRC_DIR)/maskFastaFromBed $(SRC_DIR)/mergeBed $(SRC_DIR)/overlap $(SRC_DIR)/pairToBed $(SRC_DIR)/pairToPair $(SRC_DIR)/shuffleBed $(SRC_DIR)/slopBed $(SRC_DIR)/sortBed $(SRC_DIR)/subtractBed $(SRC_DIR)/windowBed
UTIL_SUBDIRS = $(SRC_DIR)/utils/lineFileUtilities $(SRC_DIR)/utils/bedFile $(SRC_DIR)/utils/genomeFile $(SRC_DIR)/utils/gzstream $(SRC_DIR)/utils/bedFilePE $(SRC_DIR)/utils/sequenceUtilities $(SRC_DIR)/utils/BamTools UTIL_SUBDIRS = $(SRC_DIR)/utils/lineFileUtilities $(SRC_DIR)/utils/bedFile $(SRC_DIR)/utils/genomeFile $(SRC_DIR)/utils/gzstream $(SRC_DIR)/utils/bedFilePE $(SRC_DIR)/utils/sequenceUtilities $(SRC_DIR)/utils/BamTools
......
...@@ -226,29 +226,28 @@ void ProcessBed(istream &bedInput, BedFile *bed, string path, string sortType, s ...@@ -226,29 +226,28 @@ void ProcessBed(istream &bedInput, BedFile *bed, string path, string sortType, s
if (session != "none") if (session != "none")
cout << "load " << session << endl; cout << "load " << session << endl;
// process each BED entry and convert to an IGV request
BED bedEntry, nullBed; BED bedEntry, nullBed;
int lineNum = 0; int lineNum = 0;
BedLineStatus bedStatus; BedLineStatus bedStatus;
bed->Open(); bed->Open();
// process each BED entry and convert to an IGV request
while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
if (bedStatus == BED_VALID) { if (bedStatus == BED_VALID) {
string locus, filename; string locus, filename;
if (useNames == false) { locus = bedEntry.chrom + ":" + ToString(bedEntry.start - slop) + "-" + ToString(bedEntry.end + slop);
locus = bedEntry.chrom + ":" + ToString(bedEntry.start - slop) + "-" + ToString(bedEntry.end + slop);
if (useNames == false)
filename = locus; filename = locus;
}
else { else {
locus = bedEntry.chrom + ":" + ToString(bedEntry.start - slop) + "-" + ToString(bedEntry.end + slop);
if (bedEntry.name.empty() == false) if (bedEntry.name.empty() == false)
filename = bedEntry.name; filename = bedEntry.name;
else { else {
cerr << "Error: You requested that filenames be based upon the name field. However, it appears to be empty. Exiting!" << endl; cerr << "Error: You requested that filenames be based upon the name field. However, it appears to be empty. Exiting!" << endl;
exit (1); exit (1);
} }
} }
// goto // goto
cout << "goto " << locus << endl; cout << "goto " << locus << endl;
...@@ -264,6 +263,7 @@ void ProcessBed(istream &bedInput, BedFile *bed, string path, string sortType, s ...@@ -264,6 +263,7 @@ void ProcessBed(istream &bedInput, BedFile *bed, string path, string sortType, s
// snapshot // snapshot
cout << "snapshot " << filename << "." << imageType << endl; cout << "snapshot " << filename << "." << imageType << endl;
// reset
bedEntry = nullBed; bedEntry = nullBed;
} }
} }
......
...@@ -32,7 +32,7 @@ int main(int argc, char* argv[]) { ...@@ -32,7 +32,7 @@ int main(int argc, char* argv[]) {
// input files // input files
string bedFile; string bedFile;
string genomeFile; string genomeFile;
int max = 999999999; int max = INT_MAX;
bool haveBed = false; bool haveBed = false;
bool bamInput = false; bool bamInput = false;
......
/*****************************************************************************
overlap.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0+ license.
******************************************************************************/
#include <vector>
#include <map>
#include <numeric>
#include <iterator>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <stdlib.h>
#include <math.h>
#include "version.h"
#include "lineFileUtilities.h"
using namespace std;
// define our program name
#define PROGRAM_NAME "groupBy"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void ShowHelp(void);
void DetermineInput(string &inFile, const vector<int> &groupColumns, int opColumn, string op);
void GroupBy(istream &input, const vector<int> &groupColumns, int opColumn, string op);
void ReportSummary(const vector<string> &group, const vector<string> &data, string op);
float ToFloat (string element);
void TabPrint (string element);
int main(int argc, char* argv[]) {
// input files
string inFile;
string groupColumnsString = "1,2,3";
string opColumnString;
string op = "sum";
// our configuration variables
bool showHelp = false;
bool haveInFile = false;
bool haveGroupColumns = false;
bool haveOpColumn = false;
bool haveOp = true;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
}
}
if(showHelp) ShowHelp();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-i", 2, parameterLength)) {
if ((i+1) < argc) {
haveInFile = true;
inFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-grp", 4, parameterLength)) {
haveGroupColumns = true;
groupColumnsString = argv[i + 1];
i++;
}
else if(PARAMETER_CHECK("-opCol", 6, parameterLength)) {
haveOpColumn = true;
opColumnString = argv[i + 1];
i++;
}
else if(PARAMETER_CHECK("-op", 3, parameterLength)) {
haveOp = true;
op = argv[i + 1];
i++;
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
// make sure we have an input files
if (!haveInFile) {
cerr << endl << "*****" << endl << "*****ERROR: Need -i file. " << endl << "*****" << endl;
showHelp = true;
}
if (!haveOpColumn) {
cerr << endl << "*****" << endl << "*****ERROR: Need -opCol." << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
// Split the column string sent by the user into discrete column numbers
// A comma separated string is expected.
vector<int> groupColumnsInt;
Tokenize(groupColumnsString, groupColumnsInt, ",");
int opColumnInt = atoi(opColumnString.c_str());
DetermineInput(inFile, groupColumnsInt, opColumnInt, op);
}
else {
ShowHelp();
}
}
void ShowHelp(void) {
cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl;
cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl;
cerr << "Summary: Summarizes a dataset column based upon" << endl;
cerr << "\t common column groupings. Akin to the SQL \"group by\" command." << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <input> -opCol <> " << endl << endl;
cerr << "Options: " << endl;
cerr << "\t-i\t" << "Input file. Use \"stdin\" for pipes." << endl << endl;
cerr << "\t-grp\t" << "Specify the columns (1-based) for the grouping." << endl;
cerr << "\t\tThe columns must be comma separated." << endl;
cerr << "\t\tDefault: 1,2,3" << endl << endl;
cerr << "\t-opCol\t" << "Specify the column (1-based) that should be summarized." << endl;
cerr << "\t\tRequired." << endl << endl;
cerr << "\t-op\t" << "Specify the operation that should be applied to opCol." << endl;
cerr << "\t\tValid operations: sum, count, min, max, mean, median," << endl;
cerr << "\t\tmode, antimode, stdev" << endl;
cerr << "\t\tDefault: sum" << endl;
cerr << "Examples: " << endl;
cerr << "\t$ cat test.out" << endl;
cerr << "\tchr1 10 20 A chr1 15 25 1000" << endl;
cerr << "\tchr1 10 20 A chr1 25 35 10000" << endl << endl;
cerr << "\t$ cat test.out | groupBy -i stdin -grpCols 1,2,3,4 -opCol 8 -op sum" << endl;
cerr << "\tchr1 10 20 A 11000" << endl << endl;
cerr << "\t$ cat test.out | groupBy -i stdin -grpCols 1,2,3,4 -opCol 8 -op max" << endl;
cerr << "\tchr1 10 20 A 1000" << endl << endl;
cerr << "\t$ cat test.out | groupBy -i stdin -grpCols 1,2,3,4 -opCol 8 -op mean" << endl;
cerr << "\tchr1 10 20 A 5500" << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) The input file/stream should be sorted/grouped by the -grpCols." << endl << endl;
// end the program here
exit(1);
}
void DetermineInput(string &inFile, const vector<int> &groupColumns, int opColumn, string op) {
if (inFile != "stdin") { // process a file
ifstream in(inFile.c_str(), ios::in);
if ( !in ) {
cerr << "Error: The requested input file (" << inFile << ") could not be opened. Exiting!" << endl;
exit (1);
}
GroupBy(in, groupColumns, opColumn, op);
}
else GroupBy(cin, groupColumns, opColumn, op);
}
void GroupBy(istream &input, const vector<int> &groupColumns, int opColumn, string op) {
int lineNum = 0;
string inLine;
vector<string> inFields;
vector<string> prevGroup(0);
vector<string> currGroup(0);
vector<string> values;
values.reserve(100000);
while (getline(input, inLine)) {
lineNum++;
Tokenize(inLine, inFields);
currGroup.clear();
vector<int>::const_iterator gIt = groupColumns.begin();
vector<int>::const_iterator gEnd = groupColumns.end();
for (; gIt != gEnd; ++gIt)
currGroup.push_back(inFields[*gIt-1]);
if ((currGroup != prevGroup) && (prevGroup.size() > 0)) {
// Summarize this group
ReportSummary(prevGroup, values, op);
values.clear();
values.push_back(inFields[opColumn-1].c_str());
}
else
values.push_back(inFields[opColumn-1].c_str());
prevGroup = currGroup;
inFields.clear();
}
values.clear();
values.push_back(inFields[opColumn-1].c_str());
ReportSummary(currGroup, values, op);
}
void ReportSummary(const vector<string> &group, const vector<string> &data, string op) {
vector<double> dataF;
// convert to floats
transform(data.begin(), data.end(), back_inserter(dataF), ToFloat);
if (op == "sum") {
// sum them up
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << total << endl;
}
else if (op == "min") {
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << *min_element( dataF.begin(), dataF.end() ) << endl;
}
else if (op == "max") {
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << *max_element( dataF.begin(), dataF.end() ) << endl;
}
else if (op == "mean") {
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
double mean = total / dataF.size();
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << mean << endl;
}
else if (op == "median") {
double median = 0.0;
sort(dataF.begin(), dataF.end());
int totalLines = dataF.size();
if ((totalLines % 2) > 0) {
long mid;
mid = totalLines / 2;
median = dataF[mid];
}
else {
long midLow, midHigh;
midLow = (totalLines / 2) - 1;
midHigh = (totalLines / 2);
median = (dataF[midLow] + dataF[midHigh]) / 2.0;
}
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << median << endl;
}
else if (op == "count") {
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << data.size() << endl;
}
else if ((op == "mode") || (op == "antimode")) {
// compute the frequency of each unique value
map<string, int> freqs;
vector<string>::const_iterator dIt = data.begin();
vector<string>::const_iterator dEnd = data.end();
for (; dIt != dEnd; ++dIt) {
freqs[*dIt]++;
}
// grab the mode and the anti mode
string mode, antiMode;
int count = 0;
int minCount = INT_MAX;
for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) {
if (iter->second > count) {
mode = iter->first;
count = iter->second;
}
if (iter->second < minCount) {
antiMode = iter->first;
minCount = iter->second;
}
}
// report
for_each(group.begin(), group.end(), TabPrint);
if (op == "mode")
cout << setprecision (7) << mode << endl;
else if (op == "antimode")
cout << setprecision (7) << antiMode << endl;
}
else if (op == "stdev") {
// get the mean
double total = accumulate(dataF.begin(), dataF.end(), 0.0);
double mean = total / dataF.size();
// get the variance
double totalVariance = 0.0;
vector<double>::const_iterator dIt = dataF.begin();
vector<double>::const_iterator dEnd = dataF.end();
for (; dIt != dEnd; ++dIt) {
totalVariance += pow((*dIt - mean),2);
}
double variance = totalVariance / dataF.size();
double stddev = sqrt(variance);
// report
for_each(group.begin(), group.end(), TabPrint);
cout << setprecision (7) << stddev << endl;
}
}
float ToFloat (string element) {
return atof(element.c_str());
}
void TabPrint (string element) {
cout << element << "\t";
}
...@@ -124,7 +124,7 @@ void ShowHelp(void) { ...@@ -124,7 +124,7 @@ void ShowHelp(void) {
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <input> -cols s1,e1,s2,e2 " << endl << endl; cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <input> -cols s1,e1,s2,e2 " << endl << endl;
cerr << "Notes: " << endl; cerr << "Options: " << endl;
cerr << "\t-i\t" << "Input file. Use \"stdin\" for pipes." << endl << endl; cerr << "\t-i\t" << "Input file. Use \"stdin\" for pipes." << endl << endl;
cerr << "\t-cols\t" << "Specify the columns (1-based) for the starts and ends of the" << endl; cerr << "\t-cols\t" << "Specify the columns (1-based) for the starts and ends of the" << endl;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment