Commit 761d4a58 authored by Aaron's avatar Aaron
Browse files

added cluster tool. both cluster and merge require sorted files for efficiency.

parent c045c254
......@@ -23,6 +23,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/bedToIgv \
$(SRC_DIR)/bed12ToBed6 \
$(SRC_DIR)/closestBed \
$(SRC_DIR)/clusterBed \
$(SRC_DIR)/complementBed \
$(SRC_DIR)/coverageBed \
$(SRC_DIR)/fastaFromBed \
......
......@@ -14,7 +14,8 @@ def main():
'bed12tobed6': 'bed12ToBed6',
'bedpetobam': 'bedpeToBam',
'bedtobam': 'bedToBam',
'closest': 'closestBed',
'closest': 'closestBed',
'cluster': 'clusterBed',
'complement': 'complementBed',
'coverage': 'coverageBed',
'flank': 'flankBed',
......
......@@ -41,6 +41,7 @@ int bedtobam_main(int argc, char* argv[]);//
int bedtoigv_main(int argc, char* argv[]);//
int bedpetobam_main(int argc, char* argv[]);//
int closest_main(int argc, char* argv[]); //
int cluster_main(int argc, char* argv[]); //
int complement_main(int argc, char* argv[]);//
int coverage_main(int argc, char* argv[]); //
int fastafrombed_main(int argc, char* argv[]);//
......@@ -82,6 +83,7 @@ int main(int argc, char *argv[])
else if (sub_cmd == "coverage") return coverage_main(argc-1, argv+1);
else if (sub_cmd == "genomecov") return genomecoverage_main(argc-1, argv+1);
else if (sub_cmd == "merge") return merge_main(argc-1, argv+1);
else if (sub_cmd == "cluster") return cluster_main(argc-1, argv+1);
else if (sub_cmd == "complement") return complement_main(argc-1, argv+1);
else if (sub_cmd == "subtract") return subtract_main(argc-1, argv+1);
else if (sub_cmd == "slop") return slop_main(argc-1, argv+1);
......@@ -171,6 +173,7 @@ int bedtools_help(void)
cout << " coverage " << "Compute the coverage over defined intervals.\n";
cout << " genomecov " << "Compute the coverage over an entire genome.\n";
cout << " merge " << "Combine overlapping/nearby intervals into a single interval.\n";
cout << " cluster " << "Cluster (but don't merge) overlapping/nearby intervals.\n";
cout << " complement " << "Extract intervals _not_ represented by an interval file.\n";
cout << " subtract " << "Remove intervals based on overlaps b/w two files.\n";
cout << " slop " << "Adjust the size of intervals.\n";
......
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= clusterMain.cpp clusterBed.cpp
OBJECTS= $(SOURCES:.cpp=.o)
_EXT_OBJECTS=bedFile.o lineFileUtilities.o gzstream.o fileType.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
all: $(BUILT_OBJECTS)
.PHONY: all
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
$(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
.PHONY: clean
\ No newline at end of file
/*****************************************************************************
clusterBed.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "lineFileUtilities.h"
#include "clusterBed.h"
// = Constructor =
BedCluster::BedCluster(string &bedFile,
int maxDistance,
bool forceStrand)
:
_bedFile(bedFile),
_forceStrand(forceStrand),
_maxDistance(maxDistance)
{
_bed = new BedFile(bedFile);
if (_forceStrand == false)
ClusterBed();
else
ClusterBedStranded();
}
// = Destructor =
BedCluster::~BedCluster(void)
{}
// = Cluster overlapping or nearby BED entries =
void BedCluster::ClusterBed() {
uint32_t cluster_id = 0;
BED prev, curr;
int end = -1;
_bed->Open();
while (_bed->GetNextBed(curr, true)) { // true = force sorted intervals
if (_bed->_status != BED_VALID)
continue;
// new cluster, no overlap
if ( (((int) curr.start - end) > _maxDistance) ||
(curr.chrom != prev.chrom)
)
{
cluster_id++;
end = curr.end;
}
prev = curr;
_bed->reportBedTab(curr);
printf("%d\n", cluster_id);
}
}
// = Cluster overlapping BED entries, accounting for strandedness =
void BedCluster::ClusterBedStranded() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bed->loadBedFileIntoMapNoBin();
// loop through each chromosome and merge their BED entries
masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
for (; m != mEnd; ++m) {
// bedList is already sorted by start position.
string chrom = m->first;
vector<BED> bedList = m->second;
// make a list of the two strands to merge separately.
vector<string> strands(2);
strands[0] = "+";
strands[1] = "-";
uint32_t cluster_id = 0;
// do two passes, one for each strand.
for (unsigned int s = 0; s < strands.size(); s++) {
// cluster overlapping features for this chromosome.
int end = -1;
BED prev;
vector<BED>::const_iterator bedItr = bedList.begin();
vector<BED>::const_iterator bedEnd = bedList.end();
for (; bedItr != bedEnd; ++bedItr) {
// if forcing strandedness, move on if the hit
// is not on the current strand.
if (bedItr->strand != strands[s])
continue;
// new cluster, no overlap
if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0))
{
cluster_id++;
end = bedItr->end;
}
// same cluster, overlaps
else {
if ((int) bedItr->end > end)
end = bedItr->end;
}
prev = *bedItr;
_bed->reportBedTab(prev);
printf("%d\n", cluster_id);
}
}
}
}
/*****************************************************************************
clusterBed.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "bedFile.h"
#include <vector>
#include <algorithm>
#include <numeric>
#include <iostream>
#include <fstream>
#include <limits.h>
#include <stdlib.h>
using namespace std;
//************************************************
// Class methods and elements
//************************************************
class BedCluster {
public:
// constructor
BedCluster(string &bedFile, int maxDistance, bool forceStrand);
// destructor
~BedCluster(void);
// find clusters
void ClusterBed();
// find clusters based on strand
void ClusterBedStranded();
private:
string _bedFile;
bool _forceStrand;
int _maxDistance;
// instance of a bed file class.
BedFile *_bed;
};
/*****************************************************************************
clusterMain.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "clusterBed.h"
#include "version.h"
using namespace std;
// define our program name
#define PROGRAM_NAME "bedtools cluster"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void cluster_help(void);
int cluster_main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
// input files
string bedFile = "stdin";
int maxDistance = 0;
// input arguments
bool haveBed = true;
bool haveMaxDistance = false;
bool forceStrand = false;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
}
}
if(showHelp) cluster_help();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-i", 2, parameterLength)) {
if ((i+1) < argc) {
bedFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-d", 2, parameterLength)) {
if ((i+1) < argc) {
haveMaxDistance = true;
maxDistance = atoi(argv[i + 1]);
i++;
}
}
else if (PARAMETER_CHECK("-s", 2, parameterLength)) {
forceStrand = true;
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveBed) {
cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedCluster *bc = new BedCluster(bedFile, maxDistance, forceStrand);
delete bc;
}
else {
cluster_help();
}
return 0;
}
void cluster_help(void) {
cerr << "\nTool: bedtools cluster" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Clusters overlapping/nearby BED/GFF/VCF intervals." << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf>" << endl << endl;
cerr << "Options: " << endl;
cerr << "\t-s\t" << "Force strandedness. That is, only merge features" << endl;
cerr << "\t\tthat are the same strand." << endl;
cerr << "\t\t- By default, merging is done without respect to strand." << endl << endl;
cerr << "\t-d\t" << "Maximum distance between features allowed for features" << endl;
cerr << "\t\tto be merged." << endl;
cerr << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl;
cerr << "\t\t- (INTEGER)" << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) All output, regardless of input type (e.g., GFF or VCF)" << endl;
cerr << "\t will in BED format with zero-based starts" << endl << endl;
// end the program here
exit(1);
}
......@@ -241,54 +241,47 @@ void BedMerge::ReportStranded(string chrom, int start, int end,
// = Merge overlapping BED entries into a single entry =
// =====================================================
void BedMerge::MergeBed() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bed->loadBedFileIntoMapNoBin();
// loop through each chromosome and merge their BED entries
masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
for (; m != mEnd; ++m) {
// bedList is already sorted by start position.
string chrom = m->first;
vector<BED> bedList = m->second;
int mergeCount = 1;
vector<string> names;
vector<string> scores;
// merge overlapping features for this chromosome.
int start = -1;
int end = -1;
vector<BED>::const_iterator bedItr = bedList.begin();
vector<BED>::const_iterator bedEnd = bedList.end();
for (; bedItr != bedEnd; ++bedItr) {
// new block, no overlap
if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) {
if (start >= 0) {
Report(chrom, start, end, names, scores, mergeCount);
// reset
mergeCount = 1;
names.clear();
scores.clear();
}
start = bedItr->start;
end = bedItr->end;
if (!bedItr->name.empty()) names.push_back(bedItr->name);
if (!bedItr->score.empty()) scores.push_back(bedItr->score);
}
// same block, overlaps
else {
if ((int) bedItr-> end > end) end = bedItr->end;
mergeCount++;
if (!bedItr->name.empty()) names.push_back(bedItr->name);
if (!bedItr->score.empty()) scores.push_back(bedItr->score);
int mergeCount = 1;
vector<string> names;
vector<string> scores;
int start = -1;
int end = -1;
BED prev, curr;
_bed->Open();
while (_bed->GetNextBed(curr, true)) { // true = force sorted intervals
if (_bed->_status != BED_VALID)
continue;
// new block, no overlap
if ( (((int) curr.start - end) > _maxDistance) || (curr.chrom != prev.chrom)) {
if (start >= 0) {
Report(prev.chrom, start, end, names, scores, mergeCount);
// reset
mergeCount = 1;
names.clear();
scores.clear();
}
start = curr.start;
end = curr.end;
if (!curr.name.empty())
names.push_back(curr.name);
if (!curr.score.empty())
scores.push_back(curr.score);
}
if (start >= 0) {
Report(chrom, start, end, names, scores, mergeCount);
// same block, overlaps
else {
if ((int) curr.end > end)
end = curr.end;
if (!curr.name.empty())
names.push_back(curr.name);
if (!curr.score.empty())
scores.push_back(curr.score);
mergeCount++;
}
prev = curr;
}
if (start >= 0) {
Report(prev.chrom, start, end, names, scores, mergeCount);
}
}
......
......@@ -242,7 +242,11 @@ bool BedFile::GetNextBed(BED &bed, bool forceSorted) {
_prev_start = bed.start;
}
else if (forceSorted) {
cerr << "ERROR: input file: (" << bedFile << ") is not sorted by chrom then start" << endl;
cerr << "ERROR: input file: (" << bedFile
<< ") is not sorted by chrom then start." << endl
<< " The start coordinate at line " << _lineNum
<< " is less than the start at line " << _lineNum-1
<< endl;
exit(1);
}
}
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment