diff --git a/src/closestBed/Makefile b/src/closestBed/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..abe0eee0a0d1a612afe17c1310a535b16a0031a5 --- /dev/null +++ b/src/closestBed/Makefile @@ -0,0 +1,44 @@ +CXX = g++ +CXXFLAGS = -O3 +LDFLAGS = + +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= closestMain.cpp closestBed.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=bedFile.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +PROGRAM= closestBed +LIBS= + +all: $(PROGRAM) + +.PHONY: all + +$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS) + @echo " * linking $(PROGRAM)" + @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS) + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean diff --git a/src/closestBed/closestBed.cpp b/src/closestBed/closestBed.cpp new file mode 100755 index 0000000000000000000000000000000000000000..34725f56806887050db57f59f942cdb0edc1fff1 --- /dev/null +++ b/src/closestBed/closestBed.cpp @@ -0,0 +1,261 @@ +// +// closestBed.cpp +// BEDTools +// +// Created by Aaron Quinlan Spring 2009. +// Copyright 2009 Aaron Quinlan. All rights reserved. +// +// Summary: Looks for the closest features in two BED files. +// + +/* + Includes +*/ +#include "closestBed.h" + +const int MAXSLOP = 256000000; // 2*MAXSLOP = 512 megabases. + // We don't want to keep looking if we + // can't find a nearby feature within 512 Mb. +const int SLOPGROWTH = 2048000; + + +/* + Constructor +*/ +BedClosest::BedClosest(string &bedAFile, string &bedBFile) { + + this->bedAFile = bedAFile; + this->bedBFile = bedBFile; + + this->bedA = new BedFile(bedAFile); + this->bedB = new BedFile(bedBFile); +} + +/* + Destructor +*/ +BedClosest::~BedClosest(void) { +} + + + +/* + reportA + + Writes the _original_ BED entry for A. + Works for BED3 - BED6. +*/ +void BedClosest::reportA(const BED &a) { + + if (bedA->bedType == 3) { + cout << a.chrom << "\t" << a.start << "\t" << a.end; + } + else if (bedA->bedType == 4) { + cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" + << a.name; + } + else if (bedA->bedType == 5) { + cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" + << a.name << "\t" << a.score; + } + else if (bedA->bedType == 6) { + cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" + << a.name << "\t" << a.score << "\t" << a.strand; + } +} + + + +/* + reportB + + Writes the _original_ BED entry for B. + Works for BED3 - BED6. +*/ +void BedClosest::reportB(const BED &b) { + if (bedB->bedType == 3) { + cout << b.chrom << "\t" << b.start << "\t" << b.end; + } + else if (bedB->bedType == 4) { + cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" + << b.name; + } + else if (bedB->bedType == 5) { + cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" + << b.name << "\t" << b.score; + } + else if (bedB->bedType == 6) { + cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" + << b.name << "\t" << b.score << "\t" << b.strand; + } +} + + + + +/* + reportNullB + + Writes a NULL B entry for cases where no closest BED was found + Works for BED3 - BED6. +*/ +void BedClosest::reportNullB() { + if (bedB->bedType == 3) { + cout << "none" << "\t" << "-1" << "\t" << "-1"; + } + else if (bedB->bedType == 4) { + cout << "none" << "\t" << "-1" << "\t" << "-1" << "\t" + << "-1"; + } + else if (bedB->bedType == 5) { + cout << "none" << "\t" << "-1" << "\t" << "-1" << "\t" + << "-1" << "\t" << "-1"; + } + else if (bedB->bedType == 6) { + cout << "none" << "\t" << "-1" << "\t" << "-1" << "\t" + << "-1" << "\t" << "-1" << "\t" << "-1"; + } +} + + + + +void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) { + + int slop = SLOPGROWTH; // start out just looking for overlaps + // within the current bin (~128Kb) + + // update the current feature's start and end + + int aFudgeStart = 0; + int aFudgeEnd; + int numOverlaps = 0; + BED closestB; + float maxOverlap = 0; + int minDistance = 999999999; + + + if(bedB->bedMap.find(a.chrom) != bedB->bedMap.end()) { + + while ((numOverlaps == 0) && (slop <= MAXSLOP)) { + + if ((a.start - slop) > 0) { + aFudgeStart = a.start - slop; + } + else { + aFudgeEnd; + } + aFudgeEnd = a.end + slop; + + bedB->binKeeperFind(bedB->bedMap[a.chrom], aFudgeStart, aFudgeEnd, hits); + + for (vector<BED>::iterator h = hits.begin(); h != hits.end(); ++h) { + + numOverlaps++; + + // do the actual features overlap? + int s = max(a.start, h->start); + int e = min(a.end, h->end); + + if (s < e) { + + // is there enough overlap (default ~ 1bp) + float overlap = (float)(e-s) / (float)(a.end - a.start); + + if ( overlap > 0 ) { + + // is this hit the closest? + if (overlap > maxOverlap) { + closestB = *h; + maxOverlap = overlap; + } + } + } + else if (h->end < a.start){ + if ((a.start - h->end) < minDistance) { + closestB = *h; + minDistance = a.start - h->end; + } + } + else { + if ((h->start - a.end) < minDistance) { + closestB = *h; + minDistance = h->start - a.end; + } + } + + } + slop += SLOPGROWTH; // if there were no overlaps, + // we'll widen the range by 64Kb in each direction + } + } + else { + reportA(a); + cout << "\t"; + reportNullB(); + cout << "\n"; + } + + if (numOverlaps > 0) { + reportA(a); + cout << "\t"; + reportB(closestB); + cout << "\n"; + } +} + + +void BedClosest::ClosestBed() { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + bedB->loadBedFileIntoMap(); + + + string bedLine; + BED bedEntry; + int lineNum = 0; + + // open the BED file for reading + if (bedA->bedFile != "stdin") { + + ifstream bed(bedA->bedFile.c_str(), ios::in); + if ( !bed ) { + cerr << "Error: The requested bed file (" <<bedA->bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + + BED a; + while (getline(bed, bedLine)) { + + vector<string> bedFields; + Tokenize(bedLine,bedFields); + + lineNum++; + if (bedA->parseBedLine(a, bedFields, lineNum)) { + vector<BED> hits; + FindWindowOverlaps(a, hits); + } + } + } + // "A" is being passed via STDIN. + else { + + BED a; + while (getline(cin, bedLine)) { + + vector<string> bedFields; + Tokenize(bedLine,bedFields); + + lineNum++; + if (bedA->parseBedLine(a, bedFields, lineNum)) { + + vector<BED> hits; + FindWindowOverlaps(a, hits); + } + } + } +} +// END ClosestBed + + diff --git a/src/closestBed/closestBed.h b/src/closestBed/closestBed.h new file mode 100755 index 0000000000000000000000000000000000000000..641f0df3c8008e722261a04953e5250f9b6473f3 --- /dev/null +++ b/src/closestBed/closestBed.h @@ -0,0 +1,49 @@ +#ifndef CLOSESTBED_H +#define CLOSESTBED_H + +#include "bedFile.h" +#include <vector> +#include <map> +#include <list> +#include <iostream> +#include <fstream> + +using namespace std; + + +//*********************************************** +// Typedefs +//*********************************************** +typedef list<BED> bedList; + + +//************************************************ +// Class methods and elements +//************************************************ +class BedClosest { + +public: + + // constructor + BedClosest(string &, string &); + + // destructor + ~BedClosest(void); + + void reportA(const BED &); + void reportB(const BED &); + void reportNullB(); + + void ClosestBed(); + void FindWindowOverlaps(BED &, vector<BED> &); + +private: + + string bedAFile; + string bedBFile; + + // instance of a bed file class. + BedFile *bedA, *bedB; + +}; +#endif /* CLOSEST_H */ diff --git a/src/closestBed/closestMain.cpp b/src/closestBed/closestMain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d36dbfa08628025205313db85ab0358977caaeb4 --- /dev/null +++ b/src/closestBed/closestMain.cpp @@ -0,0 +1,99 @@ +#include "closestBed.h" + +using namespace std; + +// define our program name +#define PROGRAM_NAME "closestBed" + +// define the version +#define VERSION "1.1.0" + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +// function declarations +void ShowHelp(void); + +int main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + + // input arguments + int slop = 0; + + bool haveBedA = false; + bool haveBedB = false; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-h", 2, parameterLength) || + PARAMETER_CHECK("--help", 5, parameterLength)) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + haveBedA = true; + bedAFile = argv[i + 1]; + i++; + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + haveBedB = true; + bedBFile = argv[i + 1]; + i++; + } + } + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + BedClosest *bc = new BedClosest(bedAFile, bedBFile); + bc->ClosestBed(); + return 0; + } + else { + ShowHelp(); + } +} + +void ShowHelp(void) { + + cerr << "===============================================" << endl; + cerr << " " <<PROGRAM_NAME << " v" << VERSION << endl ; + cerr << " Aaron Quinlan, Ph.D. (aaronquinlan@gmail.com) " << endl ; + cerr << " Hall Laboratory, University of Virginia" << endl; + cerr << "===============================================" << endl << endl; + cerr << "Description: For each feature in BED A, finds the closest feature (upstream or downstream) in BED B" << endl; + + cerr << "***NOTE: Only BED3 - BED6 formats allowed.***"<< endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl; + + cerr << "NOTES: " << endl; + cerr << "\t" << "-i stdin " << "allows closestBed to read BED A from stdin. E.g.: cat a.bed | closestBed -a stdin -b B.bed" << endl << endl; + cerr << "\t" << "Reports \"none\" for chrom and \"-1\" for all other fields when a feature is not found in B on the same chromosome as the feature in A. E.g. none -1 -1" << endl; + + // end the program here + exit(1); + +}