Skip to content
Snippets Groups Projects
Commit 7d3a1c23 authored by Aaron Quinlan's avatar Aaron Quinlan
Browse files

Added closestBed

parent e5a491c1
No related branches found
No related tags found
No related merge requests found
CXX = g++
CXXFLAGS = -O3
LDFLAGS =
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= closestMain.cpp closestBed.cpp
OBJECTS= $(SOURCES:.cpp=.o)
_EXT_OBJECTS=bedFile.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= closestBed
LIBS=
all: $(PROGRAM)
.PHONY: all
$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS)
@echo " * linking $(PROGRAM)"
@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS)
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
$(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
.PHONY: clean
//
// closestBed.cpp
// BEDTools
//
// Created by Aaron Quinlan Spring 2009.
// Copyright 2009 Aaron Quinlan. All rights reserved.
//
// Summary: Looks for the closest features in two BED files.
//
/*
Includes
*/
#include "closestBed.h"
const int MAXSLOP = 256000000; // 2*MAXSLOP = 512 megabases.
// We don't want to keep looking if we
// can't find a nearby feature within 512 Mb.
const int SLOPGROWTH = 2048000;
/*
Constructor
*/
BedClosest::BedClosest(string &bedAFile, string &bedBFile) {
this->bedAFile = bedAFile;
this->bedBFile = bedBFile;
this->bedA = new BedFile(bedAFile);
this->bedB = new BedFile(bedBFile);
}
/*
Destructor
*/
BedClosest::~BedClosest(void) {
}
/*
reportA
Writes the _original_ BED entry for A.
Works for BED3 - BED6.
*/
void BedClosest::reportA(const BED &a) {
if (bedA->bedType == 3) {
cout << a.chrom << "\t" << a.start << "\t" << a.end;
}
else if (bedA->bedType == 4) {
cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t"
<< a.name;
}
else if (bedA->bedType == 5) {
cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t"
<< a.name << "\t" << a.score;
}
else if (bedA->bedType == 6) {
cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t"
<< a.name << "\t" << a.score << "\t" << a.strand;
}
}
/*
reportB
Writes the _original_ BED entry for B.
Works for BED3 - BED6.
*/
void BedClosest::reportB(const BED &b) {
if (bedB->bedType == 3) {
cout << b.chrom << "\t" << b.start << "\t" << b.end;
}
else if (bedB->bedType == 4) {
cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t"
<< b.name;
}
else if (bedB->bedType == 5) {
cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t"
<< b.name << "\t" << b.score;
}
else if (bedB->bedType == 6) {
cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t"
<< b.name << "\t" << b.score << "\t" << b.strand;
}
}
/*
reportNullB
Writes a NULL B entry for cases where no closest BED was found
Works for BED3 - BED6.
*/
void BedClosest::reportNullB() {
if (bedB->bedType == 3) {
cout << "none" << "\t" << "-1" << "\t" << "-1";
}
else if (bedB->bedType == 4) {
cout << "none" << "\t" << "-1" << "\t" << "-1" << "\t"
<< "-1";
}
else if (bedB->bedType == 5) {
cout << "none" << "\t" << "-1" << "\t" << "-1" << "\t"
<< "-1" << "\t" << "-1";
}
else if (bedB->bedType == 6) {
cout << "none" << "\t" << "-1" << "\t" << "-1" << "\t"
<< "-1" << "\t" << "-1" << "\t" << "-1";
}
}
void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) {
int slop = SLOPGROWTH; // start out just looking for overlaps
// within the current bin (~128Kb)
// update the current feature's start and end
int aFudgeStart = 0;
int aFudgeEnd;
int numOverlaps = 0;
BED closestB;
float maxOverlap = 0;
int minDistance = 999999999;
if(bedB->bedMap.find(a.chrom) != bedB->bedMap.end()) {
while ((numOverlaps == 0) && (slop <= MAXSLOP)) {
if ((a.start - slop) > 0) {
aFudgeStart = a.start - slop;
}
else {
aFudgeEnd;
}
aFudgeEnd = a.end + slop;
bedB->binKeeperFind(bedB->bedMap[a.chrom], aFudgeStart, aFudgeEnd, hits);
for (vector<BED>::iterator h = hits.begin(); h != hits.end(); ++h) {
numOverlaps++;
// do the actual features overlap?
int s = max(a.start, h->start);
int e = min(a.end, h->end);
if (s < e) {
// is there enough overlap (default ~ 1bp)
float overlap = (float)(e-s) / (float)(a.end - a.start);
if ( overlap > 0 ) {
// is this hit the closest?
if (overlap > maxOverlap) {
closestB = *h;
maxOverlap = overlap;
}
}
}
else if (h->end < a.start){
if ((a.start - h->end) < minDistance) {
closestB = *h;
minDistance = a.start - h->end;
}
}
else {
if ((h->start - a.end) < minDistance) {
closestB = *h;
minDistance = h->start - a.end;
}
}
}
slop += SLOPGROWTH; // if there were no overlaps,
// we'll widen the range by 64Kb in each direction
}
}
else {
reportA(a);
cout << "\t";
reportNullB();
cout << "\n";
}
if (numOverlaps > 0) {
reportA(a);
cout << "\t";
reportB(closestB);
cout << "\n";
}
}
void BedClosest::ClosestBed() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
bedB->loadBedFileIntoMap();
string bedLine;
BED bedEntry;
int lineNum = 0;
// open the BED file for reading
if (bedA->bedFile != "stdin") {
ifstream bed(bedA->bedFile.c_str(), ios::in);
if ( !bed ) {
cerr << "Error: The requested bed file (" <<bedA->bedFile << ") could not be opened. Exiting!" << endl;
exit (1);
}
BED a;
while (getline(bed, bedLine)) {
vector<string> bedFields;
Tokenize(bedLine,bedFields);
lineNum++;
if (bedA->parseBedLine(a, bedFields, lineNum)) {
vector<BED> hits;
FindWindowOverlaps(a, hits);
}
}
}
// "A" is being passed via STDIN.
else {
BED a;
while (getline(cin, bedLine)) {
vector<string> bedFields;
Tokenize(bedLine,bedFields);
lineNum++;
if (bedA->parseBedLine(a, bedFields, lineNum)) {
vector<BED> hits;
FindWindowOverlaps(a, hits);
}
}
}
}
// END ClosestBed
#ifndef CLOSESTBED_H
#define CLOSESTBED_H
#include "bedFile.h"
#include <vector>
#include <map>
#include <list>
#include <iostream>
#include <fstream>
using namespace std;
//***********************************************
// Typedefs
//***********************************************
typedef list<BED> bedList;
//************************************************
// Class methods and elements
//************************************************
class BedClosest {
public:
// constructor
BedClosest(string &, string &);
// destructor
~BedClosest(void);
void reportA(const BED &);
void reportB(const BED &);
void reportNullB();
void ClosestBed();
void FindWindowOverlaps(BED &, vector<BED> &);
private:
string bedAFile;
string bedBFile;
// instance of a bed file class.
BedFile *bedA, *bedB;
};
#endif /* CLOSEST_H */
#include "closestBed.h"
using namespace std;
// define our program name
#define PROGRAM_NAME "closestBed"
// define the version
#define VERSION "1.1.0"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void ShowHelp(void);
int main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
// input files
string bedAFile;
string bedBFile;
// input arguments
int slop = 0;
bool haveBedA = false;
bool haveBedB = false;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-h", 2, parameterLength) ||
PARAMETER_CHECK("--help", 5, parameterLength)) {
showHelp = true;
}
}
if(showHelp) ShowHelp();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-a", 2, parameterLength)) {
haveBedA = true;
bedAFile = argv[i + 1];
i++;
}
else if(PARAMETER_CHECK("-b", 2, parameterLength)) {
haveBedB = true;
bedBFile = argv[i + 1];
i++;
}
}
// make sure we have both input files
if (!haveBedA || !haveBedB) {
cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedClosest *bc = new BedClosest(bedAFile, bedBFile);
bc->ClosestBed();
return 0;
}
else {
ShowHelp();
}
}
void ShowHelp(void) {
cerr << "===============================================" << endl;
cerr << " " <<PROGRAM_NAME << " v" << VERSION << endl ;
cerr << " Aaron Quinlan, Ph.D. (aaronquinlan@gmail.com) " << endl ;
cerr << " Hall Laboratory, University of Virginia" << endl;
cerr << "===============================================" << endl << endl;
cerr << "Description: For each feature in BED A, finds the closest feature (upstream or downstream) in BED B" << endl;
cerr << "***NOTE: Only BED3 - BED6 formats allowed.***"<< endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl;
cerr << "NOTES: " << endl;
cerr << "\t" << "-i stdin " << "allows closestBed to read BED A from stdin. E.g.: cat a.bed | closestBed -a stdin -b B.bed" << endl << endl;
cerr << "\t" << "Reports \"none\" for chrom and \"-1\" for all other fields when a feature is not found in B on the same chromosome as the feature in A. E.g. none -1 -1" << endl;
// end the program here
exit(1);
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment