From d538e8e7b1e9834b667cc7144f146fa0c53c3ac4 Mon Sep 17 00:00:00 2001 From: Aaron Quinlan <aaronquinlan@gmail.com> Date: Wed, 22 Apr 2009 22:50:02 -0400 Subject: [PATCH] Added new windowBed program Added windowBed, which scans for overlaps within a requested widow size. --- src/intersectBed/intersectMain.cpp | 2 +- src/windowBed/Makefile | 44 ++++ src/windowBed/windowBed.cpp | 231 ++++++++++++++++++ src/windowBed/windowBed.h | 51 ++++ .../{intersectMain.cpp => windowMain.cpp} | 66 ++--- 5 files changed, 345 insertions(+), 49 deletions(-) create mode 100644 src/windowBed/Makefile create mode 100755 src/windowBed/windowBed.cpp create mode 100755 src/windowBed/windowBed.h rename src/windowBed/{intersectMain.cpp => windowMain.cpp} (53%) diff --git a/src/intersectBed/intersectMain.cpp b/src/intersectBed/intersectMain.cpp index f3bd42e2..a77a6946 100755 --- a/src/intersectBed/intersectMain.cpp +++ b/src/intersectBed/intersectMain.cpp @@ -141,7 +141,7 @@ void ShowHelp(void) { cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl; cerr << "OPTIONS: " << endl; - cerr << "\t" << "-u\t\t\t" << "Write ORIGINAL a.bed entry ONCE if ANY overlap bed." << endl << "\t\t\t\tIn other words, ignore multiple overlaps." << endl << endl; + cerr << "\t" << "-u\t\t\t" << "Write ORIGINAL a.bed entry ONCE if ANY overlap with B.bed." << endl << "\t\t\t\tIn other words, just report the fact >=1 hit was found." << endl << endl; cerr << "\t" << "-v \t\t\t" << "Only report those entries in A that have NO OVERLAP in B." << endl << "\t\t\t\tSimilar to grep -v." << endl << endl; cerr << "\t" << "-f (e.g. 0.05)\t\t" << "Minimum overlap req'd as fraction of a.bed." << endl << "\t\t\t\tDefault is 1E-9 (effectively 1bp)." << endl << endl; cerr << "\t" << "-c \t\t\t" << "For each entry in A, report the number of hits in B while restricting to -f." << endl << "\t\t\t\tReports 0 for A entries that have no overlap with B." << endl << endl; diff --git a/src/windowBed/Makefile b/src/windowBed/Makefile new file mode 100644 index 00000000..f1697a77 --- /dev/null +++ b/src/windowBed/Makefile @@ -0,0 +1,44 @@ +CXX = g++ +CXXFLAGS = -O3 +LDFLAGS = -m64 + +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= windowMain.cpp windowBed.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=bedFile.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +PROGRAM= windowBed +LIBS= + +all: $(PROGRAM) + +.PHONY: all + +$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS) + @echo " * linking $(PROGRAM)" + @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS) + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean diff --git a/src/windowBed/windowBed.cpp b/src/windowBed/windowBed.cpp new file mode 100755 index 00000000..79cf04f5 --- /dev/null +++ b/src/windowBed/windowBed.cpp @@ -0,0 +1,231 @@ +// +// windowBed.cpp +// BEDTools +// +// Created by Aaron Quinlan Spring 2009. +// Copyright 2009 Aaron Quinlan. All rights reserved. +// +// Summary: Looks for overlaps between features in two BED files. +// + +/* + Includes +*/ +#include "windowBed.h" + + +/* + Constructor +*/ +BedWindow::BedWindow(string &bedAFile, string &bedBFile, int &slop, bool &anyHit, bool &noHit, bool &writeCount) { + + this->bedAFile = bedAFile; + this->bedBFile = bedBFile; + + this->slop = slop; + this->anyHit = anyHit; + this->noHit = noHit; + this->writeCount = writeCount; + + this->bedA = new BedFile(bedAFile); + this->bedB = new BedFile(bedBFile); +} + +/* + Destructor +*/ +BedWindow::~BedWindow(void) { +} + + + +/* + reportA + + Writes the _original_ BED entry for A. + Works for BED3 - BED6. +*/ +void BedWindow::reportA(const BED &a) { + + if (bedA->bedType == 3) { + cout << a.chrom << "\t" << a.start << "\t" << a.end; + } + else if (bedA->bedType == 4) { + cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" + << a.name; + } + else if (bedA->bedType == 5) { + cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" + << a.name << "\t" << a.score; + } + else if (bedA->bedType == 6) { + cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" + << a.name << "\t" << a.score << "\t" << a.strand; + } +} + + + +/* + reportB + + Writes the _original_ BED entry for B. + Works for BED3 - BED6. +*/ +void BedWindow::reportB(const BED &b) { + if (bedB->bedType == 3) { + cout << b.chrom << "\t" << b.start << "\t" << b.end; + } + else if (bedB->bedType == 4) { + cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" + << b.name; + } + else if (bedB->bedType == 5) { + cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" + << b.name << "\t" << b.score; + } + else if (bedB->bedType == 6) { + cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" + << b.name << "\t" << b.score << "\t" << b.strand; + } +} + + + + + +void BedWindow::WindowIntersectBed() { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + bedB->loadBedFileIntoMap(); + + + string bedLine; + BED bedEntry; + int lineNum = 0; + + // open the BED file for reading + if (bedA->bedFile != "stdin") { + + ifstream bed(bedA->bedFile.c_str(), ios::in); + if ( !bed ) { + cerr << "Error: The requested bed file (" <<bedA->bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + + BED a; + while (getline(bed, bedLine)) { + + vector<string> bedFields; + Tokenize(bedLine,bedFields); + + lineNum++; + if (bedA->parseBedLine(a, bedFields, lineNum)) { + + vector<BED> hits; + + // update the current feature's start and end + // according to the slop requested (slop = 0 by default) + int aFudgeStart = 0; + int aFudgeEnd; + if ((a.start - this->slop) > 0) { + aFudgeStart = a.start - this->slop; + } + else { + aFudgeEnd; + } + aFudgeEnd = a.end + this->slop; + + bedB->binKeeperFind(bedB->bedMap[a.chrom], aFudgeStart, aFudgeEnd, hits); + + int numOverlaps = 0; + for (vector<BED>::iterator h = hits.begin(); h != hits.end(); ++h) { + + int s = max(aFudgeStart, h->start); + int e = min(aFudgeEnd, h->end); + + if (s < e) { + // is there enough overlap (default ~ 1bp) + if ( ((float)(e-s) / (float)(a.end - a.start)) > 0 ) { + numOverlaps++; + if (!anyHit && !noHit && !writeCount) { + reportA(a); cout << "\t"; + reportB(*h); cout << endl; + } + } + } + } + if (anyHit && (numOverlaps >= 1)) { + reportA(a); cout << endl; + } + else if (writeCount) { + reportA(a); cout << "\t" << numOverlaps << endl; + } + else if (noHit && (numOverlaps == 0)) { + reportA(a); cout << endl; + } + } + } + } + // "A" is being passed via STDIN. + else { + + BED a; + while (getline(cin, bedLine)) { + + vector<string> bedFields; + Tokenize(bedLine,bedFields); + + lineNum++; + if (bedA->parseBedLine(a, bedFields, lineNum)) { + + vector<BED> hits; + + // update the current feature's start and end + // according to the slop requested (slop = 0 by default) + int aFudgeStart = 0; + int aFudgeEnd; + if ((a.start - this->slop) > 0) { + aFudgeStart = a.start - this->slop; + } + else { + aFudgeEnd; + } + aFudgeEnd = a.end + this->slop; + + bedB->binKeeperFind(bedB->bedMap[a.chrom], aFudgeStart, aFudgeEnd, hits); + + int numOverlaps = 0; + for (vector<BED>::iterator h = hits.begin(); h != hits.end(); ++h) { + + int s = max(aFudgeStart, h->start); + int e = min(aFudgeEnd, h->end); + + if (s < e) { + // is there enough overlap (default ~ 1bp) + if ( ((float)(e-s) / (float)(a.end - a.start)) > 0 ) { + numOverlaps++; + if (!anyHit && !noHit && !writeCount) { + reportA(a); cout << "\t"; + reportB(*h); cout << endl; + } + } + } + } + if (anyHit && (numOverlaps >= 1)) { + reportA(a); cout << endl; + } + else if (writeCount) { + reportA(a); cout << "\t" << numOverlaps << endl; + } + else if (noHit && (numOverlaps == 0)) { + reportA(a); cout << endl; + } + } + } + } +} +// END WindowIntersectBed + + diff --git a/src/windowBed/windowBed.h b/src/windowBed/windowBed.h new file mode 100755 index 00000000..61e9b9b2 --- /dev/null +++ b/src/windowBed/windowBed.h @@ -0,0 +1,51 @@ +#ifndef WINDOWBED_H +#define WINDOWBED_H + +#include "bedFile.h" +#include <vector> +#include <map> +#include <list> +#include <iostream> +#include <fstream> + +using namespace std; + + +//*********************************************** +// Typedefs +//*********************************************** +typedef list<BED> bedList; + + +//************************************************ +// Class methods and elements +//************************************************ +class BedWindow { + +public: + + // constructor + BedWindow(string &, string &, int &, bool &, bool &, bool &); + + // destructor + ~BedWindow(void); + + void reportA(const BED &); + void reportB(const BED &); + + void WindowIntersectBed(); + +private: + + string bedAFile; + string bedBFile; + bool anyHit; + bool writeCount; + int slop; + bool noHit; + + // instance of a bed file class. + BedFile *bedA, *bedB; + +}; +#endif /* WINDOWBED_H */ diff --git a/src/windowBed/intersectMain.cpp b/src/windowBed/windowMain.cpp similarity index 53% rename from src/windowBed/intersectMain.cpp rename to src/windowBed/windowMain.cpp index 4cd0393a..0a390b4d 100755 --- a/src/windowBed/intersectMain.cpp +++ b/src/windowBed/windowMain.cpp @@ -1,9 +1,9 @@ -#include "intersectBed.h" +#include "windowBed.h" using namespace std; // define our program name -#define PROGRAM_NAME "intersectBed" +#define PROGRAM_NAME "windowBed" // define the version #define VERSION "1.1.0" @@ -24,17 +24,13 @@ int main(int argc, char* argv[]) { string bedBFile; // input arguments - float overlapFraction = 1E-9; int slop = 0; bool haveBedA = false; bool haveBedB = false; bool noHit = false; bool anyHit = false; - bool writeA = false; - bool writeB = false; bool writeCount = false; - bool haveFraction = false; bool haveSlop = false; // check to see if we should print out some help @@ -69,24 +65,13 @@ int main(int argc, char* argv[]) { else if(PARAMETER_CHECK("-u", 2, parameterLength)) { anyHit = true; } - else if(PARAMETER_CHECK("-f", 2, parameterLength)) { - haveFraction = true; - overlapFraction = atof(argv[i + 1]); - i++; - } - else if(PARAMETER_CHECK("-wa", 3, parameterLength)) { - writeA = true; - } - else if(PARAMETER_CHECK("-wb", 3, parameterLength)) { - writeB = true; - } else if(PARAMETER_CHECK("-c", 2, parameterLength)) { writeCount = true; } else if (PARAMETER_CHECK("-v", 2, parameterLength)) { noHit = true; } - else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + else if (PARAMETER_CHECK("-w", 2, parameterLength)) { haveSlop = true; slop = atoi(argv[i + 1]); i++; @@ -107,31 +92,20 @@ int main(int argc, char* argv[]) { cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -v, not both." << endl << "*****" << endl; showHelp = true; } - - if (writeB && writeCount) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -c, not both." << endl << "*****" << endl; - showHelp = true; - } if (anyHit && writeCount) { cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -c, not both." << endl << "*****" << endl; showHelp = true; } - if (haveSlop && haveFraction) { - cerr << endl << "*****" << endl << "*****ERROR: Request either -s OR -f, not both." << endl << "*****" << endl; - showHelp = true; - } - if (haveSlop && (slop < 0)) { cerr << endl << "*****" << endl << "*****ERROR: Slop (-s) must be positive." << endl << "*****" << endl; showHelp = true; } if (!showHelp) { - - BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, overlapFraction, slop, noHit, writeCount); - bi->IntersectBed(); + BedWindow *bi = new BedWindow(bedAFile, bedBFile, slop, anyHit, noHit, writeCount); + bi->WindowIntersectBed(); return 0; } else { @@ -141,30 +115,26 @@ int main(int argc, char* argv[]) { void ShowHelp(void) { - cerr << "=======================================================" << endl; - cerr << PROGRAM_NAME << " v" << VERSION << endl ; - cerr << "Aaron Quinlan, Ph.D." << endl; - cerr << "aaronquinlan@gmail.com" << endl ; - cerr << "Hall Laboratory" << endl; - cerr << "Biochemistry and Molecular Genetics" << endl; - cerr << "University of Virginia" << endl; - cerr << "=======================================================" << endl << endl; - cerr << "Description: Report overlaps between a.bed and b.bed." << endl << endl; + cerr << "===============================================" << endl; + cerr << " " <<PROGRAM_NAME << " v" << VERSION << endl ; + cerr << " Aaron Quinlan, Ph.D. (aaronquinlan@gmail.com) " << endl ; + cerr << " Hall Laboratory, University of Virginia" << endl; + cerr << "===============================================" << endl << endl; + cerr << "Description: Examines a \"window\" around each feature in A.bed and" << endl; + cerr << "reports all features in B.bed that are within the window. For each overlap the entire entry in A and B are reported." + << endl << endl; cerr << "***NOTE: Only BED3 - BED6 formats allowed.***"<< endl << endl; cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl; cerr << "OPTIONS: " << endl; - cerr << "\t" << "-u\t\t\t" << "Write ORIGINAL a.bed entry ONCE if ANY overlap bed." << endl << "\t\t\t\tIn other words, ignore multiple overlaps." << endl << endl; - cerr << "\t" << "-v \t\t\t" << "Only report those entries in A that have NO OVERLAP in B." << endl << "\t\t\t\tSimilar to grep -v." << endl << endl; - cerr << "\t" << "-s (100000)\t\t" << "Slop added before and after each entry in A" << endl << "\t\t\t\tUseful for finding overlaps within N bases upstream and downstream." << endl << endl; - cerr << "\t" << "-f (e.g. 0.05)\t\t" << "Minimum overlap req'd as fraction of a.bed." << endl << "\t\t\t\tDefault is 1E-9 (effectively 1bp)." << endl << endl; - cerr << "\t" << "-c \t\t\t" << "For each entry in A, report the number of hits in B while restricting to -f." << endl << "\t\t\t\tReports 0 for A entries that have no overlap with B." << endl << endl; - cerr << "\t" << "-wa \t\t\t" << "Write the original entry in A for each overlap." << endl << endl; - cerr << "\t" << "-wb \t\t\t" << "Write the original entry in B for each overlap." << endl << "\t\t\t\tUseful for knowing _what_ A overlaps. Restricted by -f." << endl << endl; + cerr << "\t" << "-w (e.g. 100000)\t" << "Base pairs added before and after each entry in A when searching for overlaps in B" << endl << endl; + cerr << "\t" << "-u\t\t\t" << "Write ORIGINAL a.bed entry ONCE if ANY overlap with B.bed." << endl << "\t\t\t\tIn other words, just report the fact >=1 hit was found." << endl << endl; + cerr << "\t" << "-v \t\t\t" << "Only report those entries in A that have NO OVERLAP in B within the requested window." << endl << "\t\t\t\tSimilar to grep -v." << endl << endl; + cerr << "\t" << "-c \t\t\t" << "For each entry in A, report the number of hits in B within the requested window." << endl << "\t\t\t\tReports 0 for A entries that have no overlap with B." << endl << endl; cerr << "NOTES: " << endl; - cerr << "\t" << "-i stdin\t\t" << "Allows intersectBed to read BED from stdin. E.g.: cat a.bed | intersectBed -a stdin -b B.bed" << endl << endl; + cerr << "\t" << "-i stdin\t\t" << "Allows windowBed to read BED from stdin. E.g.: cat a.bed | windowBed -a stdin -b B.bed" << endl << endl; // end the program here -- GitLab