From ad475658ec34ca7c22fd8aa1a21070d7a10f416f Mon Sep 17 00:00:00 2001 From: Aaron Quinlan <aaronquinlan@gmail.com> Date: Sun, 26 Apr 2009 14:21:50 -0400 Subject: [PATCH] Added subtractBed --- Makefile | 2 +- src/subtractBed/Makefile | 44 +++++++ src/subtractBed/a | 1 + src/subtractBed/b | 1 + src/subtractBed/subtractBed.cpp | 203 +++++++++++++++++++++++++++++++ src/subtractBed/subtractBed.h | 52 ++++++++ src/subtractBed/subtractMain.cpp | 114 +++++++++++++++++ 7 files changed, 416 insertions(+), 1 deletion(-) create mode 100644 src/subtractBed/Makefile create mode 100644 src/subtractBed/a create mode 100644 src/subtractBed/b create mode 100755 src/subtractBed/subtractBed.cpp create mode 100755 src/subtractBed/subtractBed.h create mode 100755 src/subtractBed/subtractMain.cpp diff --git a/Makefile b/Makefile index b3b8aec1..7d9a5c97 100644 --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ export CXX ?= g++ #include includes/$(BLD_PLATFORM).inc # define our source subdirectories -SUBDIRS = $(SRC_DIR)/closestBed $(SRC_DIR)/complementBed $(SRC_DIR)/coverageBed $(SRC_DIR)/intersectBed $(SRC_DIR)/mergeBed $(SRC_DIR)/genomeCoverageBed $(SRC_DIR)/fastaFromBed $(SRC_DIR)/sortBed $(SRC_DIR)/windowBed +SUBDIRS = $(SRC_DIR)/closestBed $(SRC_DIR)/complementBed $(SRC_DIR)/coverageBed $(SRC_DIR)/intersectBed $(SRC_DIR)/mergeBed $(SRC_DIR)/genomeCoverageBed $(SRC_DIR)/fastaFromBed $(SRC_DIR)/sortBed $(SRC_DIR)/windowBed $(SRC_DIR)/subtractBed UTIL_SUBDIRS = $(SRC_DIR)/utils/bedFile $(SRC_DIR)/utils/sequenceUtilities all: diff --git a/src/subtractBed/Makefile b/src/subtractBed/Makefile new file mode 100644 index 00000000..ae38802f --- /dev/null +++ b/src/subtractBed/Makefile @@ -0,0 +1,44 @@ +CXX = g++ +CXXFLAGS = -O3 -Wall +LDFLAGS = + +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= subtractMain.cpp subtractBed.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=bedFile.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +PROGRAM= subtractBed +LIBS= + +all: $(PROGRAM) + +.PHONY: all + +$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS) + @echo " * linking $(PROGRAM)" + @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS) + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean diff --git a/src/subtractBed/a b/src/subtractBed/a new file mode 100644 index 00000000..d6c80bc6 --- /dev/null +++ b/src/subtractBed/a @@ -0,0 +1 @@ +chr1 10 20 diff --git a/src/subtractBed/b b/src/subtractBed/b new file mode 100644 index 00000000..c03d074e --- /dev/null +++ b/src/subtractBed/b @@ -0,0 +1 @@ +chr1 0 100 diff --git a/src/subtractBed/subtractBed.cpp b/src/subtractBed/subtractBed.cpp new file mode 100755 index 00000000..c7ef2178 --- /dev/null +++ b/src/subtractBed/subtractBed.cpp @@ -0,0 +1,203 @@ +// +// subtractBed.cpp +// BEDTools +// +// Created by Aaron Quinlan Spring 2009. +// Copyright 2009 Aaron Quinlan. All rights reserved. +// +// Summary: Removes overlapping segments from a BED entry. +// + +/* + Includes +*/ +#include "subtractBed.h" + + +/* + Constructor +*/ +BedSubtract::BedSubtract(string &bedAFile, string &bedBFile, float &overlapFraction) { + + this->bedAFile = bedAFile; + this->bedBFile = bedBFile; + this->overlapFraction = overlapFraction; + + this->bedA = new BedFile(bedAFile); + this->bedB = new BedFile(bedBFile); +} + +/* + Destructor +*/ +BedSubtract::~BedSubtract(void) { +} + + + +/* + reportA + + Writes the _original_ BED entry for A. + Works for BED3 - BED6. +*/ +void BedSubtract::reportA(BED &a) { + + if (bedA->bedType == 3) { + cout << a.chrom << "\t" << a.start << "\t" << a.end; + } + else if (bedA->bedType == 4) { + cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" + << a.name; + } + else if (bedA->bedType == 5) { + cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" + << a.name << "\t" << a.score; + } + else if (bedA->bedType == 6) { + cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" + << a.name << "\t" << a.score << "\t" << a.strand; + } +} + + +/* + reportARemainder + + Writes the base-pair remainder for a BED entry in A after an overlap + with B has been subtracted. + + Works for BED3 - BED6. +*/ +void BedSubtract::reportARemainder(BED &a, int &start, int &end) { + + if (bedA->bedType == 3) { + cout << a.chrom << "\t" << start << "\t" << end; + } + else if (bedA->bedType == 4) { + cout << a.chrom << "\t" << start << "\t" << end << "\t" + << a.name; + } + else if (bedA->bedType == 5) { + cout << a.chrom << "\t" << start << "\t" << end << "\t" + << a.name << "\t" << a.score; + } + else if (bedA->bedType == 6) { + cout << a.chrom << "\t" << start << "\t" << end << "\t" + << a.name << "\t" << a.score << "\t" << a.strand; + } + +} + + + + +void BedSubtract::FindOverlaps(BED &a, vector<BED> &hits) { + + // find all of the overlaps between a and B. + bedB->binKeeperFind(bedB->bedMap[a.chrom], a.start, a.end, hits); + + int numOverlaps = 0; + for (vector<BED>::iterator h = hits.begin(); h != hits.end(); ++h) { + + int s = max(a.start, h->start); + int e = min(a.end, h->end); + + if (s < e) { + + numOverlaps++; + + // is there enough overlap (default ~ 1bp) + float overlap = ((float)(e-s) / (float)(a.end - a.start)); + + if ( overlap > this->overlapFraction ) { + + if (overlap < 1.0) { // if overlap = 1, then B consumes A and therefore, + // we won't report A. + // A ++++++++++++ + // B ---------- + // Res. ======== + if (h->start > a.start) { + int end = h->start + 1; + reportARemainder(a,a.start,end); cout << "\n"; + } + // A ++++++++++++ + // B ---------- + // Res. ======== + else { + reportARemainder(a,a.start,h->end); cout << "\n"; + } + } + } + } + + } + if (numOverlaps == 0) { + // no overlap found, so just report A as-is. + reportA(a); cout << "\n"; + } +} + + + +void BedSubtract::SubtractBed() { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + bedB->loadBedFileIntoMap(); + + string bedLine; + BED bedEntry; + int lineNum = 0; + + // are we dealing with a file? + if (bedA->bedFile != "stdin") { + + // open the BED file for reading + ifstream bed(bedA->bedFile.c_str(), ios::in); + if ( !bed ) { + cerr << "Error: The requested bed file (" <<bedA->bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + + BED a; + // process each entry in A + while (getline(bed, bedLine)) { + + // split the current line into ditinct fields + vector<string> bedFields; + Tokenize(bedLine,bedFields); + + lineNum++; + + // find the overlaps with B if it's a valid BED entry. + if (bedA->parseBedLine(a, bedFields, lineNum)) { + vector<BED> hits; + FindOverlaps(a, hits); + } + } + } + // "A" is being passed via STDIN. + else { + + BED a; + // process each entry in A + while (getline(cin, bedLine)) { + + // split the current line into ditinct fields + vector<string> bedFields; + Tokenize(bedLine,bedFields); + + lineNum++; + + // find the overlaps with B if it's a valid BED entry. + if (bedA->parseBedLine(a, bedFields, lineNum)) { + vector<BED> hits; + FindOverlaps(a, hits); + } + } + } +} +// END Intersect + + diff --git a/src/subtractBed/subtractBed.h b/src/subtractBed/subtractBed.h new file mode 100755 index 00000000..f46c8022 --- /dev/null +++ b/src/subtractBed/subtractBed.h @@ -0,0 +1,52 @@ +#ifndef SUBTRACTBED_H +#define SUBTRACTBED_H + +#include "bedFile.h" +#include <vector> +#include <map> +#include <list> +#include <iostream> +#include <fstream> + +using namespace std; + + +//*********************************************** +// Typedefs +//*********************************************** +typedef list<BED> bedList; + + +//************************************************ +// Class methods and elements +//************************************************ +class BedSubtract { + +public: + + // constructor + BedSubtract(string &, string &, float &); + + // destructor + ~BedSubtract(void); + + void reportARemainder(BED &, int &, int &); + void reportA(BED &); + + void FindOverlaps(BED &, vector<BED> &); + void SubtractBed(); + + +private: + + string bedAFile; + string bedBFile; + float overlapFraction; + bool noHit; + + // instance of a bed file class. + BedFile *bedA, *bedB; + +}; + +#endif /* SUBTRACTBED_H */ diff --git a/src/subtractBed/subtractMain.cpp b/src/subtractBed/subtractMain.cpp new file mode 100755 index 00000000..83962e0f --- /dev/null +++ b/src/subtractBed/subtractMain.cpp @@ -0,0 +1,114 @@ +#include "subtractBed.h" + +using namespace std; + +// define our program name +#define PROGRAM_NAME "subtractBed" + +// define the version +#define VERSION "1.1.0" + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +// function declarations +void ShowHelp(void); + +int main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + + // input arguments + float overlapFraction = 1E-9; + + bool haveBedA = false; + bool haveBedB = false; + bool haveFraction = false; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-h", 2, parameterLength) || + PARAMETER_CHECK("--help", 5, parameterLength)) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + bedAFile = argv[i + 1]; + } + i++; + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedB = true; + bedBFile = argv[i + 1]; + } + i++; + } + else if(PARAMETER_CHECK("-f", 2, parameterLength)) { + haveFraction = true; + overlapFraction = atof(argv[i + 1]); + i++; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + + BedSubtract *bs = new BedSubtract(bedAFile, bedBFile, overlapFraction); + bs->SubtractBed(); + return 0; + } + else { + ShowHelp(); + } +} + +void ShowHelp(void) { + + cerr << "===============================================" << endl; + cerr << " " <<PROGRAM_NAME << " v" << VERSION << endl ; + cerr << " Aaron Quinlan, Ph.D. (aaronquinlan@gmail.com) " << endl ; + cerr << " Hall Laboratory, University of Virginia" << endl; + cerr << "===============================================" << endl << endl; + cerr << "Description: Report overlaps between a.bed and b.bed." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl; + + cerr << "OPTIONS: " << endl; + cerr << "\t" << "-f (e.g. 0.05)\t\t" << "Minimum overlap req'd as fraction of a.bed." << endl << "\t\t\t\tDefault is 1E-9 (effectively 1bp)." << endl << endl; + + cerr << "NOTES: " << endl; + cerr << "\t" << "-i stdin\t\t" << "Allows BED file A to be read from stdin. E.g.: cat a.bed | subtractBed -a stdin -b B.bed" << endl << endl; + cerr << "\t***Only BED3 - BED6 formats allowed.***"<< endl << endl; + + // end the program here + exit(1); +} -- GitLab