Skip to content
Snippets Groups Projects
Commit ad475658 authored by Aaron Quinlan's avatar Aaron Quinlan
Browse files

Added subtractBed

parent 6869dc73
No related branches found
No related tags found
No related merge requests found
......@@ -19,7 +19,7 @@ export CXX ?= g++
#include includes/$(BLD_PLATFORM).inc
# define our source subdirectories
SUBDIRS = $(SRC_DIR)/closestBed $(SRC_DIR)/complementBed $(SRC_DIR)/coverageBed $(SRC_DIR)/intersectBed $(SRC_DIR)/mergeBed $(SRC_DIR)/genomeCoverageBed $(SRC_DIR)/fastaFromBed $(SRC_DIR)/sortBed $(SRC_DIR)/windowBed
SUBDIRS = $(SRC_DIR)/closestBed $(SRC_DIR)/complementBed $(SRC_DIR)/coverageBed $(SRC_DIR)/intersectBed $(SRC_DIR)/mergeBed $(SRC_DIR)/genomeCoverageBed $(SRC_DIR)/fastaFromBed $(SRC_DIR)/sortBed $(SRC_DIR)/windowBed $(SRC_DIR)/subtractBed
UTIL_SUBDIRS = $(SRC_DIR)/utils/bedFile $(SRC_DIR)/utils/sequenceUtilities
all:
......
CXX = g++
CXXFLAGS = -O3 -Wall
LDFLAGS =
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= subtractMain.cpp subtractBed.cpp
OBJECTS= $(SOURCES:.cpp=.o)
_EXT_OBJECTS=bedFile.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= subtractBed
LIBS=
all: $(PROGRAM)
.PHONY: all
$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS)
@echo " * linking $(PROGRAM)"
@$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS)
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
$(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/* $(BIN_DIR)/*
.PHONY: clean
chr1 10 20
chr1 0 100
//
// subtractBed.cpp
// BEDTools
//
// Created by Aaron Quinlan Spring 2009.
// Copyright 2009 Aaron Quinlan. All rights reserved.
//
// Summary: Removes overlapping segments from a BED entry.
//
/*
Includes
*/
#include "subtractBed.h"
/*
Constructor
*/
BedSubtract::BedSubtract(string &bedAFile, string &bedBFile, float &overlapFraction) {
this->bedAFile = bedAFile;
this->bedBFile = bedBFile;
this->overlapFraction = overlapFraction;
this->bedA = new BedFile(bedAFile);
this->bedB = new BedFile(bedBFile);
}
/*
Destructor
*/
BedSubtract::~BedSubtract(void) {
}
/*
reportA
Writes the _original_ BED entry for A.
Works for BED3 - BED6.
*/
void BedSubtract::reportA(BED &a) {
if (bedA->bedType == 3) {
cout << a.chrom << "\t" << a.start << "\t" << a.end;
}
else if (bedA->bedType == 4) {
cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t"
<< a.name;
}
else if (bedA->bedType == 5) {
cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t"
<< a.name << "\t" << a.score;
}
else if (bedA->bedType == 6) {
cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t"
<< a.name << "\t" << a.score << "\t" << a.strand;
}
}
/*
reportARemainder
Writes the base-pair remainder for a BED entry in A after an overlap
with B has been subtracted.
Works for BED3 - BED6.
*/
void BedSubtract::reportARemainder(BED &a, int &start, int &end) {
if (bedA->bedType == 3) {
cout << a.chrom << "\t" << start << "\t" << end;
}
else if (bedA->bedType == 4) {
cout << a.chrom << "\t" << start << "\t" << end << "\t"
<< a.name;
}
else if (bedA->bedType == 5) {
cout << a.chrom << "\t" << start << "\t" << end << "\t"
<< a.name << "\t" << a.score;
}
else if (bedA->bedType == 6) {
cout << a.chrom << "\t" << start << "\t" << end << "\t"
<< a.name << "\t" << a.score << "\t" << a.strand;
}
}
void BedSubtract::FindOverlaps(BED &a, vector<BED> &hits) {
// find all of the overlaps between a and B.
bedB->binKeeperFind(bedB->bedMap[a.chrom], a.start, a.end, hits);
int numOverlaps = 0;
for (vector<BED>::iterator h = hits.begin(); h != hits.end(); ++h) {
int s = max(a.start, h->start);
int e = min(a.end, h->end);
if (s < e) {
numOverlaps++;
// is there enough overlap (default ~ 1bp)
float overlap = ((float)(e-s) / (float)(a.end - a.start));
if ( overlap > this->overlapFraction ) {
if (overlap < 1.0) { // if overlap = 1, then B consumes A and therefore,
// we won't report A.
// A ++++++++++++
// B ----------
// Res. ========
if (h->start > a.start) {
int end = h->start + 1;
reportARemainder(a,a.start,end); cout << "\n";
}
// A ++++++++++++
// B ----------
// Res. ========
else {
reportARemainder(a,a.start,h->end); cout << "\n";
}
}
}
}
}
if (numOverlaps == 0) {
// no overlap found, so just report A as-is.
reportA(a); cout << "\n";
}
}
void BedSubtract::SubtractBed() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
bedB->loadBedFileIntoMap();
string bedLine;
BED bedEntry;
int lineNum = 0;
// are we dealing with a file?
if (bedA->bedFile != "stdin") {
// open the BED file for reading
ifstream bed(bedA->bedFile.c_str(), ios::in);
if ( !bed ) {
cerr << "Error: The requested bed file (" <<bedA->bedFile << ") could not be opened. Exiting!" << endl;
exit (1);
}
BED a;
// process each entry in A
while (getline(bed, bedLine)) {
// split the current line into ditinct fields
vector<string> bedFields;
Tokenize(bedLine,bedFields);
lineNum++;
// find the overlaps with B if it's a valid BED entry.
if (bedA->parseBedLine(a, bedFields, lineNum)) {
vector<BED> hits;
FindOverlaps(a, hits);
}
}
}
// "A" is being passed via STDIN.
else {
BED a;
// process each entry in A
while (getline(cin, bedLine)) {
// split the current line into ditinct fields
vector<string> bedFields;
Tokenize(bedLine,bedFields);
lineNum++;
// find the overlaps with B if it's a valid BED entry.
if (bedA->parseBedLine(a, bedFields, lineNum)) {
vector<BED> hits;
FindOverlaps(a, hits);
}
}
}
}
// END Intersect
#ifndef SUBTRACTBED_H
#define SUBTRACTBED_H
#include "bedFile.h"
#include <vector>
#include <map>
#include <list>
#include <iostream>
#include <fstream>
using namespace std;
//***********************************************
// Typedefs
//***********************************************
typedef list<BED> bedList;
//************************************************
// Class methods and elements
//************************************************
class BedSubtract {
public:
// constructor
BedSubtract(string &, string &, float &);
// destructor
~BedSubtract(void);
void reportARemainder(BED &, int &, int &);
void reportA(BED &);
void FindOverlaps(BED &, vector<BED> &);
void SubtractBed();
private:
string bedAFile;
string bedBFile;
float overlapFraction;
bool noHit;
// instance of a bed file class.
BedFile *bedA, *bedB;
};
#endif /* SUBTRACTBED_H */
#include "subtractBed.h"
using namespace std;
// define our program name
#define PROGRAM_NAME "subtractBed"
// define the version
#define VERSION "1.1.0"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void ShowHelp(void);
int main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
// input files
string bedAFile;
string bedBFile;
// input arguments
float overlapFraction = 1E-9;
bool haveBedA = false;
bool haveBedB = false;
bool haveFraction = false;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-h", 2, parameterLength) ||
PARAMETER_CHECK("--help", 5, parameterLength)) {
showHelp = true;
}
}
if(showHelp) ShowHelp();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-a", 2, parameterLength)) {
if ((i+1) < argc) {
haveBedA = true;
bedAFile = argv[i + 1];
}
i++;
}
else if(PARAMETER_CHECK("-b", 2, parameterLength)) {
if ((i+1) < argc) {
haveBedB = true;
bedBFile = argv[i + 1];
}
i++;
}
else if(PARAMETER_CHECK("-f", 2, parameterLength)) {
haveFraction = true;
overlapFraction = atof(argv[i + 1]);
i++;
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveBedA || !haveBedB) {
cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedSubtract *bs = new BedSubtract(bedAFile, bedBFile, overlapFraction);
bs->SubtractBed();
return 0;
}
else {
ShowHelp();
}
}
void ShowHelp(void) {
cerr << "===============================================" << endl;
cerr << " " <<PROGRAM_NAME << " v" << VERSION << endl ;
cerr << " Aaron Quinlan, Ph.D. (aaronquinlan@gmail.com) " << endl ;
cerr << " Hall Laboratory, University of Virginia" << endl;
cerr << "===============================================" << endl << endl;
cerr << "Description: Report overlaps between a.bed and b.bed." << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl;
cerr << "OPTIONS: " << endl;
cerr << "\t" << "-f (e.g. 0.05)\t\t" << "Minimum overlap req'd as fraction of a.bed." << endl << "\t\t\t\tDefault is 1E-9 (effectively 1bp)." << endl << endl;
cerr << "NOTES: " << endl;
cerr << "\t" << "-i stdin\t\t" << "Allows BED file A to be read from stdin. E.g.: cat a.bed | subtractBed -a stdin -b B.bed" << endl << endl;
cerr << "\t***Only BED3 - BED6 formats allowed.***"<< endl << endl;
// end the program here
exit(1);
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment