diff --git a/FAQ b/FAQ old mode 100644 new mode 100755 diff --git a/Makefile b/Makefile old mode 100644 new mode 100755 index ed2a44b11ce3b6acf65dcfec90dcf1f87f2b260f..e7b741ab8e5464b0bbb33bcba62e66fda56eb9fd --- a/Makefile +++ b/Makefile @@ -19,7 +19,7 @@ export CXX ?= g++ #include includes/$(BLD_PLATFORM).inc # define our source subdirectories -SUBDIRS = $(SRC_DIR)/closestBed $(SRC_DIR)/complementBed $(SRC_DIR)/coverageBed $(SRC_DIR)/intersectBed $(SRC_DIR)/mergeBed $(SRC_DIR)/genomeCoverageBed $(SRC_DIR)/fastaFromBed $(SRC_DIR)/sortBed $(SRC_DIR)/windowBed $(SRC_DIR)/subtractBed $(SRC_DIR)/linksBed +SUBDIRS = $(SRC_DIR)/closestBed $(SRC_DIR)/complementBed $(SRC_DIR)/coverageBed $(SRC_DIR)/intersectBed $(SRC_DIR)/mergeBed $(SRC_DIR)/genomeCoverageBed $(SRC_DIR)/fastaFromBed $(SRC_DIR)/sortBed $(SRC_DIR)/windowBed $(SRC_DIR)/subtractBed $(SRC_DIR)/linksBed $(SRC_DIR)/peIntersectBed UTIL_SUBDIRS = $(SRC_DIR)/utils/bedFile $(SRC_DIR)/utils/sequenceUtilities all: diff --git a/README b/README old mode 100644 new mode 100755 diff --git a/RELEASE_HISTORY b/RELEASE_HISTORY new file mode 100644 index 0000000000000000000000000000000000000000..c4c289407334d1bd106fc266571cfa4e2bb7af9b --- /dev/null +++ b/RELEASE_HISTORY @@ -0,0 +1,18 @@ +VERSION 2.0 +1. Sped up the file parsing. ~10-20% increase in speed. +2. Created reportBed() as a common method in the bedFile class. Cleans up the code quite nicely. +3. Added the ability to compare BED files accounting for strandedness. +4. Paired-end intersect. +5. Fixed bug that prevented overlaps from being reported when the overlap fraction requested is 1.0 + + +VERSION 1.2, 04/27/2009. (1eb06115bdf3c49e75793f764a70c3501bb53f33) +1. Added subtractBed. + A. Fixed bug that prevented "split" overlaps from being reported. + B. Prevented A from being reported if >=1 feature in B completely spans it. +2. Added linksBed. +3. Added the ability to define separate windows for upstream and downstream to windowBed. + + +VERSION 1.1, 04/23/2009. (b74eb1afddca9b70bfa90ba763d4f2981a56f432) +Initial release. \ No newline at end of file diff --git a/TODO b/TODO old mode 100644 new mode 100755 diff --git a/USAGE_EXAMPLES b/USAGE_EXAMPLES old mode 100644 new mode 100755 diff --git a/VERSION b/VERSION old mode 100644 new mode 100755 diff --git a/bin/closestBed b/bin/closestBed index b32826c21f63162f3b8c369ba945e4691e98b002..38828d57841498707bfa4e2b4100a552ddeb7379 100755 Binary files a/bin/closestBed and b/bin/closestBed differ diff --git a/bin/complementBed b/bin/complementBed index 9fd53408126a3c8987f74e9dd5858328e4372497..d2d85c9bf399072c69f9f17193661013ee117094 100755 Binary files a/bin/complementBed and b/bin/complementBed differ diff --git a/bin/coverageBed b/bin/coverageBed index 39431e32aa196348f2ab6e01f8ac9d80d76a1120..37b6808c6bc90e0a044c2c62ed0fa6801dafccd0 100755 Binary files a/bin/coverageBed and b/bin/coverageBed differ diff --git a/bin/fastaFromBed b/bin/fastaFromBed index ebe54d0e539f7c68d30f053a0d08206c5040440e..b4ba0479184cfd753b82f2bd5dcb497cfedf253e 100755 Binary files a/bin/fastaFromBed and b/bin/fastaFromBed differ diff --git a/bin/genomeCoverageBed b/bin/genomeCoverageBed index 6a592e3b52a37cff1ca82735678445c8fd533778..fa91bf3ac714a6834d32e11e9ac6a0daf62c3602 100755 Binary files a/bin/genomeCoverageBed and b/bin/genomeCoverageBed differ diff --git a/bin/intersectBed b/bin/intersectBed index bff07c6224c4a36592aec5a1ab0f151f2925c03d..52dee09c309c25f7742a18bbac963cf4c0cb5c8b 100755 Binary files a/bin/intersectBed and b/bin/intersectBed differ diff --git a/bin/linksBed b/bin/linksBed index 545b9be09c4ffdd8e2101a95b7557d01432e94d5..604f04f714e4abcddb0e0783cc12b275b9de54ed 100755 Binary files a/bin/linksBed and b/bin/linksBed differ diff --git a/bin/mergeBed b/bin/mergeBed index 65900fefc3d90d03734b11d01bf2514e06ee0e64..bac48035f4d71ed3502f051a79be5841b7abb32e 100755 Binary files a/bin/mergeBed and b/bin/mergeBed differ diff --git a/bin/peIntersectBed b/bin/peIntersectBed new file mode 100755 index 0000000000000000000000000000000000000000..d206b4f971d84816e93d18b87d1084cb6449edbc Binary files /dev/null and b/bin/peIntersectBed differ diff --git a/bin/sortBed b/bin/sortBed index db1e21c71382b0350eedacf754719edd2942c085..4e17557463da0f0b4839b3c6546cdef24307f838 100755 Binary files a/bin/sortBed and b/bin/sortBed differ diff --git a/bin/subtractBed b/bin/subtractBed index 8cca2fad922a1c2d2d0fa3dc9265653a1e371bfe..9d0b9295965c22d51ad9561fa4b03698a2570cff 100755 Binary files a/bin/subtractBed and b/bin/subtractBed differ diff --git a/bin/windowBed b/bin/windowBed index 440553eb18c34bfbc24b76efc5ceef0c0e06d469..c2a78a47a34f32e142c57ab1735c5c89d13fc18e 100755 Binary files a/bin/windowBed and b/bin/windowBed differ diff --git a/obj/bedFile.o b/obj/bedFile.o index 52fcb5ab9214d64a8ada9585ae8c1239961d621d..605a7f3cf4d3889c114d634f825752d28572a927 100644 Binary files a/obj/bedFile.o and b/obj/bedFile.o differ diff --git a/obj/bedFilePE.o b/obj/bedFilePE.o new file mode 100644 index 0000000000000000000000000000000000000000..44bc7b30813c9677f4e4e780e5146f6e7722a8ee Binary files /dev/null and b/obj/bedFilePE.o differ diff --git a/obj/closestBed.o b/obj/closestBed.o index 000735e3d59a680edd6d5ec904f25cadcd91eebd..8e7085fbad22c6165978c88b1373874b0dd6f1e9 100644 Binary files a/obj/closestBed.o and b/obj/closestBed.o differ diff --git a/obj/closestMain.o b/obj/closestMain.o index d22bf2b04f19a5f253042b232e294357006dc459..a02cd8688d8978535b808e621670c3e6c4d15253 100644 Binary files a/obj/closestMain.o and b/obj/closestMain.o differ diff --git a/obj/coverageBed.o b/obj/coverageBed.o index 5f4869875d858c67566301fd453dc5ecf581a815..354207333394714fcf5fb31971c2330989f42d9b 100644 Binary files a/obj/coverageBed.o and b/obj/coverageBed.o differ diff --git a/obj/genomeCoverageBed.o b/obj/genomeCoverageBed.o index 75364cfb6d582295ab20028a4e7bda1e63d59073..a17a58acf05989b004bffdd3e3e8ee54d36406e1 100644 Binary files a/obj/genomeCoverageBed.o and b/obj/genomeCoverageBed.o differ diff --git a/obj/intersectBed.o b/obj/intersectBed.o index 3c3e8663ea3340289fd315a8c871a18d45e69152..e074a0271b291ae2b4dff3a4315091c09774ae78 100644 Binary files a/obj/intersectBed.o and b/obj/intersectBed.o differ diff --git a/obj/intersectMain.o b/obj/intersectMain.o index a82ff0edfb399f5db4808e809bcbec0982ac9ed4..d9c9b4e19c42324b81a12926e1edc5e6f9dc3753 100644 Binary files a/obj/intersectMain.o and b/obj/intersectMain.o differ diff --git a/obj/lineFileUtilities.o b/obj/lineFileUtilities.o new file mode 100644 index 0000000000000000000000000000000000000000..9bd38530ff0f910da69a6e32fc3e77a7218879f8 Binary files /dev/null and b/obj/lineFileUtilities.o differ diff --git a/obj/mergeBed.o b/obj/mergeBed.o index 1f9aa477fbd3b56cf31e6b988454f281cb08a612..0dc5bb4a9ee203eb48905ac7fc32dd48e0f7a9b8 100644 Binary files a/obj/mergeBed.o and b/obj/mergeBed.o differ diff --git a/obj/mergeMain.o b/obj/mergeMain.o index 549d26ea3e18f01946eed8035867d59f00cd932a..b60a15e3a954ed0e3c8fe3e8a2b97b160f76109e 100644 Binary files a/obj/mergeMain.o and b/obj/mergeMain.o differ diff --git a/obj/peIntersectBed.o b/obj/peIntersectBed.o new file mode 100644 index 0000000000000000000000000000000000000000..a55a31eebf44cf52ef54efe42eba526451557f27 Binary files /dev/null and b/obj/peIntersectBed.o differ diff --git a/obj/peIntersectMain.o b/obj/peIntersectMain.o new file mode 100644 index 0000000000000000000000000000000000000000..148a967d298a1f79991bdc3f775d59c9710c4b3c Binary files /dev/null and b/obj/peIntersectMain.o differ diff --git a/obj/sortBed.o b/obj/sortBed.o index 5223daf396741aa51079a2432fc8fd3bca42c33b..2b5ea2282412c294b1ddcc9a6e84da2685909537 100644 Binary files a/obj/sortBed.o and b/obj/sortBed.o differ diff --git a/obj/subtractBed.o b/obj/subtractBed.o index bcccf40e406e386445dbe588f206e54c5aa3618b..6a83c1b6e469da2225e0b0150dd1e7db0a5b47a4 100644 Binary files a/obj/subtractBed.o and b/obj/subtractBed.o differ diff --git a/obj/subtractMain.o b/obj/subtractMain.o index 1dc70dbea10d1c2b093ce7c6d60b69c4871e79af..56b174772dd83e4024f1c0d8a88d94aa9f415d43 100644 Binary files a/obj/subtractMain.o and b/obj/subtractMain.o differ diff --git a/obj/windowBed.o b/obj/windowBed.o index 83fa3dbb5835d758181a18bfbc9e09e6960bf816..cd2ebb862ab9fa8705f6eb1687747a97ed57d9fd 100644 Binary files a/obj/windowBed.o and b/obj/windowBed.o differ diff --git a/obj/windowMain.o b/obj/windowMain.o index 96eecbb4563b060f3b9b5671cc6cef4cc2b0c368..c98e0f19f17559425ce3b228e1d124cadb4d2f5f 100644 Binary files a/obj/windowMain.o and b/obj/windowMain.o differ diff --git a/src/closestBed/Makefile b/src/closestBed/Makefile old mode 100644 new mode 100755 index 578687f5db400020f22453bdb03e61c8dc8ca4a2..56a3533626a210ad71f0ee8010baf0a4f40c2dad --- a/src/closestBed/Makefile +++ b/src/closestBed/Makefile @@ -9,14 +9,14 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= closestMain.cpp closestBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o +_EXT_OBJECTS=bedFile.o lineFileUtilities.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= closestBed @@ -36,7 +36,8 @@ $(BUILT_OBJECTS): $(SOURCES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ - + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* diff --git a/src/closestBed/closestBed.cpp b/src/closestBed/closestBed.cpp index 96a3498929c83218f920a6307fdb0068a28908c8..b6a1343ab899a740a4a55bdd7786e60f77fac185 100755 --- a/src/closestBed/closestBed.cpp +++ b/src/closestBed/closestBed.cpp @@ -7,10 +7,7 @@ // // Summary: Looks for the closest features in two BED files. // - -/* - Includes -*/ +#include "lineFileUtilities.h" #include "closestBed.h" const int MAXSLOP = 256000000; // 2*MAXSLOP = 512 megabases. @@ -22,13 +19,15 @@ const int SLOPGROWTH = 2048000; /* Constructor */ -BedClosest::BedClosest(string &bedAFile, string &bedBFile) { +BedClosest::BedClosest(string &bedAFile, string &bedBFile, bool &forceStrand) { this->bedAFile = bedAFile; this->bedBFile = bedBFile; + this->forceStrand = forceStrand; this->bedA = new BedFile(bedAFile); this->bedB = new BedFile(bedBFile); + } /* @@ -38,61 +37,6 @@ BedClosest::~BedClosest(void) { } - -/* - reportA - - Writes the _original_ BED entry for A. - Works for BED3 - BED6. -*/ -void BedClosest::reportA(const BED &a) { - - if (bedA->bedType == 3) { - cout << a.chrom << "\t" << a.start << "\t" << a.end; - } - else if (bedA->bedType == 4) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name; - } - else if (bedA->bedType == 5) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name << "\t" << a.score; - } - else if (bedA->bedType == 6) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name << "\t" << a.score << "\t" << a.strand; - } -} - - - -/* - reportB - - Writes the _original_ BED entry for B. - Works for BED3 - BED6. -*/ -void BedClosest::reportB(const BED &b) { - if (bedB->bedType == 3) { - cout << b.chrom << "\t" << b.start << "\t" << b.end; - } - else if (bedB->bedType == 4) { - cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" - << b.name; - } - else if (bedB->bedType == 5) { - cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" - << b.name << "\t" << b.score; - } - else if (bedB->bedType == 6) { - cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" - << b.name << "\t" << b.score << "\t" << b.strand; - } -} - - - - /* reportNullB @@ -155,6 +99,12 @@ void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) { bedB->binKeeperFind(bedB->bedMap[a.chrom], aFudgeStart, aFudgeEnd, hits); for (vector<BED>::iterator h = hits.begin(); h != hits.end(); ++h) { + + // if forcing strandedness, move on if the hit + // is not on the same strand as A. + if ((this->forceStrand) && (a.strand != h->strand)) { + continue; // continue force the next iteration of the for loop. + } numOverlaps++; @@ -190,21 +140,24 @@ void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) { } } - slop += SLOPGROWTH; // if there were no overlaps, - // we'll widen the range by 64Kb in each direction + /* if no overlaps were found, we'll + widen the range by in each direction + and search again. + */ + slop += SLOPGROWTH; } } else { - reportA(a); + bedA->reportBed(a); cout << "\t"; reportNullB(); cout << "\n"; } if (numOverlaps > 0) { - reportA(a); + bedA->reportBed(a); cout << "\t"; - reportB(closestB); + bedB->reportBed(closestB); cout << "\n"; } } diff --git a/src/closestBed/closestBed.h b/src/closestBed/closestBed.h index 641f0df3c8008e722261a04953e5250f9b6473f3..ddeb2722f6f0f2420304afa4db655857f91bf887 100755 --- a/src/closestBed/closestBed.h +++ b/src/closestBed/closestBed.h @@ -3,20 +3,11 @@ #include "bedFile.h" #include <vector> -#include <map> -#include <list> #include <iostream> #include <fstream> using namespace std; - -//*********************************************** -// Typedefs -//*********************************************** -typedef list<BED> bedList; - - //************************************************ // Class methods and elements //************************************************ @@ -25,7 +16,7 @@ class BedClosest { public: // constructor - BedClosest(string &, string &); + BedClosest(string &, string &, bool &); // destructor ~BedClosest(void); @@ -41,7 +32,8 @@ private: string bedAFile; string bedBFile; - + bool forceStrand; + // instance of a bed file class. BedFile *bedA, *bedB; diff --git a/src/closestBed/closestMain.cpp b/src/closestBed/closestMain.cpp old mode 100644 new mode 100755 index 5c860657224f66311843a1855c6c86fa2467add8..529a64f84b128d82d2299f75223706ed41e0b243 --- a/src/closestBed/closestMain.cpp +++ b/src/closestBed/closestMain.cpp @@ -25,6 +25,7 @@ int main(int argc, char* argv[]) { bool haveBedA = false; bool haveBedB = false; + bool forceStrand = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -54,6 +55,9 @@ int main(int argc, char* argv[]) { haveBedB = true; bedBFile = argv[i + 1]; i++; + } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; } } @@ -64,7 +68,7 @@ int main(int argc, char* argv[]) { } if (!showHelp) { - BedClosest *bc = new BedClosest(bedAFile, bedBFile); + BedClosest *bc = new BedClosest(bedAFile, bedBFile, forceStrand); bc->ClosestBed(); return 0; } @@ -84,9 +88,13 @@ void ShowHelp(void) { cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl; + cerr << "OPTIONS: " << endl; + cerr << "\t" << "-s\t\t\t" << "Force strandedness. Only report hits in B that overlap A on the same strand." << endl << "\t\t\t\tBy default, overlaps are reported without respect to strand." << endl << endl; + cerr << "NOTES: " << endl; cerr << "\t" << "-i stdin\t\t" << "Allows BED file A to be read from stdin. E.g.: cat a.bed | closestBed -a stdin -b B.bed" << endl << endl; - cerr << "\t" << "Reports \"none\" for chrom and \"-1\" for all other fields when a feature is not found in B on the same chromosome as the feature in A. E.g. none -1 -1" << endl; + cerr << "\t" << "Reports \"none\" for chrom and \"-1\" for all other fields when a feature is not found in B on the same chromosome" << endl; + cerr << "\t" << "as the feature in A. E.g. none\t-1\t-1" << endl << endl; cerr << "\t***Only BED3 - BED6 formats allowed.***" << endl << endl; // end the program here diff --git a/src/complementBed/Makefile b/src/complementBed/Makefile index 87429c8e78b91037563e398f50ff5f8d92ed75b0..72aee529e2d40344215ecdbba7312dcea671713b 100755 --- a/src/complementBed/Makefile +++ b/src/complementBed/Makefile @@ -10,14 +10,14 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= complementMain.cpp complementBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o +_EXT_OBJECTS=bedFile.o lineFileUtilities.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= complementBed @@ -37,7 +37,8 @@ $(BUILT_OBJECTS): $(SOURCES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ - + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* diff --git a/src/complementBed/complementBed.cpp b/src/complementBed/complementBed.cpp index f0799f200e2ceca470b9b8d9220c1a4d15fc62a3..c6d58be61b888adf686cfaa09d970ff6ec044636 100755 --- a/src/complementBed/complementBed.cpp +++ b/src/complementBed/complementBed.cpp @@ -8,6 +8,7 @@ // Summary: Return the intervals NOT spanned by a BED file. // +#include "lineFileUtilities.h" #include "complementBed.h" //========================== diff --git a/src/complementBed/complementBed.h b/src/complementBed/complementBed.h index 1e9cb7889861fe9c5e0199828981a0855620b849..f34326942d94ec846afe3c78a7d715641e41c665 100755 --- a/src/complementBed/complementBed.h +++ b/src/complementBed/complementBed.h @@ -1,8 +1,6 @@ #include "bedFile.h" #include <vector> #include <algorithm> -#include <map> -#include <list> #include <iostream> #include <fstream> #include <limits.h> @@ -11,12 +9,6 @@ using namespace std; -//*********************************************** -// Typedefs -//*********************************************** -typedef list<BED> bedList; - - //************************************************ // Class methods and elements //************************************************ diff --git a/src/coverageBed/Makefile b/src/coverageBed/Makefile old mode 100644 new mode 100755 index 5217675296cfba6265ae5208bfb9a1eaa122e8c8..fd84f6d541eb4ba4d273ccf9cacc521114fa0be9 --- a/src/coverageBed/Makefile +++ b/src/coverageBed/Makefile @@ -9,14 +9,14 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= coverageMain.cpp coverageBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o +_EXT_OBJECTS=bedFile.o lineFileUtilities.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= coverageBed @@ -36,7 +36,8 @@ $(BUILT_OBJECTS): $(SOURCES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ - + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* diff --git a/src/coverageBed/coverageBed.cpp b/src/coverageBed/coverageBed.cpp index 95eb1a683ca9a6672b8c8270ffe0f33f7b6bb533..a10f2d23cf7ac958731b17c95af41f8bd48bfb7a 100755 --- a/src/coverageBed/coverageBed.cpp +++ b/src/coverageBed/coverageBed.cpp @@ -6,7 +6,7 @@ // Copyright 2009 Aaron Quinlan. All rights reserved. // // - +#include "lineFileUtilities.h" #include "coverageBed.h" @@ -68,7 +68,6 @@ void BedGraph::GraphBed() { } } } - } diff --git a/src/coverageBed/coverageBed.h b/src/coverageBed/coverageBed.h index 8504c337158fab8653b347ab38570ecac3183885..5497ac8e78cae8376a66905eb228a92c9a25db1d 100755 --- a/src/coverageBed/coverageBed.h +++ b/src/coverageBed/coverageBed.h @@ -3,21 +3,12 @@ #include "bedFile.h" #include <vector> -#include <map> -#include <list> #include <iostream> #include <fstream> #include <stdlib.h> using namespace std; - -//*********************************************** -// Typedefs -//*********************************************** -typedef list<BED> bedList; - - //************************************************ // Class methods and elements //************************************************ diff --git a/src/fastaFromBed/Makefile b/src/fastaFromBed/Makefile index 5ae30b62424a8ed7833bdda56e162019e2ec0402..b9f32efdfa19edf7a85d585d01ff09837b94a4e3 100755 --- a/src/fastaFromBed/Makefile +++ b/src/fastaFromBed/Makefile @@ -9,14 +9,14 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/sequenceUtilities/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/sequenceUtilities/ -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= fastaFromBedMain.cpp fastaFromBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o sequenceUtils.o +_EXT_OBJECTS=bedFile.o sequenceUtils.o lineFileUtilities.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= fastaFromBed @@ -35,9 +35,11 @@ $(BUILT_OBJECTS): $(SOURCES) @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) $(EXT_OBJECTS): - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/sequenceUtilities/ - + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ + + clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* diff --git a/src/fastaFromBed/fastaFromBed.cpp b/src/fastaFromBed/fastaFromBed.cpp index d630527f41f72e38d2b4d868b344c997770f8b3b..dba99dcc2ac20c1598f3ba5f317e037c13aaf6bf 100755 --- a/src/fastaFromBed/fastaFromBed.cpp +++ b/src/fastaFromBed/fastaFromBed.cpp @@ -7,7 +7,7 @@ // // Summary: Extract fasta sequences based on BED intervals. // - +#include "lineFileUtilities.h" #include "fastaFromBed.h" Bed2Fa::Bed2Fa(bool &useName, string &dbFile, string &bedFile, string &fastaOutFile) { diff --git a/src/fastaFromBed/fastaFromBed.h b/src/fastaFromBed/fastaFromBed.h index 661e436c0181f239f19e5482aca0b250101b6797..0c115fe8c5385fc42cd20f3c8e4da72dac2651b3 100755 --- a/src/fastaFromBed/fastaFromBed.h +++ b/src/fastaFromBed/fastaFromBed.h @@ -1,5 +1,6 @@ #ifndef FASTAFROMBED_H #define FASTAFROMBED_H + #include "bedFile.h" #include "sequenceUtils.h" #include <vector> diff --git a/src/genomeCoverageBed/Makefile b/src/genomeCoverageBed/Makefile index 7e9e794065e9c239958eae222682c3c9859dfb9b..1d150650eea3204610c28536b6a7879188e35dcd 100755 --- a/src/genomeCoverageBed/Makefile +++ b/src/genomeCoverageBed/Makefile @@ -9,14 +9,14 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= genomeCoverageMain.cpp genomeCoverageBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o +_EXT_OBJECTS=bedFile.o lineFileUtilities.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= genomeCoverageBed @@ -35,8 +35,9 @@ $(BUILT_OBJECTS): $(SOURCES) @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) $(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ - + clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* diff --git a/src/genomeCoverageBed/genomeCoverageBed.cpp b/src/genomeCoverageBed/genomeCoverageBed.cpp index 12778bdaed58f8145082cd8b3d8fad25819d5787..8e3d899abb91127ef3f070fae99929778774f612 100755 --- a/src/genomeCoverageBed/genomeCoverageBed.cpp +++ b/src/genomeCoverageBed/genomeCoverageBed.cpp @@ -6,7 +6,7 @@ // Copyright 2009 Aaron Quinlan. All rights reserved. // // - +#include "lineFileUtilities.h" #include "genomeCoverageBed.h" /* diff --git a/src/genomeCoverageBed/genomeCoverageBed.h b/src/genomeCoverageBed/genomeCoverageBed.h index f68ff5034ee145fd46f88329ec1bad76d47cf827..e1e0571a88eafd85963344befa4584b4243f1b72 100755 --- a/src/genomeCoverageBed/genomeCoverageBed.h +++ b/src/genomeCoverageBed/genomeCoverageBed.h @@ -1,7 +1,5 @@ #include "bedFile.h" #include <vector> -#include <map> -#include <list> #include <iostream> #include <fstream> @@ -10,8 +8,6 @@ using namespace std; //*********************************************** // Typedefs //*********************************************** -typedef list<BED> bedList; - typedef map<int, int, less<int> > depthMap; typedef map<string, depthMap, less<string> > chromDepthMap; diff --git a/src/intersectBed/Makefile b/src/intersectBed/Makefile old mode 100644 new mode 100755 index 75c31350bdd6caa8774ddb6d442a61f80a2737e3..b039979eb7e59bb257913dfc48ff1bddae6a6878 --- a/src/intersectBed/Makefile +++ b/src/intersectBed/Makefile @@ -9,14 +9,14 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= intersectMain.cpp intersectBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o +_EXT_OBJECTS=bedFile.o lineFileUtilities.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= intersectBed @@ -36,7 +36,8 @@ $(BUILT_OBJECTS): $(SOURCES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ - + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp index 879e82a5f6a43e13de83d75ddfad60319ed4f269..a177b3d86498982124be5a816acd4549f86a0db3 100755 --- a/src/intersectBed/intersectBed.cpp +++ b/src/intersectBed/intersectBed.cpp @@ -7,10 +7,7 @@ // // Summary: Looks for overlaps between features in two BED files. // - -/* - Includes -*/ +#include "lineFileUtilities.h" #include "intersectBed.h" @@ -19,7 +16,7 @@ */ BedIntersect::BedIntersect(string &bedAFile, string &bedBFile, bool &anyHit, bool &writeA, bool &writeB, float &overlapFraction, - bool &noHit, bool &writeCount) { + bool &noHit, bool &writeCount, bool &forceStrand) { this->bedAFile = bedAFile; this->bedBFile = bedBFile; @@ -29,11 +26,13 @@ BedIntersect::BedIntersect(string &bedAFile, string &bedBFile, bool &anyHit, this->writeB = writeB; this->writeCount = writeCount; this->overlapFraction = overlapFraction; + this->forceStrand = forceStrand; this->bedA = new BedFile(bedAFile); this->bedB = new BedFile(bedBFile); } + /* Destructor */ @@ -41,87 +40,6 @@ BedIntersect::~BedIntersect(void) { } - -/* - reportA - - Writes the _original_ BED entry for A. - Works for BED3 - BED6. -*/ -void BedIntersect::reportA(const BED &a) { - - if (bedA->bedType == 3) { - cout << a.chrom << "\t" << a.start << "\t" << a.end; - } - else if (bedA->bedType == 4) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name; - } - else if (bedA->bedType == 5) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name << "\t" << a.score; - } - else if (bedA->bedType == 6) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name << "\t" << a.score << "\t" << a.strand; - } -} - - -/* - reportAIntersect - - Writes the base-pair _overlap_ for a BED entry in A. - Works for BED3 - BED6. -*/ -void BedIntersect::reportAIntersect(const BED &a, int &start, int &end) { - - if (bedA->bedType == 3) { - cout << a.chrom << "\t" << start << "\t" << end; - } - else if (bedA->bedType == 4) { - cout << a.chrom << "\t" << start << "\t" << end << "\t" - << a.name; - } - else if (bedA->bedType == 5) { - cout << a.chrom << "\t" << start << "\t" << end << "\t" - << a.name << "\t" << a.score; - } - else if (bedA->bedType == 6) { - cout << a.chrom << "\t" << start << "\t" << end << "\t" - << a.name << "\t" << a.score << "\t" << a.strand; - } - -} - - - -/* - reportB - - Writes the _original_ BED entry for B. - Works for BED3 - BED6. -*/ -void BedIntersect::reportB(const BED &b) { - if (bedB->bedType == 3) { - cout << b.chrom << "\t" << b.start << "\t" << b.end; - } - else if (bedB->bedType == 4) { - cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" - << b.name; - } - else if (bedB->bedType == 5) { - cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" - << b.name << "\t" << b.score; - } - else if (bedB->bedType == 6) { - cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" - << b.name << "\t" << b.score << "\t" << b.strand; - } -} - - - void BedIntersect::FindOverlaps(BED &a, vector<BED> &hits) { // find all of the overlaps between a and B. @@ -131,45 +49,55 @@ void BedIntersect::FindOverlaps(BED &a, vector<BED> &hits) { for (vector<BED>::iterator h = hits.begin(); h != hits.end(); ++h) { + // if forcing strandedness, move on if the hit + // is not on the same strand as A. + if ((this->forceStrand) && (a.strand != h->strand)) { + continue; // continue force the next iteration of the for loop. + } + int s = max(a.start, h->start); int e = min(a.end, h->end); if (s < e) { - // is there enough overlap (default ~ 1bp) - if ( ((float)(e-s) / (float)(a.end - a.start)) > this->overlapFraction ) { + // is there enough overlap relative to the user's request? + // (default ~ 1bp) + if ( ((float)(e-s) / (float)(a.end - a.start)) >= this->overlapFraction ) { + numOverlaps++; + if (!anyHit && !noHit && !writeCount) { if (!writeB) { if (writeA) { - reportA(a); cout << endl; + bedA->reportBed(a); cout << endl; } else { - reportAIntersect(a,s,e); cout << endl; + bedA->reportBedRange(a,s,e); cout << endl; } } else { if (writeA) { - reportA(a); cout << "\t"; - reportB(*h); cout << endl; + bedA->reportBed(a); cout << "\t"; + bedB->reportBed(*h); cout << endl; } else { - reportAIntersect(a,s,e); cout << "\t"; - reportB(*h); cout << endl; + bedA->reportBedRange(a,s,e); cout << "\t"; + bedB->reportBed(*h); cout << endl; } } } + } } } if (anyHit && (numOverlaps >= 1)) { - reportA(a); cout << endl; + bedA->reportBed(a); cout << endl; } else if (writeCount) { - reportA(a); cout << "\t" << numOverlaps << endl; + bedA->reportBed(a); cout << "\t" << numOverlaps << endl; } else if (noHit && (numOverlaps == 0)) { - reportA(a); cout << endl; + bedA->reportBed(a); cout << endl; } } diff --git a/src/intersectBed/intersectBed.h b/src/intersectBed/intersectBed.h index ddf1e95678204732f42d0004ae92ae0dc34e65e5..0b78b849d4ffd0dd53daebea936bafd5b8692b64 100755 --- a/src/intersectBed/intersectBed.h +++ b/src/intersectBed/intersectBed.h @@ -3,20 +3,11 @@ #include "bedFile.h" #include <vector> -#include <map> -#include <list> #include <iostream> #include <fstream> using namespace std; - -//*********************************************** -// Typedefs -//*********************************************** -typedef list<BED> bedList; - - //************************************************ // Class methods and elements //************************************************ @@ -25,7 +16,7 @@ class BedIntersect { public: // constructor - BedIntersect(string &, string &, bool &, bool &, bool &, float &, bool &, bool &); + BedIntersect(string &, string &, bool &, bool &, bool &, float &, bool &, bool &, bool &); // destructor ~BedIntersect(void); @@ -47,6 +38,7 @@ private: bool writeA; bool writeB; bool writeCount; + bool forceStrand; float overlapFraction; bool noHit; diff --git a/src/intersectBed/intersectMain.cpp b/src/intersectBed/intersectMain.cpp index 71329254a0e465f73dc87a52530fefa0e74f33f1..17fe0c9ebc78bfb9b5fc37a8f0620d24180fb12a 100755 --- a/src/intersectBed/intersectMain.cpp +++ b/src/intersectBed/intersectMain.cpp @@ -34,6 +34,7 @@ int main(int argc, char* argv[]) { bool writeB = false; bool writeCount = false; bool haveFraction = false; + bool forceStrand = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -88,6 +89,9 @@ int main(int argc, char* argv[]) { else if (PARAMETER_CHECK("-v", 2, parameterLength)) { noHit = true; } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; @@ -117,7 +121,7 @@ int main(int argc, char* argv[]) { if (!showHelp) { - BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, overlapFraction, noHit, writeCount); + BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, overlapFraction, noHit, writeCount, forceStrand); bi->IntersectBed(); return 0; } @@ -138,6 +142,7 @@ void ShowHelp(void) { cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl; cerr << "OPTIONS: " << endl; + cerr << "\t" << "-s\t\t\t" << "Force strandedness. Only report hits in B that overlap A on the same strand." << endl << "\t\t\t\tBy default, overlaps are reported without respect to strand." << endl << endl; cerr << "\t" << "-u\t\t\t" << "Write ORIGINAL a.bed entry ONCE if ANY overlap with B.bed." << endl << "\t\t\t\tIn other words, just report the fact >=1 hit was found." << endl << endl; cerr << "\t" << "-v \t\t\t" << "Only report those entries in A that have NO OVERLAP in B." << endl << "\t\t\t\tSimilar to grep -v." << endl << endl; cerr << "\t" << "-f (e.g. 0.05)\t\t" << "Minimum overlap req'd as fraction of a.bed." << endl << "\t\t\t\tDefault is 1E-9 (effectively 1bp)." << endl << endl; diff --git a/src/linksBed/Makefile b/src/linksBed/Makefile index 74856d00a475497483a4d2aff05b0c8ced719e10..2fa4c67158691d52e2cea3280fdd873c6d041963 100755 --- a/src/linksBed/Makefile +++ b/src/linksBed/Makefile @@ -9,14 +9,14 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= linksMain.cpp linksBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o +_EXT_OBJECTS=bedFile.o lineFileUtilities.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= linksBed @@ -36,7 +36,8 @@ $(BUILT_OBJECTS): $(SOURCES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ - + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* diff --git a/src/linksBed/linksBed.cpp b/src/linksBed/linksBed.cpp index b54d9c3abbca6dc04c177640bde557571dd972de..f92b2193b3524dab4f3dc3fa1fbed613817e2d78 100755 --- a/src/linksBed/linksBed.cpp +++ b/src/linksBed/linksBed.cpp @@ -7,7 +7,7 @@ // // Summary: Creates HTML or UCSC browser coordinates from a BED file. // - +#include "lineFileUtilities.h" #include "linksBed.h" // diff --git a/src/linksBed/linksBed.h b/src/linksBed/linksBed.h index f104a999a6f90df62c42ab7b2f06b6b8c04cf177..a5ff327b5b117a5076241c2b54ddf57ee28d9787 100755 --- a/src/linksBed/linksBed.h +++ b/src/linksBed/linksBed.h @@ -1,20 +1,11 @@ #include "bedFile.h" #include <vector> #include <algorithm> -#include <map> -#include <list> #include <iostream> #include <fstream> using namespace std; - -//*********************************************** -// Typedefs -//*********************************************** -typedef list<BED> bedList; - - //************************************************ // Class methods and elements //************************************************ diff --git a/src/mergeBed/Makefile b/src/mergeBed/Makefile index a316933dc4fe419a99e094b1b7951b23c4da756e..0bffbdf01820d22c0506da89ebd8f6bd2011a474 100755 --- a/src/mergeBed/Makefile +++ b/src/mergeBed/Makefile @@ -9,14 +9,14 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= mergeMain.cpp mergeBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o +_EXT_OBJECTS=bedFile.o lineFileUtilities.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= mergeBed @@ -36,7 +36,8 @@ $(BUILT_OBJECTS): $(SOURCES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ - + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* diff --git a/src/mergeBed/mergeBed.cpp b/src/mergeBed/mergeBed.cpp index 51d27a1cc36ec9f127cfb7aae27a415c902a5a6f..d88e2725770933e2a28b0f0702333ec5a91c3882 100755 --- a/src/mergeBed/mergeBed.cpp +++ b/src/mergeBed/mergeBed.cpp @@ -7,37 +7,34 @@ // // Summary: Combines overlapping BED entries into a single entry. // - - +#include "lineFileUtilities.h" #include "mergeBed.h" -//:::::::::::::::::::::::::::::::::: -// Constructor -//:::::::::::::::::::::::::::::::::: - -BedMerge::BedMerge(string &bedFile, bool &numEntries, int &maxDistance) { +// =============== +// = Constructor = +// =============== +BedMerge::BedMerge(string &bedFile, bool &numEntries, int &maxDistance, bool &forceStrand) { this->bedFile = bedFile; this->numEntries = numEntries; this->maxDistance = -1 * maxDistance; + this->forceStrand = forceStrand; + this->bed = new BedFile(bedFile); } -//:::::::::::::::::::::::::::::::::: -// Destructor -//:::::::::::::::::::::::::::::::::: - +// ================= +// = Destructor = +// ================= BedMerge::~BedMerge(void) { } - -//:::::::::::::::::::::::::::::::::::::::::::::::::: -// Merge overlapping BED entries into a single entry -//:::::::::::::::::::::::::::::::::::::::::::::::::: - +// ===================================================== +// = Merge overlapping BED entries into a single entry = +// ===================================================== void BedMerge::MergeBed() { // load the "B" bed file into a map so @@ -50,58 +47,59 @@ void BedMerge::MergeBed() { // bedList is already sorted by start position. vector<BED> bedList = m->second; - int minStart = 999999999; + int minStart = INT_MAX; int maxEnd = 0; bool OIP = false; // OIP = Overlap In Progress. Lame, I realize. - unsigned int i; + unsigned int prev = 0; + unsigned int curr = 0; int mergeCount = 1; + // loop through the BED entries for this chromosome // and look for overlaps - for (i = 1; i < bedList.size(); ++i) { + for (curr = 0; curr < bedList.size(); ++curr) { // Is there an overlap between the current and previous entries? - if ( overlaps(bedList[i-1].start, bedList[i-1].end, - bedList[i].start, bedList[i].end) >= this->maxDistance) { + if ( overlaps(bedList[prev].start, bedList[prev].end, + bedList[curr].start, bedList[curr].end) >= this->maxDistance) { OIP = true; mergeCount++; - minStart = min(bedList[i-1].start, min(minStart, bedList[i].start)); - maxEnd = max(bedList[i-1].end, max(maxEnd, bedList[i].end)); + minStart = min(bedList[prev].start, min(minStart, bedList[curr].start)); + maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end)); } else if ( overlaps(minStart, maxEnd, - bedList[i].start, bedList[i].end) >= this->maxDistance) { + bedList[curr].start, bedList[curr].end) >= this->maxDistance) { mergeCount++; - minStart = min(minStart, bedList[i].start); - maxEnd = max(maxEnd, bedList[i].end); - + minStart = min(minStart, bedList[curr].start); + maxEnd = max(maxEnd, bedList[curr].end); } else { // was there an overlap befor the current entry broke it? if (OIP) { if (this->numEntries) { - cout << bedList[i-1].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl; + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl; } else { - cout << bedList[i-1].chrom << "\t" << minStart << "\t" << maxEnd << endl; + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << endl; } } else { if (this->numEntries) { - cout << bedList[i-1].chrom << "\t" << bedList[i-1].start << "\t" << bedList[i-1].end << "\t" << mergeCount << endl; + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << endl; } else { - cout << bedList[i-1].chrom << "\t" << bedList[i-1].start << "\t" << bedList[i-1].end << endl; + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << endl; } } // reset things for the next overlapping "block" OIP = false; - mergeCount = 1; - minStart = 999999999; + mergeCount = 1; + minStart = INT_MAX; maxEnd = 0; } @@ -110,20 +108,140 @@ void BedMerge::MergeBed() { // clean up based on the last entry for the current chromosome if (OIP) { if (this->numEntries) { - cout << bedList[i-1].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl; + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl; } else { - cout << bedList[i-1].chrom << "\t" << minStart << "\t" << maxEnd << endl; + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << endl; } } else { if (this->numEntries) { - cout << bedList[i-1].chrom << "\t" << bedList[i-1].start << "\t" << bedList[i-1].end << "\t" << mergeCount << endl; + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << endl; } else { - cout << bedList[i-1].chrom << "\t" << bedList[i-1].start << "\t" << bedList[i-1].end << endl; + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << endl; } } + prev = curr; } } + +// ================================================================================== +// = Merge overlapping BED entries into a single entry, accounting for strandedness = +// ================================================================================== +void BedMerge::MergeBedStranded() { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + bed->loadBedFileIntoMapNoBin(); + + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = bed->bedMapNoBin.begin(); m != bed->bedMapNoBin.end(); ++m) { + + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + + // make a list of the two strands to merge separately. + vector<string> strands(2); + strands[0] = "+"; + strands[1] = "-"; + + // do two passes, one for each strand. + for (unsigned int s = 0; s < strands.size(); s++) { + + int minStart = INT_MAX; + int maxEnd = 0; + bool OIP = false; // OIP = Overlap In Progress. Lame, I realize. + int prev = -1; + unsigned int curr = 0; + int mergeCount = 1; + int numOnStrand = 0; + + // loop through the BED entries for this chromosome + // and look for overlaps + for (curr = 0; curr < bedList.size(); ++curr) { + + // if forcing strandedness, move on if the hit + // is not on the current strand. + if (bedList[curr].strand != strands[s]) { + continue; // continue force the next iteration of the for loop. + } + else { + numOnStrand++; + } + + // make sure prev points to an actual element on the + // current strand + if (prev < 0) { + if (bedList[curr].strand == strands[s]) { + prev = curr; + } + continue; + } + + if ( overlaps(bedList[prev].start, bedList[prev].end, + bedList[curr].start, bedList[curr].end) >= this->maxDistance) { + + OIP = true; + mergeCount++; + minStart = min(bedList[prev].start, min(minStart, bedList[curr].start)); + maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end)); + + } + else if ( overlaps(minStart, maxEnd, + bedList[curr].start, bedList[curr].end) >= this->maxDistance) { + + mergeCount++; + minStart = min(minStart, bedList[curr].start); + maxEnd = max(maxEnd, bedList[curr].end); + } + else { + + // was there an overlap befor the current entry broke it? + if (OIP) { + if (this->numEntries) { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl; + } + else { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl; + } + } + else { + if ((this->numEntries) && (numOnStrand > 0)) { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl; + } + else if (numOnStrand > 0) { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl; + } + } + + // reset things for the next overlapping "block" + OIP = false; + mergeCount = 1; + minStart = INT_MAX; + maxEnd = 0; + } + } + + // clean up based on the last entry for the current chromosome + if (OIP) { + if (this->numEntries) { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl; + } + else { + cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl; + } + } + else { + if ((this->numEntries) && (numOnStrand > 0)) { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl; + } + else if (numOnStrand > 0) { + cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl; + } + } + prev = curr; + } + } +} diff --git a/src/mergeBed/mergeBed.h b/src/mergeBed/mergeBed.h index 159b914131905697a690a1e13da7d9a9b701d2bc..f93f04f3e03380fe75c151634437c7c3a27a02bf 100755 --- a/src/mergeBed/mergeBed.h +++ b/src/mergeBed/mergeBed.h @@ -1,20 +1,13 @@ #include "bedFile.h" #include <vector> #include <algorithm> -#include <map> -#include <list> #include <iostream> #include <fstream> +#include <limits.h> +#include <stdlib.h> using namespace std; - -//*********************************************** -// Typedefs -//*********************************************** -typedef list<BED> bedList; - - //************************************************ // Class methods and elements //************************************************ @@ -23,17 +16,19 @@ class BedMerge { public: // constructor - BedMerge(string &, bool &, int&); + BedMerge(string &, bool &, int&, bool &); // destructor ~BedMerge(void); void MergeBed(); + void MergeBedStranded(); private: string bedFile; bool numEntries; + bool forceStrand; int maxDistance; // instance of a bed file class. BedFile *bed; diff --git a/src/mergeBed/mergeMain.cpp b/src/mergeBed/mergeMain.cpp index 62cbc10caa23f38d4dc7ef1aba0432947c0ec4d3..19df09f55d3ecb716a12d52947c5b8d932a8d929 100755 --- a/src/mergeBed/mergeMain.cpp +++ b/src/mergeBed/mergeMain.cpp @@ -28,6 +28,7 @@ int main(int argc, char* argv[]) { bool numEntries = false; bool haveMaxDistance = false; bool haveBed = false; + bool forceStrand = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -55,13 +56,15 @@ int main(int argc, char* argv[]) { } else if(PARAMETER_CHECK("-n", 2, parameterLength)) { numEntries = true; - i++; } else if(PARAMETER_CHECK("-d", 2, parameterLength)) { haveMaxDistance = true; maxDistance = atoi(argv[i + 1]); i++; } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; @@ -75,8 +78,14 @@ int main(int argc, char* argv[]) { } if (!showHelp) { - BedMerge *bm = new BedMerge(bedFile, numEntries, maxDistance); - bm->MergeBed(); + BedMerge *bm = new BedMerge(bedFile, numEntries, maxDistance, forceStrand); + + if (!forceStrand) { + bm->MergeBed(); + } + else { + bm->MergeBedStranded(); + } return 0; } else { @@ -96,8 +105,9 @@ void ShowHelp(void) { cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <input.bed>" << endl << endl; cerr << "OPTIONS: " << endl; + cerr << "\t" << "-s\t\t\t" << "Force strandedness. Only report hits in B that overlap A on the same strand." << endl << "\t\t\t\tBy default, overlaps are reported without respect to strand." << endl << endl; cerr << "\t" << "-n\t\t\t" << "Report the number of BED entries that were merged. (=1 if no merging occured)" << endl << endl; - cerr << "\t" << "-d\t\t\t" << "Maximum distance between features allowed for features to be merged. (Default=0)" << endl << endl; + cerr << "\t" << "-d\t\t\t" << "Maximum distance between features allowed for features to be merged. (Default=0)" << endl << endl; cerr << "NOTES: " << endl; cerr << "\t" << "-i stdin\t\t" << "Allows BED file A to be read from stdin. E.g.: cat a.bed | mergeBed -a stdin -b B.bed" << endl << endl; diff --git a/src/peIntersectBed/Makefile b/src/peIntersectBed/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..9dc7bc5d8bd642691929b9419dd26ef553aa40db --- /dev/null +++ b/src/peIntersectBed/Makefile @@ -0,0 +1,46 @@ +CXX = g++ +CXXFLAGS = -O3 -Wall +LDFLAGS = + +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/bedFilePE/ -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= peIntersectMain.cpp peIntersectBed.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=bedFilePE.o bedFile.o lineFileUtilities.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +PROGRAM= peIntersectBed +LIBS= + +all: $(PROGRAM) + +.PHONY: all + +$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS) + @echo " * linking $(PROGRAM)" + @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS) + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFilePE/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean diff --git a/src/peIntersectBed/peIntersectBed.cpp b/src/peIntersectBed/peIntersectBed.cpp new file mode 100755 index 0000000000000000000000000000000000000000..e6bc0850c9be630568b873fff72c565234057a6e --- /dev/null +++ b/src/peIntersectBed/peIntersectBed.cpp @@ -0,0 +1,262 @@ +// +// peIntersectBed.cpp +// BEDTools +// +// Created by Aaron Quinlan Spring 2009. +// Copyright 2009 Aaron Quinlan. All rights reserved. +// +// Summary: Looks for overlaps between paired-end reads / SVs (BEDPE format) and a BED file. +// +#include "lineFileUtilities.h" +#include "peIntersectBed.h" + + +/* + Constructor +*/ + +BedIntersectPE::BedIntersectPE(string &bedAFilePE, string &bedBFile, float &overlapFraction, + string &searchType) { + + this->bedAFilePE = bedAFilePE; + this->bedBFile = bedBFile; + this->overlapFraction = overlapFraction; + this->searchType = searchType; + + this->bedA = new BedFilePE(bedAFilePE); + this->bedB = new BedFile(bedBFile); +} + + +/* + Destructor +*/ + +BedIntersectPE::~BedIntersectPE(void) { +} + + + +void BedIntersectPE::FindOverlaps(BEDPE &a, vector<BED> &hits1, vector<BED> &hits2, string &type) { + + // list of hits on each end of BEDPE + // that exceed the requested overlap fraction + vector<BED> qualityHits1; + vector<BED> qualityHits2; + + // count of hits on each end of BEDPE + // that exceed the requested overlap fraction + int numOverlapsEnd1 = 0; + int numOverlapsEnd2 = 0; + + + /* + Find the quality hits between ***end1*** of the BEDPE and the B BED file + */ + bedB->binKeeperFind(bedB->bedMap[a.chrom1], a.start1, a.end1, hits1); + + for (vector<BED>::iterator h = hits1.begin(); h != hits1.end(); ++h) { + + int s = max(a.start1, h->start); + int e = min(a.end1, h->end); + + if (s < e) { + + // is there enough overlap (default ~ 1bp) + if ( ((float)(e-s) / (float)(a.end1 - a.start1)) >= this->overlapFraction ) { + numOverlapsEnd1++; + + if (type == "either") { + bedA->reportBedPE(a); cout << "\t"; + bedB->reportBed(*h); cout << endl; + } + else { + qualityHits1.push_back(*h); + } + } + } + } + + + /* + Now find the quality hits between ***end2*** of the BEDPE and the B BED file + */ + bedB->binKeeperFind(bedB->bedMap[a.chrom2], a.start2, a.end2, hits2); + + for (vector<BED>::iterator h = hits2.begin(); h != hits2.end(); ++h) { + + int s = max(a.start2, h->start); + int e = min(a.end2, h->end); + + if (s < e) { + + // is there enough overlap (default ~ 1bp) + if ( ((float)(e-s) / (float)(a.end2 - a.start2)) >= this->overlapFraction ) { + numOverlapsEnd2++; + + if (type == "either") { + bedA->reportBedPE(a); cout << "\t"; + bedB->reportBed(*h); cout << endl; + } + else { + qualityHits2.push_back(*h); + } + } + } + } + + + /* + Now report the hits depending on what the user has requested. + */ + if (type == "neither") { + if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 == 0) ) { + bedA->reportBedPE(a); cout << endl; + } + } + else if (type == "xor") { + if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 == 0) ) { + for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) { + bedA->reportBedPE(a); cout << "\t"; + bedB->reportBed(*q); cout << endl; + } + } + else if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 > 0) ) { + for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) { + bedA->reportBedPE(a); cout << "\t"; + bedB->reportBed(*q); cout << endl; + } + } + } + else if (type == "both") { + if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 > 0) ) { + for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) { + bedA->reportBedPE(a); cout << "\t"; + bedB->reportBed(*q); cout << endl; + } + for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) { + bedA->reportBedPE(a); cout << "\t"; + bedB->reportBed(*q); cout << endl; + } + } + } +} + + + + + +void BedIntersectPE::FindSpanningOverlaps(BEDPE &a, vector<BED> &hits, string &type) { + + // count of hits on _between_ end of BEDPE + // that exceed the requested overlap fraction + int numOverlaps = 0; + + + /* + Find the hits between end1 and start2 of the BEDPE and the B BED file + + In other words, find the hits between the "span" of the pair + */ + bedB->binKeeperFind(bedB->bedMap[a.chrom1], a.end1, a.start2, hits); + + for (vector<BED>::iterator h = hits.begin(); h != hits.end(); ++h) { + + int s = max(a.end1, h->start); + int e = min(a.start2, h->end); + + if (s < e) { + + // is there enough overlap (default ~ 1bp) + if ( ((float)(e-s) / (float)(a.start2 - a.end1)) >= this->overlapFraction ) { + numOverlaps++; + + bedA->reportBedPE(a); cout << "\t"; + bedB->reportBed(*h); cout << endl; + } + } + } +} + + + + +void BedIntersectPE::IntersectBedPE() { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + bedB->loadBedFileIntoMap(); + + string bedLine; + int lineNum = 0; + + // are we dealing with a file? + if (bedA->bedFile != "stdin") { + + // open the BEDPE file for reading + ifstream bed(bedA->bedFile.c_str(), ios::in); + if ( !bed ) { + cerr << "Error: The requested bed file (" <<bedA->bedFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + + BEDPE a; + // process each entry in A + while (getline(bed, bedLine)) { + + // split the current line into ditinct fields + vector<string> bedFields; + Tokenize(bedLine,bedFields); + + lineNum++; + + // find the overlaps with B if it's a valid BED entry. + if (bedA->parseBedPELine(a, bedFields, lineNum)) { + + if (this->searchType == "span") { + vector<BED> hits; + if (a.chrom1 == a.chrom2) { + FindSpanningOverlaps(a, hits, this->searchType); + } + } + else { + vector<BED> hits1, hits2; + FindOverlaps(a, hits1, hits2, this->searchType); + } + + } + } + } + // "A" is being passed via STDIN. + else { + + BEDPE a; + // process each entry in A + while (getline(cin, bedLine)) { + + // split the current line into ditinct fields + vector<string> bedFields; + Tokenize(bedLine,bedFields); + + lineNum++; + + // find the overlaps with B if it's a valid BED entry. + if (bedA->parseBedPELine(a, bedFields, lineNum)) { + + if (this->searchType == "span") { + vector<BED> hits; + if (a.chrom1 == a.chrom2) { + FindSpanningOverlaps(a, hits, this->searchType); + } + } + else { + vector<BED> hits1, hits2; + FindOverlaps(a, hits1, hits2, this->searchType); + } + } + } + } +} +// END IntersectPE + + diff --git a/src/peIntersectBed/peIntersectBed.h b/src/peIntersectBed/peIntersectBed.h new file mode 100755 index 0000000000000000000000000000000000000000..62054733d3b85ff05eec268be10c166c2686be2b --- /dev/null +++ b/src/peIntersectBed/peIntersectBed.h @@ -0,0 +1,46 @@ +#ifndef INTERSECTBED_H +#define INTERSECTBED_H + +#include "bedFile.h" +#include "bedFilePE.h" +#include <vector> +#include <iostream> +#include <fstream> + +using namespace std; + +//************************************************ +// Class methods and elements +//************************************************ +class BedIntersectPE { + +public: + + // constructor + BedIntersectPE(string &, string &, float &, string &); + + // destructor + ~BedIntersectPE(void); + + void FindOverlaps(BEDPE &, vector<BED> &, vector<BED> &, string &); + void FindSpanningOverlaps(BEDPE &, vector<BED> &, string &); + + void IntersectBedPE (); + + +private: + + string bedAFilePE; + string bedBFile; + + float overlapFraction; + string searchType; + + // instance of a paired-end bed file class. + BedFilePE *bedA; + + // instance of a bed file class. + BedFile *bedB; +}; + +#endif /* PEINTERSECTBED_H */ diff --git a/src/peIntersectBed/peIntersectMain.cpp b/src/peIntersectBed/peIntersectMain.cpp new file mode 100755 index 0000000000000000000000000000000000000000..0e0b16b820a68e8c322249694e00f59258746b3b --- /dev/null +++ b/src/peIntersectBed/peIntersectMain.cpp @@ -0,0 +1,170 @@ +#include "peIntersectBed.h" + +using namespace std; + +// define our program name +#define PROGRAM_NAME "peIntersectBed" + +// define the version +#define VERSION "1.2.0" + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +// function declarations +void ShowHelp(void); + +int main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + + // input arguments + float overlapFraction = 1E-9; + string searchType = "either"; + + bool haveBedA = false; + bool haveBedB = false; + bool haveSearchType = false; + bool haveFraction = false; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-h", 2, parameterLength) || + PARAMETER_CHECK("--help", 5, parameterLength)) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + bedAFile = argv[i + 1]; + } + i++; + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedB = true; + bedBFile = argv[i + 1]; + } + i++; + } + else if(PARAMETER_CHECK("-type", 5, parameterLength)) { + if ((i+1) < argc) { + haveSearchType = true; + searchType = argv[i + 1]; + } + i++; + } + //else if(PARAMETER_CHECK("-u", 2, parameterLength)) { + // anyHit = true; + //} + else if(PARAMETER_CHECK("-f", 2, parameterLength)) { + haveFraction = true; + overlapFraction = atof(argv[i + 1]); + i++; + } + //else if(PARAMETER_CHECK("-wa", 3, parameterLength)) { + // writeA = true; + //} + //else if(PARAMETER_CHECK("-wb", 3, parameterLength)) { + // writeB = true; + //} + //else if(PARAMETER_CHECK("-c", 2, parameterLength)) { + // writeCount = true; + //} + //else if (PARAMETER_CHECK("-v", 2, parameterLength)) { + // noHit = true; + //} + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + showHelp = true; + } + + if (haveSearchType && (searchType != "either") && (searchType != "neither") && (searchType != "both") && (searchType != "xor") && (searchType != "span")) { + cerr << endl << "*****" << endl << "*****ERROR: Request \"either\" or \"both\" or \"neither\" or \"xor\" or \"span\"" << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + + BedIntersectPE *bi = new BedIntersectPE(bedAFile, bedBFile, overlapFraction, searchType); + bi->IntersectBedPE(); + return 0; + } + else { + ShowHelp(); + } +} + +void ShowHelp(void) { + + cerr << "===============================================" << endl; + cerr << " " <<PROGRAM_NAME << " v" << VERSION << endl ; + cerr << " Aaron Quinlan, Ph.D. (aaronquinlan@gmail.com) " << endl ; + cerr << " Hall Laboratory, University of Virginia" << endl; + cerr << "===============================================" << endl << endl; + cerr << "Description: Report overlaps between a.bed and b.bed." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl; + + cerr << "OPTIONS: " << endl; + cerr << "\t" << "-u\t\t\t" << "Write ORIGINAL a.bed entry ONCE if ANY overlap with B.bed." << endl << "\t\t\t\tIn other words, just report the fact >=1 hit was found." << endl << endl; + cerr << "\t" << "-c \t\t\t" << "For each entry in A, report the number of hits in B while restricting to -f and -type." << endl << "\t\t\t\tReports 0 for A entries that have no overlap with B." << endl << endl; + cerr << "\t" << "-f (e.g. 0.05)\t\t" << "Minimum overlap req'd as fraction of a.bed." << endl << "\t\t\t\tDefault is 1E-9 (effectively 1bp)." << endl << endl; + cerr << "\t" << "-type \t\t\t" << "either (default)\tReport overlaps if _either_ end of BEDPE (A) overlaps BED B." << endl; + cerr << "\t\t\t\tneither\t\t\tReport overlaps if _neither_ end of BEDPE (A) overlaps BED B." << endl; + cerr << "\t\t\t\tboth\t\t\tReport overlaps if _both_ ends of BEDPE (A) overlap BED B." << endl; + cerr << "\t\t\t\txor\t\t\tReport overlaps if _one and only one_ end of BEDPE (A) overlap BED B." << endl; + cerr << "\t\t\t\tspan\t\t\tReport overlaps start1:end2 of BEDPE (A) overlaps BED B." << endl; + cerr << "\t\t\t\t\t\t\tNOTE: Will only report overlaps for BEDPE entries with chrom1 = chrom2" << endl << endl; + + + cerr << "NOTES: " << endl; + cerr << "\t" << "-i stdin\t\t" << "Allows BEDPE file A to be read from stdin. E.g.: cat a.bedpe | peIntersectBed -a stdin -b B.bed" << endl << endl; + cerr << "\t***Only BEDPE formats allowed (6,7,8, or 10 column format. See below) for -a.***"<< endl << endl; + cerr << "\t***Only BED3 - BED6 formats allowed for -b.***"<< endl << endl; + + cerr << "BEDPE FORMAT (tab-delimited): " << endl; + cerr << "\t" << "1. chrom for end 1 (req'd)" << endl; + cerr << "\t" << "2. start for end 1 (req'd)" << endl; + cerr << "\t" << "3. end for end 1 (req'd)" << endl; + cerr << "\t" << "4. chrom for end 2 (req'd)" << endl; + cerr << "\t" << "5. start for end 2 (req'd)" << endl; + cerr << "\t" << "6. end for end 2 (req'd)" << endl; + cerr << "\t" << "7. name (opt.)" << endl; + cerr << "\t" << "8. score (opt.)" << endl; + cerr << "\t" << "9. strand for end 1 (opt.)" << endl; + cerr << "\t" << "10. strand for end 2 (opt.)" << endl; + cerr << "\t" << "Note: Strands for each end must be provided if you choose to include strand information." << endl << endl; + + cerr << "Example BEDPE record:" << endl; + cerr << "chr1\t1000000\t1000100\tchr1\t1001000\t1001100\tsomename\t100\t+\t-" << endl; + + // end the program here + exit(1); + +} diff --git a/src/sortBed/Makefile b/src/sortBed/Makefile index 8cbea605eb4595b4a00f0f43b7580445e8e1d576..fefca26e16874c644dcae04029160713834531c7 100755 --- a/src/sortBed/Makefile +++ b/src/sortBed/Makefile @@ -9,14 +9,14 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= sortMain.cpp sortBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o +_EXT_OBJECTS=bedFile.o lineFileUtilities.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= sortBed @@ -36,6 +36,7 @@ $(BUILT_OBJECTS): $(SOURCES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ clean: @echo "Cleaning up." diff --git a/src/sortBed/sortBed.cpp b/src/sortBed/sortBed.cpp index 13551e0f2ca38b5cc2ea5024f1626b0dd7fcaf71..bacc744e6d3ea55f3ab129ee1688bede71fbaad0 100755 --- a/src/sortBed/sortBed.cpp +++ b/src/sortBed/sortBed.cpp @@ -7,7 +7,7 @@ // // Summary: Sorts a BED file in ascending order by chrom then by start position. // - +#include "lineFileUtilities.h" #include "sortBed.h" // @@ -25,31 +25,6 @@ BedSort::~BedSort(void) { } -/* - reportBed - - Writes the _original_ BED entry for A. - Works for BED3 - BED6. -*/ -void BedSort::reportBed(const BED &a) { - - if (bed->bedType == 3) { - cout << a.chrom << "\t" << a.start << "\t" << a.end; - } - else if (bed->bedType == 4) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name; - } - else if (bed->bedType == 5) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name << "\t" << a.score; - } - else if (bed->bedType == 6) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name << "\t" << a.score << "\t" << a.strand; - } -} - void BedSort::SortBed() { // load the "B" bed file into a map so @@ -63,8 +38,7 @@ void BedSort::SortBed() { vector<BED> bedList = m->second; for (unsigned int i = 0; i < bedList.size(); ++i) { - reportBed(bedList[i]); cout << "\n"; - ///cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl; + bed->reportBed(bedList[i]); cout << "\n"; } } } @@ -96,8 +70,7 @@ void BedSort::SortBedBySizeAsc() { // report the entries in ascending order for (unsigned int i = 0; i < masterList.size(); ++i) { - reportBed(masterList[i]); cout << "\n"; - //cout << masterList[i].chrom << "\t" << masterList[i].start << "\t" << masterList[i].end << endl; + bed->reportBed(masterList[i]); cout << "\n"; } } @@ -128,8 +101,7 @@ void BedSort::SortBedBySizeDesc() { // report the entries in ascending order for (unsigned int i = 0; i < masterList.size(); ++i) { - reportBed(masterList[i]); cout << "\n"; - //cout << masterList[i].chrom << "\t" << masterList[i].start << "\t" << masterList[i].end << endl; + bed->reportBed(masterList[i]); cout << "\n"; } } @@ -147,8 +119,7 @@ void BedSort::SortBedByChromThenSizeAsc() { sort(bedList.begin(), bedList.end(), sortBySizeAsc); for (unsigned int i = 0; i < bedList.size(); ++i) { - reportBed(bedList[i]); cout << "\n"; - //cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl; + bed->reportBed(bedList[i]); cout << "\n"; } } } @@ -169,8 +140,7 @@ void BedSort::SortBedByChromThenSizeDesc() { sort(bedList.begin(), bedList.end(), sortBySizeDesc); for (unsigned int i = 0; i < bedList.size(); ++i) { - reportBed(bedList[i]); cout << "\n"; - //cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl; + bed->reportBed(bedList[i]); cout << "\n"; } } } @@ -191,8 +161,7 @@ void BedSort::SortBedByChromThenScoreAsc() { sort(bedList.begin(), bedList.end(), sortByScoreAsc); for (unsigned int i = 0; i < bedList.size(); ++i) { - reportBed(bedList[i]); cout << "\n"; - //cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl; + bed->reportBed(bedList[i]); cout << "\n"; } } } @@ -218,8 +187,7 @@ void BedSort::SortBedByChromThenScoreDesc() { sort(bedList.begin(), bedList.end(), sortByScoreDesc); for (unsigned int i = 0; i < bedList.size(); ++i) { - reportBed(bedList[i]); cout << "\n"; - //cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl; + bed->reportBed(bedList[i]); cout << "\n"; } } } diff --git a/src/sortBed/sortBed.h b/src/sortBed/sortBed.h index 7f3fcb3b1e168837aebe5329720260794ba7d703..60c6067cfa3ae52ffec723729af5f00a45cf98de 100755 --- a/src/sortBed/sortBed.h +++ b/src/sortBed/sortBed.h @@ -1,20 +1,12 @@ #include "bedFile.h" #include <vector> #include <algorithm> -#include <map> -#include <list> #include <iostream> #include <fstream> using namespace std; -//*********************************************** -// Typedefs -//*********************************************** -typedef list<BED> bedList; - - //************************************************ // Class methods and elements //************************************************ diff --git a/src/subtractBed/Makefile b/src/subtractBed/Makefile old mode 100644 new mode 100755 index ae38802f94c6d0cfb9a18d23fc9b82c0f618f046..42e8bd6d3e9dd79b410152bb6fd27bc94011abc9 --- a/src/subtractBed/Makefile +++ b/src/subtractBed/Makefile @@ -9,14 +9,14 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= subtractMain.cpp subtractBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o +_EXT_OBJECTS=bedFile.o lineFileUtilities.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= subtractBed @@ -36,7 +36,8 @@ $(BUILT_OBJECTS): $(SOURCES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ - + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* diff --git a/src/subtractBed/subtractBed.cpp b/src/subtractBed/subtractBed.cpp index cb9565d4cf5e2335bd565ce3036e692eba4fca1d..44624a5b8ae9c0691c01a549ef8a5799e3632808 100755 --- a/src/subtractBed/subtractBed.cpp +++ b/src/subtractBed/subtractBed.cpp @@ -7,26 +7,25 @@ // // Summary: Removes overlapping segments from a BED entry. // - -/* - Includes -*/ +#include "lineFileUtilities.h" #include "subtractBed.h" /* Constructor */ -BedSubtract::BedSubtract(string &bedAFile, string &bedBFile, float &overlapFraction) { +BedSubtract::BedSubtract(string &bedAFile, string &bedBFile, float &overlapFraction, bool &forceStrand) { this->bedAFile = bedAFile; this->bedBFile = bedBFile; this->overlapFraction = overlapFraction; + this->forceStrand = forceStrand; this->bedA = new BedFile(bedAFile); this->bedB = new BedFile(bedBFile); } + /* Destructor */ @@ -35,63 +34,6 @@ BedSubtract::~BedSubtract(void) { -/* - reportA - - Writes the _original_ BED entry for A. - Works for BED3 - BED6. -*/ -void BedSubtract::reportA(BED &a) { - - if (bedA->bedType == 3) { - cout << a.chrom << "\t" << a.start << "\t" << a.end; - } - else if (bedA->bedType == 4) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name; - } - else if (bedA->bedType == 5) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name << "\t" << a.score; - } - else if (bedA->bedType == 6) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name << "\t" << a.score << "\t" << a.strand; - } -} - - -/* - reportARemainder - - Writes the base-pair remainder for a BED entry in A after an overlap - with B has been subtracted. - - Works for BED3 - BED6. -*/ -void BedSubtract::reportARemainder(BED &a, int &start, int &end) { - - if (bedA->bedType == 3) { - cout << a.chrom << "\t" << start << "\t" << end; - } - else if (bedA->bedType == 4) { - cout << a.chrom << "\t" << start << "\t" << end << "\t" - << a.name; - } - else if (bedA->bedType == 5) { - cout << a.chrom << "\t" << start << "\t" << end << "\t" - << a.name << "\t" << a.score; - } - else if (bedA->bedType == 6) { - cout << a.chrom << "\t" << start << "\t" << end << "\t" - << a.name << "\t" << a.score << "\t" << a.strand; - } - -} - - - - void BedSubtract::FindOverlaps(BED &a, vector<BED> &hits) { // find all of the overlaps between a and B. @@ -102,6 +44,12 @@ void BedSubtract::FindOverlaps(BED &a, vector<BED> &hits) { int numConsumedByB = 0; for (vector<BED>::iterator h = hits.begin(); h != hits.end(); ++h) { + + // if forcing strandedness, move on if the hit + // is not on the same strand as A. + if ((this->forceStrand) && (a.strand != h->strand)) { + continue; // continue force the next iteration of the for loop. + } int s = max(a.start, h->start); int e = min(a.end, h->end); @@ -119,9 +67,16 @@ void BedSubtract::FindOverlaps(BED &a, vector<BED> &hits) { // report the subtraction with each entry in B // if A was not "consumed" by any entry in B if (numConsumedByB == 0) { + int numOverlaps = 0; for (vector<BED>::iterator h = hits.begin(); h != hits.end(); ++h) { + // if forcing strandedness, move on if the hit + // is not on the same strand as A. + if ((this->forceStrand) && (a.strand != h->strand)) { + continue; // continue force the next iteration of the for loop. + } + int s = max(a.start, h->start); int e = min(a.end, h->end); @@ -132,7 +87,7 @@ void BedSubtract::FindOverlaps(BED &a, vector<BED> &hits) { // is there enough overlap (default ~ 1bp) float overlap = ((float)(e-s) / (float)(a.end - a.start)); - if ( overlap > this->overlapFraction ) { + if ( overlap >= this->overlapFraction ) { if (overlap < 1.0) { // if overlap = 1, then B consumes A and therefore, // we won't report A. @@ -140,26 +95,26 @@ void BedSubtract::FindOverlaps(BED &a, vector<BED> &hits) { // B ---- // Res. ==== ==== if ( (h->start > a.start) && (h->end < a.end) ) { - reportARemainder(a,a.start,h->start); cout << "\n"; - reportARemainder(a,h->end,a.end); cout << "\n"; + bedA->reportBedRange(a,a.start,h->start); cout << "\n"; + bedA->reportBedRange(a,h->end,a.end); cout << "\n"; } // A ++++++++++++ // B ---------- // Res. == else if (h->start == a.start) { - reportARemainder(a,h->end,a.end); cout << "\n"; + bedA->reportBedRange(a,h->end,a.end); cout << "\n"; } // A ++++++++++++ // B ---------- // Res. ==== else if (h->start < a.start) { - reportARemainder(a,h->end,a.end); cout << "\n"; + bedA->reportBedRange(a,h->end,a.end); cout << "\n"; } // A ++++++++++++ // B ---------- // Res. ======= else if (h->start > a.start) { - reportARemainder(a,a.start,h->start); cout << "\n"; + bedA->reportBedRange(a,a.start,h->start); cout << "\n"; } } } @@ -168,7 +123,7 @@ void BedSubtract::FindOverlaps(BED &a, vector<BED> &hits) { } if (numOverlaps == 0) { // no overlap found, so just report A as-is. - reportA(a); cout << "\n"; + bedA->reportBed(a); cout << "\n"; } } } diff --git a/src/subtractBed/subtractBed.h b/src/subtractBed/subtractBed.h index f46c80226bd96cdc81049bd2629e8d1d16b5ee2c..a7eb54ebccdfcbb4f9df5b11034de51fe9c92570 100755 --- a/src/subtractBed/subtractBed.h +++ b/src/subtractBed/subtractBed.h @@ -3,20 +3,11 @@ #include "bedFile.h" #include <vector> -#include <map> -#include <list> #include <iostream> #include <fstream> using namespace std; - -//*********************************************** -// Typedefs -//*********************************************** -typedef list<BED> bedList; - - //************************************************ // Class methods and elements //************************************************ @@ -25,7 +16,7 @@ class BedSubtract { public: // constructor - BedSubtract(string &, string &, float &); + BedSubtract(string &, string &, float &, bool &); // destructor ~BedSubtract(void); @@ -43,7 +34,8 @@ private: string bedBFile; float overlapFraction; bool noHit; - + bool forceStrand; + // instance of a bed file class. BedFile *bedA, *bedB; diff --git a/src/subtractBed/subtractMain.cpp b/src/subtractBed/subtractMain.cpp index 1c356622a7eb2b2a11a5e6b5c7af093afed3c02d..8121108cb71089b80923f1dfbb35ddf722a6f8a9 100755 --- a/src/subtractBed/subtractMain.cpp +++ b/src/subtractBed/subtractMain.cpp @@ -29,6 +29,7 @@ int main(int argc, char* argv[]) { bool haveBedA = false; bool haveBedB = false; bool haveFraction = false; + bool forceStrand = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -68,6 +69,9 @@ int main(int argc, char* argv[]) { overlapFraction = atof(argv[i + 1]); i++; } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; @@ -82,7 +86,7 @@ int main(int argc, char* argv[]) { if (!showHelp) { - BedSubtract *bs = new BedSubtract(bedAFile, bedBFile, overlapFraction); + BedSubtract *bs = new BedSubtract(bedAFile, bedBFile, overlapFraction, forceStrand); bs->SubtractBed(); return 0; } @@ -103,6 +107,7 @@ void ShowHelp(void) { cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl; cerr << "OPTIONS: " << endl; + cerr << "\t" << "-s\t\t\t" << "Force strandedness. Only report hits in B that overlap A on the same strand." << endl << "\t\t\t\tBy default, overlaps are reported without respect to strand." << endl << endl; cerr << "\t" << "-f (e.g. 0.05)\t\t" << "Minimum overlap req'd as fraction of a.bed." << endl << "\t\t\t\tDefault is 1E-9 (effectively 1bp)." << endl << endl; cerr << "NOTES: " << endl; diff --git a/src/utils/bedFile/Makefile b/src/utils/bedFile/Makefile index f870ded74246524b769356cac3a62b180714657a..900d6a422ae15f70bcdda37efb5857199ef24e11 100755 --- a/src/utils/bedFile/Makefile +++ b/src/utils/bedFile/Makefile @@ -1,10 +1,32 @@ CXX = g++ -c CXXFLAGS = -O3 -Wall LDFLAGS = -OBJS = ../../../obj/ +OBJ_DIR = ../../../obj/ +BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../../utils/ +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/lineFileUtilities/ -bedFile: bedFile.cpp bedFile.h - $(CXX) $(CXXFLAGS) $(LDFLAGS) bedFile.cpp -o $(OBJS)bedFile.o +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= bedFile.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=lineFileUtilities.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(INCLUDES) clean: - rm bedFile.o + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean \ No newline at end of file diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 19929909fc56f4e72d750c616359ad3d98f4cfc0..fbdf44f38ba03c0daaa8bdb57e9620a2d11aa82e 100755 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -11,7 +11,7 @@ // BED processing code. I am grateful for his elegant // genome binning algorithm and therefore use it extensively. - +#include "lineFileUtilities.h" #include "bedFile.h" static int binOffsetsExtended[] = @@ -20,27 +20,6 @@ static int binOffsetsExtended[] = #define _binFirstShift 17/* How much to shift to get to finest bin. */ #define _binNextShift 3/* How much to shift to get to next larger bin. */ -//*********************************************** -// Common functions -//*********************************************** - -void Tokenize(const string& str, vector<string>& tokens) -{ - // Skip delimiters at beginning. - string::size_type lastPos = str.find_first_not_of("\t", 0); - // Find first "non-delimiter". - string::size_type pos = str.find_first_of("\t", lastPos); - - while (string::npos != pos || string::npos != lastPos) - { - // Found a token, add it to the vector. - tokens.push_back(str.substr(lastPos, pos - lastPos)); - // Skip delimiters. Note the "not_of" - lastPos = str.find_first_not_of("\t", pos); - // Find next "non-delimiter" - pos = str.find_first_of("\t", lastPos); - } -} int overlaps(const int aS, const int aE, const int bS, const int bE) { return min(aE, bE) - max(aS, bS); @@ -214,27 +193,27 @@ void BedFile::countHits(map<int, vector<BED>, std::less<int> > &bk, const int st } - - // Constructor BedFile::BedFile(string &bedFile) { this->bedFile = bedFile; } -// Destructorc +// Destructor BedFile::~BedFile(void) { } bool BedFile::parseBedLine (BED &bed, const vector<string> &lineVector, const int &lineNum) { - if ((lineNum == 1) && (lineVector.size() >= 3)) { - this->bedType = lineVector.size(); + if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { if (this->bedType == 3) { bed.chrom = lineVector[0]; bed.start = atoi(lineVector[1].c_str()); bed.end = atoi(lineVector[2].c_str()); + bed.name = ""; + bed.score = 0; + bed.strand = "+"; return true; } else if (this->bedType == 4) { @@ -242,6 +221,8 @@ bool BedFile::parseBedLine (BED &bed, const vector<string> &lineVector, const in bed.start = atoi(lineVector[1].c_str()); bed.end = atoi(lineVector[2].c_str()); bed.name = lineVector[3]; + bed.score = 0; + bed.strand = "+"; return true; } else if (this->bedType ==5) { @@ -250,6 +231,7 @@ bool BedFile::parseBedLine (BED &bed, const vector<string> &lineVector, const in bed.end = atoi(lineVector[2].c_str()); bed.name = lineVector[3]; bed.score = atoi(lineVector[4].c_str()); + bed.strand = "+"; return true; } else if (this->bedType == 6) { @@ -261,8 +243,22 @@ bool BedFile::parseBedLine (BED &bed, const vector<string> &lineVector, const in bed.strand = lineVector[5]; return true; } + else { + cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BED file has 3,4,5 or 6 fields. Exiting..." << endl; + exit(1); + } + + if (bed.start > bed.end) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than End. Ignoring it and moving on." << endl; + return false; + } + else if ( (bed.start < 0) || (bed.end < 0) ) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl; + return false; + } } - else if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { + else if ((lineNum == 1) && (lineVector.size() >= 3)) { + this->bedType = lineVector.size(); if (this->bedType == 3) { bed.chrom = lineVector[0]; @@ -294,7 +290,11 @@ bool BedFile::parseBedLine (BED &bed, const vector<string> &lineVector, const in bed.strand = lineVector[5]; return true; } - + else { + cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BED file has 3,4,5 or 6 fields. Exiting..." << endl; + exit(1); + } + if (bed.start > bed.end) { cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than End. Ignoring it and moving on." << endl; return false; @@ -333,12 +333,12 @@ void BedFile::loadBedFileIntoMap() { BED bedEntry; int lineNum = 0; - //while (bed >> bedEntry.chrom >> bedEntry.start >> bedEntry.end) { + vector<string> bedFields; // vector of strings for each column in BED file. + bedFields.reserve(6); // reserve space for worst case (BED 6) + while (getline(bed, bedLine)) { - vector<string> bedFields; Tokenize(bedLine,bedFields); - lineNum++; if (parseBedLine(bedEntry, bedFields, lineNum)) { @@ -346,12 +346,18 @@ void BedFile::loadBedFileIntoMap() { bedEntry.count = 0; this->bedMap[bedEntry.chrom][bin].push_back(bedEntry); } + bedFields.clear(); } } void BedFile::loadBedFileIntoMapNoBin() { - // Are we dealing with a BED file or a BED passed via stdin? + string bedLine; + BED bedEntry; + int lineNum = 0; + + vector<string> bedFields; // vector of strings for each column in BED file. + bedFields.reserve(6); // reserve space for worst case (BED 6) // Case 1: Proper BED File. if ( (this->bedFile != "") && (this->bedFile != "stdin") ) { @@ -363,13 +369,8 @@ void BedFile::loadBedFileIntoMapNoBin() { exit (1); } - string bedLine; - BED bedEntry; - int lineNum = 0; - while (getline(bed, bedLine)) { - vector<string> bedFields; Tokenize(bedLine,bedFields); lineNum++; @@ -378,14 +379,12 @@ void BedFile::loadBedFileIntoMapNoBin() { bedEntry.count = 0; this->bedMapNoBin[bedEntry.chrom].push_back(bedEntry); } + bedFields.clear(); } } // Case 2: STDIN. else { - string bedLine; - BED bedEntry; - int lineNum = 0; - + while (getline(cin, bedLine)) { vector<string> bedFields; @@ -397,6 +396,7 @@ void BedFile::loadBedFileIntoMapNoBin() { bedEntry.count = 0; this->bedMapNoBin[bedEntry.chrom].push_back(bedEntry); } + bedFields.clear(); } } @@ -408,5 +408,56 @@ void BedFile::loadBedFileIntoMapNoBin() { } +/* + reportBed + + Writes the _original_ BED entry. + Works for BED3 - BED6. +*/ +void BedFile::reportBed(const BED &bed) { + + if (this->bedType == 3) { + cout << bed.chrom << "\t" << bed.start << "\t" << bed.end; + } + else if (this->bedType == 4) { + cout << bed.chrom << "\t" << bed.start << "\t" << bed.end << "\t" + << bed.name; + } + else if (this->bedType == 5) { + cout << bed.chrom << "\t" << bed.start << "\t" << bed.end << "\t" + << bed.name << "\t" << bed.score; + } + else if (this->bedType == 6) { + cout << bed.chrom << "\t" << bed.start << "\t" << bed.end << "\t" + << bed.name << "\t" << bed.score << "\t" << bed.strand; + } +} + + +/* + reportBedRange + + Writes a custom start->end for a BED entry. + Works for BED3 - BED6. +*/ +void BedFile::reportBedRange(const BED &bed, int &start, int &end) { + + if (this->bedType == 3) { + cout << bed.chrom << "\t" << start << "\t" << end; + } + else if (this->bedType == 4) { + cout << bed.chrom << "\t" << start << "\t" << end << "\t" + << bed.name; + } + else if (this->bedType == 5) { + cout << bed.chrom << "\t" << start << "\t" << end << "\t" + << bed.name << "\t" << bed.score; + } + else if (this->bedType == 6) { + cout << bed.chrom << "\t" << start << "\t" << end << "\t" + << bed.name << "\t" << bed.score << "\t" << bed.strand; + } + +} diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index d0009798c5dbb8c3347b6eb43ae174816fc14490..611665c2c01096c3192fe5445963462d8474c644 100755 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -16,6 +16,9 @@ using namespace std; // Common data structures //************************************************* +/* + Structure for regular BED3-BED6 records +*/ struct BED { // UCSC BED fields @@ -57,7 +60,6 @@ std::string ToString(const T & value) return ss.str(); } -void Tokenize(const string& str, vector<string>& tokens); // BED Sorting Methods bool sortByChrom(BED const &, BED const &); @@ -75,7 +77,6 @@ typedef std::map<string, map<int, vector<BED>, std::less<int> >, std::less<strin typedef std::map<string, vector<BED>, std::less<string> > masterBedMapNoBin; - //************************************************ // BedFile Class methods and elements //************************************************ @@ -100,6 +101,8 @@ public: const int, vector<BED> &); void countHits(map<int, vector<BED>, std::less<int> > &, const int, const int); + void reportBed(const BED &); + void reportBedRange(const BED &, int &, int &); // a vector of the BED entries in the BED file. vector<BED> bedVector; diff --git a/src/utils/bedFilePE/Makefile b/src/utils/bedFilePE/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..b99a1600cc3bc20ea33f65195e5cda83346e4e8f --- /dev/null +++ b/src/utils/bedFilePE/Makefile @@ -0,0 +1,32 @@ +CXX = g++ -c +CXXFLAGS = -O3 -Wall +LDFLAGS = +OBJ_DIR = ../../../obj/ +BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../../utils/ +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/lineFileUtilities/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= bedFilePE.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=lineFileUtilities.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean \ No newline at end of file diff --git a/src/utils/bedFilePE/bedFilePE.cpp b/src/utils/bedFilePE/bedFilePE.cpp new file mode 100755 index 0000000000000000000000000000000000000000..04be9e2041f6615036086682a27a85529844a621 --- /dev/null +++ b/src/utils/bedFilePE/bedFilePE.cpp @@ -0,0 +1,224 @@ +// +// bedFilePE.cpp +// BEDTools +// +// Created by Aaron Quinlan Spring 2009. +// Copyright 2009 Aaron Quinlan. All rights reserved. +// +// Summary: Contains common functions for finding BED overlaps. +// +// Acknowledgments: Much of the code herein is taken from Jim Kent's +// BED processing code. I am grateful for his elegant +// genome binning algorithm and therefore use it extensively. + + +#include "bedFilePE.h" + +//*********************************************** +// Common functions +//*********************************************** + +// Constructor +BedFilePE::BedFilePE(string &bedFile) { + this->bedFile = bedFile; +} + +// Destructorc +BedFilePE::~BedFilePE(void) { +} + +/* + reportBedPE + + Writes the _original_ BED entry for A. + Works for BEDPE only. +*/ +void BedFilePE::reportBedPE(const BEDPE &a) { + + if (this->bedType == 6) { + cout << a.chrom1 << "\t" << a.start1 << "\t" << a.end1 << "\t" + << a.chrom2 << "\t" << a.start2 << "\t" << a.end2; + } + else if (this->bedType == 7) { + cout << a.chrom1 << "\t" << a.start1 << "\t" << a.end1 << "\t" + << a.chrom2 << "\t" << a.start2 << "\t" << a.end2 << "\t" + << a.name; + } + else if (this->bedType == 8) { + cout << a.chrom1 << "\t" << a.start1 << "\t" << a.end1 << "\t" + << a.chrom2 << "\t" << a.start2 << "\t" << a.end2 << "\t" + << a.name << "\t" << a.score; + } + else if (this->bedType == 10) { + cout << a.chrom1 << "\t" << a.start1 << "\t" << a.end1 << "\t" + << a.chrom2 << "\t" << a.start2 << "\t" << a.end2 << "\t" + << a.name << "\t" << a.score << "\t" << a.strand1 << "\t" << a.strand2; + } +} + +bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, const int &lineNum) { + + if ((lineNum == 1) && (lineVector.size() >= 3)) { + + this->bedType = lineVector.size(); + + if (this->bedType == 6) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + return true; + } + else if (this->bedType == 7) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + bed.name = lineVector[6]; + return true; + } + else if (this->bedType == 8) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + bed.name = lineVector[6]; + bed.score = atoi(lineVector[7].c_str()); + return true; + } + else if (this->bedType == 10) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + bed.name = lineVector[6]; + bed.score = atoi(lineVector[7].c_str()); + + bed.strand1 = lineVector[8]; + bed.strand2 = lineVector[9]; + + return true; + } + else { + cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields. Exiting..." << endl; + exit(1); + } + + if (bed.start1 > bed.end1) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl; + return false; + } + else if (bed.start2 > bed.end2) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl; + return false; + } + else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl; + return false; + } + } + else if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { + + if (this->bedType == 6) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + return true; + } + else if (this->bedType == 7) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + bed.name = lineVector[6]; + return true; + } + else if (this->bedType == 8) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + bed.name = lineVector[6]; + bed.score = atoi(lineVector[7].c_str()); + return true; + } + else if (this->bedType == 10) { + bed.chrom1 = lineVector[0]; + bed.start1 = atoi(lineVector[1].c_str()); + bed.end1 = atoi(lineVector[2].c_str()); + + bed.chrom2 = lineVector[3]; + bed.start2 = atoi(lineVector[4].c_str()); + bed.end2 = atoi(lineVector[5].c_str()); + + bed.name = lineVector[6]; + bed.score = atoi(lineVector[7].c_str()); + + bed.strand1 = lineVector[8]; + bed.strand2 = lineVector[9]; + + return true; + } + else { + cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields. Exiting..." << endl; + exit(1); + } + + if (bed.start1 > bed.end1) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl; + return false; + } + else if (bed.start2 > bed.end2) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl; + return false; + } + else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl; + return false; + } + } + else if (lineVector.size() == 1) { + cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + exit(1); + } + else if (lineVector.size() != this->bedType) { + cerr << "Differing number of BED fields encountered at line: " << lineNum << ". Exiting..." << endl; + exit(1); + } + else if (lineVector.size() < 8) { + cerr << "TAB delimited BED file with 10 fields (chrom1, start1, end1, strand1, chrom2, start2, end2, strand2) is required. Exiting..." << endl; + exit(1); + } + return false; +} + + diff --git a/src/utils/bedFilePE/bedFilePE.h b/src/utils/bedFilePE/bedFilePE.h new file mode 100755 index 0000000000000000000000000000000000000000..0e9d22fb7950f1128d40fed8ecc7836fb48f4255 --- /dev/null +++ b/src/utils/bedFilePE/bedFilePE.h @@ -0,0 +1,66 @@ +#ifndef BEDFILEPE_H +#define BEDFILEPE_H + +#include <vector> +#include <map> +#include <string> +#include <iostream> +#include <fstream> +#include <sstream> +#include <cstring> +#include <algorithm> + +using namespace std; + +//************************************************* +// Common data structures +//************************************************* + +/* + Structure for paired-end records +*/ +struct BEDPE { + + // UCSC BED fields + string chrom1; + int start1; + int end1; + + string chrom2; + int start2; + int end2; + + string name; + unsigned short score; + + string strand1; + string strand2; +}; + + + +//************************************************ +// BedFile Class methods and elements +//************************************************ +class BedFilePE { + +public: + + // Constructor + BedFilePE(string &); + + // Destructor + ~BedFilePE(void); + + // Methods + bool parseBedPELine (BEDPE &, const vector<string> &, const int &); + void reportBedPE(const BEDPE &); + + string bedFile; + unsigned int bedType; + +private: + // none +}; + +#endif /* BEDFILEPE_H */ diff --git a/src/utils/lineFileUtilities/Makefile b/src/utils/lineFileUtilities/Makefile new file mode 100755 index 0000000000000000000000000000000000000000..fe18aa0b3c7f35ca8d20e4eeae03a58464cfb4a7 --- /dev/null +++ b/src/utils/lineFileUtilities/Makefile @@ -0,0 +1,32 @@ +CXX = g++ -c +CXXFLAGS = -O3 -Wall +LDFLAGS = +OBJ_DIR = ../../../obj/ +BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../../utils/ +# ------------------- +# define our includes +# ------------------- +INCLUDES = + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= lineFileUtilities.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS= +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean \ No newline at end of file diff --git a/src/utils/lineFileUtilities/lineFileUtilities.cpp b/src/utils/lineFileUtilities/lineFileUtilities.cpp new file mode 100755 index 0000000000000000000000000000000000000000..c5dd1e6ce6256a60c572f2cdb459a05e106d4a64 --- /dev/null +++ b/src/utils/lineFileUtilities/lineFileUtilities.cpp @@ -0,0 +1,37 @@ +// +// lineFileUtilities.cpp +// BEDTools +// +// Created by Aaron Quinlan Spring 2009. +// Copyright 2009 Aaron Quinlan. All rights reserved. +// +// Summary: Contains common functions for processing text files. +// + +#include "lineFileUtilities.h" + + +//*********************************************** +// lineFileUtilities: +// Common Functions +//*********************************************** + +void Tokenize(const string& str, vector<string>& tokens) +{ + // Skip delimiters at beginning. + string::size_type lastPos = str.find_first_not_of("\t", 0); + // Find first "non-delimiter". + string::size_type pos = str.find_first_of("\t", lastPos); + + while (string::npos != pos || string::npos != lastPos) + { + // Found a token, add it to the vector. + tokens.push_back(str.substr(lastPos, pos - lastPos)); + // Skip delimiters. Note the "not_of" + lastPos = str.find_first_not_of("\t", pos); + // Find next "non-delimiter" + pos = str.find_first_of("\t", lastPos); + } +} + + diff --git a/src/utils/lineFileUtilities/lineFileUtilities.h b/src/utils/lineFileUtilities/lineFileUtilities.h new file mode 100755 index 0000000000000000000000000000000000000000..febac8db55a3de16ae188a46e1585d8bc4a7490a --- /dev/null +++ b/src/utils/lineFileUtilities/lineFileUtilities.h @@ -0,0 +1,14 @@ +#ifndef LINEFILEUTILITIES_H +#define LINEFILEUTILITIES_H + +#include <vector> +#include <string> +#include <algorithm> + +using namespace std; + +// split a line from a file into a vector of strings. token = "\t" +void Tokenize(const string& str, vector<string>& tokens); + + +#endif /* LINEFILEUTILITIES_H */ diff --git a/src/utils/lineFileUtilities/lineFileUtilities.o b/src/utils/lineFileUtilities/lineFileUtilities.o new file mode 100755 index 0000000000000000000000000000000000000000..9bd38530ff0f910da69a6e32fc3e77a7218879f8 Binary files /dev/null and b/src/utils/lineFileUtilities/lineFileUtilities.o differ diff --git a/src/utils/sequenceUtilities/Makefile b/src/utils/sequenceUtilities/Makefile old mode 100644 new mode 100755 index a8e4d8082c88dda521f06a4f85fdd686adb31a51..81d16baeb824c0dff3d3be8efcf391b7ec6c3bb8 --- a/src/utils/sequenceUtilities/Makefile +++ b/src/utils/sequenceUtilities/Makefile @@ -1,11 +1,32 @@ CXX = g++ -c CXXFLAGS = -O3 -Wall LDFLAGS = -OBJS = ../../../obj/ +OBJ_DIR = ../../../obj/ +BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../../utils/ +# ------------------- +# define our includes +# ------------------- +INCLUDES = -sequenceUtils: sequenceUtils.cpp sequenceUtils.h - $(CXX) $(CXXFLAGS) $(LDFLAGS) sequenceUtils.cpp -o $(OBJS)sequenceUtils.o +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= sequenceUtils.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS = +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(INCLUDES) clean: - rm $(OBJS) sequenceUtils.o - rm sequenceUtils.o \ No newline at end of file + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean \ No newline at end of file diff --git a/src/utils/sequenceUtilities/sequenceUtils.cpp b/src/utils/sequenceUtilities/sequenceUtils.cpp old mode 100644 new mode 100755 diff --git a/src/utils/sequenceUtilities/sequenceUtils.h b/src/utils/sequenceUtilities/sequenceUtils.h old mode 100644 new mode 100755 diff --git a/src/windowBed/Makefile b/src/windowBed/Makefile old mode 100644 new mode 100755 index 4a732aa98642844cb61c5efe502b8704ebb7ddba..337401e15c0b18417c24b5b29aaed2ef1f57fcb9 --- a/src/windowBed/Makefile +++ b/src/windowBed/Makefile @@ -9,14 +9,14 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= windowMain.cpp windowBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o +_EXT_OBJECTS=bedFile.o lineFileUtilities.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= windowBed @@ -36,7 +36,8 @@ $(BUILT_OBJECTS): $(SOURCES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ - + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* diff --git a/src/windowBed/windowBed.cpp b/src/windowBed/windowBed.cpp index 331834579a86a6fda0a73ed39dbe7abd9a7c3ac1..24544d89e68f15e5d54bf78c84ec44539a2c40e3 100755 --- a/src/windowBed/windowBed.cpp +++ b/src/windowBed/windowBed.cpp @@ -7,17 +7,14 @@ // // Summary: Looks for overlaps between features in two BED files. // - -/* - Includes -*/ +#include "lineFileUtilities.h" #include "windowBed.h" /* Constructor */ -BedWindow::BedWindow(string &bedAFile, string &bedBFile, int &leftSlop, int &rightSlop, bool &anyHit, bool &noHit, bool &writeCount) { +BedWindow::BedWindow(string &bedAFile, string &bedBFile, int &leftSlop, int &rightSlop, bool &anyHit, bool &noHit, bool &writeCount, bool &forceStrand) { this->bedAFile = bedAFile; this->bedBFile = bedBFile; @@ -28,68 +25,18 @@ BedWindow::BedWindow(string &bedAFile, string &bedBFile, int &leftSlop, int &rig this->anyHit = anyHit; this->noHit = noHit; this->writeCount = writeCount; + this->forceStrand = forceStrand; this->bedA = new BedFile(bedAFile); this->bedB = new BedFile(bedBFile); } -/* - Destructor -*/ -BedWindow::~BedWindow(void) { -} - - - -/* - reportA - - Writes the _original_ BED entry for A. - Works for BED3 - BED6. -*/ -void BedWindow::reportA(const BED &a) { - - if (bedA->bedType == 3) { - cout << a.chrom << "\t" << a.start << "\t" << a.end; - } - else if (bedA->bedType == 4) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name; - } - else if (bedA->bedType == 5) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name << "\t" << a.score; - } - else if (bedA->bedType == 6) { - cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" - << a.name << "\t" << a.score << "\t" << a.strand; - } -} - /* - reportB - - Writes the _original_ BED entry for B. - Works for BED3 - BED6. + Destructor */ -void BedWindow::reportB(const BED &b) { - if (bedB->bedType == 3) { - cout << b.chrom << "\t" << b.start << "\t" << b.end; - } - else if (bedB->bedType == 4) { - cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" - << b.name; - } - else if (bedB->bedType == 5) { - cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" - << b.name << "\t" << b.score; - } - else if (bedB->bedType == 6) { - cout << b.chrom << "\t" << b.start << "\t" << b.end << "\t" - << b.name << "\t" << b.score << "\t" << b.strand; - } +BedWindow::~BedWindow(void) { } @@ -115,6 +62,12 @@ void BedWindow::FindWindowOverlaps(BED &a, vector<BED> &hits) { int numOverlaps = 0; for (vector<BED>::iterator h = hits.begin(); h != hits.end(); ++h) { + // if forcing strandedness, move on if the hit + // is not on the same strand as A. + if ((this->forceStrand) && (a.strand != h->strand)) { + continue; // continue force the next iteration of the for loop. + } + int s = max(aFudgeStart, h->start); int e = min(aFudgeEnd, h->end); @@ -123,20 +76,20 @@ void BedWindow::FindWindowOverlaps(BED &a, vector<BED> &hits) { if ( ((float)(e-s) / (float)(a.end - a.start)) > 0 ) { numOverlaps++; if (!anyHit && !noHit && !writeCount) { - reportA(a); cout << "\t"; - reportB(*h); cout << endl; + bedA->reportBed(a); cout << "\t"; + bedB->reportBed(*h); cout << endl; } } } } if (anyHit && (numOverlaps >= 1)) { - reportA(a); cout << endl; + bedA->reportBed(a); cout << endl; } else if (writeCount) { - reportA(a); cout << "\t" << numOverlaps << endl; + bedA->reportBed(a); cout << "\t" << numOverlaps << endl; } else if (noHit && (numOverlaps == 0)) { - reportA(a); cout << endl; + bedA->reportBed(a); cout << endl; } } diff --git a/src/windowBed/windowBed.h b/src/windowBed/windowBed.h index d8ef2b80559b0d32e2f0c3c256ba6ccff457f761..bfc89a9bf2387e324bdad11ba5d0b3346b3c3ceb 100755 --- a/src/windowBed/windowBed.h +++ b/src/windowBed/windowBed.h @@ -3,20 +3,11 @@ #include "bedFile.h" #include <vector> -#include <map> -#include <list> #include <iostream> #include <fstream> using namespace std; - -//*********************************************** -// Typedefs -//*********************************************** -typedef list<BED> bedList; - - //************************************************ // Class methods and elements //************************************************ @@ -25,7 +16,7 @@ class BedWindow { public: // constructor - BedWindow(string &, string &, int &, int &, bool &, bool &, bool &); + BedWindow(string &, string &, int &, int &, bool &, bool &, bool &, bool &); // destructor ~BedWindow(void); @@ -45,6 +36,7 @@ private: int leftSlop; int rightSlop; bool noHit; + bool forceStrand; // instance of a bed file class. BedFile *bedA, *bedB; diff --git a/src/windowBed/windowMain.cpp b/src/windowBed/windowMain.cpp index 21f77f915c2fc66819e11acf23eee5ab3994f69f..0a84e29ce9beb4b48a49962c6658847bf5557245 100755 --- a/src/windowBed/windowMain.cpp +++ b/src/windowBed/windowMain.cpp @@ -35,6 +35,7 @@ int main(int argc, char* argv[]) { bool haveSlop = false; bool haveLeft = false; bool haveRight = false; + bool forceStrand = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -74,6 +75,9 @@ int main(int argc, char* argv[]) { else if (PARAMETER_CHECK("-v", 2, parameterLength)) { noHit = true; } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } else if (PARAMETER_CHECK("-w", 2, parameterLength)) { haveSlop = true; leftSlop = atoi(argv[i + 1]); @@ -133,7 +137,7 @@ int main(int argc, char* argv[]) { } if (!showHelp) { - BedWindow *bi = new BedWindow(bedAFile, bedBFile, leftSlop, rightSlop, anyHit, noHit, writeCount); + BedWindow *bi = new BedWindow(bedAFile, bedBFile, leftSlop, rightSlop, anyHit, noHit, writeCount, forceStrand); bi->WindowIntersectBed(); return 0; } @@ -150,15 +154,16 @@ void ShowHelp(void) { cerr << " Hall Laboratory, University of Virginia" << endl; cerr << "===============================================" << endl << endl; cerr << "Description: Examines a \"window\" around each feature in A.bed and" << endl; - cerr << "reports all features in B.bed that are within the window. For each overlap the entire entry in A and B are reported." - << endl << endl; + cerr << "reports all features in B.bed that are within the window. For each" << endl; + cerr << "overlap the entire entry in A and B are reported." << endl << endl; cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <a.bed> -b <b.bed>" << endl << endl; cerr << "OPTIONS: " << endl; - cerr << "\t" << "-w (def. 1000)\t\t" << "Base pairs added upstream and downstream of each entry in A when searching for overlaps in B." << endl << endl; - cerr << "\t" << "-l (def. 1000)\t\t" << "Base pairs added upstream (left of) of each entry in A when searching for overlaps in B." << endl << endl; - cerr << "\t" << "-r (def. 1000)\t\t" << "Base pairs added downstream (right of) of each entry in A when searching for overlaps in B." << endl << endl; + cerr << "\t" << "-s\t\t\t" << "Force strandedness. Only report hits in B that overlap A on the same strand." << endl << "\t\t\t\tBy default, overlaps are reported without respect to strand." << endl << endl; + cerr << "\t" << "-w (def. 1000)\t\t" << "Base pairs added upstream and downstream of each entry in A when searching for overlaps in B." << endl << endl; + cerr << "\t" << "-l (def. 1000)\t\t" << "Base pairs added upstream (left of) of each entry in A when searching for overlaps in B." << endl << endl; + cerr << "\t" << "-r (def. 1000)\t\t" << "Base pairs added downstream (right of) of each entry in A when searching for overlaps in B." << endl << endl; cerr << "\t" << "-u\t\t\t" << "Write ORIGINAL a.bed entry ONCE if ANY overlap with B.bed." << endl << "\t\t\t\tIn other words, just report the fact >=1 hit was found." << endl << endl; cerr << "\t" << "-v \t\t\t" << "Only report those entries in A that have NO OVERLAP in B within the requested window." << endl << "\t\t\t\tSimilar to grep -v." << endl << endl; cerr << "\t" << "-c \t\t\t" << "For each entry in A, report the number of hits in B within the requested window." << endl << "\t\t\t\tReports 0 for A entries that have no overlap with B." << endl << endl;