From 2db206caea5a3ce69a280b33cc65a0e6b1b50b2b Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Thu, 15 Sep 2011 15:39:47 -0400 Subject: [PATCH] gcB no longer requires a genome file for BAM. New chromsweep tool. --- Makefile | 5 +- src/bedToBam/bedToBam.cpp | 2 +- src/bedToIgv/Makefile | 8 +- src/chromsweep/Makefile | 46 ++++ src/chromsweep/chromsweep.cpp | 176 ++++++++++++ src/chromsweep/chromsweep.h | 96 +++++++ src/chromsweep/chromsweepMain.cpp | 271 +++++++++++++++++++ src/complementBed/Makefile | 8 +- src/flankBed/Makefile | 8 +- src/genomeCoverageBed/genomeCoverageBed.cpp | 21 +- src/genomeCoverageBed/genomeCoverageBed.h | 1 + src/genomeCoverageBed/genomeCoverageMain.cpp | 3 +- src/pairToBed/pairToBed.h | 1 - src/pairToBed/pairToBedMain.cpp | 1 - src/shuffleBed/Makefile | 8 +- src/slopBed/Makefile | 8 +- src/unionBedGraphs/Makefile | 3 +- src/utils/bedFile/bedFile.cpp | 5 + src/utils/bedFile/bedFile.h | 2 + src/utils/genomeFile/Makefile | 13 +- src/utils/genomeFile/genomeFile.cpp | 10 + src/utils/genomeFile/genomeFile.h | 8 +- 22 files changed, 678 insertions(+), 26 deletions(-) create mode 100644 src/chromsweep/Makefile create mode 100644 src/chromsweep/chromsweep.cpp create mode 100644 src/chromsweep/chromsweep.h create mode 100644 src/chromsweep/chromsweepMain.cpp diff --git a/Makefile b/Makefile index 996fa1a2..d1d4924f 100644 --- a/Makefile +++ b/Makefile @@ -18,6 +18,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/bedToBam \ $(SRC_DIR)/bedToIgv \ $(SRC_DIR)/bed12ToBed6 \ + $(SRC_DIR)/chromsweep \ $(SRC_DIR)/closestBed \ $(SRC_DIR)/complementBed \ $(SRC_DIR)/coverageBed \ @@ -44,7 +45,6 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ UTIL_SUBDIRS = $(SRC_DIR)/utils/lineFileUtilities \ $(SRC_DIR)/utils/bedFile \ $(SRC_DIR)/utils/bedGraphFile \ - $(SRC_DIR)/utils/genomeFile \ $(SRC_DIR)/utils/gzstream \ $(SRC_DIR)/utils/fileType \ $(SRC_DIR)/utils/bedFilePE \ @@ -52,7 +52,8 @@ UTIL_SUBDIRS = $(SRC_DIR)/utils/lineFileUtilities \ $(SRC_DIR)/utils/tabFile \ $(SRC_DIR)/utils/BamTools \ $(SRC_DIR)/utils/BamTools-Ancillary \ - $(SRC_DIR)/utils/Fasta + $(SRC_DIR)/utils/Fasta \ + $(SRC_DIR)/utils/genomeFile all: [ -d $(OBJ_DIR) ] || mkdir -p $(OBJ_DIR) diff --git a/src/bedToBam/bedToBam.cpp b/src/bedToBam/bedToBam.cpp index 41509d6f..2843a560 100644 --- a/src/bedToBam/bedToBam.cpp +++ b/src/bedToBam/bedToBam.cpp @@ -203,7 +203,7 @@ void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED1 bedEntry = nullBed; } } - // close up + //close up bed->Close(); writer->Close(); } diff --git a/src/bedToIgv/Makefile b/src/bedToIgv/Makefile index 382b8830..3ad616b9 100644 --- a/src/bedToIgv/Makefile +++ b/src/bedToIgv/Makefile @@ -5,7 +5,13 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/genomeFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ -I$(UTILITIES_DIR)/fileType/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/genomeFile/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/version/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/BamTools/include # ---------------------------------- # define our source and object files diff --git a/src/chromsweep/Makefile b/src/chromsweep/Makefile new file mode 100644 index 00000000..4cd8f20f --- /dev/null +++ b/src/chromsweep/Makefile @@ -0,0 +1,46 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/version/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/fileType/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= chromsweepMain.cpp chromsweep.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=bedFile.o lineFileUtilities.o gzstream.o fileType.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +PROGRAM= chromsweep + +all: $(PROGRAM) + +.PHONY: all + +$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS) + @echo " * linking $(PROGRAM)" + @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS) + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean diff --git a/src/chromsweep/chromsweep.cpp b/src/chromsweep/chromsweep.cpp new file mode 100644 index 00000000..ce912f4b --- /dev/null +++ b/src/chromsweep/chromsweep.cpp @@ -0,0 +1,176 @@ +/***************************************************************************** + chromsweep.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "lineFileUtilities.h" +#include "chromsweep.h" +#include <queue> +#include <set> + +bool after(const BED &a, const BED &b); +void report_hits(const BED &curr_qy, const vector<BED> &hits); +vector<BED> scan_cache(const BED &curr_qy, BedLineStatus qy_status, const vector<BED> &db_cache, vector<BED> &hits); + + +/* + Constructor +*/ +ChromSweep::ChromSweep(string bedAFile, string bedBFile, bool anyHit, + bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, + float overlapFraction, bool noHit, bool writeCount, bool forceStrand, + bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput) { + + _bedAFile = bedAFile; + _bedBFile = bedBFile; + _anyHit = anyHit; + _noHit = noHit; + _writeA = writeA; + _writeB = writeB; + _writeOverlap = writeOverlap; + _writeAllOverlap = writeAllOverlap; + _writeCount = writeCount; + _overlapFraction = overlapFraction; + _forceStrand = forceStrand; + _reciprocal = reciprocal; + _obeySplits = obeySplits; + _bamInput = bamInput; + _bamOutput = bamOutput; + + if (_anyHit || _noHit || _writeCount) + _printable = false; + else + _printable = true; + + // create new BED file objects for A and B + _bedA = new BedFile(bedAFile); + _bedB = new BedFile(bedBFile); + + Sweep(); +} + + +/* + Destructor +*/ +ChromSweep::~ChromSweep(void) { +} + + +bool after(const BED &a, const BED &b) { + return (a.start >= b.end); +} + +void ChromSweep::ScanCache(const BED &curr_qy, BedLineStatus qy_status, vector<BED> &db_cache, vector<BED> &hits) { + if (qy_status != BED_INVALID) { + vector<BED>::iterator c = db_cache.begin(); + while (c != db_cache.end()) + { + if ((curr_qy.chrom == c->chrom) && !(after(curr_qy, *c))) { + if (overlaps(curr_qy.start, curr_qy.end, c->start, c->end) > 0) { + hits.push_back(*c); + } + ++c; + } + else { + c = db_cache.erase(c); + } + } + } +} + + +void ChromSweep::ChromCheck(BED &curr_qy, BED &curr_db, + BedLineStatus &qy_status, BedLineStatus &db_status, + int &qy_lineNum, int &db_lineNum, + vector<BED> &db_cache, vector<BED> &hits) +{ + if ((curr_qy.chrom == curr_db.chrom) || (db_status == BED_INVALID) || (qy_status == BED_INVALID)) { + return; + } + + if (curr_qy.chrom > curr_db.chrom) { + while (!_bedB->Empty() && curr_db.chrom < curr_qy.chrom) + { + db_status = _bedB->GetNextBed(curr_db, db_lineNum); + } + db_cache.clear(); + } + else if (curr_qy.chrom < curr_db.chrom) { + // report hits for the remaining queries on this chrom + BED tmp_curr_qy = curr_qy; + while (!_bedA->Empty() && tmp_curr_qy.chrom == curr_qy.chrom) + { + //db_cache = ScanCache(tmp_curr_qy, qy_status, db_cache, hits); + ScanCache(tmp_curr_qy, qy_status, db_cache, hits); + + ReportHits(tmp_curr_qy, hits); + qy_status = _bedA->GetNextBed(tmp_curr_qy, qy_lineNum); + hits.clear(); + } + // now fast forward query to catch up to database + while (!_bedA->Empty() && tmp_curr_qy.chrom < curr_db.chrom) + { + // hits is empty to reflect the fact that no hits are found in catch-up mode + ReportHits(tmp_curr_qy, hits); + qy_status = _bedA->GetNextBed(tmp_curr_qy, qy_lineNum); + } + curr_qy = tmp_curr_qy; + db_cache.clear(); + } +} + + +void ChromSweep::ReportHits(const BED &curr_qy, const vector<BED> &hits) { + _bedA->reportBedTab(curr_qy); + cout << hits.size() << endl; +} + + +void ChromSweep::Sweep() { + + int qy_lineNum = 0; + int db_lineNum = 0; + + // current feature from each file + BED curr_qy, curr_db; + + // status of the current lines + BedLineStatus qy_status, db_status; + vector<BED> db_cache; + vector<BED> hits; + + // open the files; get the first line from each + _bedA->Open(); + _bedB->Open(); + qy_status = _bedA->GetNextBed(curr_qy, qy_lineNum); + db_status = _bedB->GetNextBed(curr_db, db_lineNum); + while (!_bedA->Empty()) { + // have we changed chromosomes? + ChromCheck(curr_qy, curr_db, qy_status, db_status, qy_lineNum, db_lineNum, db_cache, hits); + // scan the database cache for hits + //db_cache = ScanCache(curr_qy, qy_status, db_cache, hits); + ScanCache(curr_qy, qy_status, db_cache, hits); + // advance the db until we are ahead of the query. update hits and cache as necessary + while (!_bedB->Empty() && + curr_qy.chrom == curr_db.chrom && + !(after(curr_db, curr_qy))) + { + if (overlaps(curr_qy.start, curr_qy.end, curr_db.start, curr_db.end) > 0) { + hits.push_back(curr_db); + } + db_cache.push_back(curr_db); + db_status = _bedB->GetNextBed(curr_db, db_lineNum); + } + // report the hits for this query and reset for the next. + ReportHits(curr_qy, hits); + hits.clear(); + qy_status = _bedA->GetNextBed(curr_qy, qy_lineNum); + } +} diff --git a/src/chromsweep/chromsweep.h b/src/chromsweep/chromsweep.h new file mode 100644 index 00000000..0ea74440 --- /dev/null +++ b/src/chromsweep/chromsweep.h @@ -0,0 +1,96 @@ +/***************************************************************************** + chromsweepBed.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#ifndef CHROMSWEEP_H +#define CHROMSWEEP_H + +#include "bedFile.h" +// #include "BamReader.h" +// #include "BamWriter.h" +// #include "BamAncillary.h" +// #include "BamAux.h" +// using namespace BamTools; + + +#include <vector> +#include <queue> +#include <iostream> +#include <fstream> +#include <stdlib.h> +using namespace std; + + + +class ChromSweep { + +public: + + // constructor + ChromSweep(string bedAFile, string bedBFile, bool anyHit, + bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, + float overlapFraction, bool noHit, bool writeCount, bool forceStrand, + bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput); + + // destructor + ~ChromSweep(void); + +private: + + //------------------------------------------------ + // private attributes + //------------------------------------------------ + string _bedAFile; + string _bedBFile; + + bool _writeA; // should the original A feature be reported? + bool _writeB; // should the original B feature be reported? + bool _writeOverlap; + bool _writeAllOverlap; + + bool _forceStrand; + bool _reciprocal; + float _overlapFraction; + + bool _anyHit; + bool _noHit; + bool _writeCount; // do we want a count of the number of overlaps in B? + bool _obeySplits; + bool _bamInput; + bool _bamOutput; + + bool _printable; + + queue<BED*> _outputBuffer; + bool _lastPick; + + map<string, vector<BED*> > _windowA; + map<string, vector<BED*> > _windowB; + + // instance of a bed file class. + BedFile *_bedA, *_bedB; + + //------------------------------------------------ + // private methods + //------------------------------------------------ + void ScanCache(const BED &curr_qy, BedLineStatus qy_status, vector<BED> &db_cache, vector<BED> &hits); + + void ChromCheck(BED &curr_qy, BED &curr_db, + BedLineStatus &qy_status, BedLineStatus &db_status, + int &qy_lineNum, int &db_lineNum, + vector<BED> &db_cache, vector<BED> &hits); + + void Sweep(); + void ReportHits(const BED &curr_qy, const vector<BED> &hits); + + +}; + +#endif /* CHROMSWEEP_H */ diff --git a/src/chromsweep/chromsweepMain.cpp b/src/chromsweep/chromsweepMain.cpp new file mode 100644 index 00000000..913a2a0c --- /dev/null +++ b/src/chromsweep/chromsweepMain.cpp @@ -0,0 +1,271 @@ +/***************************************************************************** + chromsweepMain.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "chromsweep.h" +#include "version.h" + +using namespace std; + +// define our program name +#define PROGRAM_NAME "chromsweep" + + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +// function declarations +void ShowHelp(void); + +int main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + + // input arguments + float overlapFraction = 1E-9; + + bool haveBedA = false; + bool haveBedB = false; + bool noHit = false; + bool anyHit = false; + bool writeA = false; + bool writeB = false; + bool writeCount = false; + bool writeOverlap = false; + bool writeAllOverlap = false; + bool haveFraction = false; + bool reciprocalFraction = false; + bool forceStrand = false; + bool obeySplits = false; + bool inputIsBam = false; + bool outputIsBam = true; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + outputIsBam = false; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-abam", 5, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + inputIsBam = true; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedB = true; + bedBFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-bed", 4, parameterLength)) { + outputIsBam = false; + } + else if(PARAMETER_CHECK("-u", 2, parameterLength)) { + anyHit = true; + } + else if(PARAMETER_CHECK("-f", 2, parameterLength)) { + if ((i+1) < argc) { + haveFraction = true; + overlapFraction = atof(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-wa", 3, parameterLength)) { + writeA = true; + } + else if(PARAMETER_CHECK("-wb", 3, parameterLength)) { + writeB = true; + } + else if(PARAMETER_CHECK("-wo", 3, parameterLength)) { + writeOverlap = true; + } + else if(PARAMETER_CHECK("-wao", 4, parameterLength)) { + writeAllOverlap = true; + writeOverlap = true; + } + else if(PARAMETER_CHECK("-c", 2, parameterLength)) { + writeCount = true; + } + else if(PARAMETER_CHECK("-r", 2, parameterLength)) { + reciprocalFraction = true; + } + else if (PARAMETER_CHECK("-v", 2, parameterLength)) { + noHit = true; + } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } + else if (PARAMETER_CHECK("-split", 6, parameterLength)) { + obeySplits = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && noHit) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -v, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (writeB && writeCount) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -c, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (writeCount && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (writeA && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wa OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (writeB && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -wb OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (reciprocalFraction && !haveFraction) { + cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && writeCount) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -c, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && writeB) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -wb, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (anyHit && writeOverlap) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -u OR -wo, not both." << endl << "*****" << endl; + showHelp = true; + } + + + if (!showHelp) { + + ChromSweep *bi = new ChromSweep(bedAFile, bedBFile, anyHit, writeA, writeB, writeOverlap, + writeAllOverlap, overlapFraction, noHit, writeCount, forceStrand, + reciprocalFraction, obeySplits, inputIsBam, outputIsBam); + delete bi; + return 0; + } + else { + ShowHelp(); + } +} + +void ShowHelp(void) { + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Report overlaps between two feature files." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-abam\t" << "The A input file is in BAM format. Output will be BAM as well." << endl << endl; + + cerr << "\t-bed\t" << "When using BAM input (-abam), write output as BED. The default" << endl; + cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; + + cerr << "\t-wa\t" << "Write the original entry in A for each overlap." << endl << endl; + + cerr << "\t-wb\t" << "Write the original entry in B for each overlap." << endl; + cerr << "\t\t- Useful for knowing _what_ A overlaps. Restricted by -f and -r." << endl << endl; + + cerr << "\t-wo\t" << "Write the original A and B entries plus the number of base" << endl; + cerr << "\t\tpairs of overlap between the two features." << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl; + cerr << "\t\t Only A features with overlap are reported." << endl << endl; + + cerr << "\t-wao\t" << "Write the original A and B entries plus the number of base" << endl; + cerr << "\t\tpairs of overlap between the two features." << endl; + cerr << "\t\t- Overlapping features restricted by -f and -r." << endl; + cerr << "\t\t However, A features w/o overlap are also reported" << endl; + cerr << "\t\t with a NULL B feature and overlap = 0." << endl << endl; + + cerr << "\t-u\t" << "Write the original A entry _once_ if _any_ overlaps found in B." << endl; + cerr << "\t\t- In other words, just report the fact >=1 hit was found." << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; + + cerr << "\t-c\t" << "For each entry in A, report the number of overlaps with B." << endl; + cerr << "\t\t- Reports 0 for A entries that have no overlap with B." << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; + + cerr << "\t-v\t" << "Only report those entries in A that have _no overlaps_ with B." << endl; + cerr << "\t\t- Similar to \"grep -v\" (an homage)." << endl << endl; + + cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl; + cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl; + cerr << "\t\t- FLOAT (e.g. 0.50)" << endl << endl; + + cerr << "\t-r\t" << "Require that the fraction overlap be reciprocal for A and B." << endl; + cerr << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl; + cerr << "\t\t that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl; + + cerr << "\t-s\t" << "Force strandedness. That is, only report hits in B that" << endl; + cerr << "\t\toverlap A on the same strand." << endl; + cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; + + cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl << endl; + + + // end the program here + exit(1); + +} diff --git a/src/complementBed/Makefile b/src/complementBed/Makefile index 1e980bd6..2f542063 100644 --- a/src/complementBed/Makefile +++ b/src/complementBed/Makefile @@ -5,7 +5,13 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/genomeFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ -I$(UTILITIES_DIR)/fileType/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/genomeFile/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/version/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/BamTools/include # ---------------------------------- # define our source and object files diff --git a/src/flankBed/Makefile b/src/flankBed/Makefile index bb3416e7..1da20693 100644 --- a/src/flankBed/Makefile +++ b/src/flankBed/Makefile @@ -5,7 +5,13 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/genomeFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ -I$(UTILITIES_DIR)/fileType/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/genomeFile/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/version/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/BamTools/include # ---------------------------------- # define our source and object files diff --git a/src/genomeCoverageBed/genomeCoverageBed.cpp b/src/genomeCoverageBed/genomeCoverageBed.cpp index cb0dfc95..eecc73ca 100644 --- a/src/genomeCoverageBed/genomeCoverageBed.cpp +++ b/src/genomeCoverageBed/genomeCoverageBed.cpp @@ -13,7 +13,7 @@ Licenced under the GNU General Public License 2.0 license. #include "genomeCoverageBed.h" -BedGenomeCoverage::BedGenomeCoverage(string bedFile, string genomeFile, +BedGenomeCoverage::BedGenomeCoverage(string bedFile, string genomeFile, bool eachBase, bool startSites, bool bedGraph, bool bedGraphAll, int max, float scale, @@ -43,15 +43,20 @@ BedGenomeCoverage::BedGenomeCoverage(string bedFile, string genomeFile, _currChromName = ""; _currChromSize = 0 ; - _bed = new BedFile(bedFile); - _genome = new GenomeFile(genomeFile); - + + if (_bamInput == false) { + _genome = new GenomeFile(genomeFile); + } + PrintTrackDefinitionLine(); - if (_bamInput == false) + if (_bamInput == false) { + _bed = new BedFile(bedFile); CoverageBed(); - else - CoverageBam(_bed->bedFile); + } + else { + CoverageBam(_bedFile); + } } void BedGenomeCoverage::PrintTrackDefinitionLine() @@ -214,6 +219,8 @@ void BedGenomeCoverage::CoverageBam(string bamFile) { string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); + // load the BAM header references into a BEDTools "genome file" + _genome = new GenomeFile(refs); // convert each aligned BAM entry to BED // and compute coverage on B BamAlignment bam; diff --git a/src/genomeCoverageBed/genomeCoverageBed.h b/src/genomeCoverageBed/genomeCoverageBed.h index 8d2f7458..5e6ef4b4 100644 --- a/src/genomeCoverageBed/genomeCoverageBed.h +++ b/src/genomeCoverageBed/genomeCoverageBed.h @@ -90,6 +90,7 @@ private: // methods void CoverageBed(); void CoverageBam(string bamFile); + void LoadBamHeaderIntoGenomeFile(const string &bamFile); void ReportChromCoverage(const vector<DEPTH> &, const int &chromSize, const string &chrom, chromHistMap&); void ReportGenomeCoverage(chromHistMap &chromDepthHist); void ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom); diff --git a/src/genomeCoverageBed/genomeCoverageMain.cpp b/src/genomeCoverageBed/genomeCoverageMain.cpp index fa57e403..a316e45a 100644 --- a/src/genomeCoverageBed/genomeCoverageMain.cpp +++ b/src/genomeCoverageBed/genomeCoverageMain.cpp @@ -163,7 +163,7 @@ int main(int argc, char* argv[]) { } // make sure we have both input files - if (!haveBed || !haveGenome) { + if (!haveBed && !haveGenome && !bamInput) { cerr << endl << "*****" << endl << "*****ERROR: Need both a BED (-i) and a genome (-g) file. " << endl << "*****" << endl; showHelp = true; } @@ -197,7 +197,6 @@ int main(int argc, char* argv[]) { } if (!showHelp) { - BedGenomeCoverage *bc = new BedGenomeCoverage(bedFile, genomeFile, eachBase, startSites, bedGraph, bedGraphAll, max, scale, bamInput, obeySplits, diff --git a/src/pairToBed/pairToBed.h b/src/pairToBed/pairToBed.h index d881c565..20ceb426 100644 --- a/src/pairToBed/pairToBed.h +++ b/src/pairToBed/pairToBed.h @@ -44,7 +44,6 @@ public: // constructor BedIntersectPE(string bedAFilePE, string bedBFile, float overlapFraction, string searchType, bool sameStrand, bool diffStrand, bool bamInput, bool bamOutput, bool uncompressedBam, bool useEditDistance); - // destructor ~BedIntersectPE(void); diff --git a/src/pairToBed/pairToBedMain.cpp b/src/pairToBed/pairToBedMain.cpp index f29cf978..38b8714a 100644 --- a/src/pairToBed/pairToBedMain.cpp +++ b/src/pairToBed/pairToBedMain.cpp @@ -154,7 +154,6 @@ int main(int argc, char* argv[]) { cerr << endl << "*****" << endl << "*****ERROR: Request either -s OR -S, not both." << endl << "*****" << endl; showHelp = true; } - if (!showHelp) { BedIntersectPE *bi = new BedIntersectPE(bedAFile, bedBFile, overlapFraction, diff --git a/src/shuffleBed/Makefile b/src/shuffleBed/Makefile index 42e6cf97..0265ae05 100644 --- a/src/shuffleBed/Makefile +++ b/src/shuffleBed/Makefile @@ -5,7 +5,13 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/genomeFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ -I$(UTILITIES_DIR)/fileType/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/genomeFile/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/version/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/BamTools/include # ---------------------------------- # define our source and object files diff --git a/src/slopBed/Makefile b/src/slopBed/Makefile index ba798c52..aed0b82d 100644 --- a/src/slopBed/Makefile +++ b/src/slopBed/Makefile @@ -5,7 +5,13 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/genomeFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ -I$(UTILITIES_DIR)/fileType/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/genomeFile/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/version/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/BamTools/include # ---------------------------------- # define our source and object files diff --git a/src/unionBedGraphs/Makefile b/src/unionBedGraphs/Makefile index 00b488a6..bca92f26 100755 --- a/src/unionBedGraphs/Makefile +++ b/src/unionBedGraphs/Makefile @@ -10,7 +10,8 @@ INCLUDES = -I$(UTILITIES_DIR)/bedGraphFile/ \ -I$(UTILITIES_DIR)/genomeFile/ \ -I$(UTILITIES_DIR)/version/ \ -I$(UTILITIES_DIR)/gzstream/ \ - -I$(UTILITIES_DIR)/fileType/ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/BamTools/include # ---------------------------------- # define our source and object files diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 970be657..0abb6973 100644 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -156,6 +156,11 @@ void BedFile::Seek(unsigned long offset) { _bedStream->seekg(offset); } +// Jump to a specific byte in the file +bool BedFile::Empty() { + return _bedStream->eof(); +} + // Close the BED file void BedFile::Close(void) { diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 71afd096..425c8fd7 100644 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -407,6 +407,8 @@ public: // Jump to a specific byte in the file void Seek(unsigned long offset); + + bool Empty(); // Close an opened BED file. void Close(void); diff --git a/src/utils/genomeFile/Makefile b/src/utils/genomeFile/Makefile index a48c6fba..e5339c68 100644 --- a/src/utils/genomeFile/Makefile +++ b/src/utils/genomeFile/Makefile @@ -1,10 +1,12 @@ OBJ_DIR = ../../../obj/ BIN_DIR = ../../../bin/ -UTILITIES_DIR = ../../utils/ +UTILITIES_DIR = ../ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/fileType/ +INCLUDES = -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/BamTools/include/ # ---------------------------------- # define our source and object files @@ -17,13 +19,14 @@ BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) $(BUILT_OBJECTS): $(SOURCES) @echo " * compiling" $(*F).cpp - @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) -L$(BT_ROOT)/lib $(EXT_OBJECTS): - @$(MAKE) --no-print-directory -C -W $(INCLUDES) + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* -.PHONY: clean \ No newline at end of file +.PHONY: clean diff --git a/src/utils/genomeFile/genomeFile.cpp b/src/utils/genomeFile/genomeFile.cpp index 67a280ea..d94e9226 100644 --- a/src/utils/genomeFile/genomeFile.cpp +++ b/src/utils/genomeFile/genomeFile.cpp @@ -18,6 +18,16 @@ GenomeFile::GenomeFile(const string &genomeFile) { loadGenomeFileIntoMap(); } +GenomeFile::GenomeFile(const RefVector &genome) { + for (size_t i = 0; i < genome.size(); ++i) { + string chrom = genome[i].RefName; + int length = genome[i].RefLength; + + _chromSizes[chrom] = length; + _chromList.push_back(chrom); + } +} + // Destructor GenomeFile::~GenomeFile(void) { } diff --git a/src/utils/genomeFile/genomeFile.h b/src/utils/genomeFile/genomeFile.h index 5da8d6b9..415be847 100644 --- a/src/utils/genomeFile/genomeFile.h +++ b/src/utils/genomeFile/genomeFile.h @@ -19,6 +19,9 @@ #include <fstream> #include <cstring> #include <cstdio> +#include "api/BamReader.h" +#include "api/BamAux.h" +using namespace BamTools; using namespace std; @@ -31,8 +34,11 @@ class GenomeFile { public: - // Constructor + // Constructor using a file GenomeFile(const string &genomeFile); + + // Constructor using a vector of BamTools RefVector + GenomeFile(const RefVector &genome); // Destructor ~GenomeFile(void); -- GitLab