diff --git a/src/intersectFile/Makefile b/src/intersectFile/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0b37027a789f65e7f584e42e8d359653e3e129fa --- /dev/null +++ b/src/intersectFile/Makefile @@ -0,0 +1,43 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ + -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/GenomeFile/ \ + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BlockedIntervals \ + -I$(UTILITIES_DIR)/BamTools-Ancillary \ + -I$(UTILITIES_DIR)/FileRecordTools/ \ + -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ + -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/NewChromsweep \ + -I$(UTILITIES_DIR)/BinTree \ + -I$(UTILITIES_DIR)/version/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= intersectMain.cpp intersectFile.cpp intersectFile.h +OBJECTS= intersectMain.o intersectFile.o +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +PROGRAM= intersectFile + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/intersectMain.o $(OBJ_DIR)/intersectFile.o + +.PHONY: clean diff --git a/src/intersectFile/intersectFile.cpp b/src/intersectFile/intersectFile.cpp new file mode 100644 index 0000000000000000000000000000000000000000..914be97dea108bf69371007697d3e7512f80bea2 --- /dev/null +++ b/src/intersectFile/intersectFile.cpp @@ -0,0 +1,130 @@ +/***************************************************************************** + intersectFile.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ + +#include "intersectFile.h" +#include "Context.h" +#include "FileRecordMgr.h" +#include "NewChromsweep.h" +#include "BinTree.h" +#include "RecordOutputMgr.h" + + +FileIntersect::FileIntersect(Context *context) +: _context(context), + _blockMgr(NULL), + _recordOutputMgr(NULL) +{ + _blockMgr = new BlockMgr(); + _blockMgr->setContext(context); + _recordOutputMgr = new RecordOutputMgr(); +} + +FileIntersect::~FileIntersect(void) { + if (_blockMgr != NULL) { + delete _blockMgr; + _blockMgr = NULL; + } + delete _recordOutputMgr; +} + +void FileIntersect::processHits(RecordKeyList &hits) { + if (hits.getKey()->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) { + RecordKeyList blockList(hits.getKey()); + bool deleteBlocks = false; + _blockMgr->getBlocks(blockList, deleteBlocks); + _recordOutputMgr->printRecord(hits, &blockList); + if (deleteBlocks) { + _blockMgr->deleteBlocks(blockList); + } + return; + } + _recordOutputMgr->printRecord(hits); +} + +bool FileIntersect::intersectFiles() +{ + if (_context->getSortedInput()) { + return processSortedFiles(); + } + return processUnsortedFiles(); +} + +bool FileIntersect::processSortedFiles() +{ + // use the chromsweep algorithm to detect overlaps on the fly. + NewChromSweep sweep(_context); + + if (!sweep.init()) { + return false; + } + if (!_recordOutputMgr->init(_context)) { + return false; + } + + RecordKeyList hitSet; + while (sweep.next(hitSet)) { + if (_context->getObeySplits()) { + RecordKeyList keySet(hitSet.getKey()); + RecordKeyList resultSet(hitSet.getKey()); + _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet); + processHits(resultSet); + } else { + processHits(hitSet); + } + } + return true; +} + +bool FileIntersect::processUnsortedFiles() +{ + const QuickString &databaseFilename = _context->getDatabaseFileName(); + BinTree *binTree = new BinTree(_context->getDatabaseFileIdx(), _context); + + FileRecordMgr *queryFRM = new FileRecordMgr(_context->getQueryFileIdx(), _context); + if (!queryFRM->open()) { + return false; + } + + if (!binTree->loadDB()) { + fprintf(stderr, "Error: Unable to load database file %s.\n", databaseFilename.c_str()); + delete binTree; + exit(1); + } + + + _context->determineOutputType(); + _recordOutputMgr->init(_context); + + while (!queryFRM->eof()) { + Record *queryRecord = queryFRM->allocateAndGetNextRecord(); + if (queryRecord == NULL) { + continue; + } + RecordKeyList hitSet(queryRecord); + binTree->getHits(queryRecord, hitSet); + if (_context->getObeySplits()) { + RecordKeyList keySet(hitSet.getKey()); + RecordKeyList resultSet; + _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet); + processHits(resultSet); + } else { + processHits(hitSet); + } + queryFRM->deleteRecord(queryRecord); + } + queryFRM->close(); + + //clean up. + delete queryFRM; + delete binTree; + return true; +} diff --git a/src/intersectFile/intersectFile.h b/src/intersectFile/intersectFile.h new file mode 100644 index 0000000000000000000000000000000000000000..3922aa9120ac05f9046d1f58b4ce77d3ccf4cf48 --- /dev/null +++ b/src/intersectFile/intersectFile.h @@ -0,0 +1,46 @@ +/***************************************************************************** + intersectFile.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#ifndef INTERSECTFILE_H +#define INTERSECTFILE_H + +using namespace std; + +#include "RecordKeyList.h" + +using namespace std; + +class Context; +class BlockMgr; +class RecordOutputMgr; + +class FileIntersect { + +public: + FileIntersect(Context *context); + ~FileIntersect(void); + + bool intersectFiles(); + +private: + Context *_context; + Record *_queryRec; + Record *_databaseRec; + BlockMgr *_blockMgr; + RecordOutputMgr *_recordOutputMgr; + + void processHits(RecordKeyList &hits); + bool processSortedFiles(); + bool processUnsortedFiles(); + +}; + +#endif /* INTERSECTFILE_H */ diff --git a/src/intersectFile/intersectMain.cpp b/src/intersectFile/intersectMain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..030d33a854b577ead71aba798d95b85c2320f2f3 --- /dev/null +++ b/src/intersectFile/intersectMain.cpp @@ -0,0 +1,119 @@ +/***************************************************************************** + intersectMain.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +using namespace std; + +#include "intersectFile.h" +#include "version.h" +#include "Context.h" + +// define our program name +#define PROGRAM_NAME "bedtools intersect" + +void intersect_help(void); + +int intersect_main(int argc, char* argv[]) { + + Context *context = new Context(); + context->parseCmdArgs(argc, argv, 1); + if (!context->isValidState()) { + fprintf(stderr, "%s\n", context->getErrorMsg().c_str()); + intersect_help(); + return 0; + } + + FileIntersect *fileIntersect = new FileIntersect(context); + + bool retVal = fileIntersect->intersectFiles(); + delete fileIntersect; + delete context; + return retVal ? 0 : 1; +} + +void intersect_help(void) { + + cerr << "\nTool: bedtools intersect (aka intersectBed)" << endl; + cerr << "Version: " << VERSION << "\n"; + cerr << "Summary: Report overlaps between two feature files." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-abam\t" << "The A input file is in BAM format. Output will be BAM as well." << endl << endl; + + cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default writes compressed BAM." << endl << endl; + + cerr << "\t-bed\t" << "When using BAM input (-abam), write output as BED. The default" << endl; + cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; + + cerr << "\t-wa\t" << "Write the original entry in A for each overlap." << endl << endl; + + cerr << "\t-wb\t" << "Write the original entry in B for each overlap." << endl; + cerr << "\t\t- Useful for knowing _what_ A overlaps. Restricted by -f and -r." << endl << endl; + + cerr << "\t-loj\t" << "Perform a \"left outer join\". That is, for each feature in A" << endl; + cerr << "\t\treport each overlap with B. If no overlaps are found, " << endl; + cerr << "\t\treport a NULL feature for B." << endl << endl; + + cerr << "\t-wo\t" << "Write the original A and B entries plus the number of base" << endl; + cerr << "\t\tpairs of overlap between the two features." << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl; + cerr << "\t\t Only A features with overlap are reported." << endl << endl; + + cerr << "\t-wao\t" << "Write the original A and B entries plus the number of base" << endl; + cerr << "\t\tpairs of overlap between the two features." << endl; + cerr << "\t\t- Overlapping features restricted by -f and -r." << endl; + cerr << "\t\t However, A features w/o overlap are also reported" << endl; + cerr << "\t\t with a NULL B feature and overlap = 0." << endl << endl; + + cerr << "\t-u\t" << "Write the original A entry _once_ if _any_ overlaps found in B." << endl; + cerr << "\t\t- In other words, just report the fact >=1 hit was found." << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; + + cerr << "\t-c\t" << "For each entry in A, report the number of overlaps with B." << endl; + cerr << "\t\t- Reports 0 for A entries that have no overlap with B." << endl; + cerr << "\t\t- Overlaps restricted by -f and -r." << endl << endl; + + cerr << "\t-v\t" << "Only report those entries in A that have _no overlaps_ with B." << endl; + cerr << "\t\t- Similar to \"grep -v\" (an homage)." << endl << endl; + + cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl; + cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl; + cerr << "\t\t- FLOAT (e.g. 0.50)" << endl << endl; + + cerr << "\t-r\t" << "Require that the fraction overlap be reciprocal for A and B." << endl; + cerr << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl; + cerr << "\t\t that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl; + + cerr << "\t-s\t" << "Require same strandedness. That is, only report hits in B" << endl; + cerr << "\t\tthat overlap A on the _same_ strand." << endl; + cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; + + cerr << "\t-S\t" << "Require different strandedness. That is, only report hits in B" << endl; + cerr << "\t\tthat overlap A on the _opposite_ strand." << endl; + cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; + + cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl << endl; + + cerr << "\t-sorted\t" << "Use the \"chromsweep\" algorithm for sorted (-k1,1 -k2,2n) input" << endl << endl; + + cerr << "\t-header\t" << "Print the header from the A file prior to results." << endl << endl; + + cerr << "Notes: " << endl; + cerr << "\t(1) When a BAM file is used for the A file, the alignment is retained if overlaps exist," << endl; + cerr << "\tand exlcuded if an overlap cannot be found. If multiple overlaps exist, they are not" << endl; + cerr << "\treported, as we are only testing for one or more overlaps." << endl << endl; + + // end the program here + exit(1); + +}