diff --git a/Makefile b/Makefile index 4b1f5242e9b4cb6ac7769973c926bce3eca72513..39c2e8e34faca3a0d07630814ae9137d32ce5661 100644 --- a/Makefile +++ b/Makefile @@ -57,6 +57,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/randomBed \ $(SRC_DIR)/regressTest \ $(SRC_DIR)/reldist \ + $(SRC_DIR)/sampleFile \ $(SRC_DIR)/shuffleBed \ $(SRC_DIR)/slopBed \ $(SRC_DIR)/sortBed \ diff --git a/src/bedtools.cpp b/src/bedtools.cpp index c0e98d3c155150b34bd878a5d4e2cca4f58aa312..079c921e8d409639b2d5df96ef6eab28eb51db56 100644 --- a/src/bedtools.cpp +++ b/src/bedtools.cpp @@ -66,6 +66,7 @@ int pairtobed_main(int argc, char* argv[]);// int pairtopair_main(int argc, char* argv[]);// int random_main(int argc, char* argv[]); // int reldist_main(int argc, char* argv[]); // +int sample_main(int argc, char* argv[]); // int shuffle_main(int argc, char* argv[]); // int slop_main(int argc, char* argv[]); // int sort_main(int argc, char* argv[]); // @@ -138,6 +139,7 @@ int main(int argc, char *argv[]) else if (sub_cmd == "makewindows") return windowmaker_main(argc-1, argv+1); else if (sub_cmd == "groupby") return groupby_main(argc-1, argv+1); else if (sub_cmd == "expand") return expand_main(argc-1, argv+1); + else if (sub_cmd == "sample") return sample_main(argc-1, argv+1); else if (sub_cmd == "neksb1") return nek_sandbox1_main(argc-1, argv+1); else if (sub_cmd == "regresstest") return regress_test_main(argc, argv); //this command does need all the orig args. // help diff --git a/src/intersectFile/intersectFile.cpp b/src/intersectFile/intersectFile.cpp index 914be97dea108bf69371007697d3e7512f80bea2..a30ce3ae9b738faa9fb288ec88a552587a422d4e 100644 --- a/src/intersectFile/intersectFile.cpp +++ b/src/intersectFile/intersectFile.cpp @@ -37,16 +37,16 @@ FileIntersect::~FileIntersect(void) { } void FileIntersect::processHits(RecordKeyList &hits) { - if (hits.getKey()->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) { - RecordKeyList blockList(hits.getKey()); - bool deleteBlocks = false; - _blockMgr->getBlocks(blockList, deleteBlocks); - _recordOutputMgr->printRecord(hits, &blockList); - if (deleteBlocks) { - _blockMgr->deleteBlocks(blockList); - } - return; - } +// if (hits.getKey()->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) { +// RecordKeyList blockList(hits.getKey()); +// bool deleteBlocks = false; +// _blockMgr->getBlocks(blockList, deleteBlocks); +// _recordOutputMgr->printRecord(hits, &blockList); +// if (deleteBlocks) { +// _blockMgr->deleteBlocks(blockList); +// } +// return; +// } _recordOutputMgr->printRecord(hits); } diff --git a/src/intersectFile/intersectMain.cpp b/src/intersectFile/intersectMain.cpp index c5ef10b76a7eec5ebe14d93725ccc6544b6de898..4aac270a30a377b74b6e09fa47fd5d3189361820 100644 --- a/src/intersectFile/intersectMain.cpp +++ b/src/intersectFile/intersectMain.cpp @@ -12,7 +12,6 @@ using namespace std; #include "intersectFile.h" -#include "version.h" #include "Context.h" // define our program name @@ -23,7 +22,6 @@ void intersect_help(void); int intersect_main(int argc, char* argv[]) { Context *context = new Context(); - if (!context->parseCmdArgs(argc, argv, 1) || context->getShowHelp() || !context->isValidState()) { if (!context->getErrorMsg().empty()) { cerr << context->getErrorMsg() << endl; diff --git a/src/nekSandbox1/Makefile b/src/nekSandbox1/Makefile index e63d657fc8c6f8364d652a131efb73afe406edcf..fbe6d8616676f86a02102b981f1d4e5c897ebf71 100644 --- a/src/nekSandbox1/Makefile +++ b/src/nekSandbox1/Makefile @@ -14,7 +14,9 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/NewChromsweep \ -I$(UTILITIES_DIR)/GenomeFile/ \ -I$(UTILITIES_DIR)/BamTools/include \ - -I$(UTILITIES_DIR)/BamTools/src + -I$(UTILITIES_DIR)/BamTools/src/ \ + -I$(UTILITIES_DIR)/version/ + diff --git a/src/sampleFile/Makefile b/src/sampleFile/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..5c20a9c56e2450c6c67fa35efb79041e576c6525 --- /dev/null +++ b/src/sampleFile/Makefile @@ -0,0 +1,42 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ + -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/GenomeFile/ \ + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools/src \ + -I$(UTILITIES_DIR)/BlockedIntervals \ + -I$(UTILITIES_DIR)/BamTools-Ancillary \ + -I$(UTILITIES_DIR)/FileRecordTools/ \ + -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ + -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/version/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= sampleMain.cpp SampleFile.cpp SampleFile.h +OBJECTS= sampleMain.o SampleFile.o +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +PROGRAM= sample + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/sampleMain.o $(OBJ_DIR)/SampleFile.o + +.PHONY: clean diff --git a/src/sampleFile/SampleFile.cpp b/src/sampleFile/SampleFile.cpp new file mode 100644 index 0000000000000000000000000000000000000000..74792b98b2b520d4039f2dfa3a90298f1ab49018 --- /dev/null +++ b/src/sampleFile/SampleFile.cpp @@ -0,0 +1,127 @@ +/* + * SampleFile.cpp + * + * Created on: Nov 18, 2013 + * Author: nek3d + */ + +#include "SampleFile.h" +#include "Context.h" +#include "FileRecordMgr.h" +#include "RecordOutputMgr.h" + +static const bool SampleRecordLtFn(const Record *rec1, const Record *rec2) { + return (*rec1 < *rec2); +} + +SampleFile::SampleFile(Context *context) +: _context(context), + _inputFile(NULL), + _outputMgr(NULL), + _numSamples(0), + _numCurrSamples(0), + _currRecordNum(0) +{ + _numSamples = context->getNumOutputRecords(); +} + +SampleFile::~SampleFile() { + +} + +bool SampleFile::takeSample() +{ + //we're only operating on one file, so the idx is zero. + _inputFile = new FileRecordMgr(0, _context); + if (!_inputFile->open()) { + return false; // FRM will give the error and die anyway, + //no error message needed here. + } + _samples.resize(_numSamples, NULL); + + + //Context object takes care of the seed, either user given or randomly + //generated, and seeds the call to srand with it, so we don't have to + //here. + if (!_context->hasConstantSeed()) { + _context->getUnspecifiedSeed(); + } + + _context->determineOutputType(); + _outputMgr = new RecordOutputMgr(); + _outputMgr->init(_context); + + + while (!_inputFile->eof()) { + Record *record = _inputFile->allocateAndGetNextRecord(); + if (record == NULL) { + continue; + } + if (!keepRecord(record)) { + _inputFile->deleteRecord(record); + } + _currRecordNum++; + } + + if (_currRecordNum < _numSamples) { + //die with error; + cerr << "Error: Input file has fewer records than the requested number of output records." << endl; + exit(1); + } + + //If the output type is BAM, must sort the output records. + if (_context->getOutputFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { + sort(_samples.begin(), _samples.end(), SampleRecordLtFn); + } + // Now output all the kept records, then do cleanup. + for (size_t i=0; i < _numSamples; i++) { + _outputMgr->printRecord(_samples[i]); + } + delete _outputMgr; + _inputFile->close(); + + //clean up. + delete _inputFile; + + return true; +} + +bool SampleFile::keepRecord(Record *record) +{ + if (!strandComplies(record)) { + return false; + } + if (_numCurrSamples < _numSamples) { + _samples[_numCurrSamples] = record; + _numCurrSamples++; + return true; + } + + + // We need a random number in the range [0, _currRecordNum]. + // Must combine two consective calls to rand() + // because RAND_MAX is 2^31 (2147483648), whereas + // the number of input records could be far larger. + size_t idx = ((((long) rand()) << 31) | rand()) % _currRecordNum; + + if (idx < _numSamples) { + //replace old record at idx with this new one. + _inputFile->deleteRecord(_samples[idx]); + _samples[idx] = record; + return true; + } + return false; +} + +bool SampleFile::strandComplies(const Record * record) { + if (!_context->getSameStrand()) { + return true; + } + if (_context->getForwardOnly() && record->getStrand() == Record::FORWARD) { + return true; + } + if (_context->getReverseOnly() && record->getStrand() == Record::REVERSE) { + return true; + } + return false; +} diff --git a/src/sampleFile/SampleFile.h b/src/sampleFile/SampleFile.h new file mode 100644 index 0000000000000000000000000000000000000000..24726b28f795342f089fd83a2266359bd929f732 --- /dev/null +++ b/src/sampleFile/SampleFile.h @@ -0,0 +1,42 @@ +/* + * SampleFile.h + * + * Created on: Nov 18, 2013 + * Author: nek3d + */ + +#ifndef SAMPLEFILE_H_ +#define SAMPLEFILE_H_ + +using namespace std; + +#include "Context.h" +#include "Record.h" +#include <vector> + +class FileRecordMgr; +class Context; +class RecordOutputMgr; + +class SampleFile { +public: + SampleFile(Context *context); + ~SampleFile(); + bool takeSample(); + +private: + Context *_context; + FileRecordMgr *_inputFile; + RecordOutputMgr *_outputMgr; + vector<Record *> _samples; + size_t _numSamples; //the number of samples we ultimately want + size_t _numCurrSamples; //the number of samples kept so far. + size_t _currRecordNum; + int _seed; + + bool keepRecord(Record *record); + bool strandComplies(const Record * record); + +}; + +#endif /* SAMPLEFILE_H_ */ diff --git a/src/sampleFile/sampleMain.cpp b/src/sampleFile/sampleMain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..777c775b40f1ebeb643d7859229f65c7d50a9952 --- /dev/null +++ b/src/sampleFile/sampleMain.cpp @@ -0,0 +1,77 @@ +/* + * sampleMain.cpp + * + * Created on: Nov 18, 2013 + * Author: nek3d + */ + +#include <iostream> +#include "Context.h" +#include "SampleFile.h" + +#define PROGRAM_NAME "bedtools sample" + +void sample_help(void); + +int sample_main(int argc, char **argv) +{ + Context *context = new Context(); + if (!context->parseCmdArgs(argc, argv, 1) || context->getShowHelp() || !context->isValidState()) { + if (!context->getErrorMsg().empty()) { + cerr << context->getErrorMsg() << endl; + } + sample_help(); + delete context; + return 0; + } + SampleFile *sampleFile = new SampleFile(context); + + bool retVal = sampleFile->takeSample(); + delete sampleFile; + delete context; + return retVal ? 0 : 1; + +} + +void sample_help(void) { + + cerr << "\nTool: bedtools sample (aka sampleBed)" << endl; + cerr << "Version: " << VERSION << "\n"; + cerr << "Summary: Take sample of input file(s) using reservoir sampling algorithm." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf/bam>" << endl << endl; + + cerr << "WARNING:\tThe current sample algorithm will hold all requested sample records in memory prior to output." << endl; + cerr << "\t\tThe user must ensure that there is adequate memory for this." << endl << endl; + cerr << "Options: " << endl; + + cerr << "\t-n\t" << "The number of records to generate." << endl; + cerr << "\t\t- Default = 1,000,000." << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + cerr << "\t-seed\t" << "Supply an integer seed for the shuffling." << endl; + cerr << "\t\t- By default, the seed is chosen automatically." << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + + cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default writes compressed BAM." << endl << endl; + + cerr << "\t-bed\t" << "When using BAM input (-abam), write output as BED. The default" << endl; + cerr << "\t\tis to write output in BAM when using -abam." << endl << endl; + + cerr << "\t-s\t" << "Require same strandedness. That is, only give records" << endl; + cerr << "\t\tthat have the same strand. Use '-s forward' or '-s reverse'" << endl; + cerr << "\t\tfor forward or reverse strand records, respectively." << endl; + cerr << "\t\t- By default, records are reported without respect to strand." << endl << endl; + + cerr << "\t-header\t" << "Print the header from the input file prior to results." << endl << endl; + + cerr << "Notes: " << endl; + cerr << "\tTBD: Enter other usage notes here." << endl << endl; + + // end the program here + exit(1); + +} + + diff --git a/src/utils/BinTree/Makefile b/src/utils/BinTree/Makefile index 4bfe2edcbc43dba6bfa25e210e5f7e5ada221a84..de04c816782689fc36ab33275d89bfa8f8d2a61e 100644 --- a/src/utils/BinTree/Makefile +++ b/src/utils/BinTree/Makefile @@ -12,7 +12,8 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ -I$(UTILITIES_DIR)/BamTools/include \ - -I$(UTILITIES_DIR)/BamTools/src + -I$(UTILITIES_DIR)/BamTools/src/ \ + -I$(UTILITIES_DIR)/version/ # ---------------------------------- # define our source and object files diff --git a/src/utils/Contexts/Context.cpp b/src/utils/Contexts/Context.cpp index c201fbe54fe154f8fe4a88c514676933bb2c7398..a3e410b7e17db14bb4ec7dff1e3dd35982500e53 100644 --- a/src/utils/Contexts/Context.cpp +++ b/src/utils/Contexts/Context.cpp @@ -6,6 +6,8 @@ */ #include "Context.h" +#include <unistd.h> +#include <sys/types.h> Context::Context() : @@ -43,9 +45,15 @@ Context::Context() _reportCount(false), _maxDistance(0), _reportNames(false), - _reportScores(false) + _reportScores(false), + _numOutputRecords(0), + _hasConstantSeed(false), + _seed(0), + _forwardOnly(false), + _reverseOnly(false) { _programNames["intersect"] = INTERSECT; + _programNames["sample"] = SAMPLE; _validScoreOps.insert("sum"); _validScoreOps.insert("max"); @@ -71,16 +79,36 @@ bool Context::determineOutputType() { } //test whether output should be BED or BAM. //If the user explicitly requested BED, then it's BED. - //Otherwise, if there are any BAM files in the input, - //then the output should be BAM. - if (getExplicitBedOutput() || getQueryFileType() != FileRecordTypeChecker::BAM_FILE_TYPE) { + if (getExplicitBedOutput()) { setOutputFileType(FileRecordTypeChecker::SINGLE_LINE_DELIM_TEXT_FILE_TYPE); - } else { + _outputTypeDetermined = true; + return true; + } + //If this is an intersection, and the query is BAM, then + //the output is BAM. + if (_program == INTERSECT && getQueryFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { setOutputFileType(FileRecordTypeChecker::BAM_FILE_TYPE); + _outputTypeDetermined = true; + return true; + + } + //Otherwise, if there are any BAM files in the input, + //then the output should be BAM. + for (size_t i = 0; i < _inputFiles.size(); i++) { + if (_inputFiles[i]._fileType == FileRecordTypeChecker::BAM_FILE_TYPE) { + setOutputFileType(FileRecordTypeChecker::BAM_FILE_TYPE); + _bamHeaderAndRefIdx = i; + _outputTypeDetermined = true; + return true; + } } + + //Okay, it's bed. + setOutputFileType(FileRecordTypeChecker::SINGLE_LINE_DELIM_TEXT_FILE_TYPE); _outputTypeDetermined = true; return true; + } void Context::openGenomeFile(const QuickString &genomeFilename) @@ -222,6 +250,23 @@ bool Context::parseCmdArgs(int argc, char **argv, int skipFirstArgs) { else if (strcmp(argv[i], "-s") == 0) { setSameStrand(true); markUsed(i - skipFirstArgs); + + if (_program == SAMPLE) { + if (argc <= i+1) { + _errorMsg = "Error: -s option given, but \"forward\" or \"reverse\" not specified."; + return false; + } + if (strcmp(argv[i+1], "forward") == 0) { + _forwardOnly = true; + } else if (strcmp(argv[i+1], "reverse") == 0) { + _reverseOnly = true; + } else { + _errorMsg = "Error: -s option given, but \"forward\" or \"reverse\" not specified."; + return false; + } + i++; + markUsed(i - skipFirstArgs); + } } else if (strcmp(argv[i], "-S") == 0) { setDiffStrand(true); @@ -246,6 +291,26 @@ bool Context::parseCmdArgs(int argc, char **argv, int skipFirstArgs) { else if(strcmp(argv[i], "-header") == 0) { setPrintHeader(true); markUsed(i - skipFirstArgs); + } else if (strcmp(argv[i], "-n") == 0) { + if (argc <= i+1) { + _errorMsg = "Error: -n option given, but no number of output records specified."; + return false; + } + setNumOutputRecords(atoi(argv[i + 1])); + markUsed(i - skipFirstArgs); + i++; + markUsed(i - skipFirstArgs); + } else if (strcmp(argv[i], "-seed") == 0) { + if (argc <= i+1) { + _errorMsg = "Error: -seed option given, but no seed specified."; + return false; + } + _hasConstantSeed = true; + _seed = atoi(argv[i+1]); + srand(_seed); + markUsed(i - skipFirstArgs); + i++; + markUsed(i - skipFirstArgs); } } return true; @@ -257,8 +322,55 @@ bool Context::isValidState() return false; } - if (getProgram() == INTERSECT && (_queryFileIdx == -1 || _databaseFileIdx == -1)) { - _errorMsg = "Error: Intersect program was not given a query and database file."; + if (_program == INTERSECT) { + return isValidIntersectState(); + } + if (_program == SAMPLE) { + return isValidSampleState(); + } + return false; +} + + +bool Context::cmdArgsValid() +{ + bool retval = true; + for (int i = _skipFirstArgs; i < _argc; i++) { + if (!isUsed(i - _skipFirstArgs)) { + _errorMsg += "\nERROR. Unrecognized argument: "; + _errorMsg += _argv[i]; + retval = false; + } + } + return retval; +} + +int Context::getBamHeaderAndRefIdx() { + if (_bamHeaderAndRefIdx != -1) { + //already found which BAM file to use for the header + return _bamHeaderAndRefIdx; + } + if (_inputFiles[_queryFileIdx]._fileType == FileRecordTypeChecker::BAM_FILE_TYPE) { + _bamHeaderAndRefIdx = _queryFileIdx; + } else { + _bamHeaderAndRefIdx = _databaseFileIdx; + } + return _bamHeaderAndRefIdx; +} + +int Context::getUnspecifiedSeed() +{ + // thanks to Rob Long for the tip. + _seed = (unsigned)time(0)+(unsigned)getpid(); + srand(_seed); + return _seed; +} + + +bool Context::isValidIntersectState() +{ + if (_queryFileIdx == -1 || _databaseFileIdx == -1) { + _errorMsg = "Error: query and database files not specified."; return false; } @@ -317,32 +429,17 @@ bool Context::isValidState() return _inputFiles.size() > 0; } -bool Context::cmdArgsValid() +bool Context::isValidSampleState() { - bool retval = true; - for (int i = _skipFirstArgs; i < _argc; i++) { - if (!isUsed(i - _skipFirstArgs)) { - _errorMsg += "\nERROR. Unrecognized argument: "; - _errorMsg += _argv[i]; - retval = false; - } - } - return retval; -} - -int Context::getBamHeaderAndRefIdx() { - if (_bamHeaderAndRefIdx != -1) { - //already found which BAM file to use for the header - return _bamHeaderAndRefIdx; + if (_inputFiles.size() != 1) { + _errorMsg = "Error: input file not specified."; + // Allow one and only input file for now + return false; } - if (_inputFiles[_queryFileIdx]._fileType == FileRecordTypeChecker::BAM_FILE_TYPE) { - _bamHeaderAndRefIdx = _queryFileIdx; - } else { - _bamHeaderAndRefIdx = _databaseFileIdx; + if (_numOutputRecords < 1) { + _errorMsg = "Error: number of output records not specified."; + return false; } - return _bamHeaderAndRefIdx; + return true; } - - - diff --git a/src/utils/Contexts/Context.h b/src/utils/Contexts/Context.h index 53000b1ce0be89c4037ef3cee1dc5fc9e1375b50..e41e7cf724e8b7da108d0407259ebecb4c3f3dda 100644 --- a/src/utils/Contexts/Context.h +++ b/src/utils/Contexts/Context.h @@ -9,15 +9,15 @@ #define CONTEXTBASE_H_ -// This is an abstract base class for all context objects. -// -// Context classes handle the settings for an operation, +// The Context class handles the settings for an operation, // such as merge, intersect, jaccard, etc. // // Settings include the input and output parameters, such as input // files, file types (if explicitly provided), genome files, // run options, output format, etc. +#include <cstdlib> +#include "version.h" #include "BedtoolsTypes.h" #include "FileRecordTypeChecker.h" #include "NewGenomeFile.h" @@ -34,7 +34,7 @@ public: typedef FileRecordTypeChecker::RECORD_TYPE ContextRecordType; typedef enum {UNSPECIFIED_PROGRAM, INTERSECT, WINDOW, CLOSEST, COVERAGE, MAP, GENOMECOV, MERGE, CLUSTER, - COMPLEMENT, SUBTRACT, SLOP, FLANK, SORT, RANDOM, SHUFFLE, ANNOTATE, MULTIINTER, UNIONBEDG, PAIRTOBED, + COMPLEMENT, SUBTRACT, SLOP, FLANK, SORT, RANDOM, SAMPLE, SHUFFLE, ANNOTATE, MULTIINTER, UNIONBEDG, PAIRTOBED, PAIRTOPAIR,BAMTOBED, BEDTOBAM, BEDTOFASTQ, BEDPETOBAM, BED12TOBED6, GETFASTA, MASKFASTA, NUC, MULTICOV, TAG, JACCARD, OVERLAP, IGV, LINKS,MAKEWINDOWS, GROUPBY, EXPAND } PROGRAM_TYPE; @@ -53,6 +53,15 @@ public: void setInputFileType(int fileNum, ContextFileType fileType) { _inputFiles[fileNum]._fileType = fileType; } ContextRecordType getInputRecordType(int fileNum) const { return _inputFiles[fileNum]._recordType; } void setInputRecordType(int fileNum, ContextRecordType recordType) { _inputFiles[fileNum]._recordType = recordType; } + //HERE ARE SOME SIMPLER VERSIONS OF THE ABOVE FOR APPS THAT HAVE ONLY ONE INPUT FILE + const QuickString &getInputFileName() const { return _inputFiles[0]._fileName; } + ContextFileType getInputFileType() const { return _inputFiles[0]._fileType; } + void setInputFileType(ContextFileType fileType) { _inputFiles[0]._fileType = fileType; } + ContextRecordType getInputRecordType() const { return _inputFiles[0]._recordType; } + void setInputRecordType(ContextRecordType recordType) { _inputFiles[0]._recordType = recordType; } + int getInputFileIdx() const { return 0; } + + bool determineOutputType(); @@ -149,6 +158,8 @@ public: bool getSameStrand() const {return _sameStrand; } void setSameStrand(bool val) { _sameStrand = val; } + bool getForwardOnly() const { return _forwardOnly; } + bool getReverseOnly() const { return _reverseOnly; } bool getDiffStrand() const {return _diffStrand; } void setDiffStrand(bool val) { _diffStrand = val; } @@ -184,6 +195,21 @@ public: void setScoreOp(const QuickString &op) { _scoreOp = op; } + // METHODS FOR PROGRAMS WITH USER_SPECIFIED NUMBER + // OF OUTPUT RECORDS. + int getNumOutputRecords() const { return _numOutputRecords; } + void setNumOutputRecords(int val) { _numOutputRecords = val; } + + //SEED OPS FOR APPS WITH RANDOMNESS + //When a seed has been specified on the command line, srand + //will have been already been called during command line parsing. + //For reference, srand is the C function that initializes the + //psuedo-random number generator. If a seed has not been + //specifified, the user may call the + bool hasConstantSeed() const { return _hasConstantSeed; } + int getConstantSeed() const { return _seed; } + int getUnspecifiedSeed(); + protected: PROGRAM_TYPE _program; @@ -235,7 +261,7 @@ protected: bool _explicitBedOutput; int _queryFileIdx; int _databaseFileIdx; - int _bamHeaderAndRefIdx; + size_t _bamHeaderAndRefIdx; int _maxNumDatabaseFields; bool _useFullBamTags; @@ -246,10 +272,18 @@ protected: QuickString _scoreOp; set<QuickString> _validScoreOps; + int _numOutputRecords; + + bool _hasConstantSeed; + int _seed; + bool _forwardOnly; + bool _reverseOnly; + void markUsed(int i) { _argsProcessed[i] = true; } bool isUsed(int i) const { return _argsProcessed[i]; } bool cmdArgsValid(); - + bool isValidIntersectState(); + bool isValidSampleState(); }; #endif /* CONTEXT_H_ */ diff --git a/src/utils/Contexts/Makefile b/src/utils/Contexts/Makefile index df9a819e5be900882f484cdd033c2aeaf4e47631..c0de287ff1c9b5842da7e5fca2945bc63cb12a6e 100644 --- a/src/utils/Contexts/Makefile +++ b/src/utils/Contexts/Makefile @@ -8,7 +8,9 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/fileType/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/GenomeFile/ \ - -I$(UTILITIES_DIR)/BamTools/include + -I$(UTILITIES_DIR)/BamTools/include/ \ + -I$(UTILITIES_DIR)/version/ + # ---------------------------------- # define our source and object files diff --git a/src/utils/FileRecordTools/FileReaders/Makefile b/src/utils/FileRecordTools/FileReaders/Makefile index 3e6ad82db933214ac4188c7e15e4230a8053da3b..37ff6e89f0cc87f1b4597dbc2f6fba8a6b9b4a8e 100644 --- a/src/utils/FileRecordTools/FileReaders/Makefile +++ b/src/utils/FileRecordTools/FileReaders/Makefile @@ -11,6 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/BamTools/include/ \ -I$(UTILITIES_DIR)/GenomeFile/ \ -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/version/ # ---------------------------------- diff --git a/src/utils/FileRecordTools/Makefile b/src/utils/FileRecordTools/Makefile index 69d2f4de81580f6003596cbf81187b2ab5bf3b2d..cacf887f920b03582030d3041be93fbacacf88b8 100644 --- a/src/utils/FileRecordTools/Makefile +++ b/src/utils/FileRecordTools/Makefile @@ -11,7 +11,9 @@ INCLUDES = -I$(UTILITIES_DIR)/fileType/ \ -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/BamTools/include \ - -I$(UTILITIES_DIR)/BamTools/src + -I$(UTILITIES_DIR)/BamTools/src/ \ + -I$(UTILITIES_DIR)/version/ + SUBDIRS = ./FileReaders \ ./Records \ diff --git a/src/utils/FileRecordTools/RecordOutputMgr.cpp b/src/utils/FileRecordTools/RecordOutputMgr.cpp index b6b8fd29aed602caa2e36b4e56efeae820a46fed..74088951184422459b946ff381d7099664b3a465 100644 --- a/src/utils/FileRecordTools/RecordOutputMgr.cpp +++ b/src/utils/FileRecordTools/RecordOutputMgr.cpp @@ -7,7 +7,7 @@ #include "RecordOutputMgr.h" #include "Context.h" - +#include "BlockMgr.h" #include "Bed3Interval.h" #include "Bed4Interval.h" #include "BedGraphInterval.h" @@ -28,9 +28,10 @@ RecordOutputMgr::RecordOutputMgr() : _context(NULL), _printable(true), _bamWriter(NULL), - _currBlockList(NULL) + _currBlockList(NULL), + _blockMgr(NULL) { - + _blockMgr = new BlockMgr(); } RecordOutputMgr::~RecordOutputMgr() @@ -43,10 +44,14 @@ RecordOutputMgr::~RecordOutputMgr() delete _bamWriter; _bamWriter = NULL; } + delete _blockMgr; + _blockMgr = NULL; + } bool RecordOutputMgr::init(Context *context) { _context = context; + _blockMgr->setContext(_context); if (_context->getOutputFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { //set-up BAM writer. _bamWriter = new BamTools::BamWriter(); @@ -57,23 +62,18 @@ bool RecordOutputMgr::init(Context *context) { //for everything but BAM, we'll copy output to an output buffer before printing. _outBuf.reserve(MAX_OUTBUF_SIZE); } - if (_context->getAnyHit() || _context->getNoHit() || _context->getWriteCount()) { - _printable = false; - } - if (!_context->isValidState()) { - fprintf(stderr, "%s\n", context->getErrorMsg().c_str()); - exit(1); - } - if (_context->getPrintHeader()) { - _outBuf.append(_context->getHeader(_context->getQueryFileIdx())); + if (_context->getProgram() == Context::INTERSECT) { + if (_context->getAnyHit() || _context->getNoHit() || _context->getWriteCount()) { + _printable = false; + } } return true; } -void RecordOutputMgr::printHeader(const string &header) -{ - _outBuf.append(header); -} +//void RecordOutputMgr::printHeader(const string &header) +//{ +// _outBuf.append(header); +//} bool RecordOutputMgr::printKeyAndTerminate(RecordKeyList &keyList) { printBamType bamCode = printBamRecord(keyList); @@ -109,60 +109,98 @@ RecordOutputMgr::printBamType RecordOutputMgr::printBamRecord(RecordKeyList &key return NOT_BAM; } +void RecordOutputMgr::printRecord(const Record *record) +{ + RecordKeyList keyList(record); + printRecord(keyList); +} + +void RecordOutputMgr::printRecord(RecordKeyList &keyList) { + if (keyList.getKey()->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) { + RecordKeyList blockList(keyList.getKey()); + bool deleteBlocks = false; + _blockMgr->getBlocks(blockList, deleteBlocks); + printRecord(keyList, &blockList); + if (deleteBlocks) { + _blockMgr->deleteBlocks(blockList); + } + return; + } + printRecord(keyList, NULL); + +} + void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockList) { if (needsFlush()) { flush(); } -// if (keyList.getKey()->getChrName() == "chr1" && keyList.getKey()->getStartPos() == 11996) { -// printf("Break point here.\n"); -// } //The first time we print a record is when we print any header, because the header //hasn't been read from the query file until after the first record has also been read. if (_context->getPrintHeader()) { - _outBuf.append(_context->getHeader(_context->getQueryFileIdx())); - _context->setPrintHeader(false); + checkForHeader(); } const_cast<Record *>(keyList.getKey())->undoZeroLength(); - _currBlockList = blockList; - if (_printable) { - if (keyList.empty()) { - if (_context->getWriteAllOverlap()) { - // -wao the user wants to force the reporting of 0 overlap - if (printKeyAndTerminate(keyList)) { + if (_context->getProgram() == Context::INTERSECT) { + if (_printable) { + if (keyList.empty()) { + if (_context->getWriteAllOverlap()) { + // -wao the user wants to force the reporting of 0 overlap + if (printKeyAndTerminate(keyList)) { + _currBlockList = NULL; + return; + } + tab(); + null(false, true); + tab(); + _outBuf.append('0'); + newline(); + } + else if (_context->getLeftJoin()) { + if (printKeyAndTerminate(keyList)) { + _currBlockList = NULL; + return; + } + tab(); + null(false, true); + newline(); _currBlockList = NULL; return; } - tab(); - null(false, true); - tab(); - _outBuf.append('0'); - newline(); - } - else if (_context->getLeftJoin()) { - if (printKeyAndTerminate(keyList)) { + } else { + if (printBamRecord(keyList, true) == BAM_AS_BAM) { _currBlockList = NULL; return; } - tab(); - null(false, true); - newline(); - } - } else { - if (printBamRecord(keyList, true) == BAM_AS_BAM) { - _currBlockList = NULL; - return; - } - for (RecordKeyList::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { - reportOverlapDetail(keyList.getKey(), iter->value()); + for (RecordKeyList::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { + reportOverlapDetail(keyList.getKey(), iter->value()); + } } + } else { // not printable + reportOverlapSummary(keyList); + } + _currBlockList = NULL; + } else if (_context->getProgram() == Context::SAMPLE) { + if (!printKeyAndTerminate(keyList)) { + newline(); + } + _currBlockList = NULL; + return; + } +} + +void RecordOutputMgr::checkForHeader() { + if (_context->getProgram() == Context::INTERSECT) { + if (_context->getPrintHeader()) { + _outBuf.append(_context->getHeader(_context->getQueryFileIdx())); + } + } else if (_context->getProgram() == Context::SAMPLE) { + if (_context->getPrintHeader()) { + _outBuf.append(_context->getHeader(_context->getInputFileIdx())); } - } else { // not printable - reportOverlapSummary(keyList); } - _currBlockList = NULL; } void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record *hitRecord) @@ -285,15 +323,15 @@ void RecordOutputMgr::reportOverlapSummary(RecordKeyList &keyList) void RecordOutputMgr::null(bool queryType, bool dbType) { FileRecordTypeChecker::RECORD_TYPE recordType = FileRecordTypeChecker::UNKNOWN_RECORD_TYPE; - if (queryType) { - recordType = _context->getQueryRecordType(); - } else if (dbType) { - recordType = _context->getDatabaseRecordType(); - } else { - return; //TBD? Implement printNull for records that are neither query nor database. - //Not sure if this would ever be necessary. + if (_context->getProgram() == Context::INTERSECT) { + if (queryType) { + recordType = _context->getQueryRecordType(); + } else if (dbType) { + recordType = _context->getDatabaseRecordType(); + } + } else if (_context->getProgram() == Context::SAMPLE) { + recordType = _context->getInputRecordType(); } - //This is kind of a hack. Need an instance of the correct class of record in order to call it's printNull method. Record *dummyRecord = NULL; diff --git a/src/utils/FileRecordTools/RecordOutputMgr.h b/src/utils/FileRecordTools/RecordOutputMgr.h index 932be18f04714ee96f98e21d075f55bb032b6733..1bb8bbfcadb4f84e98a9a2039a63f3e33d618bd1 100644 --- a/src/utils/FileRecordTools/RecordOutputMgr.h +++ b/src/utils/FileRecordTools/RecordOutputMgr.h @@ -14,7 +14,7 @@ using namespace std; #include "api/BamWriter.h" class Context; - +class BlockMgr; class RecordOutputMgr { public: @@ -24,30 +24,34 @@ public: //The init method must be called after all the input files are open. bool init(Context *context); - typedef enum { NOT_BAM, BAM_AS_BAM, BAM_AS_BED} printBamType; - - void printHeader(const string &); - bool printKeyAndTerminate(RecordKeyList &keyList); - printBamType printBamRecord(RecordKeyList &keyList, bool bamOutputOnly = false); - void printRecord(RecordKeyList &keyList, RecordKeyList *blockList = NULL); - void reportOverlapDetail(const Record *keyRecord, const Record *hitRecord); - void reportOverlapSummary(RecordKeyList &keyList); +// void printHeader(const string &); + void printRecord(const Record *record); + void printRecord(RecordKeyList &keyList); private: + typedef enum { NOT_BAM, BAM_AS_BAM, BAM_AS_BED} printBamType; + Context *_context; bool _printable; BamTools::BamWriter *_bamWriter; RecordKeyList *_currBlockList; QuickString _outBuf; - + BlockMgr *_blockMgr; //some helper functions to neaten the code. void tab() { _outBuf.append('\t'); } void newline() { _outBuf.append('\n'); } void null(bool queryType, bool dbType); + void printRecord(RecordKeyList &keyList, RecordKeyList *blockList); void printKey(const Record *key); void printKey(const Record *key, const QuickString & start, const QuickString & end); + bool printKeyAndTerminate(RecordKeyList &keyList); + printBamType printBamRecord(RecordKeyList &keyList, bool bamOutputOnly = false); + void checkForHeader(); + void reportOverlapDetail(const Record *keyRecord, const Record *hitRecord); + void reportOverlapSummary(RecordKeyList &keyList); + static const unsigned int MAX_OUTBUF_SIZE = 16384; //16 K bool needsFlush() const { return _outBuf.size() >= MAX_OUTBUF_SIZE *.9; } void flush(); diff --git a/src/utils/FileRecordTools/Records/Makefile b/src/utils/FileRecordTools/Records/Makefile index f1e883217d366ad0a90be082fa2578862bd1b79d..6bfd6106ab52fb820c3759b74fa175639d948511 100644 --- a/src/utils/FileRecordTools/Records/Makefile +++ b/src/utils/FileRecordTools/Records/Makefile @@ -10,7 +10,9 @@ INCLUDES = -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/Contexts/ \ -I$(UTILITIES_DIR)/GenomeFile/ \ -I$(UTILITIES_DIR)/BamTools/include \ - -I$(UTILITIES_DIR)/BamTools/src + -I$(UTILITIES_DIR)/BamTools/src/ \ + -I$(UTILITIES_DIR)/version/ + # ---------------------------------- # define our source and object files diff --git a/src/utils/NewChromsweep/Makefile b/src/utils/NewChromsweep/Makefile index 7c1808e989f149e4071824fb07a4d62d20452bff..8f4d93105b30488d40a10599c2d1bd0b083576cf 100644 --- a/src/utils/NewChromsweep/Makefile +++ b/src/utils/NewChromsweep/Makefile @@ -12,7 +12,9 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ -I$(UTILITIES_DIR)/BamTools/include \ - -I$(UTILITIES_DIR)/BamTools/src + -I$(UTILITIES_DIR)/BamTools/src/ \ + -I$(UTILITIES_DIR)/version/ + # ---------------------------------- # define our source and object files