Commit b1aa9ab7 authored by nkindlon's avatar nkindlon
Browse files

Added sample sub-program with needed changes to main, Makefiles, Context,...

Added sample sub-program with needed changes to main, Makefiles, Context, FileRecordMgr, and RecordOutputMgr.
parent 3b9fc3e1
......@@ -57,6 +57,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/randomBed \
$(SRC_DIR)/regressTest \
$(SRC_DIR)/reldist \
$(SRC_DIR)/sampleFile \
$(SRC_DIR)/shuffleBed \
$(SRC_DIR)/slopBed \
$(SRC_DIR)/sortBed \
......
......@@ -66,6 +66,7 @@ int pairtobed_main(int argc, char* argv[]);//
int pairtopair_main(int argc, char* argv[]);//
int random_main(int argc, char* argv[]); //
int reldist_main(int argc, char* argv[]); //
int sample_main(int argc, char* argv[]); //
int shuffle_main(int argc, char* argv[]); //
int slop_main(int argc, char* argv[]); //
int sort_main(int argc, char* argv[]); //
......@@ -138,6 +139,7 @@ int main(int argc, char *argv[])
else if (sub_cmd == "makewindows") return windowmaker_main(argc-1, argv+1);
else if (sub_cmd == "groupby") return groupby_main(argc-1, argv+1);
else if (sub_cmd == "expand") return expand_main(argc-1, argv+1);
else if (sub_cmd == "sample") return sample_main(argc-1, argv+1);
else if (sub_cmd == "neksb1") return nek_sandbox1_main(argc-1, argv+1);
else if (sub_cmd == "regresstest") return regress_test_main(argc, argv); //this command does need all the orig args.
// help
......
......@@ -37,16 +37,16 @@ FileIntersect::~FileIntersect(void) {
}
void FileIntersect::processHits(RecordKeyList &hits) {
if (hits.getKey()->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) {
RecordKeyList blockList(hits.getKey());
bool deleteBlocks = false;
_blockMgr->getBlocks(blockList, deleteBlocks);
_recordOutputMgr->printRecord(hits, &blockList);
if (deleteBlocks) {
_blockMgr->deleteBlocks(blockList);
}
return;
}
// if (hits.getKey()->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) {
// RecordKeyList blockList(hits.getKey());
// bool deleteBlocks = false;
// _blockMgr->getBlocks(blockList, deleteBlocks);
// _recordOutputMgr->printRecord(hits, &blockList);
// if (deleteBlocks) {
// _blockMgr->deleteBlocks(blockList);
// }
// return;
// }
_recordOutputMgr->printRecord(hits);
}
......
......@@ -12,7 +12,6 @@
using namespace std;
#include "intersectFile.h"
#include "version.h"
#include "Context.h"
// define our program name
......@@ -23,7 +22,6 @@ void intersect_help(void);
int intersect_main(int argc, char* argv[]) {
Context *context = new Context();
if (!context->parseCmdArgs(argc, argv, 1) || context->getShowHelp() || !context->isValidState()) {
if (!context->getErrorMsg().empty()) {
cerr << context->getErrorMsg() << endl;
......
......@@ -14,7 +14,9 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/GenomeFile/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src
-I$(UTILITIES_DIR)/BamTools/src/ \
-I$(UTILITIES_DIR)/version/
......
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/GenomeFile/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src \
-I$(UTILITIES_DIR)/BlockedIntervals \
-I$(UTILITIES_DIR)/BamTools-Ancillary \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= sampleMain.cpp SampleFile.cpp SampleFile.h
OBJECTS= sampleMain.o SampleFile.o
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= sample
all: $(BUILT_OBJECTS)
.PHONY: all
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/sampleMain.o $(OBJ_DIR)/SampleFile.o
.PHONY: clean
/*
* SampleFile.cpp
*
* Created on: Nov 18, 2013
* Author: nek3d
*/
#include "SampleFile.h"
#include "Context.h"
#include "FileRecordMgr.h"
#include "RecordOutputMgr.h"
static const bool SampleRecordLtFn(const Record *rec1, const Record *rec2) {
return (*rec1 < *rec2);
}
SampleFile::SampleFile(Context *context)
: _context(context),
_inputFile(NULL),
_outputMgr(NULL),
_numSamples(0),
_numCurrSamples(0),
_currRecordNum(0)
{
_numSamples = context->getNumOutputRecords();
}
SampleFile::~SampleFile() {
}
bool SampleFile::takeSample()
{
//we're only operating on one file, so the idx is zero.
_inputFile = new FileRecordMgr(0, _context);
if (!_inputFile->open()) {
return false; // FRM will give the error and die anyway,
//no error message needed here.
}
_samples.resize(_numSamples, NULL);
//Context object takes care of the seed, either user given or randomly
//generated, and seeds the call to srand with it, so we don't have to
//here.
if (!_context->hasConstantSeed()) {
_context->getUnspecifiedSeed();
}
_context->determineOutputType();
_outputMgr = new RecordOutputMgr();
_outputMgr->init(_context);
while (!_inputFile->eof()) {
Record *record = _inputFile->allocateAndGetNextRecord();
if (record == NULL) {
continue;
}
if (!keepRecord(record)) {
_inputFile->deleteRecord(record);
}
_currRecordNum++;
}
if (_currRecordNum < _numSamples) {
//die with error;
cerr << "Error: Input file has fewer records than the requested number of output records." << endl;
exit(1);
}
//If the output type is BAM, must sort the output records.
if (_context->getOutputFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) {
sort(_samples.begin(), _samples.end(), SampleRecordLtFn);
}
// Now output all the kept records, then do cleanup.
for (size_t i=0; i < _numSamples; i++) {
_outputMgr->printRecord(_samples[i]);
}
delete _outputMgr;
_inputFile->close();
//clean up.
delete _inputFile;
return true;
}
bool SampleFile::keepRecord(Record *record)
{
if (!strandComplies(record)) {
return false;
}
if (_numCurrSamples < _numSamples) {
_samples[_numCurrSamples] = record;
_numCurrSamples++;
return true;
}
// We need a random number in the range [0, _currRecordNum].
// Must combine two consective calls to rand()
// because RAND_MAX is 2^31 (2147483648), whereas
// the number of input records could be far larger.
size_t idx = ((((long) rand()) << 31) | rand()) % _currRecordNum;
if (idx < _numSamples) {
//replace old record at idx with this new one.
_inputFile->deleteRecord(_samples[idx]);
_samples[idx] = record;
return true;
}
return false;
}
bool SampleFile::strandComplies(const Record * record) {
if (!_context->getSameStrand()) {
return true;
}
if (_context->getForwardOnly() && record->getStrand() == Record::FORWARD) {
return true;
}
if (_context->getReverseOnly() && record->getStrand() == Record::REVERSE) {
return true;
}
return false;
}
/*
* SampleFile.h
*
* Created on: Nov 18, 2013
* Author: nek3d
*/
#ifndef SAMPLEFILE_H_
#define SAMPLEFILE_H_
using namespace std;
#include "Context.h"
#include "Record.h"
#include <vector>
class FileRecordMgr;
class Context;
class RecordOutputMgr;
class SampleFile {
public:
SampleFile(Context *context);
~SampleFile();
bool takeSample();
private:
Context *_context;
FileRecordMgr *_inputFile;
RecordOutputMgr *_outputMgr;
vector<Record *> _samples;
size_t _numSamples; //the number of samples we ultimately want
size_t _numCurrSamples; //the number of samples kept so far.
size_t _currRecordNum;
int _seed;
bool keepRecord(Record *record);
bool strandComplies(const Record * record);
};
#endif /* SAMPLEFILE_H_ */
/*
* sampleMain.cpp
*
* Created on: Nov 18, 2013
* Author: nek3d
*/
#include <iostream>
#include "Context.h"
#include "SampleFile.h"
#define PROGRAM_NAME "bedtools sample"
void sample_help(void);
int sample_main(int argc, char **argv)
{
Context *context = new Context();
if (!context->parseCmdArgs(argc, argv, 1) || context->getShowHelp() || !context->isValidState()) {
if (!context->getErrorMsg().empty()) {
cerr << context->getErrorMsg() << endl;
}
sample_help();
delete context;
return 0;
}
SampleFile *sampleFile = new SampleFile(context);
bool retVal = sampleFile->takeSample();
delete sampleFile;
delete context;
return retVal ? 0 : 1;
}
void sample_help(void) {
cerr << "\nTool: bedtools sample (aka sampleBed)" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Take sample of input file(s) using reservoir sampling algorithm." << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf/bam>" << endl << endl;
cerr << "WARNING:\tThe current sample algorithm will hold all requested sample records in memory prior to output." << endl;
cerr << "\t\tThe user must ensure that there is adequate memory for this." << endl << endl;
cerr << "Options: " << endl;
cerr << "\t-n\t" << "The number of records to generate." << endl;
cerr << "\t\t- Default = 1,000,000." << endl;
cerr << "\t\t- (INTEGER)" << endl << endl;
cerr << "\t-seed\t" << "Supply an integer seed for the shuffling." << endl;
cerr << "\t\t- By default, the seed is chosen automatically." << endl;
cerr << "\t\t- (INTEGER)" << endl << endl;
cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default writes compressed BAM." << endl << endl;
cerr << "\t-bed\t" << "When using BAM input (-abam), write output as BED. The default" << endl;
cerr << "\t\tis to write output in BAM when using -abam." << endl << endl;
cerr << "\t-s\t" << "Require same strandedness. That is, only give records" << endl;
cerr << "\t\tthat have the same strand. Use '-s forward' or '-s reverse'" << endl;
cerr << "\t\tfor forward or reverse strand records, respectively." << endl;
cerr << "\t\t- By default, records are reported without respect to strand." << endl << endl;
cerr << "\t-header\t" << "Print the header from the input file prior to results." << endl << endl;
cerr << "Notes: " << endl;
cerr << "\tTBD: Enter other usage notes here." << endl << endl;
// end the program here
exit(1);
}
......@@ -12,7 +12,8 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src
-I$(UTILITIES_DIR)/BamTools/src/ \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
......
......@@ -6,6 +6,8 @@
*/
#include "Context.h"
#include <unistd.h>
#include <sys/types.h>
Context::Context()
:
......@@ -43,9 +45,15 @@ Context::Context()
_reportCount(false),
_maxDistance(0),
_reportNames(false),
_reportScores(false)
_reportScores(false),
_numOutputRecords(0),
_hasConstantSeed(false),
_seed(0),
_forwardOnly(false),
_reverseOnly(false)
{
_programNames["intersect"] = INTERSECT;
_programNames["sample"] = SAMPLE;
_validScoreOps.insert("sum");
_validScoreOps.insert("max");
......@@ -71,16 +79,36 @@ bool Context::determineOutputType() {
}
//test whether output should be BED or BAM.
//If the user explicitly requested BED, then it's BED.
//Otherwise, if there are any BAM files in the input,
//then the output should be BAM.
if (getExplicitBedOutput() || getQueryFileType() != FileRecordTypeChecker::BAM_FILE_TYPE) {
if (getExplicitBedOutput()) {
setOutputFileType(FileRecordTypeChecker::SINGLE_LINE_DELIM_TEXT_FILE_TYPE);
} else {
_outputTypeDetermined = true;
return true;
}
//If this is an intersection, and the query is BAM, then
//the output is BAM.
if (_program == INTERSECT && getQueryFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) {
setOutputFileType(FileRecordTypeChecker::BAM_FILE_TYPE);
_outputTypeDetermined = true;
return true;
}
//Otherwise, if there are any BAM files in the input,
//then the output should be BAM.
for (size_t i = 0; i < _inputFiles.size(); i++) {
if (_inputFiles[i]._fileType == FileRecordTypeChecker::BAM_FILE_TYPE) {
setOutputFileType(FileRecordTypeChecker::BAM_FILE_TYPE);
_bamHeaderAndRefIdx = i;
_outputTypeDetermined = true;
return true;
}
}
//Okay, it's bed.
setOutputFileType(FileRecordTypeChecker::SINGLE_LINE_DELIM_TEXT_FILE_TYPE);
_outputTypeDetermined = true;
return true;
}
void Context::openGenomeFile(const QuickString &genomeFilename)
......@@ -222,6 +250,23 @@ bool Context::parseCmdArgs(int argc, char **argv, int skipFirstArgs) {
else if (strcmp(argv[i], "-s") == 0) {
setSameStrand(true);
markUsed(i - skipFirstArgs);
if (_program == SAMPLE) {
if (argc <= i+1) {
_errorMsg = "Error: -s option given, but \"forward\" or \"reverse\" not specified.";
return false;
}
if (strcmp(argv[i+1], "forward") == 0) {
_forwardOnly = true;
} else if (strcmp(argv[i+1], "reverse") == 0) {
_reverseOnly = true;
} else {
_errorMsg = "Error: -s option given, but \"forward\" or \"reverse\" not specified.";
return false;
}
i++;
markUsed(i - skipFirstArgs);
}
}
else if (strcmp(argv[i], "-S") == 0) {
setDiffStrand(true);
......@@ -246,6 +291,26 @@ bool Context::parseCmdArgs(int argc, char **argv, int skipFirstArgs) {
else if(strcmp(argv[i], "-header") == 0) {
setPrintHeader(true);
markUsed(i - skipFirstArgs);
} else if (strcmp(argv[i], "-n") == 0) {
if (argc <= i+1) {
_errorMsg = "Error: -n option given, but no number of output records specified.";
return false;
}
setNumOutputRecords(atoi(argv[i + 1]));
markUsed(i - skipFirstArgs);
i++;
markUsed(i - skipFirstArgs);
} else if (strcmp(argv[i], "-seed") == 0) {
if (argc <= i+1) {
_errorMsg = "Error: -seed option given, but no seed specified.";
return false;
}
_hasConstantSeed = true;
_seed = atoi(argv[i+1]);
srand(_seed);
markUsed(i - skipFirstArgs);
i++;
markUsed(i - skipFirstArgs);
}
}
return true;
......@@ -257,8 +322,55 @@ bool Context::isValidState()
return false;
}
if (getProgram() == INTERSECT && (_queryFileIdx == -1 || _databaseFileIdx == -1)) {
_errorMsg = "Error: Intersect program was not given a query and database file.";
if (_program == INTERSECT) {
return isValidIntersectState();
}
if (_program == SAMPLE) {
return isValidSampleState();
}
return false;
}
bool Context::cmdArgsValid()
{
bool retval = true;
for (int i = _skipFirstArgs; i < _argc; i++) {
if (!isUsed(i - _skipFirstArgs)) {
_errorMsg += "\nERROR. Unrecognized argument: ";
_errorMsg += _argv[i];
retval = false;
}
}
return retval;
}
int Context::getBamHeaderAndRefIdx() {
if (_bamHeaderAndRefIdx != -1) {
//already found which BAM file to use for the header
return _bamHeaderAndRefIdx;
}
if (_inputFiles[_queryFileIdx]._fileType == FileRecordTypeChecker::BAM_FILE_TYPE) {
_bamHeaderAndRefIdx = _queryFileIdx;
} else {
_bamHeaderAndRefIdx = _databaseFileIdx;
}
return _bamHeaderAndRefIdx;
}
int Context::getUnspecifiedSeed()
{
// thanks to Rob Long for the tip.
_seed = (unsigned)time(0)+(unsigned)getpid();
srand(_seed);
return _seed;
}
bool Context::isValidIntersectState()
{
if (_queryFileIdx == -1 || _databaseFileIdx == -1) {
_errorMsg = "Error: query and database files not specified.";
return false;
}
......@@ -317,32 +429,17 @@ bool Context::isValidState()
return _inputFiles.size() > 0;
}
bool Context::cmdArgsValid()
bool Context::isValidSampleState()
{
bool retval = true;
for (int i = _skipFirstArgs; i < _argc; i++) {
if (!isUsed(i - _skipFirstArgs)) {
_errorMsg += "\nERROR. Unrecognized argument: ";
_errorMsg += _argv[i];
retval = false;
}
}
return retval;
}
int Context::getBamHeaderAndRefIdx() {
if (_bamHeaderAndRefIdx != -1) {
//already found which BAM file to use for the header
return _bamHeaderAndRefIdx;
if (_inputFiles.size() != 1) {
_errorMsg = "Error: input file not specified.";
// Allow one and only input file for now
return false;
}
if (_inputFiles[_queryFileIdx]._fileType == FileRecordTypeChecker::BAM_FILE_TYPE) {
_bamHeaderAndRefIdx = _queryFileIdx;
} else {
_bamHeaderAndRefIdx = _databaseFileIdx;
if (_numOutputRecords < 1) {
_errorMsg = "Error: number of output records not specified.";
return false;
}
return _bamHeaderAndRefIdx;
return true;
}
......@@ -9,15 +9,15 @@
#define CONTEXTBASE_H_
// This is an abstract base class for all context objects.
//
// Context classes handle the settings for an operation,
// The Context class handles the settings for an operation,
// such as merge, intersect, jaccard, etc.
//
// Settings include the input and output parameters, such as input
// files, file types (if explicitly provided), genome files,
// run options, output format, etc.
#include <cstdlib>