Commit b912d88c authored by nkindlon's avatar nkindlon
Browse files

Refactored contexts for validation, added unit tests, enabled -plit for map.

parent 86346f36
......@@ -86,7 +86,8 @@ UTIL_SUBDIRS = $(SRC_DIR)/utils/bedFile \
$(SRC_DIR)/utils/BlockedIntervals \
$(SRC_DIR)/utils/Fasta \
$(SRC_DIR)/utils/VectorOps \
$(SRC_DIR)/utils/GenomeFile
$(SRC_DIR)/utils/GenomeFile \
$(SRC_DIR)/utils/RecordOutputMgr
BUILT_OBJECTS = $(OBJ_DIR)/*.o
......
......@@ -17,6 +17,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/BinTree \
-I$(UTILITIES_DIR)/version/
......
......@@ -24,18 +24,16 @@ FileIntersect::FileIntersect(ContextIntersect *context)
_recordOutputMgr(NULL)
{
_recordOutputMgr = new RecordOutputMgr();
_recordOutputMgr->init(_context);
if (_context->getObeySplits()) {
_blockMgr = new BlockMgr();
_blockMgr->setContext(context);
_blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal());
_recordOutputMgr->setSplitInfo(_blockMgr);
}
}
FileIntersect::~FileIntersect(void) {
if (_blockMgr != NULL) {
delete _blockMgr;
_blockMgr = NULL;
}
delete _blockMgr;
_blockMgr = NULL;
delete _recordOutputMgr;
}
......@@ -59,9 +57,6 @@ bool FileIntersect::processSortedFiles()
if (!sweep.init()) {
return false;
}
if (!_recordOutputMgr->init(_context)) {
return false;
}
RecordKeyList hitSet;
while (sweep.next(hitSet)) {
......@@ -79,23 +74,11 @@ bool FileIntersect::processSortedFiles()
bool FileIntersect::processUnsortedFiles()
{
const QuickString &databaseFilename = _context->getDatabaseFileName();
BinTree *binTree = new BinTree(_context->getDatabaseFileIdx(), _context);
FileRecordMgr *queryFRM = new FileRecordMgr(_context->getQueryFileIdx(), _context);
if (!queryFRM->open()) {
return false;
}
if (!binTree->loadDB()) {
fprintf(stderr, "Error: Unable to load database file %s.\n", databaseFilename.c_str());
delete binTree;
exit(1);
}
BinTree *binTree = new BinTree( _context);
binTree->loadDB();
FileRecordMgr *queryFRM = _context->getFile(_context->getQueryFileIdx());
_context->determineOutputType();
_recordOutputMgr->init(_context);
while (!queryFRM->eof()) {
Record *queryRecord = queryFRM->allocateAndGetNextRecord();
......@@ -117,7 +100,6 @@ bool FileIntersect::processUnsortedFiles()
queryFRM->close();
//clean up.
delete queryFRM;
delete binTree;
return true;
}
......@@ -28,6 +28,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/VectorOps \
-I$(UTILITIES_DIR)/BinTree \
......
......@@ -24,17 +24,16 @@ FileMap::FileMap(ContextMap *context)
_recordOutputMgr(NULL)
{
// FIX ME - block manager only works for intersect
//_blockMgr = new BlockMgr();
//_blockMgr->setContext(context);
_blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal());
_recordOutputMgr = new RecordOutputMgr();
_recordOutputMgr->init(_context);
}
FileMap::~FileMap(void) {
if (_blockMgr != NULL) {
delete _blockMgr;
_blockMgr = NULL;
}
delete _recordOutputMgr;
delete _blockMgr;
_blockMgr = NULL;
delete _recordOutputMgr;
_recordOutputMgr = NULL;
}
bool FileMap::mapFiles()
......@@ -43,20 +42,18 @@ bool FileMap::mapFiles()
if (!sweep.init()) {
return false;
}
if (!_recordOutputMgr->init(_context)) {
return false;
}
RecordKeyList hitSet;
while (sweep.next(hitSet)) {
//if (_context->getObeySplits()) {
// RecordKeyList keySet(hitSet.getKey());
// RecordKeyList resultSet(hitSet.getKey());
// _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet);
//} else {
//}
//_recordOutputMgr->printKeyAndTerminate(hitSet);
SummarizeHits(hitSet);
_recordOutputMgr->printRecord(hitSet.getKey(), _output);
if (_context->getObeySplits()) {
RecordKeyList keySet(hitSet.getKey());
RecordKeyList resultSet(hitSet.getKey());
_blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet);
SummarizeHits(resultSet);
_recordOutputMgr->printRecord(resultSet.getKey(), _output);
} else {
SummarizeHits(hitSet);
_recordOutputMgr->printRecord(hitSet.getKey(), _output);
}
}
return true;
}
......@@ -116,7 +113,7 @@ void FileMap::SummarizeHits(RecordKeyList &hits) {
else if (operation == "distinct")
_tmp_output << vo.GetDistinct();
else {
cerr << "ERROR: " << operation << " is an unrecoginzed operation\n";
cerr << "ERROR: " << operation << " is an unrecognized operation\n";
exit(1);
}
_output.append(_tmp_output.str());
......
......@@ -128,36 +128,36 @@ int nek_sandbox1_main(int argc,char** argv)
//
// return 0;
//
ContextIntersect context;
context.addInputFile(argv[1]);
context.setSortedInput(true);
// context.setObeySplits(true);
FileRecordMgr frm(0, &context);
// frm.getBlockMgr()->setBreakOnSkipOps(true);
if (!frm.open()) {
cerr << "Error: couldn't open file " << argv[1] << ". Exiting." << endl;
exit(1);
}
cout << "File Type is : " << frm.getFileType() << ", " << frm.getFileTypeName() << "." << endl;
cout << "RecordType is : " << frm.getRecordType() << ", " << frm.getRecordTypeName() << "." << endl;
bool headerFound = false;
QuickString outbuf;
while (!frm.eof()) {
Record *record = frm.allocateAndGetNextRecord();
if (!headerFound && frm.hasHeader()) {
cout << frm.getHeader() << endl;
headerFound = true;
}
if (record == NULL) {
break;
}
outbuf.clear();
record->print(outbuf);
printf("%s\n", outbuf.c_str());
// ContextIntersect context;
// context.addInputFile(argv[1]);
// context.setSortedInput(true);
//// context.setObeySplits(true);
//
// FileRecordMgr frm(0, &context);
//// frm.getBlockMgr()->setBreakOnSkipOps(true);
// if (!frm.open()) {
// cerr << "Error: couldn't open file " << argv[1] << ". Exiting." << endl;
// exit(1);
// }
// cout << "File Type is : " << frm.getFileType() << ", " << frm.getFileTypeName() << "." << endl;
// cout << "RecordType is : " << frm.getRecordType() << ", " << frm.getRecordTypeName() << "." << endl;
//
// bool headerFound = false;
// QuickString outbuf;
// while (!frm.eof()) {
// Record *record = frm.allocateAndGetNextRecord();
// if (!headerFound && frm.hasHeader()) {
// cout << frm.getHeader() << endl;
// headerFound = true;
// }
// if (record == NULL) {
// break;
// }
//
// outbuf.clear();
// record->print(outbuf);
// printf("%s\n", outbuf.c_str());
//
// RecordKeyList recList(record);
// int blockCount = frm.getBlockMgr()->getBlocks(recList);
// printf("The %d blocks are:\n", blockCount);
......@@ -168,10 +168,10 @@ int nek_sandbox1_main(int argc,char** argv)
// printf("\n\n");
// frm.getBlockMgr()->deleteBlocks(recList);
frm.deleteRecord(record);
}
// frm.deleteRecord(record);
// }
// cout << "Final header is: " << frm.getHeader() << endl;
frm.close();
// frm.close();
return 0;
}
......
......@@ -17,6 +17,7 @@ INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
......
......@@ -2,11 +2,10 @@
#include "FileRecordMgr.h"
BinTree::BinTree(int databaseFileIdx, ContextIntersect *context)
: _databaseFileIdx(databaseFileIdx),
BinTree::BinTree(ContextIntersect *context)
: _databaseFile(NULL),
_context(context),
_binOffsetsExtended(NULL),
_dbFileMgr(NULL),
_showBinMetrics(false),
_maxBinNumFound(0)
{
......@@ -37,7 +36,7 @@ BinTree::~BinTree() {
}
for (innerListIterType listIter = bin->begin(); listIter != bin->end(); listIter = bin->next()) {
const Record *record = listIter->value();
_dbFileMgr->deleteRecord(record);
_databaseFile->deleteRecord(record);
}
delete bin;
bin = NULL;
......@@ -46,10 +45,6 @@ BinTree::~BinTree() {
delete [] bins;
bins = NULL;
}
if (_dbFileMgr != NULL) {
delete _dbFileMgr;
_dbFileMgr = NULL;
}
delete [] _binOffsetsExtended;
if (_showBinMetrics) {
......@@ -73,18 +68,13 @@ BinTree::~BinTree() {
}
}
bool BinTree::loadDB()
void BinTree::loadDB()
{
_dbFileMgr = new FileRecordMgr(_databaseFileIdx, _context);
if (!_dbFileMgr->open()) {
fprintf(stderr, "ERROR: Can't open database file %s to build tree.\n", _context->getInputFileName(_databaseFileIdx).c_str());
delete _dbFileMgr;
_dbFileMgr = NULL;
return false;
}
_databaseFile = _context->getFile(_context->getDatabaseFileIdx());
Record *record = NULL;
while (!_dbFileMgr->eof()) {
record = _dbFileMgr->allocateAndGetNextRecord();
while (!_databaseFile->eof()) {
record = _databaseFile->allocateAndGetNextRecord();
//In addition to NULL records, we also don't want to add unmapped reads.
if (record == NULL || record->isUnmapped()) {
continue;
......@@ -92,19 +82,17 @@ bool BinTree::loadDB()
if (!addRecordToTree(record)) {
fprintf(stderr, "ERROR: Unable to add record to tree.\n");
_dbFileMgr->close();
return false;
_databaseFile->close();
exit(1);
}
}
_dbFileMgr->close();
_databaseFile->close();
//TBD: give ERROR and return false if tree is empty.
if (_mainMap.empty()) {
fprintf(stderr, "ERROR: Tree is empty, no records added.\n");
return false;
exit(1);
}
return true;
}
void BinTree::getHits(Record *record, RecordKeyList &hitSet)
......
......@@ -24,15 +24,15 @@ class Record;
class BinTree {
public:
BinTree(int databaseFileIdx, ContextIntersect *context);
BinTree(ContextIntersect *context);
~BinTree();
bool loadDB();
void loadDB();
void getHits(Record *record, RecordKeyList &hitSet);
private:
int _databaseFileIdx;
FileRecordMgr *_databaseFile;
ContextIntersect *_context;
//
......@@ -60,8 +60,6 @@ private:
typedef map<mainKeyType, allBinsType> mainMapType;
mainMapType _mainMap;
FileRecordMgr *_dbFileMgr;
bool _showBinMetrics;
uint32_t _maxBinNumFound;
map<uint32_t, int> _binsHit;
......
/*
* Context.cpp
*
* Created on: Feb 12, 2013
* Author: nek3d
*/
#include "Context.h"
#include <unistd.h>
#include <sys/types.h>
Context::Context()
:
_program(UNSPECIFIED_PROGRAM),
_useMergedIntervals(false),
_genomeFile(NULL),
_outputFileType(FileRecordTypeChecker::UNKNOWN_FILE_TYPE),
_outputTypeDetermined(false),
_skipFirstArgs(0),
_showHelp(false),
_obeySplits(false),
_uncompressedBam(false),
_useBufferedOutput(true),
_anyHit(false),
_noHit(false),
_writeA(false),
_writeB(false),
_leftJoin(false),
_writeCount(false),
_writeOverlap(false),
_writeAllOverlap(false),
_haveFraction(false),
_overlapFraction(1E-9),
_reciprocal(false),
_sameStrand(false),
_diffStrand(false),
_sortedInput(false),
_printHeader(false),
_printable(true),
_explicitBedOutput(false),
_queryFileIdx(-1),
_databaseFileIdx(-1),
_bamHeaderAndRefIdx(-1),
_maxNumDatabaseFields(0),
_useFullBamTags(false),
_reportCount(false),
_maxDistance(0),
_reportNames(false),
_reportScores(false),
_numOutputRecords(0),
_hasConstantSeed(false),
_seed(0),
_forwardOnly(false),
_reverseOnly(false)
{
_programNames["intersect"] = INTERSECT;
_programNames["sample"] = SAMPLE;
_validScoreOps.insert("sum");
_validScoreOps.insert("max");
_validScoreOps.insert("min");
_validScoreOps.insert("mean");
_validScoreOps.insert("mode");
_validScoreOps.insert("median");
_validScoreOps.insert("antimode");
_validScoreOps.insert("collapse");
}
Context::~Context()
{
if (_genomeFile != NULL) {
delete _genomeFile;
_genomeFile = NULL;
}
}
bool Context::determineOutputType() {
if (_outputTypeDetermined) {
return true;
}
//test whether output should be BED or BAM.
//If the user explicitly requested BED, then it's BED.
if (getExplicitBedOutput()) {
setOutputFileType(FileRecordTypeChecker::SINGLE_LINE_DELIM_TEXT_FILE_TYPE);
_outputTypeDetermined = true;
return true;
}
//If this is an intersection, and the query is BAM, then
//the output is BAM.
if (_program == INTERSECT && getQueryFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) {
setOutputFileType(FileRecordTypeChecker::BAM_FILE_TYPE);
_outputTypeDetermined = true;
return true;
}
//Otherwise, if there are any BAM files in the input,
//then the output should be BAM.
for (size_t i = 0; i < _inputFiles.size(); i++) {
if (_inputFiles[i]._fileType == FileRecordTypeChecker::BAM_FILE_TYPE) {
setOutputFileType(FileRecordTypeChecker::BAM_FILE_TYPE);
_bamHeaderAndRefIdx = i;
_outputTypeDetermined = true;
return true;
}
}
//Okay, it's bed.
setOutputFileType(FileRecordTypeChecker::SINGLE_LINE_DELIM_TEXT_FILE_TYPE);
_outputTypeDetermined = true;
return true;
}
void Context::openGenomeFile(const QuickString &genomeFilename)
{
_genomeFile = new NewGenomeFile(genomeFilename.c_str());
}
void Context::openGenomeFile(const BamTools::RefVector &refVector)
{
_genomeFile = new NewGenomeFile(refVector);
}
bool Context::parseCmdArgs(int argc, char **argv, int skipFirstArgs) {
_argc = argc;
_argv = argv;
_skipFirstArgs = skipFirstArgs;
if (argc < 2) {
setShowHelp(true);
return false;
}
setProgram(_programNames[argv[0]]);
_argsProcessed.resize(argc - skipFirstArgs, false);
for (int i=skipFirstArgs; i < argc; i++) {
if (isUsed(i - skipFirstArgs)) {
continue;
}
if (strcmp(argv[i], "-i") == 0) {
if (argc <= i+1) {
_errorMsg = "\n***** ERROR: -i option given, but no input file specified. *****";
return false;
}
addInputFile(argv[i+1]);
markUsed(i - skipFirstArgs);
i++;
markUsed(i - skipFirstArgs);
} else if (strcmp(argv[i], "-g") == 0) {
if (argc <= i+1) {
_errorMsg = "\n***** ERROR: -g option given, but no genome file specified. *****";
return false;
}
openGenomeFile(argv[i+1]);
markUsed(i - skipFirstArgs);
i++;
markUsed(i - skipFirstArgs);
} else if (strcmp(argv[i], "-h") == 0) {
setShowHelp(true);
markUsed(i - skipFirstArgs);
} else if (strcmp(argv[i], "--help") == 0) {
setShowHelp(true);
markUsed(i - skipFirstArgs);
}
else if (strcmp(argv[i], "-split") == 0) {
setObeySplits(true);
markUsed(i - skipFirstArgs);
}
if (strcmp(argv[i], "-a") == 0) {
if (argc <= i+1) {
_errorMsg = "\n***** ERROR: -a option given, but no query file specified. *****";
return false;
}
addInputFile(argv[i+1]);
_queryFileIdx = getNumInputFiles() -1;
markUsed(i - skipFirstArgs);
i++;
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-abam") == 0) {
if (argc <= i+1) {
_errorMsg = "\n***** ERROR: -abam option given, but no query BAM file specified. *****";
return false;
}
addInputFile(argv[i+1]);
_queryFileIdx = getNumInputFiles() -1;
markUsed(i - skipFirstArgs);
i++;
markUsed(i - skipFirstArgs);
setInputFileType(_queryFileIdx, FileRecordTypeChecker::BAM_FILE_TYPE);
}
else if (strcmp(argv[i], "-b") == 0) {
if (argc <= i+1) {
_errorMsg = "\n***** ERROR: -b option given, but no database file specified. *****";
return false;
}
addInputFile(argv[i+1]);
_databaseFileIdx = getNumInputFiles() -1;
markUsed(i - skipFirstArgs);
i++;
markUsed(i - skipFirstArgs);
} else if (strcmp(argv[i], "-u") == 0) {
setAnyHit(true);
markUsed(i - skipFirstArgs);
} else if(strcmp(argv[i], "-f") == 0) {
if ((i+1) < argc) {
setHaveFraction(true);
setOverlapFraction(atof(argv[i + 1]));
markUsed(i - skipFirstArgs);
i++;
markUsed(i - skipFirstArgs);
}
}
else if(strcmp(argv[i], "-bed") == 0) {
setExplicitBedOutput(true);
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-wa") == 0) {
setWriteA(true);
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-wb") == 0) {
setWriteB(true);
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-wo") == 0) {
setWriteOverlap(true);
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-wao") == 0) {
setWriteAllOverlap(true);
setWriteOverlap(true);
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-c") == 0) {
setWriteCount(true);
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-r") == 0) {
setReciprocal(true);
markUsed(i - skipFirstArgs);
}
else if (strcmp(argv[i], "-v") == 0) {
setNoHit(true);
markUsed(i - skipFirstArgs);
}
else if (strcmp(argv[i], "-s") == 0) {
setSameStrand(true);
markUsed(i - skipFirstArgs);