Commit 1435a95a authored by nkindlon's avatar nkindlon
Browse files

added other util directories

parent a1b9dcff
#include "BinTree.h"
#include "FileRecordMgr.h"
BinTree::BinTree(int databaseFileIdx, Context *context)
: _databaseFileIdx(databaseFileIdx),
_context(context),
_binOffsetsExtended(NULL),
_dbFileMgr(NULL),
_showBinMetrics(false),
_maxBinNumFound(0)
{
_binOffsetsExtended = new uint32_t[NUM_BIN_LEVELS];
memset(_binOffsetsExtended, 0, NUM_BIN_LEVELS * sizeof(uint32_t));
//start at idx 1, because the memset above already initialized
//the first idx to zero, which is what we want.
for (uint32_t i= 1; i < NUM_BIN_LEVELS; i++) {
_binOffsetsExtended[i] = _binOffsetsExtended[i-1] + (1 << ((NUM_BIN_LEVELS - i -1) * 3));
}
}
BinTree::~BinTree() {
//TBD: pass all elements in tree back to FRM for proper cleanup/deletion
for (mainMapType::iterator mainIter = _mainMap.begin(); mainIter != _mainMap.end(); mainIter++) {
allBinsType bins = mainIter->second;
if (bins == NULL) {
fprintf(stderr, "ERROR: In BinTree destructor: found chromosome with NULL bin array.\n");
continue;
}
if (!_showBinMetrics) { //don't clean up bins when simply reporting metrics.
for (uint32_t i=0; i < NUM_BINS; i++) {
binType bin = bins[i];
if (bin == NULL) {
continue;
}
for (innerListIterType listIter = bin->begin(); listIter != bin->end(); listIter = bin->next()) {
const Record *record = listIter->value();
_dbFileMgr->deleteRecord(record);
}
delete bin;
bin = NULL;
}
}
delete [] bins;
bins = NULL;
}
if (_dbFileMgr != NULL) {
delete _dbFileMgr;
_dbFileMgr = NULL;
}
delete [] _binOffsetsExtended;
if (_showBinMetrics) {
map<int, int> hitsHistogram;
FILE *fp = fopen("BinsHitFile.txt", "w");
fprintf(fp, "The largest bin was %u\n", _maxBinNumFound);
fprintf(fp, "There were %d different bins hit by database.\n", (int)_binsHit.size());
for (map<uint32_t, int>::iterator binIter = _binsHit.begin(); binIter != _binsHit.end(); binIter++) {
uint32_t binNum = binIter->first;
int numHits = binIter->second;
fprintf(fp, "%u\t%d\n", binNum, numHits);
hitsHistogram[numHits]++;
}
fclose(fp);
fp = fopen("BinHitsHistogram.txt", "w");
fprintf(fp, "NumHits\tNumBins\n");
for (map<int, int>::iterator histIter = hitsHistogram.begin(); histIter != hitsHistogram.end(); histIter++) {
fprintf(fp, "%d\t%d\n", histIter->first, histIter->second);
}
fclose(fp);
}
}
bool BinTree::loadDB()
{
_dbFileMgr = new FileRecordMgr(_databaseFileIdx, _context);
if (!_dbFileMgr->open()) {
fprintf(stderr, "ERROR: Can't open database file %s to build tree.\n", _context->getInputFileName(_databaseFileIdx).c_str());
delete _dbFileMgr;
_dbFileMgr = NULL;
return false;
}
Record *record = NULL;
while (!_dbFileMgr->eof()) {
record = _dbFileMgr->allocateAndGetNextRecord();
if (record == NULL) {
continue;
}
if (!addRecordToTree(record)) {
fprintf(stderr, "ERROR: Unable to add record to tree.\n");
_dbFileMgr->close();
return false;
}
}
_dbFileMgr->close();
//TBD: give ERROR and return false if tree is empty.
if (_mainMap.empty()) {
fprintf(stderr, "ERROR: Tree is empty, no records added.\n");
return false;
}
return true;
}
void BinTree::getHits(Record *record, RecordKeyList &hitSet)
{
if (_showBinMetrics) {
return; //don't care about query entries just yet.
}
const QuickString &chr = record->getChrName();
mainMapType::iterator mainIter = _mainMap.find(chr);
if (mainIter == _mainMap.end()) {
//given chrom not even in map.
return;
}
uint32_t startBin = 0;
uint32_t endBin = 0;
uint32_t startPos = record->getStartPos();
uint32_t endPos = record->getEndPos();
startBin = (startPos >> _binFirstShift);
endBin = ((endPos-1) >> _binFirstShift);
const allBinsType bins = mainIter->second;
/* SYNOPSIS:
1. We loop through each UCSC BIN level for feature A's chrom.
2. For each BIN, we loop through each B feature and add it to
hits if it meets all of the user's requests, which include:
(a) overlap fractio, (b) strandedness, (c) reciprocal overlap
*/
for (uint32_t i = 0; i < NUM_BIN_LEVELS; i++) {
uint32_t offset = _binOffsetsExtended[i];
for (uint32_t j = (startBin+offset); j <= (endBin+offset); j++) {
// move to the next bin if this one is empty
const binType &bin = bins[j];
if (bin == NULL) {
//no list of records in this bin.
continue;
}
if (bin->empty()) {
//bin has list, but it's empty.
//Actually, this should never happen, so throw an error.
fprintf(stderr, "ERROR: found empty list for bin %u of chromosome %s\n",
j, chr.c_str());
continue;
}
for (innerListIterType listIter = bin->begin(); listIter != bin->end(); listIter = bin->next()) {
const Record *dbRec = listIter->value();
if (record->intersects(dbRec, _context->getSameStrand(), _context->getDiffStrand(),
_context->getOverlapFraction(), _context->getReciprocal())) {
hitSet.push_back(dbRec);
}
}
}
startBin >>= _binNextShift;
endBin >>= _binNextShift;
}
}
bool BinTree::addRecordToTree(const Record *record)
{
//TBD. get chr, bin. allocate all bins and single bins as needed.
const QuickString &chr = record->getChrName();
uint32_t startPos = (uint32_t)(record->getStartPos());
uint32_t endPos = (uint32_t)(record->getEndPos());
//is this chr currently in the main map?
allBinsType bins = NULL;
mainMapType::iterator mainIter = _mainMap.find(chr);
if (mainIter == _mainMap.end()) {
//this is a new chr NOT currently in the map.
bins = new binType[NUM_BINS];
memset(bins, 0, NUM_BINS * sizeof(binType));
_mainMap[chr] = bins;
} else {
bins = mainIter->second;
}
uint32_t binNum = getBin(startPos, endPos);
if (_showBinMetrics) {
if (binNum > _maxBinNumFound) {
_maxBinNumFound = binNum;
}
_binsHit[binNum]++;
return true;
}
if (binNum < 0 || binNum >= NUM_BINS) {
fprintf(stderr, "ERROR: Received illegal bin number %u from getBin call.\n", binNum);
return false;
}
binType &bin = bins[binNum];
if (bin == NULL) {
bin = new innerListType;
}
bin->push_back(record);
return true;
}
uint32_t BinTree::getBin(const Record *record) const {
return getBin((uint32_t)(record->getStartPos()), (uint32_t)(record->getEndPos()));
}
uint32_t BinTree::getBin(uint32_t start, uint32_t end) const {
--end;
start >>= _binFirstShift;
end >>= _binFirstShift;
for (uint32_t i = 0; i < NUM_BIN_LEVELS; ++i) {
if (start == end) {
return _binOffsetsExtended[i] + start;
}
start >>= _binNextShift;
end >>= _binNextShift;
}
//failure
return -1;
}
/*
* BinTree.h
*
* Created on: Jan 5, 2013
* Author: nek3d
*/
#ifndef BINTREE_H_
#define BINTREE_H_
using namespace std;
#include <stdint.h>
#include <string>
#include <set>
#include <map>
#include "QuickString.h"
#include "RecordKeyList.h"
#include "Context.h"
class FileRecordMgr;
class Record;
class BinTree {
public:
BinTree(int databaseFileIdx, Context *context);
~BinTree();
bool loadDB();
void getHits(Record *record, RecordKeyList &hitSet);
private:
int _databaseFileIdx;
Context *_context;
//
// BIN HANDLING
//
static const uint32_t NUM_BINS = 37450;
static const uint32_t NUM_BIN_LEVELS = 7;
// bins range in size from 16kb to 512Mb
// Bin 0 spans 512Mbp, # Level 1
// Bins 1-8 span 64Mbp, # Level 2
// Bins 9-72 span 8Mbp, # Level 3
// Bins 73-584 span 1Mbp # Level 4
// Bins 585-4680 span 128Kbp # Level 5
// Bins 4681-37449 span 16Kbp # Level 6
uint32_t *_binOffsetsExtended;
static const uint32_t _binFirstShift = 14; /* How much to shift to get to finest bin. */
static const uint32_t _binNextShift = 3; /* How much to shift to get to next larger bin. */
typedef BTlist<const Record *> innerListType;
typedef const BTlistNode<const Record *> * innerListIterType;
typedef innerListType * binType;
typedef binType * allBinsType;
typedef QuickString mainKeyType;
typedef map<mainKeyType, allBinsType> mainMapType;
mainMapType _mainMap;
FileRecordMgr *_dbFileMgr;
bool _showBinMetrics;
uint32_t _maxBinNumFound;
map<uint32_t, int> _binsHit;
bool addRecordToTree(const Record *);
uint32_t getBin(uint32_t start, uint32_t end) const;
uint32_t getBin(const Record *record) const;
};
#endif /* BINTREE_H_ */
OBJ_DIR = ../../../obj/
BIN_DIR = ../../../bin/
UTILITIES_DIR = ../../utils/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/GenomeFile/ \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/BamTools/include
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= BinTree.cpp BinTree.h
OBJECTS= BinTree.o
_EXT_OBJECTS=
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
$(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(INCLUDES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/BinTree.o
.PHONY: clean
\ No newline at end of file
/*
* Context.cpp
*
* Created on: Feb 12, 2013
* Author: nek3d
*/
#include "Context.h"
Context::Context()
:
_program(UNSPECIFIED_PROGRAM),
_useMergedIntervals(false),
_genomeFile(NULL),
_outputFileType(FileRecordTypeChecker::UNKNOWN_FILE_TYPE),
_outputTypeDetermined(false),
_skipFirstArgs(0),
_showHelp(false),
_obeySplits(false),
_uncompressedBam(false),
_anyHit(false),
_noHit(false),
_writeA(false),
_writeB(false),
_leftJoin(false),
_writeCount(false),
_writeOverlap(false),
_writeAllOverlap(false),
_haveFraction(false),
_overlapFraction(1E-9),
_reciprocal(false),
_sameStrand(false),
_diffStrand(false),
_sortedInput(false),
_printHeader(false),
_printable(true),
_explicitBedOutput(false),
_queryFileIdx(-1),
_databaseFileIdx(-1),
_bamHeaderAndRefIdx(-1),
_maxNumDatabaseFields(0),
_reportCount(false),
_maxDistance(0),
_reportNames(false),
_reportScores(false)
{
_validScoreOps.insert("sum");
_validScoreOps.insert("max");
_validScoreOps.insert("min");
_validScoreOps.insert("mean");
_validScoreOps.insert("mode");
_validScoreOps.insert("median");
_validScoreOps.insert("antimode");
_validScoreOps.insert("collapse");
}
Context::~Context()
{
if (_genomeFile != NULL) {
delete _genomeFile;
_genomeFile = NULL;
}
}
bool Context::determineOutputType() {
if (_outputTypeDetermined) {
return true;
}
//test whether output should be BED or BAM.
//If the user explicitly requested BED, then it's BED.
//Otherwise, if there are any BAM files in the input,
//then the output should be BAM.
if (getExplicitBedOutput() || getQueryFileType() != FileRecordTypeChecker::BAM_FILE_TYPE) {
setOutputFileType(FileRecordTypeChecker::SINGLE_LINE_DELIM_TEXT_FILE_TYPE);
} else {
setOutputFileType(FileRecordTypeChecker::BAM_FILE_TYPE);
}
_outputTypeDetermined = true;
return true;
}
void Context::openGenomeFile(const QuickString &genomeFilename)
{
_genomeFile = new NewGenomeFile(genomeFilename.c_str());
}
void Context::openGenomeFile(const BamTools::RefVector &refVector)
{
_genomeFile = new NewGenomeFile(refVector);
}
void Context::parseCmdArgs(int argc, char **argv, int skipFirstArgs) {
_argc = argc;
_argv = argv;
_skipFirstArgs = skipFirstArgs;
_argsProcessed.resize(argc - skipFirstArgs, false);
for (int i=skipFirstArgs; i < argc; i++) {
if (isUsed(i - skipFirstArgs)) {
continue;
}
if (strcmp(argv[i], "-i") == 0) {
addInputFile(argv[i+1]);
markUsed(i - skipFirstArgs);
i++;
markUsed(i - skipFirstArgs);
} else if (strcmp(argv[i], "-g") == 0) {
openGenomeFile(argv[i+1]);
markUsed(i - skipFirstArgs);
i++;
markUsed(i - skipFirstArgs);
} else if (strcmp(argv[i], "-h") == 0) {
setShowHelp(true);
markUsed(i - skipFirstArgs);
} else if (strcmp(argv[i], "--help") == 0) {
setShowHelp(true);
markUsed(i - skipFirstArgs);
}
else if (strcmp(argv[i], "-split") == 0) {
setObeySplits(true);
markUsed(i - skipFirstArgs);
}
if (strcmp(argv[i], "-a") == 0) {
addInputFile(argv[i+1]);
_queryFileIdx = getNumInputFiles() -1;
markUsed(i - skipFirstArgs);
i++;
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-abam") == 0) {
addInputFile(argv[i+1]);
_queryFileIdx = getNumInputFiles() -1;
markUsed(i - skipFirstArgs);
i++;
markUsed(i - skipFirstArgs);
setInputFileType(_queryFileIdx, FileRecordTypeChecker::BAM_FILE_TYPE);
}
else if (strcmp(argv[i], "-b") == 0) {
addInputFile(argv[i+1]);
_databaseFileIdx = getNumInputFiles() -1;
markUsed(i - skipFirstArgs);
i++;
markUsed(i - skipFirstArgs);
} else if (strcmp(argv[i], "-u") == 0) {
setAnyHit(true);
markUsed(i - skipFirstArgs);
} else if(strcmp(argv[i], "-f") == 0) {
if ((i+1) < argc) {
setHaveFraction(true);
setOverlapFraction(atof(argv[i + 1]));
markUsed(i - skipFirstArgs);
i++;
markUsed(i - skipFirstArgs);
}
}
else if(strcmp(argv[i], "-bed") == 0) {
setExplicitBedOutput(true);
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-wa") == 0) {
setWriteA(true);
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-wb") == 0) {
setWriteB(true);
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-wo") == 0) {
setWriteOverlap(true);
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-wao") == 0) {
setWriteAllOverlap(true);
setWriteOverlap(true);
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-c") == 0) {
setWriteCount(true);
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-r") == 0) {
setReciprocal(true);
markUsed(i - skipFirstArgs);
}
else if (strcmp(argv[i], "-v") == 0) {
setNoHit(true);
markUsed(i - skipFirstArgs);
}
else if (strcmp(argv[i], "-s") == 0) {
setSameStrand(true);
markUsed(i - skipFirstArgs);
}
else if (strcmp(argv[i], "-S") == 0) {
setDiffStrand(true);
markUsed(i - skipFirstArgs);
}
else if (strcmp(argv[i], "-loj") == 0) {
setLeftJoin(true);
markUsed(i - skipFirstArgs);
}
else if (strcmp(argv[i], "-ubam") == 0) {
setUncompressedBam(true);
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-sorted") == 0) {
setSortedInput(true);
markUsed(i - skipFirstArgs);
}
else if(strcmp(argv[i], "-header") == 0) {
setPrintHeader(true);
markUsed(i - skipFirstArgs);
}
}
}
bool Context::isValidState()
{
if (!Context::cmdArgsValid()) {
return false;
}
if (getAnyHit() && getNoHit()) {
_errorMsg = "Error: request either -u for anyHit OR -v for noHit, not both.";
return false;
}
if (getWriteCount()) {
if (getAnyHit()) {
_errorMsg = "Error: request either -c for writeCount OR -u for anyHit, not both.";
return false;
} else if (getWriteB()) {
_errorMsg = "Error: request either -c for writeCount OR -wb for writeB, not both.";
return false;
} else if (getQueryFileType() == FileRecordTypeChecker::BAM_FILE_TYPE && !getExplicitBedOutput()) {
_errorMsg = "Error: writeCount option is not valid with BAM query input, unless bed output is specified with -bed option.";
return false;
}
}
if (getWriteOverlap()) {
if (getWriteA()) {
_errorMsg = "Error: request either -wa for writeA OR -wo for writeOverlap, not both.";
return false;
} else if (getWriteB()) {
_errorMsg = "Error: request either -wb for writeB OR -wo for writeOverlap, not both.";
return false;
} else if (getWriteCount()) {
_errorMsg = "Error: request either -c for writeCount OR -wo for writeOverlap, not both.";
return false;
} else if (getAnyHit()) {
_errorMsg = "Error: request either -u for anyHit OR -wo for writeOverlap, not both.";
return false;
} else if (getQueryFileType() == FileRecordTypeChecker::BAM_FILE_TYPE && !getExplicitBedOutput()) {
_errorMsg = "Error: writeAllOverlap option is not valid with BAM query input, unless bed output is specified with -bed option.";
return false;
}
}
if (getWriteB() || getLeftJoin()) {
if (getQueryFileType() == FileRecordTypeChecker::BAM_FILE_TYPE && !getExplicitBedOutput()) {
cerr << endl << "*****" << endl << "*****WARNING: -wb and -loj are ignored with bam input, unless bed output is specified with -bed option." << endl << "*****" << endl;
}
}
if (getSameStrand() && getDiffStrand()) {
_errorMsg = "Error: request -s for sameStrand, or -S for diffStrand, not both.";
return false;
}