Commit 4c7213c5 authored by Neil Kindlon's avatar Neil Kindlon
Browse files

Detect differently ordered files in chromsweep and closest. Merged Fisher changes.

parent 24803222
......@@ -36,6 +36,10 @@ bool ClosestFile::getClosest() {
}
}
if (!_context->hasGenomeFile()) {
sweep.closeOut(true);
}
return true;
}
......@@ -9,9 +9,21 @@ Fisher::Fisher(ContextFisher *context)
_unionVal(0),
_numIntersections(0),
_queryLen(0),
_dbLen(0)
_dbLen(0),
_queryCounts(0),
_dbCounts(0),
_overlapCounts(0)
{
_blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal());
_haveExclude = false;
if(!(_context->getExcludeFile().empty())){
string ex = _context->getExcludeFile();
exclude = new BedFile(ex);
exclude->loadBedFileIntoMergedMap();
_haveExclude = true;
}
}
Fisher::~Fisher(void) {
......@@ -26,29 +38,41 @@ bool Fisher::calculate() {
}
// header
cout << "# Contingency Table" << endl;
// for fisher's exact test, we need the contingency table
// XXXXXXXX | not in A | in A
// not in B | n11: in neither | n12: only in A
// in B | n21: only in B | n22: in A & B
//
double left, right, two;
long long genomeSize = _context->getGenomeFile()->getGenomeSize();
// bases covered by neither a nor b
long long n11 = genomeSize - _queryLen - _dbLen + _intersectionVal;
// bases covered only by -b
long long n12 = _dbLen - _intersectionVal;
// bases covered only by -a
long long n21 = _queryLen - _intersectionVal;
// bases covered by both
long long n22 = _intersectionVal;
if(_haveExclude){
genomeSize -= exclude->getTotalFlattenedLength();
}
// bases covered by neither
long long n22_full_bases = genomeSize;
//long long n22_bases = genomeSize - _queryLen - _dbLen + _intersectionVal;
long double dMean = 1.0 + _dbLen / (long double)_dbCounts;
long double qMean = 1.0 + _queryLen / (long double)_queryCounts;
// heursitic, but seems to work quite well -- better than doing more intuitive sum then divide.
long double bMean = (qMean + dMean);
//bMean = (_unionVal + 2.0 * _intersectionVal) / (long double)(_dbCounts + _queryCounts);
long long n11 = (long)_overlapCounts;
// this could be < 0 because multiple overlaps
long long n12 = (long)max(0L, (long)_queryCounts - (long)_overlapCounts);
long long n21 = max(0L, (long)(_dbCounts - _overlapCounts));
long long n22_full = max(n21 + n21 + n11, (long long)(n22_full_bases / bMean));
long long n22 = max(0L, (long)(n22_full - n12 - n21 - n11));
printf("# Number of query intervals: %lu\n", _queryCounts);
printf("# Number of db intervals: %lu\n", _dbCounts);
printf("# Number of overlaps: %lu\n", _overlapCounts);
printf("# Number of possible intervals (estimated): %lld\n", n22_full);
printf("# phyper(%lld - 1, %lu, %lld - %lu, %lu, lower.tail=F)\n", n11, _queryCounts, n22_full, _queryCounts, _dbCounts);
cout << "# Contingency Table Of Counts" << endl;
printf("#_________________________________________\n");
printf("# | %-12s | %-12s |\n", "not in -b", "in -b");
printf("# not in -a | %-12lld | %-12lld |\n", n11, n12);
printf("# in -a | %-12lld | %-12lld |\n", n21, n22);
printf("# | %-12s | %-12s |\n", " in -b", "not in -b");
printf("# in -a | %-12lld | %-12lld |\n", n11, n12);
printf("# not in -a | %-12lld | %-12lld |\n", n21, n22);
printf("#_________________________________________\n");
kt_fisher_exact(n11, n12, n21, n22, &left, &right, &two);
......@@ -66,6 +90,7 @@ bool Fisher::getFisher() {
if (!sweep.init()) {
return false;
}
RecordKeyVector hitSet;
while (sweep.next(hitSet)) {
if (_context->getObeySplits()) {
......@@ -82,6 +107,9 @@ bool Fisher::getFisher() {
_queryLen = sweep.getQueryTotalRecordLength();
_dbLen = sweep.getDatabaseTotalRecordLength();
_queryCounts = sweep.getQueryTotalRecords();
_dbCounts = sweep.getDatabaseTotalRecords();
_unionVal = _queryLen + _dbLen;
return true;
}
......@@ -93,10 +121,14 @@ unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList)
int keyStart = key->getStartPos();
int keyEnd = key->getEndPos();
_overlapCounts += recList.size();
_qsizes.push_back((keyEnd - keyStart));
int hitIdx = 0;
for (RecordKeyVector::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) {
int maxStart = max((*iter)->getStartPos(), keyStart);
int minEnd = min((*iter)->getEndPos(), keyEnd);
_qsizes.push_back((int)(minEnd - maxStart));
if (_context->getObeySplits()) {
intersection += _blockMgr->getOverlapBases(hitIdx);
hitIdx++;
......@@ -107,4 +139,3 @@ unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList)
_numIntersections += (int)recList.size();
return intersection;
}
#ifndef FISHER_H
#define FISHER_H
#include "bedFile.h"
#include "bedFilePE.h"
#include "ContextFisher.h"
class BlockMgr;
......@@ -19,11 +22,17 @@ private:
BlockMgr *_blockMgr;
unsigned long _intersectionVal;
unsigned long _unionVal;
bool _haveExclude;
int _numIntersections;
unsigned long _queryLen;
unsigned long _dbLen;
unsigned long _queryCounts;
unsigned long _dbCounts;
unsigned long _overlapCounts;
bool getFisher();
BedFile *exclude;
vector<int> _qsizes;
unsigned long getTotalIntersection(RecordKeyVector &hits);
};
......
......@@ -9,27 +9,32 @@ BIN_DIR = ../../bin/
INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/GenomeFile/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src \
-I$(UTILITIES_DIR)/BlockedIntervals \
-I$(UTILITIES_DIR)/BamTools-Ancillary \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/bedFile/ \
-I$(UTILITIES_DIR)/bedFilePE/ \
-I$(UTILITIES_DIR)/GenomeFile/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src \
-I$(UTILITIES_DIR)/BlockedIntervals \
-I$(UTILITIES_DIR)/BamTools-Ancillary \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/VectorOps \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/VectorOps \
-I . \
-I$(UTILITIES_DIR)/BinTree \
-I$(UTILITIES_DIR)/version/
-I$(UTILITIES_DIR)/BinTree \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= fisherMain.cpp Fisher.cpp Fisher.h kfunc.c
SOURCES= fisherMain.cpp Fisher.cpp Fisher.h kfunc.c \
../utils/Contexts/ContextFisher.cpp \
../utils/Contexts/ContextFisher.h
OBJECTS= fisherMain.o Fisher.o
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= Fisher
......
......@@ -32,13 +32,9 @@ void fisher_help(void) {
cerr << "\nTool: bedtools fisher (aka fisher)" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Calculate Fisher statistic b/w two feature files."
<< endl
<< "\t Fisher is the length of the intersection over the union."
<< endl
<< "\t Values range from 0 (no intersection) to 1 (self intersection)."
<< endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf> -g <genome>" << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf> -g <genome file>" << endl << endl;
cerr << "Options: " << endl;
......@@ -47,6 +43,9 @@ void fisher_help(void) {
cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl;
cerr << "\t\t- FLOAT (e.g. 0.50)" << endl << endl;
cerr << "\t-m\t" << "Merge overlapping intervals before" << endl;
cerr << "\t\t- looking at overlap." << endl << endl;
cerr << "\t-r\t" << "Require that the fraction overlap be reciprocal for A and B." << endl;
cerr << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl;
cerr << "\t\t that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl;
......
......@@ -69,6 +69,9 @@ bool FileIntersect::processSortedFiles()
processHits(hitSet);
}
}
if (!_context->hasGenomeFile()) {
sweep.closeOut(true);
}
return true;
}
......
......@@ -80,6 +80,8 @@ void BinTree::loadDB()
continue;
}
_context->testNameConventions(record);
if (!addRecordToTree(record)) {
fprintf(stderr, "ERROR: Unable to add record to tree.\n");
databaseFile->close();
......
......@@ -57,7 +57,9 @@ ContextBase::ContextBase()
_desiredStrand(FileRecordMergeMgr::ANY_STRAND),
_maxDistance(0),
_useMergedIntervals(false),
_reportPrecision(-1)
_reportPrecision(-1),
_allFilesHaveChrInChromNames(UNTESTED),
_allFileHaveLeadingZeroInChromNames(UNTESTED)
{
_programNames["intersect"] = INTERSECT;
......@@ -599,3 +601,73 @@ bool ContextBase::parseIoBufSize(QuickString bufStr)
}
return true;
}
void ContextBase::testNameConventions(const Record *record) {
int fileIdx = record->getFileIdx();
//
// First test whether chr in chrom names match
//
bool hasChr = record->hasChrInChromName();
testType testChrVal = fileHasChrInChromNames(fileIdx);
if (testChrVal == UNTESTED) {
_fileHasChrInChromNames[fileIdx] = hasChr ? YES : NO;
} else if ((testChrVal == YES && !hasChr) || (testChrVal == NO && hasChr)) {
fprintf(stderr, "ERROR: File %s has inconsistent naming convention for record:\n", _fileNames[fileIdx].c_str());
record->print(stderr, true);
exit(1);
}
if ((_allFilesHaveChrInChromNames == YES && !hasChr) || (_allFilesHaveChrInChromNames == NO && hasChr)) {
fprintf(stderr, "ERROR: File %s has a record where naming convention is inconsistent with other files:\n", _fileNames[fileIdx].c_str());
record->print(stderr, true);
exit(1);
}
if (_allFilesHaveChrInChromNames == UNTESTED) {
_allFilesHaveChrInChromNames = hasChr ? YES : NO;
}
//
// Now test whether leading zero in chrom names match
//
bool zeroVal = record->hasLeadingZeroInChromName();
testChrVal = fileHasLeadingZeroInChromNames(fileIdx);
if (testChrVal == UNTESTED) {
_fileHasLeadingZeroInChromNames[fileIdx] = zeroVal ? YES : NO;
} else if ((testChrVal == YES && !zeroVal) || (testChrVal == NO && zeroVal)) {
fprintf(stderr, "ERROR: File %s has inconsistent naming convention (leading zero) for record:\n", _fileNames[fileIdx].c_str());
record->print(stderr, true);
exit(1);
}
if ((_allFileHaveLeadingZeroInChromNames == YES && !zeroVal) || (_allFileHaveLeadingZeroInChromNames == NO && zeroVal)) {
fprintf(stderr, "ERROR: File %s has a record where naming convention (leading zero) is inconsistent with other files:\n", _fileNames[fileIdx].c_str());
record->print(stderr, true);
exit(1);
}
if (_allFileHaveLeadingZeroInChromNames == UNTESTED) {
_allFileHaveLeadingZeroInChromNames = zeroVal ? YES : NO;
}
}
ContextBase::testType ContextBase::fileHasChrInChromNames(int fileIdx) {
conventionType::iterator iter = _fileHasChrInChromNames.find(fileIdx);
if (iter == _fileHasChrInChromNames.end()) {
return UNTESTED;
}
return iter->second;
}
ContextBase::testType ContextBase::fileHasLeadingZeroInChromNames(int fileIdx) {
conventionType::iterator iter = _fileHasLeadingZeroInChromNames.find(fileIdx);
if (iter == _fileHasLeadingZeroInChromNames.end()) {
return UNTESTED;
}
return iter->second;
}
......@@ -145,6 +145,9 @@ public:
//methods applicable only to column operations.
int getReportPrecision() const { return _reportPrecision; }
void testNameConventions(const Record *);
protected:
PROGRAM_TYPE _program;
......@@ -230,6 +233,16 @@ protected:
char **_argv;
int _i;
//track whether each file has the letters chr in their chrom names.
//this is needed for enforcing consistent naming conventions across
//multiple files, and auto-detecting errors when they're not followed.
typedef enum { YES, NO, UNTESTED } testType;
typedef map<int, testType> conventionType;
conventionType _fileHasChrInChromNames;
// as above, but check whether first digit to appear in
// a chrom name is a zero.
conventionType _fileHasLeadingZeroInChromNames;
static const int MIN_ALLOWED_BUF_SIZE = 8;
virtual bool handle_bed();
......@@ -255,6 +268,11 @@ protected:
bool handle_prec();
bool parseIoBufSize(QuickString bufStr);
testType fileHasChrInChromNames(int fileIdx);
testType fileHasLeadingZeroInChromNames(int fileIdx);
testType _allFilesHaveChrInChromNames;
testType _allFileHaveLeadingZeroInChromNames;
};
#endif /* CONTEXTBASE_H_ */
......@@ -245,3 +245,35 @@ const QuickString &Record::getField(int fieldNum) const
<< endl << "*****" << endl;
exit(1);
}
bool Record::hasChrInChromName() const {
const char *str = _chrName.c_str();
//if the chrom name has at least 3 characters,
//and the first 3 are c, h, r, case-insensitive,
//return true. Otherwise, return false.
return ((_chrName.size() >= 3) &&
(str[0] == 'c' || str[0] == 'C') &&
(str[1] == 'h' || str[1] == 'H') &&
(str[2] == 'r' || str[2] == 'R'));
}
bool Record::hasLeadingZeroInChromName() const {
const char *str = _chrName.c_str();
size_t i=0;
while (i < _chrName.size() && !isdigit(str[i])) {
if (str[i] == '_') return false; //ignore
//leading zero after underscore, as some of the
//random and hap chroms are names that way.
i++;
}
return (i < _chrName.size() && str[i] == '0');
}
void Record::print(FILE *fp, bool newline) const {
QuickString buf;
print(buf);
fprintf(fp, "%s", buf.c_str());
if(newline) fprintf(fp, "\n");
}
......@@ -33,6 +33,7 @@ public:
virtual void print(QuickString &outBuf) const {}
virtual void print(QuickString &outBuf, int start, int end) const {}
virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end) const {}
virtual void print(FILE *fp, bool newline = false) const;
virtual void printNull(QuickString &outBuf) const {}
friend ostream &operator << (ostream &out, const Record &record);
......@@ -135,6 +136,9 @@ public:
// virtual static bool isNumericField(int fieldNum) const = 0;
bool hasChrInChromName() const;
bool hasLeadingZeroInChromName() const;
protected:
virtual ~Record(); //by making the destructor protected, only the friend class(es) can actually delete Record objects, or objects derived from Record.
......
......@@ -18,7 +18,7 @@ CloseSweep::CloseSweep(ContextClosest *context)
_overlapRecs.resize(_numDBs, NULL);
_minUpstreamDist.resize(_numDBs, INT_MAX);
_minDownstreamDist.resize(_numDBs,INT_MAX);
_maxPrevLeftClosestEndPos.resize(_numDBs, 0 );
_maxPrevLeftClosestEndPos.resize(_numDBs, 0);
for (int i=0; i < _numDBs; i++) {
_minUpstreamRecs[i] = new distRecVecType();
......@@ -36,8 +36,14 @@ CloseSweep::~CloseSweep(void) {
}
}
bool CloseSweep::init() {
bool retVal = NewChromSweep::init();
_runToQueryEnd = true;
return retVal;
}
void CloseSweep::masterScan(RecordKeyVector &retList) {
//first clear out everything from the previous scan
if (_context->reportDistance()) {
_finalDistances.clear();
}
......@@ -50,11 +56,12 @@ void CloseSweep::masterScan(RecordKeyVector &retList) {
for (int i=0; i < _numDBs; i++) {
//first clear out everything from the previous scan
_minUpstreamRecs[i]->clear();
_minDownstreamRecs[i]->clear();
_overlapRecs[i]->clear();
if (dbFinished(i) || chromChange(i, retList)) {
if (dbFinished(i) || chromChange(i, retList, true)) {
continue;
} else {
......@@ -251,9 +258,21 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
void CloseSweep::finalizeSelections(int dbIdx, RecordKeyVector &retList) {
//check all actual overlaps, as well as upstream and downstream hits.
// if all of these are empty, return.
const vector<const Record *> & overlapRecs = (*(_overlapRecs[dbIdx]));
const vector<const Record *> & upRecs = (*(_minUpstreamRecs[dbIdx]));
const vector<const Record *> & downRecs = (*(_minDownstreamRecs[dbIdx]));
if (overlapRecs.empty() && upRecs.empty() && downRecs.empty()) {
return;
}
// If there are actual overlaps, only report those, then stop.
ContextClosest::tieModeType tieMode = _context->getTieMode();
const vector<const Record *> & overlapRecs = (*(_overlapRecs[dbIdx]));
if (!overlapRecs.empty()) {
if (tieMode == ContextClosest::FIRST_TIE) {
retList.push_back(overlapRecs[0]);
......@@ -272,8 +291,6 @@ void CloseSweep::finalizeSelections(int dbIdx, RecordKeyVector &retList) {
}
int upStreamDist = _minUpstreamDist[dbIdx];
int downStreamDist = _minDownstreamDist[dbIdx];
const vector<const Record *> & upRecs = (*(_minUpstreamRecs[dbIdx]));
const vector<const Record *> & downRecs = (*(_minDownstreamRecs[dbIdx]));
if (abs(upStreamDist) < abs(downStreamDist)) {
if (tieMode == ContextClosest::FIRST_TIE) {
......@@ -377,36 +394,51 @@ void CloseSweep::checkMultiDbs(RecordKeyVector &retList) {
}
}
bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList)
bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList, bool wantScan)
{
// if (_currQueryRec->getChrName() == "chr1_gl000191_random" || (_currDbRecs[dbIdx] != NULL && _currDbRecs[dbIdx]->getChrName() == "chr1_gl000191_random")) {
// printf("Break point here.\n");
// }
const Record *dbRec = _currDbRecs[dbIdx];
bool haveQuery = _currQueryRec != NULL;
bool haveDB = dbRec != NULL;
\
if (haveQuery && _currQueryChromName != _prevQueryChromName) {
_context->testNameConventions(_currQueryRec);
testChromOrder(_currQueryRec);
}
if (haveDB) {
_context->testNameConventions(dbRec);
testChromOrder(dbRec);
}
// the files are on the same chrom
if (_currDbRecs[dbIdx] == NULL || _currQueryRec->sameChrom(_currDbRecs[dbIdx])) {
if (haveQuery && (!haveDB || _currQueryRec->sameChrom(dbRec))) {
//if this is the first time the query's chrom is ahead of the chrom that was in this cache,
//then we have to clear the cache.
if (!_caches[dbIdx].empty() && _currQueryRec->chromAfter(_caches[dbIdx].begin()->value())) {
if (!_caches[dbIdx].empty() && queryChromAfterDbRec(_caches[dbIdx].begin()->value())) {
clearCache(dbIdx);
_maxPrevLeftClosestEndPos[dbIdx] = 0;
}
return false;
}
if (_currDbRecs[dbIdx]->chromAfter(_currQueryRec) && (!_caches[dbIdx].empty() && (_caches[dbIdx].begin()->value()->sameChrom(_currQueryRec)))) {
if (haveDB && haveQuery && dbRecAfterQueryChrom(dbRec) && (!_caches[dbIdx].empty() && (_caches[dbIdx].begin()->value()->sameChrom(_currQueryRec)))) {
//the newest DB record's chrom is ahead of the query, but the cache still
//has old records on that query's chrom
scanCache(dbIdx, retList);
finalizeSelections(dbIdx, retList);
return true;
}
if (!haveQuery || !haveDB) return false;
// the query is ahead of the database. fast-forward the database to catch-up.
if (_currQueryRec->chromAfter(_currDbRecs[dbIdx])) {
if (queryChromAfterDbRec(dbRec)) {
while (_currDbRecs[dbIdx] != NULL &&
_currQueryRec->chromAfter(_currDbRecs[dbIdx])) {
_dbFRMs[dbIdx]->deleteRecord(_currDbRecs[dbIdx]);
while (dbRec != NULL &&
queryChromAfterDbRec(dbRec)) {
_dbFRMs[dbIdx]->deleteRecord(dbRec);
nextRecord(false, dbIdx);
}
clearCache(dbIdx);
......@@ -416,7 +448,7 @@ bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList)
// the database is ahead of the query.
else {
// 1. scan the cache for remaining hits on the query's current chrom.
scanCache(dbIdx, retList);
if (wantScan) scanCache(dbIdx, retList);
return true;
}
......@@ -424,3 +456,24 @@ bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList)
//control can't reach here, but compiler still wants a return statement.
return true;
}
bool CloseSweep::dbRecAfterQueryChrom(const Record *dbRec)
{
//If using a genome file, compare chrom ids.
//Otherwise, compare global order, inserting as needed.
if (_context->hasGenomeFile()) {
return ( dbRec->getChromId() > _currQueryRec->getChromId() ) ;
}
//see if the db has both it's curr chrom and the query's curr chrom.
const _orderTrackType *track = _fileTracks[dbRec->getFileIdx()];
_orderTrackType::const_iterator iter = track->find(dbRec->getChrName());
if (iter == track->end()) return false; //db file does not contain the curr chrom
int dbOrder = iter->second;
iter = track->find(_currQueryRec->getChrName());
if (iter == track->end()) return false; //db file does not contain the query chrom.
int qOrder = iter->second;
return (dbOrder > qOrder);
}
......@@ -16,7 +16,7 @@ class CloseSweep : public NewChromSweep {
public:
CloseSweep(ContextClosest *context);
~CloseSweep(void);
bool init();