Commit d6544e0e authored by Brent Pedersen's avatar Brent Pedersen
Browse files

change fisher to use numbers of intervals rather than bases of overlap

parent a277bba0
......@@ -15,7 +15,6 @@ scenarios using
`Fisher's Exact Test`_ .
Given a pair of input files `-a` and `-b` in the usual BedTools parlance:
.. code-block:: bash
......@@ -23,9 +22,11 @@ Given a pair of input files `-a` and `-b` in the usual BedTools parlance:
$ cat a.bed
chr1 10 20
chr1 30 40
chr1 51 52
$ cat b.bed
chr1 15 25
chr1 51 52
And a genome of 500 bases:
......@@ -40,43 +41,55 @@ can do this with ``fisher`` as:
.. code-block:: bash
$ bedtools fisher -a a.bed -b b.bed -g t.genome
# Contingency Table
# Number of query intervals: 3
# Number of db intervals: 2
# Number of overlaps: 2
# Number of possible intervals (estimated): 37
# Contingency Table Of Counts
# phyper(2 - 1, 3, 37 - 3, 2, lower.tail=F)
#_________________________________________
# | not in -b | in -b |
# not in -a | 475 | 5 |
# in -a | 15 | 5 |
# | in -b | not in -b |
# in -a | 2 | 1 |
# not in -a | 0 | 34 |
#_________________________________________
# p-values for fisher's exact test
left right two-tail ratio
1 1.3466e-05 1.3466e-05 31.667
1 0.0045045 0.0045045 inf
Where we can see the constructed contingency table and the pvalues for left, right
and two-tail tests.
From here, we can say that given **500 bases** of genome, it is unlikely that a region of
20 bases total from `-a` and 10 bases total from `-b` would share 5 bases if the regions
were randomly distributed. *Consult your statistician for a more precise definition*.
From here, we can say that given **500 bases** of genome, it is unlikely that we
would see **as many** overlaps as we do if the intervals from `a` and `b` were not
related.
The above was highly significant, but if our genome were only **50 bases**:
The above had a fairly low p-value (0.0045), but if our genome were only **60 bases**:
.. code-block:: bash
$ echo -e "chr1\t50" > t.genome
$ echo -e "chr1\t60" > t.genome
$ bedtools fisher -a a.bed -b b.bed -g t.genome
# Contingency Table
# Number of query intervals: 3
# Number of db intervals: 2
# Number of overlaps: 2
# Number of possible intervals (estimated): 4
# Contingency Table Of Counts
# phyper(2 - 1, 3, 4 - 3, 2, lower.tail=F)
#_________________________________________
# | not in -b | in -b |
# not in -a | 25 | 15 |
# in -a | 5 | 5 |
# | in -b | not in -b |
# in -a | 2 | 1 |
# not in -a | 0 | 1 |
#_________________________________________
# p-values for fisher's exact test
left right two-tail ratio
0.86011 0.35497 0.49401 1.667
1 0.5 1 inf
We can see that neither tail is significant. Intuitively, this makes sense;
if we randomly place 20 (from `-a`), and 10 (from `-b`) bases of intervals
within 50 bases, it doesn't seem unlikely that we'd see 5 bases of overlap.
if we randomly place 3 intervals (from `-a`), and 2 (from `-b`) intervals from
within 60 bases, it doesn't seem unlikely that we'd see 2 overlaps.
.. note::
......
......@@ -2,6 +2,7 @@
#include "BlockMgr.h"
#include "NewChromsweep.h"
#include "kfunc.c"
#include <numeric> // std::accumulate
Fisher::Fisher(ContextFisher *context)
: _context(context),
......@@ -9,7 +10,11 @@ Fisher::Fisher(ContextFisher *context)
_unionVal(0),
_numIntersections(0),
_queryLen(0),
_dbLen(0)
_dbLen(0),
_queryCounts(0),
_dbCounts(0),
_overlapCounts(0)
{
_blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal());
_haveExclude = false;
......@@ -34,7 +39,6 @@ bool Fisher::calculate() {
}
// header
cout << "# Contingency Table" << endl;
double left, right, two;
......@@ -42,15 +46,30 @@ bool Fisher::calculate() {
if(_haveExclude){
genomeSize -= exclude->getTotalFlattenedLength();
}
// bases covered by both
long long n22 = genomeSize - _queryLen - _dbLen + _intersectionVal;
// bases covered only by -a
long long n21 = _dbLen - _intersectionVal;
// bases covered only by -b
long long n12 = _queryLen - _intersectionVal;
// bases covered by neither a nor b
long long n11 = _intersectionVal;
// bases covered by neither
long long n22_full_bases = genomeSize;
//long long n22_bases = genomeSize - _queryLen - _dbLen + _intersectionVal;
long double dMean = _dbLen / (long double)_dbCounts;
long double qMean = _queryLen / (long double)_queryCounts;
// mean interval size
long double bMean = (_dbLen + _queryLen) / (long double)(_dbCounts + _queryCounts);
bMean = 1 + (qMean + dMean); // / 2.0;
long long n11 = (long)_overlapCounts;
// this could be < 0 because multiple overlaps
long long n12 = (long)max(0L, (long)_queryCounts - (long)_overlapCounts);
long long n21 = (long)(_dbCounts - _overlapCounts);
long long n22_full = n22_full_bases / bMean;
long long n22 = max(0L, (long)(n22_full - n12 - n21 - n11));
printf("# Number of query intervals: %lu\n", _queryCounts);
printf("# Number of db intervals: %lu\n", _dbCounts);
printf("# Number of overlaps: %lu\n", _overlapCounts);
printf("# Number of possible intervals (estimated): %lld\n", n22_full);
cout << "# Contingency Table Of Counts" << endl;
printf("# phyper(%lld - 1, %lu, %lld - %lu, %lu, lower.tail=F)\n", n11, _queryCounts, n22_full, _queryCounts, _dbCounts);
printf("#_________________________________________\n");
printf("# | %-12s | %-12s |\n", " in -b", "not in -b");
printf("# in -a | %-12lld | %-12lld |\n", n11, n12);
......@@ -89,6 +108,9 @@ bool Fisher::getFisher() {
_queryLen = sweep.getQueryTotalRecordLength();
_dbLen = sweep.getDatabaseTotalRecordLength();
_queryCounts = sweep.getQueryTotalRecords();
_dbCounts = sweep.getDatabaseTotalRecords();
_unionVal = _queryLen + _dbLen;
return true;
}
......@@ -100,10 +122,14 @@ unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList)
int keyStart = key->getStartPos();
int keyEnd = key->getEndPos();
_overlapCounts += recList.size();
_qsizes.push_back((keyEnd - keyStart));
int hitIdx = 0;
for (RecordKeyVector::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) {
int maxStart = max((*iter)->getStartPos(), keyStart);
int minEnd = min((*iter)->getEndPos(), keyEnd);
_qsizes.push_back((int)(minEnd - maxStart));
if (_context->getObeySplits()) {
intersection += _blockMgr->getOverlapBases(hitIdx);
hitIdx++;
......
......@@ -26,9 +26,13 @@ private:
int _numIntersections;
unsigned long _queryLen;
unsigned long _dbLen;
unsigned long _queryCounts;
unsigned long _dbCounts;
unsigned long _overlapCounts;
bool getFisher();
BedFile *exclude;
vector<int> _qsizes;
unsigned long getTotalIntersection(RecordKeyVector &hits);
};
......
......@@ -32,13 +32,9 @@ void fisher_help(void) {
cerr << "\nTool: bedtools fisher (aka fisher)" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Calculate Fisher statistic b/w two feature files."
<< endl
<< "\t Fisher is the length of the intersection over the union."
<< endl
<< "\t Values range from 0 (no intersection) to 1 (self intersection)."
<< endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf> -g <genome>" << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf> -c <possible counts>" << endl << endl;
cerr << "Options: " << endl;
......
......@@ -7,7 +7,7 @@
ContextFisher::ContextFisher() {
setSortedInput(true);
setUseMergedIntervals(true);
setUseMergedIntervals(false);
}
ContextFisher::~ContextFisher() {
......
......@@ -16,34 +16,36 @@
NewChromSweep::NewChromSweep(ContextIntersect *context,
bool useMergedIntervals)
: _context(context),
_queryFRM(NULL),
_numDBs(_context->getNumDatabaseFiles()),
_useMergedIntervals(false),
_queryRecordsTotalLength(0),
_databaseRecordsTotalLength(0),
_wasInitialized(false),
_currQueryRec(NULL),
_runToQueryEnd(false)
: _context(context),
_queryFRM(NULL),
_numDBs(_context->getNumDatabaseFiles()),
_useMergedIntervals(false),
_queryRecordsTotalLength(0),
_databaseRecordsTotalLength(0),
_queryTotalRecords(0),
_databaseTotalRecords(0),
_wasInitialized(false),
_currQueryRec(NULL),
_runToQueryEnd(false)
{
}
bool NewChromSweep::init() {
//Create new FileRecordMgrs for the input files.
//Open them, and get the first record from each.
//otherwise, return true.
//Create new FileRecordMgrs for the input files.
//Open them, and get the first record from each.
//otherwise, return true.
_queryFRM = _context->getFile(_context->getQueryFileIdx());
_dbFRMs.resize(_numDBs, NULL);
for (int i=0; i < _numDBs; i++) {
_dbFRMs[i] = _context->getDatabaseFile(i);
_dbFRMs[i] = _context->getDatabaseFile(i);
}
_currDbRecs.resize(_numDBs, NULL);
for (int i=0; i < _numDBs; i++) {
nextRecord(false, i);
nextRecord(false, i);
}
_caches.resize(_numDBs);
......@@ -52,50 +54,50 @@ bool NewChromSweep::init() {
//end of the query file is hit as well.
if (_context->getNoHit() || _context->getWriteCount() || _context->getWriteOverlap() || _context->getWriteAllOverlap() || _context->getLeftJoin()) {
_runToQueryEnd = true;
_runToQueryEnd = true;
}
_wasInitialized = true;
return true;
}
void NewChromSweep::closeOut() {
while (!_queryFRM->eof()) {
nextRecord(true);
}
while (!_queryFRM->eof()) {
nextRecord(true);
}
for (int i=0; i < _numDBs; i++) {
while (!_dbFRMs[i]->eof()) {
nextRecord(false, i);
}
while (!_dbFRMs[i]->eof()) {
nextRecord(false, i);
}
}
}
NewChromSweep::~NewChromSweep(void) {
if (!_wasInitialized) {
return;
}
_queryFRM->deleteRecord(_currQueryRec);
_currQueryRec = NULL;
if (!_wasInitialized) {
return;
}
_queryFRM->deleteRecord(_currQueryRec);
_currQueryRec = NULL;
for (int i=0; i < _numDBs; i++) {
_dbFRMs[i]->deleteRecord(_currDbRecs[i]);
_currDbRecs[i] = NULL;
_dbFRMs[i]->deleteRecord(_currDbRecs[i]);
_currDbRecs[i] = NULL;
}
_queryFRM->close();
_queryFRM->close();
for (int i=0; i < _numDBs; i++) {
_dbFRMs[i]->close();
_dbFRMs[i]->close();
}
}
void NewChromSweep::scanCache(int dbIdx, RecordKeyVector &retList) {
recListIterType cacheIter = _caches[dbIdx].begin();
recListIterType cacheIter = _caches[dbIdx].begin();
while (cacheIter != _caches[dbIdx].end())
{
const Record *cacheRec = cacheIter->value();
const Record *cacheRec = cacheIter->value();
if (_currQueryRec->sameChrom(cacheRec) && !(_currQueryRec->after(cacheRec))) {
if (intersects(_currQueryRec, cacheRec)) {
retList.push_back(cacheRec);
......@@ -104,151 +106,153 @@ void NewChromSweep::scanCache(int dbIdx, RecordKeyVector &retList) {
}
else {
cacheIter = _caches[dbIdx].deleteCurrent();
_dbFRMs[dbIdx]->deleteRecord(cacheRec);
_dbFRMs[dbIdx]->deleteRecord(cacheRec);
}
}
}
void NewChromSweep::clearCache(int dbIdx)
{
//delete all objects pointed to by cache
recListType &cache = _caches[dbIdx];
for (recListIterType iter = cache.begin(); iter != cache.end(); iter = cache.next()) {
_dbFRMs[dbIdx]->deleteRecord(iter->value());
}
cache.clear();
//delete all objects pointed to by cache
recListType &cache = _caches[dbIdx];
for (recListIterType iter = cache.begin(); iter != cache.end(); iter = cache.next()) {
_dbFRMs[dbIdx]->deleteRecord(iter->value());
}
cache.clear();
}
void NewChromSweep::masterScan(RecordKeyVector &retList) {
for (int i=0; i < _numDBs; i++) {
if (dbFinished(i) || chromChange(i, retList)) {
continue;
} else {
// scan the database cache for hits
scanCache(i, retList);
//skip if we hit the end of the DB
// advance the db until we are ahead of the query. update hits and cache as necessary
while (_currDbRecs[i] != NULL &&
_currQueryRec->sameChrom(_currDbRecs[i]) &&
!(_currDbRecs[i]->after(_currQueryRec))) {
if (intersects(_currQueryRec, _currDbRecs[i])) {
retList.push_back(_currDbRecs[i]);
}
if (_currQueryRec->after(_currDbRecs[i])) {
_dbFRMs[i]->deleteRecord(_currDbRecs[i]);
_currDbRecs[i] = NULL;
} else {
_caches[i].push_back(_currDbRecs[i]);
_currDbRecs[i] = NULL;
}
nextRecord(false, i);
}
}
}
for (int i=0; i < _numDBs; i++) {
if (dbFinished(i) || chromChange(i, retList)) {
continue;
} else {
// scan the database cache for hits
scanCache(i, retList);
//skip if we hit the end of the DB
// advance the db until we are ahead of the query. update hits and cache as necessary
while (_currDbRecs[i] != NULL &&
_currQueryRec->sameChrom(_currDbRecs[i]) &&
!(_currDbRecs[i]->after(_currQueryRec))) {
if (intersects(_currQueryRec, _currDbRecs[i])) {
retList.push_back(_currDbRecs[i]);
}
if (_currQueryRec->after(_currDbRecs[i])) {
_dbFRMs[i]->deleteRecord(_currDbRecs[i]);
_currDbRecs[i] = NULL;
} else {
_caches[i].push_back(_currDbRecs[i]);
_currDbRecs[i] = NULL;
}
nextRecord(false, i);
}
}
}
}
bool NewChromSweep::chromChange(int dbIdx, RecordKeyVector &retList)
{
// the files are on the same chrom
if (_currDbRecs[dbIdx] == NULL || _currQueryRec->sameChrom(_currDbRecs[dbIdx])) {
return false;
}
// the query is ahead of the database. fast-forward the database to catch-up.
if (_currQueryRec->chromAfter(_currDbRecs[dbIdx])) {
while (_currDbRecs[dbIdx] != NULL &&
_currQueryRec->chromAfter(_currDbRecs[dbIdx])) {
_dbFRMs[dbIdx]->deleteRecord(_currDbRecs[dbIdx]);
nextRecord(false, dbIdx);
}
clearCache(dbIdx);
if (_currDbRecs[dbIdx] == NULL || _currQueryRec->sameChrom(_currDbRecs[dbIdx])) {
return false;
}
// the query is ahead of the database. fast-forward the database to catch-up.
if (_currQueryRec->chromAfter(_currDbRecs[dbIdx])) {
while (_currDbRecs[dbIdx] != NULL &&
_currQueryRec->chromAfter(_currDbRecs[dbIdx])) {
_dbFRMs[dbIdx]->deleteRecord(_currDbRecs[dbIdx]);
nextRecord(false, dbIdx);
}
clearCache(dbIdx);
return false;
}
// the database is ahead of the query.
else {
// 1. scan the cache for remaining hits on the query's current chrom.
scanCache(dbIdx, retList);
scanCache(dbIdx, retList);
return true;
}
//control can't reach here, but compiler still wants a return statement.
return true;
//control can't reach here, but compiler still wants a return statement.
return true;
}
bool NewChromSweep::next(RecordKeyVector &retList) {
retList.clearVector();
if (_currQueryRec != NULL) {
_queryFRM->deleteRecord(_currQueryRec);
}
retList.clearVector();
if (_currQueryRec != NULL) {
_queryFRM->deleteRecord(_currQueryRec);
}
if (!nextRecord(true)) return false; // query EOF hit
if (!nextRecord(true)) return false; // query EOF hit
if (allCurrDBrecsNull() && allCachesEmpty() && !_runToQueryEnd) {
return false;
}
_currChromName = _currQueryRec->getChrName();
if (allCurrDBrecsNull() && allCachesEmpty() && !_runToQueryEnd) {
return false;
}
_currChromName = _currQueryRec->getChrName();
masterScan(retList);
masterScan(retList);
if (_context->getSortOutput()) {
retList.sortVector();
}
if (_context->getSortOutput()) {
retList.sortVector();
}
retList.setKey(_currQueryRec);
return true;
retList.setKey(_currQueryRec);
return true;
}
bool NewChromSweep::nextRecord(bool query, int dbIdx) {
if (query) {
_currQueryRec = _queryFRM->getNextRecord();
if (_currQueryRec != NULL) {
_queryRecordsTotalLength += (unsigned long)(_currQueryRec->getEndPos() - _currQueryRec->getStartPos());
return true;
}
return false;
} else { //database
Record *rec = _dbFRMs[dbIdx]->getNextRecord();
_currDbRecs[dbIdx] = rec;
if (rec != NULL) {
_databaseRecordsTotalLength += (unsigned long)(rec->getEndPos() - rec->getStartPos());
return true;
}
return false;
}
if (query) {
_currQueryRec = _queryFRM->getNextRecord();
if (_currQueryRec != NULL) {
_queryRecordsTotalLength += (unsigned long)(_currQueryRec->getEndPos() - _currQueryRec->getStartPos());
_queryTotalRecords++;
return true;
}
return false;
} else { //database
Record *rec = _dbFRMs[dbIdx]->getNextRecord();
_currDbRecs[dbIdx] = rec;
if (rec != NULL) {
_databaseRecordsTotalLength += (unsigned long)(rec->getEndPos() - rec->getStartPos());
_databaseTotalRecords++;
return true;
}
return false;
}
}
bool NewChromSweep::intersects(const Record *rec1, const Record *rec2) const
{
return rec1->sameChromIntersects(rec2, _context->getSameStrand(), _context->getDiffStrand(),
_context->getOverlapFraction(), _context->getReciprocal());
return rec1->sameChromIntersects(rec2, _context->getSameStrand(), _context->getDiffStrand(),
_context->getOverlapFraction(), _context->getReciprocal());
}
bool NewChromSweep::allCachesEmpty() {
for (int i=0; i < _numDBs; i++) {
if (!_caches[i].empty()) {
return false;
}
}
return true;
for (int i=0; i < _numDBs; i++) {
if (!_caches[i].empty()) {
return false;
}
}
return true;
}
bool NewChromSweep::allCurrDBrecsNull() {
for (int i=0; i < _numDBs; i++) {
if (_currDbRecs[i] != NULL) {
return false;
}
}
return true;
for (int i=0; i < _numDBs; i++) {
if (_currDbRecs[i] != NULL) {
return false;
}
}
return true;
}
bool NewChromSweep::dbFinished(int dbIdx) {
if (_currDbRecs[dbIdx] == NULL && _caches[dbIdx].empty()) {
return true;
}
return false;
if (_currDbRecs[dbIdx] == NULL && _caches[dbIdx].empty()) {
return true;
}
return false;
}
......@@ -49,6 +49,9 @@ public:
unsigned long getQueryTotalRecordLength() { return _queryRecordsTotalLength; }
unsigned long getDatabaseTotalRecordLength() { return _databaseRecordsTotalLength; }
unsigned long getQueryTotalRecords() { return _queryTotalRecords; }
unsigned long getDatabaseTotalRecords() { return _databaseTotalRecords; }