From d6544e0e9dceab78257bc547f5dd57c6344dfb2c Mon Sep 17 00:00:00 2001
From: Brent Pedersen <bpederse@gmail.com>
Date: Mon, 24 Nov 2014 16:56:57 -0700
Subject: [PATCH] change fisher to use numbers of intervals rather than bases
 of overlap

---
 docs/content/tools/fisher.rst             |  49 ++--
 src/fisher/Fisher.cpp                     |  48 +++-
 src/fisher/Fisher.h                       |   4 +
 src/fisher/fisherMain.cpp                 |   6 +-
 src/utils/Contexts/ContextFisher.cpp      |   2 +-
 src/utils/NewChromsweep/NewChromsweep.cpp | 262 +++++++++++-----------
 src/utils/NewChromsweep/NewChromsweep.h   |   6 +
 test/fisher/README.md                     |  20 ++
 test/fisher/shuf.sh                       |   4 +
 test/fisher/sim.py                        |  27 +++
 10 files changed, 264 insertions(+), 164 deletions(-)
 create mode 100644 test/fisher/README.md
 create mode 100644 test/fisher/shuf.sh
 create mode 100644 test/fisher/sim.py

diff --git a/docs/content/tools/fisher.rst b/docs/content/tools/fisher.rst
index aa771c07..d027033a 100644
--- a/docs/content/tools/fisher.rst
+++ b/docs/content/tools/fisher.rst
@@ -15,7 +15,6 @@ scenarios using
 `Fisher's Exact Test`_ .
 
 
-
 Given a pair of input files `-a` and `-b` in the usual BedTools parlance:
 
 .. code-block:: bash
@@ -23,9 +22,11 @@ Given a pair of input files `-a` and `-b` in the usual BedTools parlance:
   $ cat a.bed
   chr1  10  20
   chr1  30  40
+  chr1	51	52
 
   $ cat b.bed
   chr1  15   25
+  chr1	51	52
 
 And a genome of 500 bases:
 
@@ -40,43 +41,55 @@ can do this with ``fisher`` as:
 .. code-block:: bash
 
     $ bedtools fisher -a a.bed -b b.bed -g t.genome
-    # Contingency Table
+    # Number of query intervals: 3
+    # Number of db intervals: 2
+    # Number of overlaps: 2
+    # Number of possible intervals (estimated): 37
+    # Contingency Table Of Counts
+    # phyper(2 - 1, 3, 37 - 3, 2, lower.tail=F)
     #_________________________________________
-    #           | not in -b    | in -b        |
-    # not in -a | 475          | 5           |
-    #     in -a | 15           | 5            |
+    #           |  in -b       | not in -b    |
+    #     in -a | 2            | 1            |
+    # not in -a | 0            | 34           |
     #_________________________________________
     # p-values for fisher's exact test
     left    right   two-tail    ratio
-    1       1.3466e-05      1.3466e-05      31.667
+    1   0.0045045   0.0045045   inf
+
+
 
 
 Where we can see the constructed contingency table and the pvalues for left, right
 and two-tail tests.
-From here, we can say that given **500 bases** of genome, it is unlikely that a region of
-20 bases total from `-a` and 10 bases total from `-b` would share 5 bases if the regions
-were randomly distributed. *Consult your statistician for a more precise definition*.
+From here, we can say that given **500 bases** of genome, it is unlikely that we
+would see **as many** overlaps as we do if the intervals from `a` and `b` were not
+related.
 
-The above was highly significant, but if our genome were only **50 bases**:
+The above had a fairly low p-value (0.0045), but if our genome were only **60 bases**:
 
 .. code-block:: bash
 
-    $ echo -e "chr1\t50" > t.genome
+    $ echo -e "chr1\t60" > t.genome
     $ bedtools fisher -a a.bed -b b.bed -g t.genome
-    # Contingency Table
+    # Number of query intervals: 3
+    # Number of db intervals: 2
+    # Number of overlaps: 2
+    # Number of possible intervals (estimated): 4
+    # Contingency Table Of Counts
+    # phyper(2 - 1, 3, 4 - 3, 2, lower.tail=F)
     #_________________________________________
-    #           | not in -b    | in -b        |
-    # not in -a | 25           | 15           |
-    #     in -a | 5            | 5            |
+    #           |  in -b       | not in -b    |
+    #     in -a | 2            | 1            |
+    # not in -a | 0            | 1            |
     #_________________________________________
     # p-values for fisher's exact test
     left    right   two-tail    ratio
-    0.86011 0.35497 0.49401 1.667
+    1   0.5 1   inf
 
 
 We can see that neither tail is significant. Intuitively, this makes sense; 
-if we randomly place 20 (from `-a`), and 10 (from `-b`) bases of intervals
-within 50 bases, it doesn't seem unlikely that we'd see 5 bases of overlap.
+if we randomly place 3 intervals (from `-a`), and 2 (from `-b`) intervals from
+within 60 bases, it doesn't seem unlikely that we'd see 2 overlaps.
 
 
 .. note::
diff --git a/src/fisher/Fisher.cpp b/src/fisher/Fisher.cpp
index 9eca0771..4cbfa7af 100644
--- a/src/fisher/Fisher.cpp
+++ b/src/fisher/Fisher.cpp
@@ -2,6 +2,7 @@
 #include "BlockMgr.h"
 #include "NewChromsweep.h"
 #include "kfunc.c"
+#include <numeric>      // std::accumulate
 
 Fisher::Fisher(ContextFisher *context)
 : _context(context),
@@ -9,7 +10,11 @@ Fisher::Fisher(ContextFisher *context)
   _unionVal(0),
   _numIntersections(0),
   _queryLen(0),
-  _dbLen(0)
+  _dbLen(0),
+  _queryCounts(0),
+  _dbCounts(0),
+  _overlapCounts(0)
+
 {
     _blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal());
     _haveExclude = false;
@@ -34,7 +39,6 @@ bool Fisher::calculate() {
     }
 
     // header
-    cout << "# Contingency Table" << endl;
 
     double left, right, two;
 
@@ -42,15 +46,30 @@ bool Fisher::calculate() {
     if(_haveExclude){
         genomeSize -= exclude->getTotalFlattenedLength();
     }
-    // bases covered by both
-    long long n22 = genomeSize - _queryLen - _dbLen + _intersectionVal;
-    // bases covered only by -a
-    long long n21 = _dbLen - _intersectionVal;
-    // bases covered only by -b
-    long long n12 = _queryLen - _intersectionVal;
-    // bases covered by neither a nor b
-    long long n11 = _intersectionVal;
-
+    // bases covered by neither
+    long long n22_full_bases = genomeSize;
+    //long long n22_bases = genomeSize - _queryLen - _dbLen + _intersectionVal;
+    long double dMean = _dbLen / (long double)_dbCounts;
+    long double qMean = _queryLen / (long double)_queryCounts;
+
+    // mean interval size
+    long double bMean = (_dbLen + _queryLen) / (long double)(_dbCounts + _queryCounts);
+    bMean = 1 + (qMean + dMean); // / 2.0;
+
+    long long n11 = (long)_overlapCounts;
+    // this could be < 0 because multiple overlaps
+    long long n12 = (long)max(0L, (long)_queryCounts - (long)_overlapCounts);
+    long long n21 = (long)(_dbCounts - _overlapCounts);
+    long long n22_full = n22_full_bases / bMean;
+    long long n22 = max(0L, (long)(n22_full - n12 - n21 - n11));
+
+    printf("# Number of query intervals: %lu\n", _queryCounts);
+    printf("# Number of db intervals: %lu\n", _dbCounts);
+    printf("# Number of overlaps: %lu\n", _overlapCounts);
+    printf("# Number of possible intervals (estimated): %lld\n", n22_full);
+
+    cout << "# Contingency Table Of Counts" << endl;
+    printf("# phyper(%lld - 1, %lu, %lld - %lu, %lu, lower.tail=F)\n", n11, _queryCounts, n22_full, _queryCounts, _dbCounts);
     printf("#_________________________________________\n");
     printf("#           | %-12s | %-12s |\n", " in -b", "not in -b");
     printf("#     in -a | %-12lld | %-12lld |\n", n11, n12);
@@ -89,6 +108,9 @@ bool Fisher::getFisher() {
     _queryLen = sweep.getQueryTotalRecordLength();
     _dbLen = sweep.getDatabaseTotalRecordLength();
 
+    _queryCounts = sweep.getQueryTotalRecords();
+    _dbCounts = sweep.getDatabaseTotalRecords();
+
     _unionVal = _queryLen + _dbLen;
     return true;
 }
@@ -100,10 +122,14 @@ unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList)
     int keyStart = key->getStartPos();
     int keyEnd = key->getEndPos();
 
+    _overlapCounts += recList.size();
+    _qsizes.push_back((keyEnd - keyStart));
+
     int hitIdx = 0;
     for (RecordKeyVector::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) {
         int maxStart = max((*iter)->getStartPos(), keyStart);
         int minEnd = min((*iter)->getEndPos(), keyEnd);
+        _qsizes.push_back((int)(minEnd - maxStart));
         if (_context->getObeySplits()) {
             intersection += _blockMgr->getOverlapBases(hitIdx);
             hitIdx++;
diff --git a/src/fisher/Fisher.h b/src/fisher/Fisher.h
index e10bc8fb..c9d1850e 100644
--- a/src/fisher/Fisher.h
+++ b/src/fisher/Fisher.h
@@ -26,9 +26,13 @@ private:
     int _numIntersections;
     unsigned long _queryLen;
     unsigned long _dbLen;
+    unsigned long _queryCounts;
+    unsigned long _dbCounts;
+    unsigned long _overlapCounts;
     bool getFisher();
     BedFile *exclude;
 
+    vector<int> _qsizes;
     unsigned long getTotalIntersection(RecordKeyVector &hits);
 };
 
diff --git a/src/fisher/fisherMain.cpp b/src/fisher/fisherMain.cpp
index 755dc97a..d35b2868 100644
--- a/src/fisher/fisherMain.cpp
+++ b/src/fisher/fisherMain.cpp
@@ -32,13 +32,9 @@ void fisher_help(void) {
     cerr << "\nTool:    bedtools fisher (aka fisher)" << endl;
     cerr << "Version: " << VERSION << "\n";
     cerr << "Summary: Calculate Fisher statistic b/w two feature files."
-         << endl
-         << "\t Fisher is the length of the intersection over the union."
-         << endl
-         << "\t Values range from 0 (no intersection) to 1 (self intersection)."
          << endl << endl;
 
-    cerr << "Usage:   " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf> -g <genome>" << endl << endl;
+    cerr << "Usage:   " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf> -c <possible counts>" << endl << endl;
 
     cerr << "Options: " << endl;
 
diff --git a/src/utils/Contexts/ContextFisher.cpp b/src/utils/Contexts/ContextFisher.cpp
index 9452c069..fd00f982 100644
--- a/src/utils/Contexts/ContextFisher.cpp
+++ b/src/utils/Contexts/ContextFisher.cpp
@@ -7,7 +7,7 @@
 
 ContextFisher::ContextFisher() {
 	setSortedInput(true);
-	setUseMergedIntervals(true);
+	setUseMergedIntervals(false);
 }
 
 ContextFisher::~ContextFisher() {
diff --git a/src/utils/NewChromsweep/NewChromsweep.cpp b/src/utils/NewChromsweep/NewChromsweep.cpp
index f7561e8c..82a9860d 100644
--- a/src/utils/NewChromsweep/NewChromsweep.cpp
+++ b/src/utils/NewChromsweep/NewChromsweep.cpp
@@ -16,34 +16,36 @@
 
 NewChromSweep::NewChromSweep(ContextIntersect *context,
                        bool useMergedIntervals)
-:	_context(context),
- 	_queryFRM(NULL),
- 	_numDBs(_context->getNumDatabaseFiles()),
-  	_useMergedIntervals(false),
- 	_queryRecordsTotalLength(0),
- 	_databaseRecordsTotalLength(0),
- 	_wasInitialized(false),
- 	_currQueryRec(NULL),
- 	_runToQueryEnd(false)
+:   _context(context),
+    _queryFRM(NULL),
+    _numDBs(_context->getNumDatabaseFiles()),
+    _useMergedIntervals(false),
+    _queryRecordsTotalLength(0),
+    _databaseRecordsTotalLength(0),
+    _queryTotalRecords(0),
+    _databaseTotalRecords(0),
+    _wasInitialized(false),
+    _currQueryRec(NULL),
+    _runToQueryEnd(false)
 {
 }
 
 
 bool NewChromSweep::init() {
     
-	//Create new FileRecordMgrs for the input files.
-	//Open them, and get the first record from each.
-	//otherwise, return true.
+    //Create new FileRecordMgrs for the input files.
+    //Open them, and get the first record from each.
+    //otherwise, return true.
     _queryFRM = _context->getFile(_context->getQueryFileIdx());
     
     _dbFRMs.resize(_numDBs, NULL);
     for (int i=0; i < _numDBs; i++) {
-    	_dbFRMs[i] = _context->getDatabaseFile(i);
+        _dbFRMs[i] = _context->getDatabaseFile(i);
     }
 
     _currDbRecs.resize(_numDBs, NULL);
     for (int i=0; i < _numDBs; i++) {
-    	nextRecord(false, i);
+        nextRecord(false, i);
     }
 
     _caches.resize(_numDBs);
@@ -52,50 +54,50 @@ bool NewChromSweep::init() {
     //end of the query file is hit as well.
 
     if (_context->getNoHit() || _context->getWriteCount() || _context->getWriteOverlap() || _context->getWriteAllOverlap() || _context->getLeftJoin()) {
-    	_runToQueryEnd = true;
+        _runToQueryEnd = true;
     }
     _wasInitialized = true;
     return true;
  }
 
 void NewChromSweep::closeOut() {
-	while (!_queryFRM->eof()) {
-		nextRecord(true);
-	}
+    while (!_queryFRM->eof()) {
+        nextRecord(true);
+    }
 
     for (int i=0; i < _numDBs; i++) {
-    	while (!_dbFRMs[i]->eof()) {
-    		nextRecord(false, i);
-    	}
+        while (!_dbFRMs[i]->eof()) {
+            nextRecord(false, i);
+        }
     }
 }
 
 NewChromSweep::~NewChromSweep(void) {
-	if (!_wasInitialized) {
-		return;
-	}
-	_queryFRM->deleteRecord(_currQueryRec);
-	_currQueryRec = NULL;
+    if (!_wasInitialized) {
+        return;
+    }
+    _queryFRM->deleteRecord(_currQueryRec);
+    _currQueryRec = NULL;
 
 
     for (int i=0; i < _numDBs; i++) {
-    	_dbFRMs[i]->deleteRecord(_currDbRecs[i]);
-    	_currDbRecs[i] = NULL;
+        _dbFRMs[i]->deleteRecord(_currDbRecs[i]);
+        _currDbRecs[i] = NULL;
     }
 
-	_queryFRM->close();
+    _queryFRM->close();
 
    for (int i=0; i < _numDBs; i++) {
-	   _dbFRMs[i]->close();
+       _dbFRMs[i]->close();
    }
 }
 
 
 void NewChromSweep::scanCache(int dbIdx, RecordKeyVector &retList) {
-	recListIterType cacheIter = _caches[dbIdx].begin();
+    recListIterType cacheIter = _caches[dbIdx].begin();
     while (cacheIter != _caches[dbIdx].end())
     {
-    	const Record *cacheRec = cacheIter->value();
+        const Record *cacheRec = cacheIter->value();
         if (_currQueryRec->sameChrom(cacheRec) && !(_currQueryRec->after(cacheRec))) {
             if (intersects(_currQueryRec, cacheRec)) {
                 retList.push_back(cacheRec);
@@ -104,151 +106,153 @@ void NewChromSweep::scanCache(int dbIdx, RecordKeyVector &retList) {
         }
         else {
             cacheIter = _caches[dbIdx].deleteCurrent();
-    		_dbFRMs[dbIdx]->deleteRecord(cacheRec);
+            _dbFRMs[dbIdx]->deleteRecord(cacheRec);
         }
     }
 }
 
 void NewChromSweep::clearCache(int dbIdx)
 {
-	//delete all objects pointed to by cache
-	recListType &cache = _caches[dbIdx];
-	for (recListIterType iter = cache.begin(); iter != cache.end(); iter = cache.next()) {
-		_dbFRMs[dbIdx]->deleteRecord(iter->value());
-	}
-	cache.clear();
+    //delete all objects pointed to by cache
+    recListType &cache = _caches[dbIdx];
+    for (recListIterType iter = cache.begin(); iter != cache.end(); iter = cache.next()) {
+        _dbFRMs[dbIdx]->deleteRecord(iter->value());
+    }
+    cache.clear();
 }
 
 void NewChromSweep::masterScan(RecordKeyVector &retList) {
-	for (int i=0; i < _numDBs; i++) {
-		if (dbFinished(i) || chromChange(i, retList)) {
-			continue;
-		} else {
-
-			// scan the database cache for hits
-			scanCache(i, retList);
-			//skip if we hit the end of the DB
-			// advance the db until we are ahead of the query. update hits and cache as necessary
-			while (_currDbRecs[i] != NULL &&
-					_currQueryRec->sameChrom(_currDbRecs[i]) &&
-					!(_currDbRecs[i]->after(_currQueryRec))) {
-				if (intersects(_currQueryRec, _currDbRecs[i])) {
-					retList.push_back(_currDbRecs[i]);
-				}
-				if (_currQueryRec->after(_currDbRecs[i])) {
-					_dbFRMs[i]->deleteRecord(_currDbRecs[i]);
-					_currDbRecs[i] = NULL;
-				} else {
-					_caches[i].push_back(_currDbRecs[i]);
-					_currDbRecs[i] = NULL;
-				}
-				nextRecord(false, i);
-			}
-		}
-	}
+    for (int i=0; i < _numDBs; i++) {
+        if (dbFinished(i) || chromChange(i, retList)) {
+            continue;
+        } else {
+
+            // scan the database cache for hits
+            scanCache(i, retList);
+            //skip if we hit the end of the DB
+            // advance the db until we are ahead of the query. update hits and cache as necessary
+            while (_currDbRecs[i] != NULL &&
+                    _currQueryRec->sameChrom(_currDbRecs[i]) &&
+                    !(_currDbRecs[i]->after(_currQueryRec))) {
+                if (intersects(_currQueryRec, _currDbRecs[i])) {
+                    retList.push_back(_currDbRecs[i]);
+                }
+                if (_currQueryRec->after(_currDbRecs[i])) {
+                    _dbFRMs[i]->deleteRecord(_currDbRecs[i]);
+                    _currDbRecs[i] = NULL;
+                } else {
+                    _caches[i].push_back(_currDbRecs[i]);
+                    _currDbRecs[i] = NULL;
+                }
+                nextRecord(false, i);
+            }
+        }
+    }
 }
 
 bool NewChromSweep::chromChange(int dbIdx, RecordKeyVector &retList)
 {
     // the files are on the same chrom
-	if (_currDbRecs[dbIdx] == NULL || _currQueryRec->sameChrom(_currDbRecs[dbIdx])) {
-		return false;
-	}
-	// the query is ahead of the database. fast-forward the database to catch-up.
-	if (_currQueryRec->chromAfter(_currDbRecs[dbIdx])) {
-
-		while (_currDbRecs[dbIdx] != NULL &&
-				_currQueryRec->chromAfter(_currDbRecs[dbIdx])) {
-			_dbFRMs[dbIdx]->deleteRecord(_currDbRecs[dbIdx]);
-			nextRecord(false, dbIdx);
-		}
-		clearCache(dbIdx);
+    if (_currDbRecs[dbIdx] == NULL || _currQueryRec->sameChrom(_currDbRecs[dbIdx])) {
+        return false;
+    }
+    // the query is ahead of the database. fast-forward the database to catch-up.
+    if (_currQueryRec->chromAfter(_currDbRecs[dbIdx])) {
+
+        while (_currDbRecs[dbIdx] != NULL &&
+                _currQueryRec->chromAfter(_currDbRecs[dbIdx])) {
+            _dbFRMs[dbIdx]->deleteRecord(_currDbRecs[dbIdx]);
+            nextRecord(false, dbIdx);
+        }
+        clearCache(dbIdx);
         return false;
     }
     // the database is ahead of the query.
     else {
         // 1. scan the cache for remaining hits on the query's current chrom.
-		scanCache(dbIdx, retList);
+        scanCache(dbIdx, retList);
 
         return true;
     }
 
-	//control can't reach here, but compiler still wants a return statement.
-	return true;
+    //control can't reach here, but compiler still wants a return statement.
+    return true;
 }
 
 
 bool NewChromSweep::next(RecordKeyVector &retList) {
-	retList.clearVector();
-	if (_currQueryRec != NULL) {
-		_queryFRM->deleteRecord(_currQueryRec);
-	}
+    retList.clearVector();
+    if (_currQueryRec != NULL) {
+        _queryFRM->deleteRecord(_currQueryRec);
+    }
 
-	if (!nextRecord(true)) return false; // query EOF hit
+    if (!nextRecord(true)) return false; // query EOF hit
 
 
-	if (allCurrDBrecsNull() && allCachesEmpty() && !_runToQueryEnd) {
-		return false;
-	}
-	_currChromName = _currQueryRec->getChrName();
+    if (allCurrDBrecsNull() && allCachesEmpty() && !_runToQueryEnd) {
+        return false;
+    }
+    _currChromName = _currQueryRec->getChrName();
 
-	masterScan(retList);
+    masterScan(retList);
 
-	if (_context->getSortOutput()) {
-		retList.sortVector();
-	}
+    if (_context->getSortOutput()) {
+        retList.sortVector();
+    }
 
-	retList.setKey(_currQueryRec);
-	return true;
+    retList.setKey(_currQueryRec);
+    return true;
 }
 
 bool NewChromSweep::nextRecord(bool query, int dbIdx) {
-	if (query) {
-		_currQueryRec = _queryFRM->getNextRecord();
-		if (_currQueryRec != NULL) {
-			_queryRecordsTotalLength += (unsigned long)(_currQueryRec->getEndPos() - _currQueryRec->getStartPos());
-			return true;
-		}
-		return false;
-	} else { //database
-		Record *rec = _dbFRMs[dbIdx]->getNextRecord();
-		_currDbRecs[dbIdx] = rec;
-		if (rec != NULL) {
-			_databaseRecordsTotalLength += (unsigned long)(rec->getEndPos() - rec->getStartPos());
-			return true;
-		}
-		return false;
-	}
+    if (query) {
+        _currQueryRec = _queryFRM->getNextRecord();
+        if (_currQueryRec != NULL) {
+            _queryRecordsTotalLength += (unsigned long)(_currQueryRec->getEndPos() - _currQueryRec->getStartPos());
+            _queryTotalRecords++;
+            return true;
+        }
+        return false;
+    } else { //database
+        Record *rec = _dbFRMs[dbIdx]->getNextRecord();
+        _currDbRecs[dbIdx] = rec;
+        if (rec != NULL) {
+            _databaseRecordsTotalLength += (unsigned long)(rec->getEndPos() - rec->getStartPos());
+            _databaseTotalRecords++;
+            return true;
+        }
+        return false;
+    }
 }
 
 bool NewChromSweep::intersects(const Record *rec1, const Record *rec2) const
 {
-	return rec1->sameChromIntersects(rec2, _context->getSameStrand(), _context->getDiffStrand(),
-			_context->getOverlapFraction(), _context->getReciprocal());
+    return rec1->sameChromIntersects(rec2, _context->getSameStrand(), _context->getDiffStrand(),
+            _context->getOverlapFraction(), _context->getReciprocal());
 }
 
 
 bool NewChromSweep::allCachesEmpty() {
-	for (int i=0; i < _numDBs; i++) {
-		if (!_caches[i].empty()) {
-			return false;
-		}
-	}
-	return true;
+    for (int i=0; i < _numDBs; i++) {
+        if (!_caches[i].empty()) {
+            return false;
+        }
+    }
+    return true;
 }
 
 bool NewChromSweep::allCurrDBrecsNull() {
-	for (int i=0; i < _numDBs; i++) {
-		if (_currDbRecs[i] != NULL) {
-			return false;
-		}
-	}
-	return true;
+    for (int i=0; i < _numDBs; i++) {
+        if (_currDbRecs[i] != NULL) {
+            return false;
+        }
+    }
+    return true;
 }
 
 bool NewChromSweep::dbFinished(int dbIdx) {
-	if (_currDbRecs[dbIdx] == NULL && _caches[dbIdx].empty()) {
-		return true;
-	}
-	return false;
+    if (_currDbRecs[dbIdx] == NULL && _caches[dbIdx].empty()) {
+        return true;
+    }
+    return false;
 }
diff --git a/src/utils/NewChromsweep/NewChromsweep.h b/src/utils/NewChromsweep/NewChromsweep.h
index 1dfe5b7b..aca2bb1d 100644
--- a/src/utils/NewChromsweep/NewChromsweep.h
+++ b/src/utils/NewChromsweep/NewChromsweep.h
@@ -49,6 +49,9 @@ public:
     unsigned long getQueryTotalRecordLength() { return _queryRecordsTotalLength; }
     unsigned long getDatabaseTotalRecordLength() { return _databaseRecordsTotalLength; }
 
+    unsigned long getQueryTotalRecords() { return _queryTotalRecords; }
+    unsigned long getDatabaseTotalRecords() { return _databaseTotalRecords; }
+
 private:
     ContextIntersect *_context;
     FileRecordMgr *_queryFRM;
@@ -63,6 +66,9 @@ private:
 
     unsigned long _databaseRecordsTotalLength;
 
+    unsigned long _queryTotalRecords;
+    unsigned long _databaseTotalRecords;
+
     bool _wasInitialized;
 
     // a cache of still active features from the database file
diff --git a/test/fisher/README.md b/test/fisher/README.md
new file mode 100644
index 00000000..d0179ca9
--- /dev/null
+++ b/test/fisher/README.md
@@ -0,0 +1,20 @@
+Fisher Testing
+==============
+
+Fisher is now based on the count of interval overlaps, subject to `-f`.
+We can compare the output of fisher on simulated data by running `python sim.py`
+which will show the output from `bedtools fisher` and then running `bash shuf.sh`
+which will repeatedly run
+
+```Shell
+
+bedtools intersect -wo -a taa.bed -b <(bedtools shuffle -allowBeyondChromEnd   -i tbb.bed -g tgg.genome) | wc -l 
+```
+
+and then report the proportion of times that number is >= the observed intersection.
+
+In `sim.py` changing the lenght of the intervals (via `maxA`, `maxB`) has the greatest effect on the
+correspondence of the simulated p-value from `shuf.sh` and the one from `fisher`. The right-tailed p-value
+from `fisher` should correspond well to the value from the simulation.
+
+
diff --git a/test/fisher/shuf.sh b/test/fisher/shuf.sh
new file mode 100644
index 00000000..ec0755aa
--- /dev/null
+++ b/test/fisher/shuf.sh
@@ -0,0 +1,4 @@
+obs=$(bedtools intersect -wo -a taa.bed -b tbb.bed | wc -l)
+
+seq 1000 | xargs -P 7 -n 1 bash -c "bedtools intersect -wo -a taa.bed -b <(bedtools shuffle -allowBeyondChromEnd   -i tbb.bed -g tgg.genome) | wc -l " | awk -vobs=$obs '{s += ($1 > obs)}END{print s / NR}'
+
diff --git a/test/fisher/sim.py b/test/fisher/sim.py
new file mode 100644
index 00000000..3379507e
--- /dev/null
+++ b/test/fisher/sim.py
@@ -0,0 +1,27 @@
+import sys
+from random import randint
+from subprocess import check_output
+
+genome_size = 25000000
+nA = 210
+nB = 3390
+
+minA, maxA = (20, 2000)
+minB, maxB = (20, 1250)
+
+for f, imin, imax, n in (
+        ('taa.bed', minA, maxA, nA),
+        ('tbb.bed', minB, maxB, nB)):
+    with open(f, 'w') as fh:
+        vals = []
+        for i in range(n):
+            s = randint(0, genome_size - imax)
+            e = randint(s + imin, min(genome_size, s + imax))
+            vals.append((s, e))
+        for s, e in sorted(vals):
+            fh.write("chr1\t%i\t%i\n" % (s, e))
+        fh.flush()
+
+print >> open('tgg.genome', 'w'), ("chr1\t%i" % genome_size)
+
+print check_output("../../bin/bedtools fisher -a taa.bed -b tbb.bed -g tgg.genome", shell=True)
-- 
GitLab