Merge pull request #94 from brentp/stats

Fisher's Exact Test

Merge pull request #94 from brentp/stats
Fisher's Exact Test
1b311e84 · Aaron Quinlan · fef96d48 · 762deee0 · 1b311e84 · 1b311e84
Commit 1b311e84 authored 10 years ago by Aaron Quinlan
--- a/Makefile
+++ b/Makefile
@@ -42,6 +42,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
 		  $(SRC_DIR)/getOverlap \
 		  $(SRC_DIR)/groupBy \
 		  $(SRC_DIR)/intersectFile \
+		  $(SRC_DIR)/fisher \
 		  $(SRC_DIR)/jaccard \
 		  $(SRC_DIR)/linksBed \
 		  $(SRC_DIR)/maskFastaFromBed \
@@ -212,4 +213,4 @@ autoversion:
 		echo "#define VERSION_GIT_H" >> $(VERSION_FILE) ; \
 		echo "#define VERSION_GIT \"$${DETECTED_VERSION}\"" >> $(VERSION_FILE) ; \
 		echo "#endif /* VERSION_GIT_H */" >> $(VERSION_FILE) ; \
-	fi )
\ No newline at end of file
+	fi )
--- a/docs/content/bedtools-suite.rst
+++ b/docs/content/bedtools-suite.rst
@@ -28,6 +28,7 @@ The full list of `bedtools` sub-commands.
   tools/coverage   
   tools/expand     
   tools/flank      
+   tools/fisher
   tools/genomecov  
   tools/getfasta    
   tools/groupby    
@@ -56,4 +57,4 @@ The full list of `bedtools` sub-commands.
   tools/window  
   
   
-  
\ No newline at end of file
+  
--- a/docs/content/tools/fisher.rst
+++ b/docs/content/tools/fisher.rst
+.. _fisher:
+
+########
+*fisher*
+########
+
+Perform fisher's exact test on the non/overlap between 2 files.
+
+|
+
+Traditionally, in order to test whether 2 sets of intervals are related
+spatially, we resort to shuffling the genome and checking the simulated
+(shuffled) versus the observed. We can do the same analytically for many
+scenarios using 
+`Fisher's Exact Test`_ .
+
+
+
+Given a pair of input files `-a` and `-b` in the usual BedTools parlance:
+
+.. code-block:: bash
+
+  $ cat a.bed
+  chr1  10  20
+  chr1  30  40
+
+  $ cat b.bed
+  chr1  15   25
+
+And a genome of 500 bases:
+
+.. code-block:: bash
+
+    $ echo -e "chr1\t500" > t.genome
+
+We may wish to know **if the amount of overlap between the 2 sets of intervals is
+more than we would expect given their coverage and the size of the genome**. We
+can do this with ``fisher`` as:
+
+.. code-block:: bash
+
+    $ bedtools fisher -a a.bed -b b.bed -g t.genome
+    # Contingency Table
+    #_________________________________________
+    #           | not in -b    | in -b        |
+    # not in -a | 475          | 15           |
+    #     in -a | 5            | 5            |
+    #_________________________________________
+    # p-values for fisher's exact test
+    left    right   two-tail    ratio
+    1.00000 0.00001 0.00001 31.667
+
+
+Where we can see the constructed contingency table and the pvalues for left, right
+and two-tail tests.
+From here, we can say that given **500 bases** of genome, it is unlikely that a region of
+20 bases total from `-a` and 10 bases total from `-b` would share 5 bases if the regions
+were randomly distributed. *Consult your statistician for a more precise definition*.
+
+The above was highly significant, but if our genome were only **50 bases**:
+
+.. code-block:: bash
+
+    $ echo -e "chr1\t50" > t.genome
+    $ bedtools fisher -a a.bed -b b.bed -g t.genome
+    # Contingency Table
+    #_________________________________________
+    #           | not in -b    | in -b        |
+    # not in -a | 25           | 15           |
+    #     in -a | 5            | 5            |
+    #_________________________________________
+    # p-values for fisher's exact test
+    left    right   two-tail    ratio
+    0.86011 0.35497 0.49401 1.667
+
+
+We can see that neither tail is significant. Intuitively, this makes sense; 
+if we randomly place 20 (from `-a`), and 10 (from `-b`) bases of intervals
+within 50 bases, it doesn't seem unlikely that we'd see 5 bases of overlap.
+
+
+.. note::
+
+    The ``fisher`` tool requires that your data is pre-sorted by chromosome and
+    then by start position (e.g., ``sort -k1,1 -k2,2n in.bed > in.sorted.bed``
+    for BED files).
+
+    This uses Heng Li's implementation of Fisher's exact test in kfunc.c.
+
+.. seealso::
+
+    :doc:`../tools/jaccard`
+    :doc:`../tools/reldist`
+    :doc:`../tools/intersect`
+    
+
+===============================
+Usage and option summary
+===============================
+**Usage**:
+::
+
+  bedtools fisher [OPTIONS] -a <BED/GFF/VCF> -b <BED/GFF/VCF> -g <genome>
+
+
+===========================    =========================================================================================================================================================
+Option                         Description
+===========================    =========================================================================================================================================================
+**-a**                           BED/GFF/VCF file A. Each feature in A is compared to B in search of overlaps. Use "stdin" if passing A with a UNIX pipe.
+**-b**                           BED/GFF/VCF file B. Use "stdin" if passing B with a UNIX pipe.
+**-g**                           genome file listing chromosome size.
+**-f**                           Minimum overlap required as a fraction of A. Default is 1E-9 (i.e. 1bp).
+**-r**                           Require that the fraction of overlap be reciprocal for A and B. In other words, if -f is 0.90 and -r is used, this requires that B overlap at least 90% of A and that A also overlaps at least 90% of B.
+**-s**                         Force "strandedness". That is, only report hits in B that overlap A on the same strand. By default, overlaps are reported without respect to strand.
+**-S**                         Require different strandedness.  That is, only report hits in B that overlap A on the _opposite_ strand. By default, overlaps are reported without respect to strand.
+**-split**                     Treat "split" BAM (i.e., having an "N" CIGAR operation) or BED12 entries as distinct BED intervals.
+===========================    =========================================================================================================================================================
+
+.. _Fisher's Exact Test: http://en.wikipedia.org/wiki/Fisher's_exact_test
--- a/src/bedtools.cpp
+++ b/src/bedtools.cpp
@@ -54,6 +54,7 @@ int getoverlap_main(int argc, char* argv[]);//
 int groupby_main(int argc, char* argv[]);//
 int intersect_main(int argc, char* argv[]); //
 int jaccard_main(int argc, char* argv[]); //
+int fisher_main(int argc, char* argv[]); //
 int links_main(int argc, char* argv[]);//
 int maskfastafrombed_main(int argc, char* argv[]);//
 int map_main(int argc, char* argv[]); //
@@ -131,6 +132,7 @@ int main(int argc, char *argv[])
    // statistics tools
    else if (sub_cmd == "jaccard")     return jaccard_main(argc-1, argv+1);
    else if (sub_cmd == "reldist")     return reldist_main(argc-1, argv+1);
+    else if (sub_cmd == "fisher")     return fisher_main(argc-1, argv+1);

    // misc. tools
    else if (sub_cmd == "overlap")     return getoverlap_main(argc-1, argv+1);

--- a/src/fisher/Fisher.cpp
+++ b/src/fisher/Fisher.cpp
+
+#include "Fisher.h"
+#include "BlockMgr.h"
+#include "NewChromsweep.h"
+#include "kfunc.c"
+
+Fisher::Fisher(ContextFisher *context)
+: _context(context),
+  _intersectionVal(0),
+  _unionVal(0),
+  _numIntersections(0),
+  _queryLen(0),
+  _dbLen(0)
+{
+	_blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal());
+}
+
+Fisher::~Fisher(void) {
+	delete _blockMgr;
+	_blockMgr = NULL;
+}
+
+bool Fisher::calculate() {
+
+	if (!getFisher()) {
+		return false;
+	}
+
+	// header
+	cout << "# Contingency Table" << endl;
+
+    // for fisher's exact test, we need the contingency table
+    // XXXXXXXX | not in A        | in A
+    // not in B | n11: in neither | n12: only in A
+    // in B     | n21: only in B  | n22: in A & B
+    //
+    double left, right, two;
+
+    long long genomeSize = _context->getGenomeFile()->getGenomeSize();
+    // bases covered by neither a nor b
+    long long n11 = genomeSize - _queryLen - _dbLen + _intersectionVal;
+    // bases covered only by -a
+    long long n12 = _queryLen - _intersectionVal;
+    // bases covered only by -b
+    long long n21 = _dbLen - _intersectionVal;
+    // bases covered by both
+    long long n22 = _intersectionVal;
+
+    printf("#_________________________________________\n");
+    printf("#           | %-12s | %-12s |\n", "not in -b", "in -b");
+    printf("# not in -a | %-12lld | %-12lld |\n", n11, n12);
+    printf("#     in -a | %-12lld | %-12lld |\n", n21, n22);
+    printf("#_________________________________________\n");
+
+    kt_fisher_exact(n11, n12, n21, n22, &left, &right, &two);
+    double ratio = ((double)n11 / (double)n12) / ((double)n21 / (double)n22);
+
+    printf("# p-values for fisher's exact test\n");
+    printf("left\tright\ttwo-tail\tratio\n");
+    printf("%.5f\t%.5f\t%.5f\t%.3f\n", left, right, two, ratio);
+    
+    //kt_fisher_exact(50010000, 10000000, 15000000, 3000000, &left, &right, &two);
+
+	return true;
+}
+
+bool Fisher::getFisher() {
+	NewChromSweep sweep(_context);
+	if (!sweep.init()) {
+		return false;
+	}
+	RecordKeyList hitSet;
+	while (sweep.next(hitSet)) {
+		if (_context->getObeySplits()) {
+			RecordKeyList keySet(hitSet.getKey());
+			RecordKeyList resultSet(hitSet.getKey());
+			_blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet);
+			_intersectionVal += getTotalIntersection(&resultSet);
+		} else {
+			_intersectionVal += getTotalIntersection(&hitSet);
+		}
+	}
+
+	sweep.closeOut();
+	_queryLen = sweep.getQueryTotalRecordLength();
+	_dbLen = sweep.getDatabaseTotalRecordLength();
+
+	_unionVal = _queryLen + _dbLen;
+	return true;
+}
+
+unsigned long Fisher::getTotalIntersection(RecordKeyList *recList)
+{
+	unsigned long intersection = 0;
+	const Record *key = recList->getKey();
+	int keyStart = key->getStartPos();
+	int keyEnd = key->getEndPos();
+
+	int hitIdx = 0;
+	for (RecordKeyList::const_iterator_type iter = recList->begin(); iter != recList->end(); iter = recList->next()) {
+		const Record *currRec = iter->value();
+		int maxStart = max(currRec->getStartPos(), keyStart);
+		int minEnd = min(currRec->getEndPos(), keyEnd);
+		if (_context->getObeySplits()) {
+			intersection += _blockMgr->getOverlapBases(hitIdx);
+			hitIdx++;
+		} else {
+			intersection += (unsigned long)(minEnd - maxStart);
+		}
+	}
+	_numIntersections += (int)recList->size();
+	return intersection;
+}
+
--- a/src/fisher/Fisher.h
+++ b/src/fisher/Fisher.h
+#ifndef FISHER_H
+#define FISHER_H
+
+#include "ContextFisher.h"
+
+class BlockMgr;
+
+class Fisher {
+
+public:
+
+    Fisher(ContextFisher *context);
+    ~Fisher();
+
+    bool calculate();
+
+private:
+    ContextFisher *_context;
+    BlockMgr *_blockMgr;
+    unsigned long _intersectionVal;
+    unsigned long _unionVal;
+    int _numIntersections;
+    unsigned long _queryLen;
+    unsigned long _dbLen;
+    bool getFisher();
+
+    unsigned long getTotalIntersection(RecordKeyList *hits);
+};
+
+#endif /* FISHER_H */
--- a/src/fisher/Makefile
+++ b/src/fisher/Makefile
+UTILITIES_DIR = ../utils/
+OBJ_DIR = ../../obj/
+BIN_DIR = ../../bin/
+
+# -------------------
+# define our includes
+# -------------------
+
+INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
+			-I$(UTILITIES_DIR)/general/ \
+			-I$(UTILITIES_DIR)/fileType/ \
+			-I$(UTILITIES_DIR)/gzstream/ \
+           -I$(UTILITIES_DIR)/GenomeFile/ \
+           -I$(UTILITIES_DIR)/BamTools/include \
+           -I$(UTILITIES_DIR)/BamTools/src \
+           -I$(UTILITIES_DIR)/BlockedIntervals \
+           -I$(UTILITIES_DIR)/BamTools-Ancillary \
+           -I$(UTILITIES_DIR)/FileRecordTools/ \
+           -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
+           -I$(UTILITIES_DIR)/FileRecordTools/Records/ \
+           -I$(UTILITIES_DIR)/RecordOutputMgr/ \
+			-I$(UTILITIES_DIR)/KeyListOps/ \
+           -I$(UTILITIES_DIR)/NewChromsweep \
+           -I$(UTILITIES_DIR)/VectorOps \
+		   -I . \
+           -I$(UTILITIES_DIR)/BinTree \
+           -I$(UTILITIES_DIR)/version/
+
+# ----------------------------------
+# define our source and object files
+# ----------------------------------
+SOURCES= fisherMain.cpp Fisher.cpp Fisher.h kfunc.c
+OBJECTS= fisherMain.o Fisher.o
+BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
+PROGRAM= Fisher
+
+all: $(BUILT_OBJECTS)
+
+.PHONY: all
+
+$(BUILT_OBJECTS): $(SOURCES)
+	@echo "  * compiling" $(*F).cpp
+	@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES)
+	
+clean:
+	@echo "Cleaning up."
+	@rm -f $(OBJ_DIR)/fisherMain.o $(OBJ_DIR)/Fisher.o
+
+.PHONY: clean
--- a/src/fisher/fisherMain.cpp
+++ b/src/fisher/fisherMain.cpp
+#include "Fisher.h"
+#include "version.h"
+
+using namespace std;
+
+#define PROGRAM_NAME "bedtools fisher"
+
+void fisher_help(void);
+
+int fisher_main(int argc, char* argv[]) {
+
+    ContextFisher *context = new ContextFisher();
+    if (!context->parseCmdArgs(argc, argv, 1) || context->getShowHelp() || !context->isValidState()) {
+    	if (!context->getErrorMsg().empty()) {
+    		cerr << context->getErrorMsg() << endl;
+    	}
+    	fisher_help();
+    	delete context;
+    	return 0;
+    }
+	Fisher *fisher = new Fisher(context);
+
+	bool retVal = fisher->calculate();
+	delete fisher;
+	delete context;
+	return retVal ? 0 : 1;
+}
+
+
+void fisher_help(void) {
+
+    cerr << "\nTool:    bedtools fisher (aka fisher)" << endl;
+    cerr << "Version: " << VERSION << "\n";
+    cerr << "Summary: Calculate Fisher statistic b/w two feature files."
+         << endl
+         << "\t Fisher is the length of the intersection over the union."
+         << endl
+         << "\t Values range from 0 (no intersection) to 1 (self intersection)."
+         << endl << endl;
+
+    cerr << "Usage:   " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf> -g <genome>" << endl << endl;
+
+    cerr << "Options: " << endl;
+
+
+    cerr << "\t-f\t"            << "Minimum overlap required as a fraction of A." << endl;
+    cerr                        << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl;
+    cerr                        << "\t\t- FLOAT (e.g. 0.50)" << endl << endl;
+
+    cerr << "\t-r\t"            << "Require that the fraction overlap be reciprocal for A and B." << endl;
+    cerr                        << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl;
+    cerr                        << "\t\t  that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl;
+
+    cerr << "\t-split\t"        << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl << endl;
+
+    cerr << "\t-s\t"           << "Force strandedness.  That is, only merge features" << endl;
+    cerr                       << "\t\tthat are on the same strand." << endl;
+    cerr                       << "\t\t- By default, merging is done without respect to strand." << endl << endl;
+
+    cerr << "\t-S\t"		   << "Force merge for one specific strand only." << endl;
+    cerr << "\t\t"             << "Follow with + or - to force merge from only" << endl;
+    cerr << "\t\t"			   << "the forward or reverse strand, respectively." << endl;
+    cerr << "\t\t"			   << "- By default, merging is done without respect to strand." << endl << endl;
+
+    cerr << "Notes: " << endl;
+    cerr << "\t(1) Input files must be sorted by chrom, then start position."
+         << endl << endl;
+
+    // end the program here
+    exit(1);
+
+}
--- a/src/fisher/kfunc.c
+++ b/src/fisher/kfunc.c
+#include <math.h>
+#include <stdlib.h>
+
+/* Log gamma function
+ * \log{\Gamma(z)}
+ * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
+ */
+double kf_lgamma(double z)
+{
+	double x = 0;
+	x += 0.1659470187408462e-06 / (z+7);
+	x += 0.9934937113930748e-05 / (z+6);
+	x -= 0.1385710331296526     / (z+5);
+	x += 12.50734324009056      / (z+4);
+	x -= 176.6150291498386      / (z+3);
+	x += 771.3234287757674      / (z+2);
+	x -= 1259.139216722289      / (z+1);
+	x += 676.5203681218835      / z;
+	x += 0.9999999999995183;
+	return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5);
+}
+
+/* complementary error function
+ * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt
+ * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66
+ */
+double kf_erfc(double x)
+{
+	const double p0 = 220.2068679123761;
+	const double p1 = 221.2135961699311;
+	const double p2 = 112.0792914978709;
+	const double p3 = 33.912866078383;
+	const double p4 = 6.37396220353165;
+	const double p5 = .7003830644436881;
+	const double p6 = .03526249659989109;
+	const double q0 = 440.4137358247522;
+	const double q1 = 793.8265125199484;
+	const double q2 = 637.3336333788311;
+	const double q3 = 296.5642487796737;
+	const double q4 = 86.78073220294608;
+	const double q5 = 16.06417757920695;
+	const double q6 = 1.755667163182642;
+	const double q7 = .08838834764831844;
+	double expntl, z, p;
+	z = fabs(x) * M_SQRT2;
+	if (z > 37.) return x > 0.? 0. : 2.;
+	expntl = exp(z * z * - .5);
+	if (z < 10. / M_SQRT2) // for small z
+	    p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0)
+			/ (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0);
+	else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65)))));
+	return x > 0.? 2. * p : 2. * (1. - p);
+}
+
+/* The following computes regularized incomplete gamma functions.
+ * Formulas are taken from Wiki, with additional input from Numerical
+ * Recipes in C (for modified Lentz's algorithm) and AS245
+ * (http://lib.stat.cmu.edu/apstat/245).
+ *
+ * A good online calculator is available at:
+ *
+ *   http://www.danielsoper.com/statcalc/calc23.aspx
+ *
+ * It calculates upper incomplete gamma function, which equals
+ * kf_gammaq(s,z)*tgamma(s).
+ */
+
+#define KF_GAMMA_EPS 1e-14
+#define KF_TINY 1e-290
+
+// regularized lower incomplete gamma function, by series expansion
+static double _kf_gammap(double s, double z)
+{
+	double sum, x;
+	long long k;
+	for (k = 1, sum = x = 1.; k < 100; ++k) {
+		sum += (x *= z / (s + k));
+		if (x / sum < KF_GAMMA_EPS) break;
+	}
+	return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum));
+}
+// regularized upper incomplete gamma function, by continued fraction
+static double _kf_gammaq(double s, double z)
+{
+	long long j;
+	double C, D, f;
+	f = 1. + z - s; C = f; D = 0.;
+	// Modified Lentz's algorithm for computing continued fraction
+	// See Numerical Recipes in C, 2nd edition, section 5.2
+	for (j = 1; j < 100; ++j) {
+		double a = j * (s - j), b = (j<<1) + 1 + z - s, d;
+		D = b + a * D;
+		if (D < KF_TINY) D = KF_TINY;
+		C = b + a / C;
+		if (C < KF_TINY) C = KF_TINY;
+		D = 1. / D;
+		d = C * D;
+		f *= d;
+		if (fabs(d - 1.) < KF_GAMMA_EPS) break;
+	}
+	return exp(s * log(z) - z - kf_lgamma(s) - log(f));
+}
+
+double kf_gammap(double s, double z)
+{
+	return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z);
+}
+
+double kf_gammaq(double s, double z)
+{
+	return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z);
+}
+
+/* Regularized incomplete beta function. The method is taken from
+ * Numerical Recipe in C, 2nd edition, section 6.4. The following web
+ * page calculates the incomplete beta function, which equals
+ * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b):
+ *
+ *   http://www.danielsoper.com/statcalc/calc36.aspx
+ */
+static double kf_betai_aux(double a, double b, double x)
+{
+	double C, D, f;
+	long long j;
+	if (x == 0.) return 0.;
+	if (x == 1.) return 1.;
+	f = 1.; C = f; D = 0.;
+	// Modified Lentz's algorithm for computing continued fraction
+	for (j = 1; j < 200; ++j) {
+		double aa, d;
+		long long m = j>>1;
+		aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1))
+			: m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m));
+		D = 1. + aa * D;
+		if (D < KF_TINY) D = KF_TINY;
+		C = 1. + aa / C;
+		if (C < KF_TINY) C = KF_TINY;
+		D = 1. / D;
+		d = C * D;
+		f *= d;
+		if (fabs(d - 1.) < KF_GAMMA_EPS) break;
+	}
+	return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f;
+}
+double kf_betai(double a, double b, double x)
+{
+	return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x);
+}
+
+#ifdef KF_MAIN
+#include <stdio.h>
+long long main(long long argc, char *argv[])
+{
+	double x = 5.5, y = 3;
+	double a, b;
+	printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x));
+	printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y));
+	a = 2; b = 2; x = 0.5;
+	printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b)));
+	return 0;
+}
+#endif
+
+
+// log\binom{n}{k}
+static double lbinom(long long n, long long k)
+{
+    if (k == 0 || n == k) return 0;
+    return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1);
+}
+
+// n11  n12  | n1_
+// n21  n22  | n2_
+//-----------+----
+// n_1  n_2  | n
+
+// hypergeometric distribution
+static double hypergeo(long long n11, long long n1_, long long n_1, long long n)
+{
+    return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1));
+}
+
+typedef struct {
+    long long n11, n1_, n_1, n;
+    double p;
+} hgacc_t;
+
+// incremental version of hypergenometric distribution
+static double hypergeo_acc(long long n11, long long n1_, long long n_1, long long n, hgacc_t *aux)
+{
+    if (n1_ || n_1 || n) {
+        aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n;
+    } else { // then only n11 changed; the rest fixed
+        if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) {
+            if (n11 == aux->n11 + 1) { // incremental
+                aux->p *= (double)(aux->n1_ - aux->n11) / n11
+                    * (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1);
+                aux->n11 = n11;
+                return aux->p;
+            }
+            if (n11 == aux->n11 - 1) { // incremental
+                aux->p *= (double)aux->n11 / (aux->n1_ - n11)
+                    * (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11);
+                aux->n11 = n11;
+                return aux->p;
+            }
+        }
+        aux->n11 = n11;
+    }
+    aux->p = hypergeo(aux->n11, aux->n1_, aux->n_1, aux->n);
+    return aux->p;
+}
+
+double kt_fisher_exact(long long n11, long long n12, long long n21, long long n22, double *_left, double *_right, double *two)
+{
+    long long i, j, max, min;
+    double p, q, left, right;
+    hgacc_t aux;
+    long long n1_, n_1, n;
+
+    n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n
+    max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail
+    min = n1_ + n_1 - n;    // not sure why n11-n22 is used instead of min(n_1,n1_)
+    if (min < 0) min = 0; // min n11, for left tail
+    *two = *_left = *_right = 1.;
+    if (min == max) return 1.; // no need to do test
+    q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table
+    // left tail
+    p = hypergeo_acc(min, 0, 0, 0, &aux);
+    for (left = 0., i = min + 1; p < 0.99999999 * q && i<=max; ++i) // loop until underflow
+        left += p, p = hypergeo_acc(i, 0, 0, 0, &aux);
+    --i;
+    if (p < 1.00000001 * q) left += p;
+    else --i;
+    // right tail
+    p = hypergeo_acc(max, 0, 0, 0, &aux);
+    for (right = 0., j = max - 1; p < 0.99999999 * q && j>=0; --j) // loop until underflow
+        right += p, p = hypergeo_acc(j, 0, 0, 0, &aux);
+    ++j;
+    if (p < 1.00000001 * q) right += p;
+    else ++j;
+    // two-tail
+    *two = left + right;
+    if (*two > 1.) *two = 1.;
+    // adjust left and right
+    if (abs(i - n11) < abs(j - n11)) right = 1. - left + q;
+    else left = 1.0 - right + q;
+    *_left = left; *_right = right;
+    return q;
+}
+
+
+
--- a/src/utils/Contexts/ContextFisher.cpp
+++ b/src/utils/Contexts/ContextFisher.cpp
+/*
+ * ContextFisher.cpp
+ *
+ */
+
+#include "ContextFisher.h"
+
+ContextFisher::ContextFisher() {
+	setSortedInput(true);
+	setUseMergedIntervals(true);
+
+}
+
+ContextFisher::~ContextFisher() {
+} 
+bool ContextFisher::parseCmdArgs(int argc, char **argv, int skipFirstArgs)
+{
+	_argc = argc;
+	_argv = argv;
+	_skipFirstArgs = skipFirstArgs;
+
+	if (_argc < 2) {
+		setShowHelp(true);
+		return false;
+	}
+
+	setProgram(_programNames[argv[0]]);
+
+	_argsProcessed.resize(_argc - _skipFirstArgs, false);
+
+	for (_i=_skipFirstArgs; _i < argc; _i++) {
+		if (isUsed(_i - _skipFirstArgs)) {
+			continue;
+		}
+		else if (strcmp(_argv[_i], "-s") == 0) {
+			if (!handle_s()) return false;
+		}
+		else if (strcmp(_argv[_i], "-S") == 0) {
+			if (!handle_S()) return false;
+		}
+        if (strcmp(_argv[_i], "-g") == 0) {
+              if (!handle_g()) return false;
+        }
+	}
+	return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs);
+}
+
+bool ContextFisher::isValidState()
+{
+	if (!ContextIntersect::isValidState()) {
+		return false;
+	}
+	// Tests for stranded merge
+	//
+	if (_desiredStrand != FileRecordMergeMgr::ANY_STRAND) { // requested stranded merge
+		for (int i=0; i < getNumInputFiles(); i++) {
+			// make sure file has strand.
+			if (!getFile(i)->recordsHaveStrand()) {
+				_errorMsg = "\n***** ERROR: stranded merge requested, but input file ";
+				_errorMsg  += getInputFileName(i);
+				_errorMsg  += " does not have strands. *****";
+				return false;
+			}
+			//make sure file is not VCF.
+			if (getFile(1)->getFileType() == FileRecordTypeChecker::VCF_FILE_TYPE) {
+				_errorMsg = "\n***** ERROR: stranded merge not supported for VCF file ";
+				_errorMsg += getInputFileName(i);
+				_errorMsg += ". *****";
+				return false;
+			}
+		}
+	}
+    if (_genomeFile == NULL){
+        _errorMsg = "\nERROR*****: specify -g genome file*****\n";
+        return false;
+    }
+	//column operations not allowed with BAM input
+	if (hasColumnOpsMethods() &&
+			getFile(0)->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) {
+		_errorMsg = "\n***** ERROR: stranded merge not supported for VCF files. *****";
+		return false;
+	}
+	return true;
+}
+
+bool ContextFisher::handle_s() {
+	setSameStrand(true);
+	_desiredStrand = FileRecordMergeMgr::SAME_STRAND_EITHER;
+	markUsed(_i - _skipFirstArgs);
+	return true;
+}
+
+bool ContextFisher::handle_S() {
+    if ((_i+1) < _argc) {
+    	bool validChar = false;
+    	if (_argv[_i+1][0] == '+') {
+			_desiredStrand = FileRecordMergeMgr::SAME_STRAND_FORWARD;
+			validChar = true;
+    	} else if (_argv[_i+1][0] == '-') {
+    		validChar = true;
+			_desiredStrand = FileRecordMergeMgr::SAME_STRAND_REVERSE;
+    	}
+    	if (validChar) {
+			markUsed(_i - _skipFirstArgs);
+			_i++;
+			markUsed(_i - _skipFirstArgs);
+			setSameStrand(true);
+			return true;
+    	}
+    }
+	_errorMsg = "\n***** ERROR: -S option must be followed by + or -. *****";
+	return false;
+}
+
+
--- a/src/utils/Contexts/ContextFisher.h
+++ b/src/utils/Contexts/ContextFisher.h
+/*
+ * ContextFisher.h
+ *
+ *  Created on: Apr 24, 2014
+ *      Author: nek3d
+ */
+
+#ifndef CONTEXTFISHER_H_
+#define CONTEXTFISHER_H_
+
+#include "ContextIntersect.h"
+#include "GenomeFile.h"
+
+class ContextFisher : public ContextIntersect {
+public:
+  ContextFisher();
+  ~ContextFisher();
+	virtual bool parseCmdArgs(int argc, char **argv, int skipFirstArgs);
+	virtual bool isValidState();
+
+private:
+	bool handle_s();
+	bool handle_S();
+};
+
+#endif /* CONTEXTFISHER_H_ */
--- a/src/utils/Contexts/Makefile
+++ b/src/utils/Contexts/Makefile
@@ -20,8 +20,8 @@ INCLUDES =  -I$(UTILITIES_DIR)/general/ \
 # define our source and object files
 # ----------------------------------
 SOURCES= ContextBase.cpp ContextBase.h ContextIntersect.cpp ContextIntersect.h ContextMap.cpp \
-	ContextMap.h ContextSample.cpp ContextSample.h ContextMerge.h ContextMerge.cpp ContextJaccard.h ContextJaccard.cpp
-OBJECTS= ContextBase.o ContextIntersect.o ContextMap.o ContextSample.o ContextMerge.o ContextJaccard.o
+	ContextMap.h ContextSample.cpp ContextSample.h ContextMerge.h ContextMerge.cpp ContextJaccard.h ContextJaccard.cpp ContextFisher.h ContextFisher.cpp
+OBJECTS= ContextBase.o ContextIntersect.o ContextMap.o ContextSample.o ContextMerge.o ContextJaccard.o ContextFisher.o
 _EXT_OBJECTS=ParseTools.o QuickString.o
 EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
 BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
@@ -42,5 +42,6 @@ clean:
 	       $(OBJ_DIR)/ContextSample.o \
 	       $(OBJ_DIR)/ContextMerge.o \
 	       $(OBJ_DIR)/ContextJaccard.o \
+	       $(OBJ_DIR)/ContextFisher.o \

-.PHONY: clean
\ No newline at end of file
+.PHONY: clean