Commit 762deee0 authored by Brent Pedersen's avatar Brent Pedersen
Browse files

Fisher's Exact Test

parent b877b350
......@@ -42,6 +42,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/getOverlap \
$(SRC_DIR)/groupBy \
$(SRC_DIR)/intersectFile \
$(SRC_DIR)/fisher \
$(SRC_DIR)/jaccard \
$(SRC_DIR)/linksBed \
$(SRC_DIR)/maskFastaFromBed \
......@@ -212,4 +213,4 @@ autoversion:
echo "#define VERSION_GIT_H" >> $(VERSION_FILE) ; \
echo "#define VERSION_GIT \"$${DETECTED_VERSION}\"" >> $(VERSION_FILE) ; \
echo "#endif /* VERSION_GIT_H */" >> $(VERSION_FILE) ; \
fi )
\ No newline at end of file
fi )
......@@ -28,6 +28,7 @@ The full list of `bedtools` sub-commands.
tools/coverage
tools/expand
tools/flank
tools/fisher
tools/genomecov
tools/getfasta
tools/groupby
......@@ -56,4 +57,4 @@ The full list of `bedtools` sub-commands.
tools/window
\ No newline at end of file
.. _fisher:
########
*fisher*
########
Perform fisher's exact test on the non/overlap between 2 files.
|
Traditionally, in order to test whether 2 sets of intervals are related
spatially, we resort to shuffling the genome and checking the simulated
(shuffled) versus the observed. We can do the same analytically for many
scenarios using
`Fisher's Exact Test`_ .
Given a pair of input files `-a` and `-b` in the usual BedTools parlance:
.. code-block:: bash
$ cat a.bed
chr1 10 20
chr1 30 40
$ cat b.bed
chr1 15 25
And a genome of 500 bases:
.. code-block:: bash
$ echo -e "chr1\t500" > t.genome
We may wish to know **if the amount of overlap between the 2 sets of intervals is
more than we would expect given their coverage and the size of the genome**. We
can do this with ``fisher`` as:
.. code-block:: bash
$ bedtools fisher -a a.bed -b b.bed -g t.genome
# Contingency Table
#_________________________________________
# | not in -b | in -b |
# not in -a | 475 | 15 |
# in -a | 5 | 5 |
#_________________________________________
# p-values for fisher's exact test
left right two-tail ratio
1.00000 0.00001 0.00001 31.667
Where we can see the constructed contingency table and the pvalues for left, right
and two-tail tests.
From here, we can say that given **500 bases** of genome, it is unlikely that a region of
20 bases total from `-a` and 10 bases total from `-b` would share 5 bases if the regions
were randomly distributed. *Consult your statistician for a more precise definition*.
The above was highly significant, but if our genome were only **50 bases**:
.. code-block:: bash
$ echo -e "chr1\t50" > t.genome
$ bedtools fisher -a a.bed -b b.bed -g t.genome
# Contingency Table
#_________________________________________
# | not in -b | in -b |
# not in -a | 25 | 15 |
# in -a | 5 | 5 |
#_________________________________________
# p-values for fisher's exact test
left right two-tail ratio
0.86011 0.35497 0.49401 1.667
We can see that neither tail is significant. Intuitively, this makes sense;
if we randomly place 20 (from `-a`), and 10 (from `-b`) bases of intervals
within 50 bases, it doesn't seem unlikely that we'd see 5 bases of overlap.
.. note::
The ``fisher`` tool requires that your data is pre-sorted by chromosome and
then by start position (e.g., ``sort -k1,1 -k2,2n in.bed > in.sorted.bed``
for BED files).
This uses Heng Li's implementation of Fisher's exact test in kfunc.c.
.. seealso::
:doc:`../tools/jaccard`
:doc:`../tools/reldist`
:doc:`../tools/intersect`
===============================
Usage and option summary
===============================
**Usage**:
::
bedtools fisher [OPTIONS] -a <BED/GFF/VCF> -b <BED/GFF/VCF> -g <genome>
=========================== =========================================================================================================================================================
Option Description
=========================== =========================================================================================================================================================
**-a** BED/GFF/VCF file A. Each feature in A is compared to B in search of overlaps. Use "stdin" if passing A with a UNIX pipe.
**-b** BED/GFF/VCF file B. Use "stdin" if passing B with a UNIX pipe.
**-g** genome file listing chromosome size.
**-f** Minimum overlap required as a fraction of A. Default is 1E-9 (i.e. 1bp).
**-r** Require that the fraction of overlap be reciprocal for A and B. In other words, if -f is 0.90 and -r is used, this requires that B overlap at least 90% of A and that A also overlaps at least 90% of B.
**-s** Force "strandedness". That is, only report hits in B that overlap A on the same strand. By default, overlaps are reported without respect to strand.
**-S** Require different strandedness. That is, only report hits in B that overlap A on the _opposite_ strand. By default, overlaps are reported without respect to strand.
**-split** Treat "split" BAM (i.e., having an "N" CIGAR operation) or BED12 entries as distinct BED intervals.
=========================== =========================================================================================================================================================
.. _Fisher's Exact Test: http://en.wikipedia.org/wiki/Fisher's_exact_test
......@@ -54,6 +54,7 @@ int getoverlap_main(int argc, char* argv[]);//
int groupby_main(int argc, char* argv[]);//
int intersect_main(int argc, char* argv[]); //
int jaccard_main(int argc, char* argv[]); //
int fisher_main(int argc, char* argv[]); //
int links_main(int argc, char* argv[]);//
int maskfastafrombed_main(int argc, char* argv[]);//
int map_main(int argc, char* argv[]); //
......@@ -131,6 +132,7 @@ int main(int argc, char *argv[])
// statistics tools
else if (sub_cmd == "jaccard") return jaccard_main(argc-1, argv+1);
else if (sub_cmd == "reldist") return reldist_main(argc-1, argv+1);
else if (sub_cmd == "fisher") return fisher_main(argc-1, argv+1);
// misc. tools
else if (sub_cmd == "overlap") return getoverlap_main(argc-1, argv+1);
......
#include "Fisher.h"
#include "BlockMgr.h"
#include "NewChromsweep.h"
#include "kfunc.c"
Fisher::Fisher(ContextFisher *context)
: _context(context),
_intersectionVal(0),
_unionVal(0),
_numIntersections(0),
_queryLen(0),
_dbLen(0)
{
_blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal());
}
Fisher::~Fisher(void) {
delete _blockMgr;
_blockMgr = NULL;
}
bool Fisher::calculate() {
if (!getFisher()) {
return false;
}
// header
cout << "# Contingency Table" << endl;
// for fisher's exact test, we need the contingency table
// XXXXXXXX | not in A | in A
// not in B | n11: in neither | n12: only in A
// in B | n21: only in B | n22: in A & B
//
double left, right, two;
long long genomeSize = _context->getGenomeFile()->getGenomeSize();
// bases covered by neither a nor b
long long n11 = genomeSize - _queryLen - _dbLen + _intersectionVal;
// bases covered only by -a
long long n12 = _queryLen - _intersectionVal;
// bases covered only by -b
long long n21 = _dbLen - _intersectionVal;
// bases covered by both
long long n22 = _intersectionVal;
printf("#_________________________________________\n");
printf("# | %-12s | %-12s |\n", "not in -b", "in -b");
printf("# not in -a | %-12lld | %-12lld |\n", n11, n12);
printf("# in -a | %-12lld | %-12lld |\n", n21, n22);
printf("#_________________________________________\n");
kt_fisher_exact(n11, n12, n21, n22, &left, &right, &two);
double ratio = ((double)n11 / (double)n12) / ((double)n21 / (double)n22);
printf("# p-values for fisher's exact test\n");
printf("left\tright\ttwo-tail\tratio\n");
printf("%.5f\t%.5f\t%.5f\t%.3f\n", left, right, two, ratio);
//kt_fisher_exact(50010000, 10000000, 15000000, 3000000, &left, &right, &two);
return true;
}
bool Fisher::getFisher() {
NewChromSweep sweep(_context);
if (!sweep.init()) {
return false;
}
RecordKeyList hitSet;
while (sweep.next(hitSet)) {
if (_context->getObeySplits()) {
RecordKeyList keySet(hitSet.getKey());
RecordKeyList resultSet(hitSet.getKey());
_blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet);
_intersectionVal += getTotalIntersection(&resultSet);
} else {
_intersectionVal += getTotalIntersection(&hitSet);
}
}
sweep.closeOut();
_queryLen = sweep.getQueryTotalRecordLength();
_dbLen = sweep.getDatabaseTotalRecordLength();
_unionVal = _queryLen + _dbLen;
return true;
}
unsigned long Fisher::getTotalIntersection(RecordKeyList *recList)
{
unsigned long intersection = 0;
const Record *key = recList->getKey();
int keyStart = key->getStartPos();
int keyEnd = key->getEndPos();
int hitIdx = 0;
for (RecordKeyList::const_iterator_type iter = recList->begin(); iter != recList->end(); iter = recList->next()) {
const Record *currRec = iter->value();
int maxStart = max(currRec->getStartPos(), keyStart);
int minEnd = min(currRec->getEndPos(), keyEnd);
if (_context->getObeySplits()) {
intersection += _blockMgr->getOverlapBases(hitIdx);
hitIdx++;
} else {
intersection += (unsigned long)(minEnd - maxStart);
}
}
_numIntersections += (int)recList->size();
return intersection;
}
#ifndef FISHER_H
#define FISHER_H
#include "ContextFisher.h"
class BlockMgr;
class Fisher {
public:
Fisher(ContextFisher *context);
~Fisher();
bool calculate();
private:
ContextFisher *_context;
BlockMgr *_blockMgr;
unsigned long _intersectionVal;
unsigned long _unionVal;
int _numIntersections;
unsigned long _queryLen;
unsigned long _dbLen;
bool getFisher();
unsigned long getTotalIntersection(RecordKeyList *hits);
};
#endif /* FISHER_H */
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/GenomeFile/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src \
-I$(UTILITIES_DIR)/BlockedIntervals \
-I$(UTILITIES_DIR)/BamTools-Ancillary \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/VectorOps \
-I . \
-I$(UTILITIES_DIR)/BinTree \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= fisherMain.cpp Fisher.cpp Fisher.h kfunc.c
OBJECTS= fisherMain.o Fisher.o
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= Fisher
all: $(BUILT_OBJECTS)
.PHONY: all
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/fisherMain.o $(OBJ_DIR)/Fisher.o
.PHONY: clean
#include "Fisher.h"
#include "version.h"
using namespace std;
#define PROGRAM_NAME "bedtools fisher"
void fisher_help(void);
int fisher_main(int argc, char* argv[]) {
ContextFisher *context = new ContextFisher();
if (!context->parseCmdArgs(argc, argv, 1) || context->getShowHelp() || !context->isValidState()) {
if (!context->getErrorMsg().empty()) {
cerr << context->getErrorMsg() << endl;
}
fisher_help();
delete context;
return 0;
}
Fisher *fisher = new Fisher(context);
bool retVal = fisher->calculate();
delete fisher;
delete context;
return retVal ? 0 : 1;
}
void fisher_help(void) {
cerr << "\nTool: bedtools fisher (aka fisher)" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Calculate Fisher statistic b/w two feature files."
<< endl
<< "\t Fisher is the length of the intersection over the union."
<< endl
<< "\t Values range from 0 (no intersection) to 1 (self intersection)."
<< endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf> -g <genome>" << endl << endl;
cerr << "Options: " << endl;
cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl;
cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl;
cerr << "\t\t- FLOAT (e.g. 0.50)" << endl << endl;
cerr << "\t-r\t" << "Require that the fraction overlap be reciprocal for A and B." << endl;
cerr << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl;
cerr << "\t\t that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl;
cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl << endl;
cerr << "\t-s\t" << "Force strandedness. That is, only merge features" << endl;
cerr << "\t\tthat are on the same strand." << endl;
cerr << "\t\t- By default, merging is done without respect to strand." << endl << endl;
cerr << "\t-S\t" << "Force merge for one specific strand only." << endl;
cerr << "\t\t" << "Follow with + or - to force merge from only" << endl;
cerr << "\t\t" << "the forward or reverse strand, respectively." << endl;
cerr << "\t\t" << "- By default, merging is done without respect to strand." << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) Input files must be sorted by chrom, then start position."
<< endl << endl;
// end the program here
exit(1);
}
#include <math.h>
#include <stdlib.h>
/* Log gamma function
* \log{\Gamma(z)}
* AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
*/
double kf_lgamma(double z)
{
double x = 0;
x += 0.1659470187408462e-06 / (z+7);
x += 0.9934937113930748e-05 / (z+6);
x -= 0.1385710331296526 / (z+5);
x += 12.50734324009056 / (z+4);
x -= 176.6150291498386 / (z+3);
x += 771.3234287757674 / (z+2);
x -= 1259.139216722289 / (z+1);
x += 676.5203681218835 / z;
x += 0.9999999999995183;
return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5);
}
/* complementary error function
* \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt
* AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66
*/
double kf_erfc(double x)
{
const double p0 = 220.2068679123761;
const double p1 = 221.2135961699311;
const double p2 = 112.0792914978709;
const double p3 = 33.912866078383;
const double p4 = 6.37396220353165;
const double p5 = .7003830644436881;
const double p6 = .03526249659989109;
const double q0 = 440.4137358247522;
const double q1 = 793.8265125199484;
const double q2 = 637.3336333788311;
const double q3 = 296.5642487796737;
const double q4 = 86.78073220294608;
const double q5 = 16.06417757920695;
const double q6 = 1.755667163182642;
const double q7 = .08838834764831844;
double expntl, z, p;
z = fabs(x) * M_SQRT2;
if (z > 37.) return x > 0.? 0. : 2.;
expntl = exp(z * z * - .5);
if (z < 10. / M_SQRT2) // for small z
p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0)
/ (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0);
else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65)))));
return x > 0.? 2. * p : 2. * (1. - p);
}
/* The following computes regularized incomplete gamma functions.
* Formulas are taken from Wiki, with additional input from Numerical
* Recipes in C (for modified Lentz's algorithm) and AS245
* (http://lib.stat.cmu.edu/apstat/245).
*
* A good online calculator is available at:
*
* http://www.danielsoper.com/statcalc/calc23.aspx
*
* It calculates upper incomplete gamma function, which equals
* kf_gammaq(s,z)*tgamma(s).
*/
#define KF_GAMMA_EPS 1e-14
#define KF_TINY 1e-290
// regularized lower incomplete gamma function, by series expansion
static double _kf_gammap(double s, double z)
{
double sum, x;
long long k;
for (k = 1, sum = x = 1.; k < 100; ++k) {
sum += (x *= z / (s + k));
if (x / sum < KF_GAMMA_EPS) break;
}
return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum));
}
// regularized upper incomplete gamma function, by continued fraction
static double _kf_gammaq(double s, double z)
{
long long j;
double C, D, f;
f = 1. + z - s; C = f; D = 0.;
// Modified Lentz's algorithm for computing continued fraction
// See Numerical Recipes in C, 2nd edition, section 5.2
for (j = 1; j < 100; ++j) {
double a = j * (s - j), b = (j<<1) + 1 + z - s, d;
D = b + a * D;
if (D < KF_TINY) D = KF_TINY;
C = b + a / C;
if (C < KF_TINY) C = KF_TINY;
D = 1. / D;
d = C * D;
f *= d;
if (fabs(d - 1.) < KF_GAMMA_EPS) break;
}
return exp(s * log(z) - z - kf_lgamma(s) - log(f));
}
double kf_gammap(double s, double z)
{
return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z);
}
double kf_gammaq(double s, double z)
{
return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z);
}
/* Regularized incomplete beta function. The method is taken from
* Numerical Recipe in C, 2nd edition, section 6.4. The following web
* page calculates the incomplete beta function, which equals
* kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b):
*
* http://www.danielsoper.com/statcalc/calc36.aspx
*/
static double kf_betai_aux(double a, double b, double x)
{
double C, D, f;
long long j;
if (x == 0.) return 0.;
if (x == 1.) return 1.;
f = 1.; C = f; D = 0.;
// Modified Lentz's algorithm for computing continued fraction
for (j = 1; j < 200; ++j) {
double aa, d;
long long m = j>>1;
aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1))
: m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m));
D = 1. + aa * D;
if (D < KF_TINY) D = KF_TINY;
C = 1. + aa / C;
if (C < KF_TINY) C = KF_TINY;
D = 1. / D;
d = C * D;
f *= d;
if (fabs(d - 1.) < KF_GAMMA_EPS) break;
}
return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f;
}
double kf_betai(double a, double b, double x)
{
return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x);
}
#ifdef KF_MAIN
#include <stdio.h>
long long main(long long argc, char *argv[])
{
double x = 5.5, y = 3;
double a, b;
printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x));
printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y));
a = 2; b = 2; x = 0.5;
printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b)));
return 0;
}
#endif
// log\binom{n}{k}
static double lbinom(long long n, long long k)
{
if (k == 0 || n == k) return 0;
return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1);
}
// n11 n12 | n1_
// n21 n22 | n2_
//-----------+----
// n_1 n_2 | n
// hypergeometric distribution
static double hypergeo(long long n11, long long n1_, long long n_1, long long n)
{
return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1));