From 762deee01a588d596d766469c47a2cc76c5c6b07 Mon Sep 17 00:00:00 2001 From: Brent Pedersen <bpederse@gmail.com> Date: Thu, 3 Jul 2014 13:27:07 -0600 Subject: [PATCH] Fisher's Exact Test --- Makefile | 3 +- docs/content/bedtools-suite.rst | 3 +- docs/content/tools/fisher.rst | 119 +++++++++++++ src/bedtools.cpp | 2 + src/fisher/Fisher.cpp | 114 ++++++++++++ src/fisher/Fisher.h | 30 ++++ src/fisher/Makefile | 49 ++++++ src/fisher/fisherMain.cpp | 72 ++++++++ src/fisher/kfunc.c | 253 +++++++++++++++++++++++++++ src/utils/Contexts/ContextFisher.cpp | 115 ++++++++++++ src/utils/Contexts/ContextFisher.h | 26 +++ src/utils/Contexts/Makefile | 7 +- 12 files changed, 788 insertions(+), 5 deletions(-) create mode 100644 docs/content/tools/fisher.rst create mode 100644 src/fisher/Fisher.cpp create mode 100644 src/fisher/Fisher.h create mode 100644 src/fisher/Makefile create mode 100644 src/fisher/fisherMain.cpp create mode 100644 src/fisher/kfunc.c create mode 100644 src/utils/Contexts/ContextFisher.cpp create mode 100644 src/utils/Contexts/ContextFisher.h diff --git a/Makefile b/Makefile index bbbc4379..420b05b8 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/getOverlap \ $(SRC_DIR)/groupBy \ $(SRC_DIR)/intersectFile \ + $(SRC_DIR)/fisher \ $(SRC_DIR)/jaccard \ $(SRC_DIR)/linksBed \ $(SRC_DIR)/maskFastaFromBed \ @@ -212,4 +213,4 @@ autoversion: echo "#define VERSION_GIT_H" >> $(VERSION_FILE) ; \ echo "#define VERSION_GIT \"$${DETECTED_VERSION}\"" >> $(VERSION_FILE) ; \ echo "#endif /* VERSION_GIT_H */" >> $(VERSION_FILE) ; \ - fi ) \ No newline at end of file + fi ) diff --git a/docs/content/bedtools-suite.rst b/docs/content/bedtools-suite.rst index 936d8ad9..c8f8aafb 100755 --- a/docs/content/bedtools-suite.rst +++ b/docs/content/bedtools-suite.rst @@ -28,6 +28,7 @@ The full list of `bedtools` sub-commands. tools/coverage tools/expand tools/flank + tools/fisher tools/genomecov tools/getfasta tools/groupby @@ -56,4 +57,4 @@ The full list of `bedtools` sub-commands. tools/window - \ No newline at end of file + diff --git a/docs/content/tools/fisher.rst b/docs/content/tools/fisher.rst new file mode 100644 index 00000000..efc6814a --- /dev/null +++ b/docs/content/tools/fisher.rst @@ -0,0 +1,119 @@ +.. _fisher: + +######## +*fisher* +######## + +Perform fisher's exact test on the non/overlap between 2 files. + +| + +Traditionally, in order to test whether 2 sets of intervals are related +spatially, we resort to shuffling the genome and checking the simulated +(shuffled) versus the observed. We can do the same analytically for many +scenarios using +`Fisher's Exact Test`_ . + + + +Given a pair of input files `-a` and `-b` in the usual BedTools parlance: + +.. code-block:: bash + + $ cat a.bed + chr1 10 20 + chr1 30 40 + + $ cat b.bed + chr1 15 25 + +And a genome of 500 bases: + +.. code-block:: bash + + $ echo -e "chr1\t500" > t.genome + +We may wish to know **if the amount of overlap between the 2 sets of intervals is +more than we would expect given their coverage and the size of the genome**. We +can do this with ``fisher`` as: + +.. code-block:: bash + + $ bedtools fisher -a a.bed -b b.bed -g t.genome + # Contingency Table + #_________________________________________ + # | not in -b | in -b | + # not in -a | 475 | 15 | + # in -a | 5 | 5 | + #_________________________________________ + # p-values for fisher's exact test + left right two-tail ratio + 1.00000 0.00001 0.00001 31.667 + + +Where we can see the constructed contingency table and the pvalues for left, right +and two-tail tests. +From here, we can say that given **500 bases** of genome, it is unlikely that a region of +20 bases total from `-a` and 10 bases total from `-b` would share 5 bases if the regions +were randomly distributed. *Consult your statistician for a more precise definition*. + +The above was highly significant, but if our genome were only **50 bases**: + +.. code-block:: bash + + $ echo -e "chr1\t50" > t.genome + $ bedtools fisher -a a.bed -b b.bed -g t.genome + # Contingency Table + #_________________________________________ + # | not in -b | in -b | + # not in -a | 25 | 15 | + # in -a | 5 | 5 | + #_________________________________________ + # p-values for fisher's exact test + left right two-tail ratio + 0.86011 0.35497 0.49401 1.667 + + +We can see that neither tail is significant. Intuitively, this makes sense; +if we randomly place 20 (from `-a`), and 10 (from `-b`) bases of intervals +within 50 bases, it doesn't seem unlikely that we'd see 5 bases of overlap. + + +.. note:: + + The ``fisher`` tool requires that your data is pre-sorted by chromosome and + then by start position (e.g., ``sort -k1,1 -k2,2n in.bed > in.sorted.bed`` + for BED files). + + This uses Heng Li's implementation of Fisher's exact test in kfunc.c. + +.. seealso:: + + :doc:`../tools/jaccard` + :doc:`../tools/reldist` + :doc:`../tools/intersect` + + +=============================== +Usage and option summary +=============================== +**Usage**: +:: + + bedtools fisher [OPTIONS] -a <BED/GFF/VCF> -b <BED/GFF/VCF> -g <genome> + + +=========================== ========================================================================================================================================================= +Option Description +=========================== ========================================================================================================================================================= +**-a** BED/GFF/VCF file A. Each feature in A is compared to B in search of overlaps. Use "stdin" if passing A with a UNIX pipe. +**-b** BED/GFF/VCF file B. Use "stdin" if passing B with a UNIX pipe. +**-g** genome file listing chromosome size. +**-f** Minimum overlap required as a fraction of A. Default is 1E-9 (i.e. 1bp). +**-r** Require that the fraction of overlap be reciprocal for A and B. In other words, if -f is 0.90 and -r is used, this requires that B overlap at least 90% of A and that A also overlaps at least 90% of B. +**-s** Force "strandedness". That is, only report hits in B that overlap A on the same strand. By default, overlaps are reported without respect to strand. +**-S** Require different strandedness. That is, only report hits in B that overlap A on the _opposite_ strand. By default, overlaps are reported without respect to strand. +**-split** Treat "split" BAM (i.e., having an "N" CIGAR operation) or BED12 entries as distinct BED intervals. +=========================== ========================================================================================================================================================= + +.. _Fisher's Exact Test: http://en.wikipedia.org/wiki/Fisher's_exact_test diff --git a/src/bedtools.cpp b/src/bedtools.cpp index 70c3e660..3185ec14 100644 --- a/src/bedtools.cpp +++ b/src/bedtools.cpp @@ -54,6 +54,7 @@ int getoverlap_main(int argc, char* argv[]);// int groupby_main(int argc, char* argv[]);// int intersect_main(int argc, char* argv[]); // int jaccard_main(int argc, char* argv[]); // +int fisher_main(int argc, char* argv[]); // int links_main(int argc, char* argv[]);// int maskfastafrombed_main(int argc, char* argv[]);// int map_main(int argc, char* argv[]); // @@ -131,6 +132,7 @@ int main(int argc, char *argv[]) // statistics tools else if (sub_cmd == "jaccard") return jaccard_main(argc-1, argv+1); else if (sub_cmd == "reldist") return reldist_main(argc-1, argv+1); + else if (sub_cmd == "fisher") return fisher_main(argc-1, argv+1); // misc. tools else if (sub_cmd == "overlap") return getoverlap_main(argc-1, argv+1); diff --git a/src/fisher/Fisher.cpp b/src/fisher/Fisher.cpp new file mode 100644 index 00000000..cb0cbcac --- /dev/null +++ b/src/fisher/Fisher.cpp @@ -0,0 +1,114 @@ + +#include "Fisher.h" +#include "BlockMgr.h" +#include "NewChromsweep.h" +#include "kfunc.c" + +Fisher::Fisher(ContextFisher *context) +: _context(context), + _intersectionVal(0), + _unionVal(0), + _numIntersections(0), + _queryLen(0), + _dbLen(0) +{ + _blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal()); +} + +Fisher::~Fisher(void) { + delete _blockMgr; + _blockMgr = NULL; +} + +bool Fisher::calculate() { + + if (!getFisher()) { + return false; + } + + // header + cout << "# Contingency Table" << endl; + + // for fisher's exact test, we need the contingency table + // XXXXXXXX | not in A | in A + // not in B | n11: in neither | n12: only in A + // in B | n21: only in B | n22: in A & B + // + double left, right, two; + + long long genomeSize = _context->getGenomeFile()->getGenomeSize(); + // bases covered by neither a nor b + long long n11 = genomeSize - _queryLen - _dbLen + _intersectionVal; + // bases covered only by -a + long long n12 = _queryLen - _intersectionVal; + // bases covered only by -b + long long n21 = _dbLen - _intersectionVal; + // bases covered by both + long long n22 = _intersectionVal; + + printf("#_________________________________________\n"); + printf("# | %-12s | %-12s |\n", "not in -b", "in -b"); + printf("# not in -a | %-12lld | %-12lld |\n", n11, n12); + printf("# in -a | %-12lld | %-12lld |\n", n21, n22); + printf("#_________________________________________\n"); + + kt_fisher_exact(n11, n12, n21, n22, &left, &right, &two); + double ratio = ((double)n11 / (double)n12) / ((double)n21 / (double)n22); + + printf("# p-values for fisher's exact test\n"); + printf("left\tright\ttwo-tail\tratio\n"); + printf("%.5f\t%.5f\t%.5f\t%.3f\n", left, right, two, ratio); + + //kt_fisher_exact(50010000, 10000000, 15000000, 3000000, &left, &right, &two); + + return true; +} + +bool Fisher::getFisher() { + NewChromSweep sweep(_context); + if (!sweep.init()) { + return false; + } + RecordKeyList hitSet; + while (sweep.next(hitSet)) { + if (_context->getObeySplits()) { + RecordKeyList keySet(hitSet.getKey()); + RecordKeyList resultSet(hitSet.getKey()); + _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet); + _intersectionVal += getTotalIntersection(&resultSet); + } else { + _intersectionVal += getTotalIntersection(&hitSet); + } + } + + sweep.closeOut(); + _queryLen = sweep.getQueryTotalRecordLength(); + _dbLen = sweep.getDatabaseTotalRecordLength(); + + _unionVal = _queryLen + _dbLen; + return true; +} + +unsigned long Fisher::getTotalIntersection(RecordKeyList *recList) +{ + unsigned long intersection = 0; + const Record *key = recList->getKey(); + int keyStart = key->getStartPos(); + int keyEnd = key->getEndPos(); + + int hitIdx = 0; + for (RecordKeyList::const_iterator_type iter = recList->begin(); iter != recList->end(); iter = recList->next()) { + const Record *currRec = iter->value(); + int maxStart = max(currRec->getStartPos(), keyStart); + int minEnd = min(currRec->getEndPos(), keyEnd); + if (_context->getObeySplits()) { + intersection += _blockMgr->getOverlapBases(hitIdx); + hitIdx++; + } else { + intersection += (unsigned long)(minEnd - maxStart); + } + } + _numIntersections += (int)recList->size(); + return intersection; +} + diff --git a/src/fisher/Fisher.h b/src/fisher/Fisher.h new file mode 100644 index 00000000..c875273a --- /dev/null +++ b/src/fisher/Fisher.h @@ -0,0 +1,30 @@ +#ifndef FISHER_H +#define FISHER_H + +#include "ContextFisher.h" + +class BlockMgr; + +class Fisher { + +public: + + Fisher(ContextFisher *context); + ~Fisher(); + + bool calculate(); + +private: + ContextFisher *_context; + BlockMgr *_blockMgr; + unsigned long _intersectionVal; + unsigned long _unionVal; + int _numIntersections; + unsigned long _queryLen; + unsigned long _dbLen; + bool getFisher(); + + unsigned long getTotalIntersection(RecordKeyList *hits); +}; + +#endif /* FISHER_H */ diff --git a/src/fisher/Makefile b/src/fisher/Makefile new file mode 100644 index 00000000..b94a5097 --- /dev/null +++ b/src/fisher/Makefile @@ -0,0 +1,49 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- + +INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ + -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/GenomeFile/ \ + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools/src \ + -I$(UTILITIES_DIR)/BlockedIntervals \ + -I$(UTILITIES_DIR)/BamTools-Ancillary \ + -I$(UTILITIES_DIR)/FileRecordTools/ \ + -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ + -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/RecordOutputMgr/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ + -I$(UTILITIES_DIR)/NewChromsweep \ + -I$(UTILITIES_DIR)/VectorOps \ + -I . \ + -I$(UTILITIES_DIR)/BinTree \ + -I$(UTILITIES_DIR)/version/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= fisherMain.cpp Fisher.cpp Fisher.h kfunc.c +OBJECTS= fisherMain.o Fisher.o +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +PROGRAM= Fisher + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/fisherMain.o $(OBJ_DIR)/Fisher.o + +.PHONY: clean diff --git a/src/fisher/fisherMain.cpp b/src/fisher/fisherMain.cpp new file mode 100644 index 00000000..755dc97a --- /dev/null +++ b/src/fisher/fisherMain.cpp @@ -0,0 +1,72 @@ +#include "Fisher.h" +#include "version.h" + +using namespace std; + +#define PROGRAM_NAME "bedtools fisher" + +void fisher_help(void); + +int fisher_main(int argc, char* argv[]) { + + ContextFisher *context = new ContextFisher(); + if (!context->parseCmdArgs(argc, argv, 1) || context->getShowHelp() || !context->isValidState()) { + if (!context->getErrorMsg().empty()) { + cerr << context->getErrorMsg() << endl; + } + fisher_help(); + delete context; + return 0; + } + Fisher *fisher = new Fisher(context); + + bool retVal = fisher->calculate(); + delete fisher; + delete context; + return retVal ? 0 : 1; +} + + +void fisher_help(void) { + + cerr << "\nTool: bedtools fisher (aka fisher)" << endl; + cerr << "Version: " << VERSION << "\n"; + cerr << "Summary: Calculate Fisher statistic b/w two feature files." + << endl + << "\t Fisher is the length of the intersection over the union." + << endl + << "\t Values range from 0 (no intersection) to 1 (self intersection)." + << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf> -g <genome>" << endl << endl; + + cerr << "Options: " << endl; + + + cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl; + cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl; + cerr << "\t\t- FLOAT (e.g. 0.50)" << endl << endl; + + cerr << "\t-r\t" << "Require that the fraction overlap be reciprocal for A and B." << endl; + cerr << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl; + cerr << "\t\t that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl; + + cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl << endl; + + cerr << "\t-s\t" << "Force strandedness. That is, only merge features" << endl; + cerr << "\t\tthat are on the same strand." << endl; + cerr << "\t\t- By default, merging is done without respect to strand." << endl << endl; + + cerr << "\t-S\t" << "Force merge for one specific strand only." << endl; + cerr << "\t\t" << "Follow with + or - to force merge from only" << endl; + cerr << "\t\t" << "the forward or reverse strand, respectively." << endl; + cerr << "\t\t" << "- By default, merging is done without respect to strand." << endl << endl; + + cerr << "Notes: " << endl; + cerr << "\t(1) Input files must be sorted by chrom, then start position." + << endl << endl; + + // end the program here + exit(1); + +} diff --git a/src/fisher/kfunc.c b/src/fisher/kfunc.c new file mode 100644 index 00000000..e735e636 --- /dev/null +++ b/src/fisher/kfunc.c @@ -0,0 +1,253 @@ +#include <math.h> +#include <stdlib.h> + +/* Log gamma function + * \log{\Gamma(z)} + * AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245 + */ +double kf_lgamma(double z) +{ + double x = 0; + x += 0.1659470187408462e-06 / (z+7); + x += 0.9934937113930748e-05 / (z+6); + x -= 0.1385710331296526 / (z+5); + x += 12.50734324009056 / (z+4); + x -= 176.6150291498386 / (z+3); + x += 771.3234287757674 / (z+2); + x -= 1259.139216722289 / (z+1); + x += 676.5203681218835 / z; + x += 0.9999999999995183; + return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5); +} + +/* complementary error function + * \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt + * AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66 + */ +double kf_erfc(double x) +{ + const double p0 = 220.2068679123761; + const double p1 = 221.2135961699311; + const double p2 = 112.0792914978709; + const double p3 = 33.912866078383; + const double p4 = 6.37396220353165; + const double p5 = .7003830644436881; + const double p6 = .03526249659989109; + const double q0 = 440.4137358247522; + const double q1 = 793.8265125199484; + const double q2 = 637.3336333788311; + const double q3 = 296.5642487796737; + const double q4 = 86.78073220294608; + const double q5 = 16.06417757920695; + const double q6 = 1.755667163182642; + const double q7 = .08838834764831844; + double expntl, z, p; + z = fabs(x) * M_SQRT2; + if (z > 37.) return x > 0.? 0. : 2.; + expntl = exp(z * z * - .5); + if (z < 10. / M_SQRT2) // for small z + p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0) + / (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0); + else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65))))); + return x > 0.? 2. * p : 2. * (1. - p); +} + +/* The following computes regularized incomplete gamma functions. + * Formulas are taken from Wiki, with additional input from Numerical + * Recipes in C (for modified Lentz's algorithm) and AS245 + * (http://lib.stat.cmu.edu/apstat/245). + * + * A good online calculator is available at: + * + * http://www.danielsoper.com/statcalc/calc23.aspx + * + * It calculates upper incomplete gamma function, which equals + * kf_gammaq(s,z)*tgamma(s). + */ + +#define KF_GAMMA_EPS 1e-14 +#define KF_TINY 1e-290 + +// regularized lower incomplete gamma function, by series expansion +static double _kf_gammap(double s, double z) +{ + double sum, x; + long long k; + for (k = 1, sum = x = 1.; k < 100; ++k) { + sum += (x *= z / (s + k)); + if (x / sum < KF_GAMMA_EPS) break; + } + return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum)); +} +// regularized upper incomplete gamma function, by continued fraction +static double _kf_gammaq(double s, double z) +{ + long long j; + double C, D, f; + f = 1. + z - s; C = f; D = 0.; + // Modified Lentz's algorithm for computing continued fraction + // See Numerical Recipes in C, 2nd edition, section 5.2 + for (j = 1; j < 100; ++j) { + double a = j * (s - j), b = (j<<1) + 1 + z - s, d; + D = b + a * D; + if (D < KF_TINY) D = KF_TINY; + C = b + a / C; + if (C < KF_TINY) C = KF_TINY; + D = 1. / D; + d = C * D; + f *= d; + if (fabs(d - 1.) < KF_GAMMA_EPS) break; + } + return exp(s * log(z) - z - kf_lgamma(s) - log(f)); +} + +double kf_gammap(double s, double z) +{ + return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z); +} + +double kf_gammaq(double s, double z) +{ + return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z); +} + +/* Regularized incomplete beta function. The method is taken from + * Numerical Recipe in C, 2nd edition, section 6.4. The following web + * page calculates the incomplete beta function, which equals + * kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b): + * + * http://www.danielsoper.com/statcalc/calc36.aspx + */ +static double kf_betai_aux(double a, double b, double x) +{ + double C, D, f; + long long j; + if (x == 0.) return 0.; + if (x == 1.) return 1.; + f = 1.; C = f; D = 0.; + // Modified Lentz's algorithm for computing continued fraction + for (j = 1; j < 200; ++j) { + double aa, d; + long long m = j>>1; + aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1)) + : m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m)); + D = 1. + aa * D; + if (D < KF_TINY) D = KF_TINY; + C = 1. + aa / C; + if (C < KF_TINY) C = KF_TINY; + D = 1. / D; + d = C * D; + f *= d; + if (fabs(d - 1.) < KF_GAMMA_EPS) break; + } + return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f; +} +double kf_betai(double a, double b, double x) +{ + return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x); +} + +#ifdef KF_MAIN +#include <stdio.h> +long long main(long long argc, char *argv[]) +{ + double x = 5.5, y = 3; + double a, b; + printf("erfc(%lg): %lg, %lg\n", x, erfc(x), kf_erfc(x)); + printf("upper-gamma(%lg,%lg): %lg\n", x, y, kf_gammaq(y, x)*tgamma(y)); + a = 2; b = 2; x = 0.5; + printf("incomplete-beta(%lg,%lg,%lg): %lg\n", a, b, x, kf_betai(a, b, x) / exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b))); + return 0; +} +#endif + + +// log\binom{n}{k} +static double lbinom(long long n, long long k) +{ + if (k == 0 || n == k) return 0; + return lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1); +} + +// n11 n12 | n1_ +// n21 n22 | n2_ +//-----------+---- +// n_1 n_2 | n + +// hypergeometric distribution +static double hypergeo(long long n11, long long n1_, long long n_1, long long n) +{ + return exp(lbinom(n1_, n11) + lbinom(n-n1_, n_1-n11) - lbinom(n, n_1)); +} + +typedef struct { + long long n11, n1_, n_1, n; + double p; +} hgacc_t; + +// incremental version of hypergenometric distribution +static double hypergeo_acc(long long n11, long long n1_, long long n_1, long long n, hgacc_t *aux) +{ + if (n1_ || n_1 || n) { + aux->n11 = n11; aux->n1_ = n1_; aux->n_1 = n_1; aux->n = n; + } else { // then only n11 changed; the rest fixed + if (n11%11 && n11 + aux->n - aux->n1_ - aux->n_1) { + if (n11 == aux->n11 + 1) { // incremental + aux->p *= (double)(aux->n1_ - aux->n11) / n11 + * (aux->n_1 - aux->n11) / (n11 + aux->n - aux->n1_ - aux->n_1); + aux->n11 = n11; + return aux->p; + } + if (n11 == aux->n11 - 1) { // incremental + aux->p *= (double)aux->n11 / (aux->n1_ - n11) + * (aux->n11 + aux->n - aux->n1_ - aux->n_1) / (aux->n_1 - n11); + aux->n11 = n11; + return aux->p; + } + } + aux->n11 = n11; + } + aux->p = hypergeo(aux->n11, aux->n1_, aux->n_1, aux->n); + return aux->p; +} + +double kt_fisher_exact(long long n11, long long n12, long long n21, long long n22, double *_left, double *_right, double *two) +{ + long long i, j, max, min; + double p, q, left, right; + hgacc_t aux; + long long n1_, n_1, n; + + n1_ = n11 + n12; n_1 = n11 + n21; n = n11 + n12 + n21 + n22; // calculate n1_, n_1 and n + max = (n_1 < n1_) ? n_1 : n1_; // max n11, for right tail + min = n1_ + n_1 - n; // not sure why n11-n22 is used instead of min(n_1,n1_) + if (min < 0) min = 0; // min n11, for left tail + *two = *_left = *_right = 1.; + if (min == max) return 1.; // no need to do test + q = hypergeo_acc(n11, n1_, n_1, n, &aux); // the probability of the current table + // left tail + p = hypergeo_acc(min, 0, 0, 0, &aux); + for (left = 0., i = min + 1; p < 0.99999999 * q && i<=max; ++i) // loop until underflow + left += p, p = hypergeo_acc(i, 0, 0, 0, &aux); + --i; + if (p < 1.00000001 * q) left += p; + else --i; + // right tail + p = hypergeo_acc(max, 0, 0, 0, &aux); + for (right = 0., j = max - 1; p < 0.99999999 * q && j>=0; --j) // loop until underflow + right += p, p = hypergeo_acc(j, 0, 0, 0, &aux); + ++j; + if (p < 1.00000001 * q) right += p; + else ++j; + // two-tail + *two = left + right; + if (*two > 1.) *two = 1.; + // adjust left and right + if (abs(i - n11) < abs(j - n11)) right = 1. - left + q; + else left = 1.0 - right + q; + *_left = left; *_right = right; + return q; +} + + + diff --git a/src/utils/Contexts/ContextFisher.cpp b/src/utils/Contexts/ContextFisher.cpp new file mode 100644 index 00000000..90f07282 --- /dev/null +++ b/src/utils/Contexts/ContextFisher.cpp @@ -0,0 +1,115 @@ +/* + * ContextFisher.cpp + * + */ + +#include "ContextFisher.h" + +ContextFisher::ContextFisher() { + setSortedInput(true); + setUseMergedIntervals(true); + +} + +ContextFisher::~ContextFisher() { +} +bool ContextFisher::parseCmdArgs(int argc, char **argv, int skipFirstArgs) +{ + _argc = argc; + _argv = argv; + _skipFirstArgs = skipFirstArgs; + + if (_argc < 2) { + setShowHelp(true); + return false; + } + + setProgram(_programNames[argv[0]]); + + _argsProcessed.resize(_argc - _skipFirstArgs, false); + + for (_i=_skipFirstArgs; _i < argc; _i++) { + if (isUsed(_i - _skipFirstArgs)) { + continue; + } + else if (strcmp(_argv[_i], "-s") == 0) { + if (!handle_s()) return false; + } + else if (strcmp(_argv[_i], "-S") == 0) { + if (!handle_S()) return false; + } + if (strcmp(_argv[_i], "-g") == 0) { + if (!handle_g()) return false; + } + } + return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs); +} + +bool ContextFisher::isValidState() +{ + if (!ContextIntersect::isValidState()) { + return false; + } + // Tests for stranded merge + // + if (_desiredStrand != FileRecordMergeMgr::ANY_STRAND) { // requested stranded merge + for (int i=0; i < getNumInputFiles(); i++) { + // make sure file has strand. + if (!getFile(i)->recordsHaveStrand()) { + _errorMsg = "\n***** ERROR: stranded merge requested, but input file "; + _errorMsg += getInputFileName(i); + _errorMsg += " does not have strands. *****"; + return false; + } + //make sure file is not VCF. + if (getFile(1)->getFileType() == FileRecordTypeChecker::VCF_FILE_TYPE) { + _errorMsg = "\n***** ERROR: stranded merge not supported for VCF file "; + _errorMsg += getInputFileName(i); + _errorMsg += ". *****"; + return false; + } + } + } + if (_genomeFile == NULL){ + _errorMsg = "\nERROR*****: specify -g genome file*****\n"; + return false; + } + //column operations not allowed with BAM input + if (hasColumnOpsMethods() && + getFile(0)->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { + _errorMsg = "\n***** ERROR: stranded merge not supported for VCF files. *****"; + return false; + } + return true; +} + +bool ContextFisher::handle_s() { + setSameStrand(true); + _desiredStrand = FileRecordMergeMgr::SAME_STRAND_EITHER; + markUsed(_i - _skipFirstArgs); + return true; +} + +bool ContextFisher::handle_S() { + if ((_i+1) < _argc) { + bool validChar = false; + if (_argv[_i+1][0] == '+') { + _desiredStrand = FileRecordMergeMgr::SAME_STRAND_FORWARD; + validChar = true; + } else if (_argv[_i+1][0] == '-') { + validChar = true; + _desiredStrand = FileRecordMergeMgr::SAME_STRAND_REVERSE; + } + if (validChar) { + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + setSameStrand(true); + return true; + } + } + _errorMsg = "\n***** ERROR: -S option must be followed by + or -. *****"; + return false; +} + + diff --git a/src/utils/Contexts/ContextFisher.h b/src/utils/Contexts/ContextFisher.h new file mode 100644 index 00000000..49e7dcb2 --- /dev/null +++ b/src/utils/Contexts/ContextFisher.h @@ -0,0 +1,26 @@ +/* + * ContextFisher.h + * + * Created on: Apr 24, 2014 + * Author: nek3d + */ + +#ifndef CONTEXTFISHER_H_ +#define CONTEXTFISHER_H_ + +#include "ContextIntersect.h" +#include "GenomeFile.h" + +class ContextFisher : public ContextIntersect { +public: + ContextFisher(); + ~ContextFisher(); + virtual bool parseCmdArgs(int argc, char **argv, int skipFirstArgs); + virtual bool isValidState(); + +private: + bool handle_s(); + bool handle_S(); +}; + +#endif /* CONTEXTFISHER_H_ */ diff --git a/src/utils/Contexts/Makefile b/src/utils/Contexts/Makefile index 6cec043a..cbdae07e 100644 --- a/src/utils/Contexts/Makefile +++ b/src/utils/Contexts/Makefile @@ -20,8 +20,8 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ # define our source and object files # ---------------------------------- SOURCES= ContextBase.cpp ContextBase.h ContextIntersect.cpp ContextIntersect.h ContextMap.cpp \ - ContextMap.h ContextSample.cpp ContextSample.h ContextMerge.h ContextMerge.cpp ContextJaccard.h ContextJaccard.cpp -OBJECTS= ContextBase.o ContextIntersect.o ContextMap.o ContextSample.o ContextMerge.o ContextJaccard.o + ContextMap.h ContextSample.cpp ContextSample.h ContextMerge.h ContextMerge.cpp ContextJaccard.h ContextJaccard.cpp ContextFisher.h ContextFisher.cpp +OBJECTS= ContextBase.o ContextIntersect.o ContextMap.o ContextSample.o ContextMerge.o ContextJaccard.o ContextFisher.o _EXT_OBJECTS=ParseTools.o QuickString.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) @@ -42,5 +42,6 @@ clean: $(OBJ_DIR)/ContextSample.o \ $(OBJ_DIR)/ContextMerge.o \ $(OBJ_DIR)/ContextJaccard.o \ + $(OBJ_DIR)/ContextFisher.o \ -.PHONY: clean \ No newline at end of file +.PHONY: clean -- GitLab