From d6544e0e9dceab78257bc547f5dd57c6344dfb2c Mon Sep 17 00:00:00 2001 From: Brent Pedersen <bpederse@gmail.com> Date: Mon, 24 Nov 2014 16:56:57 -0700 Subject: [PATCH] change fisher to use numbers of intervals rather than bases of overlap --- docs/content/tools/fisher.rst | 49 ++-- src/fisher/Fisher.cpp | 48 +++- src/fisher/Fisher.h | 4 + src/fisher/fisherMain.cpp | 6 +- src/utils/Contexts/ContextFisher.cpp | 2 +- src/utils/NewChromsweep/NewChromsweep.cpp | 262 +++++++++++----------- src/utils/NewChromsweep/NewChromsweep.h | 6 + test/fisher/README.md | 20 ++ test/fisher/shuf.sh | 4 + test/fisher/sim.py | 27 +++ 10 files changed, 264 insertions(+), 164 deletions(-) create mode 100644 test/fisher/README.md create mode 100644 test/fisher/shuf.sh create mode 100644 test/fisher/sim.py diff --git a/docs/content/tools/fisher.rst b/docs/content/tools/fisher.rst index aa771c07..d027033a 100644 --- a/docs/content/tools/fisher.rst +++ b/docs/content/tools/fisher.rst @@ -15,7 +15,6 @@ scenarios using `Fisher's Exact Test`_ . - Given a pair of input files `-a` and `-b` in the usual BedTools parlance: .. code-block:: bash @@ -23,9 +22,11 @@ Given a pair of input files `-a` and `-b` in the usual BedTools parlance: $ cat a.bed chr1 10 20 chr1 30 40 + chr1 51 52 $ cat b.bed chr1 15 25 + chr1 51 52 And a genome of 500 bases: @@ -40,43 +41,55 @@ can do this with ``fisher`` as: .. code-block:: bash $ bedtools fisher -a a.bed -b b.bed -g t.genome - # Contingency Table + # Number of query intervals: 3 + # Number of db intervals: 2 + # Number of overlaps: 2 + # Number of possible intervals (estimated): 37 + # Contingency Table Of Counts + # phyper(2 - 1, 3, 37 - 3, 2, lower.tail=F) #_________________________________________ - # | not in -b | in -b | - # not in -a | 475 | 5 | - # in -a | 15 | 5 | + # | in -b | not in -b | + # in -a | 2 | 1 | + # not in -a | 0 | 34 | #_________________________________________ # p-values for fisher's exact test left right two-tail ratio - 1 1.3466e-05 1.3466e-05 31.667 + 1 0.0045045 0.0045045 inf + + Where we can see the constructed contingency table and the pvalues for left, right and two-tail tests. -From here, we can say that given **500 bases** of genome, it is unlikely that a region of -20 bases total from `-a` and 10 bases total from `-b` would share 5 bases if the regions -were randomly distributed. *Consult your statistician for a more precise definition*. +From here, we can say that given **500 bases** of genome, it is unlikely that we +would see **as many** overlaps as we do if the intervals from `a` and `b` were not +related. -The above was highly significant, but if our genome were only **50 bases**: +The above had a fairly low p-value (0.0045), but if our genome were only **60 bases**: .. code-block:: bash - $ echo -e "chr1\t50" > t.genome + $ echo -e "chr1\t60" > t.genome $ bedtools fisher -a a.bed -b b.bed -g t.genome - # Contingency Table + # Number of query intervals: 3 + # Number of db intervals: 2 + # Number of overlaps: 2 + # Number of possible intervals (estimated): 4 + # Contingency Table Of Counts + # phyper(2 - 1, 3, 4 - 3, 2, lower.tail=F) #_________________________________________ - # | not in -b | in -b | - # not in -a | 25 | 15 | - # in -a | 5 | 5 | + # | in -b | not in -b | + # in -a | 2 | 1 | + # not in -a | 0 | 1 | #_________________________________________ # p-values for fisher's exact test left right two-tail ratio - 0.86011 0.35497 0.49401 1.667 + 1 0.5 1 inf We can see that neither tail is significant. Intuitively, this makes sense; -if we randomly place 20 (from `-a`), and 10 (from `-b`) bases of intervals -within 50 bases, it doesn't seem unlikely that we'd see 5 bases of overlap. +if we randomly place 3 intervals (from `-a`), and 2 (from `-b`) intervals from +within 60 bases, it doesn't seem unlikely that we'd see 2 overlaps. .. note:: diff --git a/src/fisher/Fisher.cpp b/src/fisher/Fisher.cpp index 9eca0771..4cbfa7af 100644 --- a/src/fisher/Fisher.cpp +++ b/src/fisher/Fisher.cpp @@ -2,6 +2,7 @@ #include "BlockMgr.h" #include "NewChromsweep.h" #include "kfunc.c" +#include <numeric> // std::accumulate Fisher::Fisher(ContextFisher *context) : _context(context), @@ -9,7 +10,11 @@ Fisher::Fisher(ContextFisher *context) _unionVal(0), _numIntersections(0), _queryLen(0), - _dbLen(0) + _dbLen(0), + _queryCounts(0), + _dbCounts(0), + _overlapCounts(0) + { _blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal()); _haveExclude = false; @@ -34,7 +39,6 @@ bool Fisher::calculate() { } // header - cout << "# Contingency Table" << endl; double left, right, two; @@ -42,15 +46,30 @@ bool Fisher::calculate() { if(_haveExclude){ genomeSize -= exclude->getTotalFlattenedLength(); } - // bases covered by both - long long n22 = genomeSize - _queryLen - _dbLen + _intersectionVal; - // bases covered only by -a - long long n21 = _dbLen - _intersectionVal; - // bases covered only by -b - long long n12 = _queryLen - _intersectionVal; - // bases covered by neither a nor b - long long n11 = _intersectionVal; - + // bases covered by neither + long long n22_full_bases = genomeSize; + //long long n22_bases = genomeSize - _queryLen - _dbLen + _intersectionVal; + long double dMean = _dbLen / (long double)_dbCounts; + long double qMean = _queryLen / (long double)_queryCounts; + + // mean interval size + long double bMean = (_dbLen + _queryLen) / (long double)(_dbCounts + _queryCounts); + bMean = 1 + (qMean + dMean); // / 2.0; + + long long n11 = (long)_overlapCounts; + // this could be < 0 because multiple overlaps + long long n12 = (long)max(0L, (long)_queryCounts - (long)_overlapCounts); + long long n21 = (long)(_dbCounts - _overlapCounts); + long long n22_full = n22_full_bases / bMean; + long long n22 = max(0L, (long)(n22_full - n12 - n21 - n11)); + + printf("# Number of query intervals: %lu\n", _queryCounts); + printf("# Number of db intervals: %lu\n", _dbCounts); + printf("# Number of overlaps: %lu\n", _overlapCounts); + printf("# Number of possible intervals (estimated): %lld\n", n22_full); + + cout << "# Contingency Table Of Counts" << endl; + printf("# phyper(%lld - 1, %lu, %lld - %lu, %lu, lower.tail=F)\n", n11, _queryCounts, n22_full, _queryCounts, _dbCounts); printf("#_________________________________________\n"); printf("# | %-12s | %-12s |\n", " in -b", "not in -b"); printf("# in -a | %-12lld | %-12lld |\n", n11, n12); @@ -89,6 +108,9 @@ bool Fisher::getFisher() { _queryLen = sweep.getQueryTotalRecordLength(); _dbLen = sweep.getDatabaseTotalRecordLength(); + _queryCounts = sweep.getQueryTotalRecords(); + _dbCounts = sweep.getDatabaseTotalRecords(); + _unionVal = _queryLen + _dbLen; return true; } @@ -100,10 +122,14 @@ unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList) int keyStart = key->getStartPos(); int keyEnd = key->getEndPos(); + _overlapCounts += recList.size(); + _qsizes.push_back((keyEnd - keyStart)); + int hitIdx = 0; for (RecordKeyVector::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { int maxStart = max((*iter)->getStartPos(), keyStart); int minEnd = min((*iter)->getEndPos(), keyEnd); + _qsizes.push_back((int)(minEnd - maxStart)); if (_context->getObeySplits()) { intersection += _blockMgr->getOverlapBases(hitIdx); hitIdx++; diff --git a/src/fisher/Fisher.h b/src/fisher/Fisher.h index e10bc8fb..c9d1850e 100644 --- a/src/fisher/Fisher.h +++ b/src/fisher/Fisher.h @@ -26,9 +26,13 @@ private: int _numIntersections; unsigned long _queryLen; unsigned long _dbLen; + unsigned long _queryCounts; + unsigned long _dbCounts; + unsigned long _overlapCounts; bool getFisher(); BedFile *exclude; + vector<int> _qsizes; unsigned long getTotalIntersection(RecordKeyVector &hits); }; diff --git a/src/fisher/fisherMain.cpp b/src/fisher/fisherMain.cpp index 755dc97a..d35b2868 100644 --- a/src/fisher/fisherMain.cpp +++ b/src/fisher/fisherMain.cpp @@ -32,13 +32,9 @@ void fisher_help(void) { cerr << "\nTool: bedtools fisher (aka fisher)" << endl; cerr << "Version: " << VERSION << "\n"; cerr << "Summary: Calculate Fisher statistic b/w two feature files." - << endl - << "\t Fisher is the length of the intersection over the union." - << endl - << "\t Values range from 0 (no intersection) to 1 (self intersection)." << endl << endl; - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf> -g <genome>" << endl << endl; + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf> -c <possible counts>" << endl << endl; cerr << "Options: " << endl; diff --git a/src/utils/Contexts/ContextFisher.cpp b/src/utils/Contexts/ContextFisher.cpp index 9452c069..fd00f982 100644 --- a/src/utils/Contexts/ContextFisher.cpp +++ b/src/utils/Contexts/ContextFisher.cpp @@ -7,7 +7,7 @@ ContextFisher::ContextFisher() { setSortedInput(true); - setUseMergedIntervals(true); + setUseMergedIntervals(false); } ContextFisher::~ContextFisher() { diff --git a/src/utils/NewChromsweep/NewChromsweep.cpp b/src/utils/NewChromsweep/NewChromsweep.cpp index f7561e8c..82a9860d 100644 --- a/src/utils/NewChromsweep/NewChromsweep.cpp +++ b/src/utils/NewChromsweep/NewChromsweep.cpp @@ -16,34 +16,36 @@ NewChromSweep::NewChromSweep(ContextIntersect *context, bool useMergedIntervals) -: _context(context), - _queryFRM(NULL), - _numDBs(_context->getNumDatabaseFiles()), - _useMergedIntervals(false), - _queryRecordsTotalLength(0), - _databaseRecordsTotalLength(0), - _wasInitialized(false), - _currQueryRec(NULL), - _runToQueryEnd(false) +: _context(context), + _queryFRM(NULL), + _numDBs(_context->getNumDatabaseFiles()), + _useMergedIntervals(false), + _queryRecordsTotalLength(0), + _databaseRecordsTotalLength(0), + _queryTotalRecords(0), + _databaseTotalRecords(0), + _wasInitialized(false), + _currQueryRec(NULL), + _runToQueryEnd(false) { } bool NewChromSweep::init() { - //Create new FileRecordMgrs for the input files. - //Open them, and get the first record from each. - //otherwise, return true. + //Create new FileRecordMgrs for the input files. + //Open them, and get the first record from each. + //otherwise, return true. _queryFRM = _context->getFile(_context->getQueryFileIdx()); _dbFRMs.resize(_numDBs, NULL); for (int i=0; i < _numDBs; i++) { - _dbFRMs[i] = _context->getDatabaseFile(i); + _dbFRMs[i] = _context->getDatabaseFile(i); } _currDbRecs.resize(_numDBs, NULL); for (int i=0; i < _numDBs; i++) { - nextRecord(false, i); + nextRecord(false, i); } _caches.resize(_numDBs); @@ -52,50 +54,50 @@ bool NewChromSweep::init() { //end of the query file is hit as well. if (_context->getNoHit() || _context->getWriteCount() || _context->getWriteOverlap() || _context->getWriteAllOverlap() || _context->getLeftJoin()) { - _runToQueryEnd = true; + _runToQueryEnd = true; } _wasInitialized = true; return true; } void NewChromSweep::closeOut() { - while (!_queryFRM->eof()) { - nextRecord(true); - } + while (!_queryFRM->eof()) { + nextRecord(true); + } for (int i=0; i < _numDBs; i++) { - while (!_dbFRMs[i]->eof()) { - nextRecord(false, i); - } + while (!_dbFRMs[i]->eof()) { + nextRecord(false, i); + } } } NewChromSweep::~NewChromSweep(void) { - if (!_wasInitialized) { - return; - } - _queryFRM->deleteRecord(_currQueryRec); - _currQueryRec = NULL; + if (!_wasInitialized) { + return; + } + _queryFRM->deleteRecord(_currQueryRec); + _currQueryRec = NULL; for (int i=0; i < _numDBs; i++) { - _dbFRMs[i]->deleteRecord(_currDbRecs[i]); - _currDbRecs[i] = NULL; + _dbFRMs[i]->deleteRecord(_currDbRecs[i]); + _currDbRecs[i] = NULL; } - _queryFRM->close(); + _queryFRM->close(); for (int i=0; i < _numDBs; i++) { - _dbFRMs[i]->close(); + _dbFRMs[i]->close(); } } void NewChromSweep::scanCache(int dbIdx, RecordKeyVector &retList) { - recListIterType cacheIter = _caches[dbIdx].begin(); + recListIterType cacheIter = _caches[dbIdx].begin(); while (cacheIter != _caches[dbIdx].end()) { - const Record *cacheRec = cacheIter->value(); + const Record *cacheRec = cacheIter->value(); if (_currQueryRec->sameChrom(cacheRec) && !(_currQueryRec->after(cacheRec))) { if (intersects(_currQueryRec, cacheRec)) { retList.push_back(cacheRec); @@ -104,151 +106,153 @@ void NewChromSweep::scanCache(int dbIdx, RecordKeyVector &retList) { } else { cacheIter = _caches[dbIdx].deleteCurrent(); - _dbFRMs[dbIdx]->deleteRecord(cacheRec); + _dbFRMs[dbIdx]->deleteRecord(cacheRec); } } } void NewChromSweep::clearCache(int dbIdx) { - //delete all objects pointed to by cache - recListType &cache = _caches[dbIdx]; - for (recListIterType iter = cache.begin(); iter != cache.end(); iter = cache.next()) { - _dbFRMs[dbIdx]->deleteRecord(iter->value()); - } - cache.clear(); + //delete all objects pointed to by cache + recListType &cache = _caches[dbIdx]; + for (recListIterType iter = cache.begin(); iter != cache.end(); iter = cache.next()) { + _dbFRMs[dbIdx]->deleteRecord(iter->value()); + } + cache.clear(); } void NewChromSweep::masterScan(RecordKeyVector &retList) { - for (int i=0; i < _numDBs; i++) { - if (dbFinished(i) || chromChange(i, retList)) { - continue; - } else { - - // scan the database cache for hits - scanCache(i, retList); - //skip if we hit the end of the DB - // advance the db until we are ahead of the query. update hits and cache as necessary - while (_currDbRecs[i] != NULL && - _currQueryRec->sameChrom(_currDbRecs[i]) && - !(_currDbRecs[i]->after(_currQueryRec))) { - if (intersects(_currQueryRec, _currDbRecs[i])) { - retList.push_back(_currDbRecs[i]); - } - if (_currQueryRec->after(_currDbRecs[i])) { - _dbFRMs[i]->deleteRecord(_currDbRecs[i]); - _currDbRecs[i] = NULL; - } else { - _caches[i].push_back(_currDbRecs[i]); - _currDbRecs[i] = NULL; - } - nextRecord(false, i); - } - } - } + for (int i=0; i < _numDBs; i++) { + if (dbFinished(i) || chromChange(i, retList)) { + continue; + } else { + + // scan the database cache for hits + scanCache(i, retList); + //skip if we hit the end of the DB + // advance the db until we are ahead of the query. update hits and cache as necessary + while (_currDbRecs[i] != NULL && + _currQueryRec->sameChrom(_currDbRecs[i]) && + !(_currDbRecs[i]->after(_currQueryRec))) { + if (intersects(_currQueryRec, _currDbRecs[i])) { + retList.push_back(_currDbRecs[i]); + } + if (_currQueryRec->after(_currDbRecs[i])) { + _dbFRMs[i]->deleteRecord(_currDbRecs[i]); + _currDbRecs[i] = NULL; + } else { + _caches[i].push_back(_currDbRecs[i]); + _currDbRecs[i] = NULL; + } + nextRecord(false, i); + } + } + } } bool NewChromSweep::chromChange(int dbIdx, RecordKeyVector &retList) { // the files are on the same chrom - if (_currDbRecs[dbIdx] == NULL || _currQueryRec->sameChrom(_currDbRecs[dbIdx])) { - return false; - } - // the query is ahead of the database. fast-forward the database to catch-up. - if (_currQueryRec->chromAfter(_currDbRecs[dbIdx])) { - - while (_currDbRecs[dbIdx] != NULL && - _currQueryRec->chromAfter(_currDbRecs[dbIdx])) { - _dbFRMs[dbIdx]->deleteRecord(_currDbRecs[dbIdx]); - nextRecord(false, dbIdx); - } - clearCache(dbIdx); + if (_currDbRecs[dbIdx] == NULL || _currQueryRec->sameChrom(_currDbRecs[dbIdx])) { + return false; + } + // the query is ahead of the database. fast-forward the database to catch-up. + if (_currQueryRec->chromAfter(_currDbRecs[dbIdx])) { + + while (_currDbRecs[dbIdx] != NULL && + _currQueryRec->chromAfter(_currDbRecs[dbIdx])) { + _dbFRMs[dbIdx]->deleteRecord(_currDbRecs[dbIdx]); + nextRecord(false, dbIdx); + } + clearCache(dbIdx); return false; } // the database is ahead of the query. else { // 1. scan the cache for remaining hits on the query's current chrom. - scanCache(dbIdx, retList); + scanCache(dbIdx, retList); return true; } - //control can't reach here, but compiler still wants a return statement. - return true; + //control can't reach here, but compiler still wants a return statement. + return true; } bool NewChromSweep::next(RecordKeyVector &retList) { - retList.clearVector(); - if (_currQueryRec != NULL) { - _queryFRM->deleteRecord(_currQueryRec); - } + retList.clearVector(); + if (_currQueryRec != NULL) { + _queryFRM->deleteRecord(_currQueryRec); + } - if (!nextRecord(true)) return false; // query EOF hit + if (!nextRecord(true)) return false; // query EOF hit - if (allCurrDBrecsNull() && allCachesEmpty() && !_runToQueryEnd) { - return false; - } - _currChromName = _currQueryRec->getChrName(); + if (allCurrDBrecsNull() && allCachesEmpty() && !_runToQueryEnd) { + return false; + } + _currChromName = _currQueryRec->getChrName(); - masterScan(retList); + masterScan(retList); - if (_context->getSortOutput()) { - retList.sortVector(); - } + if (_context->getSortOutput()) { + retList.sortVector(); + } - retList.setKey(_currQueryRec); - return true; + retList.setKey(_currQueryRec); + return true; } bool NewChromSweep::nextRecord(bool query, int dbIdx) { - if (query) { - _currQueryRec = _queryFRM->getNextRecord(); - if (_currQueryRec != NULL) { - _queryRecordsTotalLength += (unsigned long)(_currQueryRec->getEndPos() - _currQueryRec->getStartPos()); - return true; - } - return false; - } else { //database - Record *rec = _dbFRMs[dbIdx]->getNextRecord(); - _currDbRecs[dbIdx] = rec; - if (rec != NULL) { - _databaseRecordsTotalLength += (unsigned long)(rec->getEndPos() - rec->getStartPos()); - return true; - } - return false; - } + if (query) { + _currQueryRec = _queryFRM->getNextRecord(); + if (_currQueryRec != NULL) { + _queryRecordsTotalLength += (unsigned long)(_currQueryRec->getEndPos() - _currQueryRec->getStartPos()); + _queryTotalRecords++; + return true; + } + return false; + } else { //database + Record *rec = _dbFRMs[dbIdx]->getNextRecord(); + _currDbRecs[dbIdx] = rec; + if (rec != NULL) { + _databaseRecordsTotalLength += (unsigned long)(rec->getEndPos() - rec->getStartPos()); + _databaseTotalRecords++; + return true; + } + return false; + } } bool NewChromSweep::intersects(const Record *rec1, const Record *rec2) const { - return rec1->sameChromIntersects(rec2, _context->getSameStrand(), _context->getDiffStrand(), - _context->getOverlapFraction(), _context->getReciprocal()); + return rec1->sameChromIntersects(rec2, _context->getSameStrand(), _context->getDiffStrand(), + _context->getOverlapFraction(), _context->getReciprocal()); } bool NewChromSweep::allCachesEmpty() { - for (int i=0; i < _numDBs; i++) { - if (!_caches[i].empty()) { - return false; - } - } - return true; + for (int i=0; i < _numDBs; i++) { + if (!_caches[i].empty()) { + return false; + } + } + return true; } bool NewChromSweep::allCurrDBrecsNull() { - for (int i=0; i < _numDBs; i++) { - if (_currDbRecs[i] != NULL) { - return false; - } - } - return true; + for (int i=0; i < _numDBs; i++) { + if (_currDbRecs[i] != NULL) { + return false; + } + } + return true; } bool NewChromSweep::dbFinished(int dbIdx) { - if (_currDbRecs[dbIdx] == NULL && _caches[dbIdx].empty()) { - return true; - } - return false; + if (_currDbRecs[dbIdx] == NULL && _caches[dbIdx].empty()) { + return true; + } + return false; } diff --git a/src/utils/NewChromsweep/NewChromsweep.h b/src/utils/NewChromsweep/NewChromsweep.h index 1dfe5b7b..aca2bb1d 100644 --- a/src/utils/NewChromsweep/NewChromsweep.h +++ b/src/utils/NewChromsweep/NewChromsweep.h @@ -49,6 +49,9 @@ public: unsigned long getQueryTotalRecordLength() { return _queryRecordsTotalLength; } unsigned long getDatabaseTotalRecordLength() { return _databaseRecordsTotalLength; } + unsigned long getQueryTotalRecords() { return _queryTotalRecords; } + unsigned long getDatabaseTotalRecords() { return _databaseTotalRecords; } + private: ContextIntersect *_context; FileRecordMgr *_queryFRM; @@ -63,6 +66,9 @@ private: unsigned long _databaseRecordsTotalLength; + unsigned long _queryTotalRecords; + unsigned long _databaseTotalRecords; + bool _wasInitialized; // a cache of still active features from the database file diff --git a/test/fisher/README.md b/test/fisher/README.md new file mode 100644 index 00000000..d0179ca9 --- /dev/null +++ b/test/fisher/README.md @@ -0,0 +1,20 @@ +Fisher Testing +============== + +Fisher is now based on the count of interval overlaps, subject to `-f`. +We can compare the output of fisher on simulated data by running `python sim.py` +which will show the output from `bedtools fisher` and then running `bash shuf.sh` +which will repeatedly run + +```Shell + +bedtools intersect -wo -a taa.bed -b <(bedtools shuffle -allowBeyondChromEnd -i tbb.bed -g tgg.genome) | wc -l +``` + +and then report the proportion of times that number is >= the observed intersection. + +In `sim.py` changing the lenght of the intervals (via `maxA`, `maxB`) has the greatest effect on the +correspondence of the simulated p-value from `shuf.sh` and the one from `fisher`. The right-tailed p-value +from `fisher` should correspond well to the value from the simulation. + + diff --git a/test/fisher/shuf.sh b/test/fisher/shuf.sh new file mode 100644 index 00000000..ec0755aa --- /dev/null +++ b/test/fisher/shuf.sh @@ -0,0 +1,4 @@ +obs=$(bedtools intersect -wo -a taa.bed -b tbb.bed | wc -l) + +seq 1000 | xargs -P 7 -n 1 bash -c "bedtools intersect -wo -a taa.bed -b <(bedtools shuffle -allowBeyondChromEnd -i tbb.bed -g tgg.genome) | wc -l " | awk -vobs=$obs '{s += ($1 > obs)}END{print s / NR}' + diff --git a/test/fisher/sim.py b/test/fisher/sim.py new file mode 100644 index 00000000..3379507e --- /dev/null +++ b/test/fisher/sim.py @@ -0,0 +1,27 @@ +import sys +from random import randint +from subprocess import check_output + +genome_size = 25000000 +nA = 210 +nB = 3390 + +minA, maxA = (20, 2000) +minB, maxB = (20, 1250) + +for f, imin, imax, n in ( + ('taa.bed', minA, maxA, nA), + ('tbb.bed', minB, maxB, nB)): + with open(f, 'w') as fh: + vals = [] + for i in range(n): + s = randint(0, genome_size - imax) + e = randint(s + imin, min(genome_size, s + imax)) + vals.append((s, e)) + for s, e in sorted(vals): + fh.write("chr1\t%i\t%i\n" % (s, e)) + fh.flush() + +print >> open('tgg.genome', 'w'), ("chr1\t%i" % genome_size) + +print check_output("../../bin/bedtools fisher -a taa.bed -b tbb.bed -g tgg.genome", shell=True) -- GitLab