diff --git a/.gitignore b/.gitignore index d831ebb0a5aa8522d6d869482511aaf7f19d0c66..d62c880bdc7fb71e7c9499e9cbbfef8f78e2604a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ docs/_build/ src/utils/version/version_git.h .project .cproject +nbproject diff --git a/docs/content/tools/fisher.rst b/docs/content/tools/fisher.rst index 239328dd9112e051569dd6cfac74f4f24453088f..aa771c07821cad0144acaa06628d4a7211d1843f 100644 --- a/docs/content/tools/fisher.rst +++ b/docs/content/tools/fisher.rst @@ -48,7 +48,7 @@ can do this with ``fisher`` as: #_________________________________________ # p-values for fisher's exact test left right two-tail ratio - 1.00000 0.00001 0.00001 31.667 + 1 1.3466e-05 1.3466e-05 31.667 Where we can see the constructed contingency table and the pvalues for left, right diff --git a/src/bamToFastq/bamToFastq.cpp b/src/bamToFastq/bamToFastq.cpp index 8b80e7f729c1cedb719da3649da0a917aad2577b..245554c73aa8110cb19cdb566a4c5dba66cc50a1 100644 --- a/src/bamToFastq/bamToFastq.cpp +++ b/src/bamToFastq/bamToFastq.cpp @@ -79,20 +79,26 @@ void BamToFastq::PairedFastq() { reader.Open(_bamFile); // rip through the BAM file and convert each mapped entry to BEDPE BamAlignment bam1, bam2; - while (reader.GetNextAlignment(bam1)) { + bool shouldConsumeReads = true; + while (true) { - reader.GetNextAlignment(bam2); + if (shouldConsumeReads) { + if (!reader.GetNextAlignment(bam1) || !reader.GetNextAlignment(bam2)) break; + } else { + shouldConsumeReads = true; + } if (bam1.Name != bam2.Name) { while (bam1.Name != bam2.Name) { if (bam1.IsPaired()) { cerr << "*****WARNING: Query " << bam1.Name - << " is marked as paired, but it's mate does not occur" + << " is marked as paired, but its mate does not occur" << " next to it in your BAM file. Skipping. " << endl; } bam1 = bam2; - reader.GetNextAlignment(bam2); + if (!reader.GetNextAlignment(bam2)) break; + shouldConsumeReads = false; } } else if (bam1.IsPaired() && bam2.IsPaired()) { diff --git a/src/fisher/Fisher.cpp b/src/fisher/Fisher.cpp index 874da58b193c129dac4f4789898f488e36ef1867..8463e2f97b58bd3a2ec8a9a056d2753e62be736c 100644 --- a/src/fisher/Fisher.cpp +++ b/src/fisher/Fisher.cpp @@ -11,27 +11,27 @@ Fisher::Fisher(ContextFisher *context) _queryLen(0), _dbLen(0) { - _blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal()); + _blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal()); } Fisher::~Fisher(void) { - delete _blockMgr; - _blockMgr = NULL; + delete _blockMgr; + _blockMgr = NULL; } bool Fisher::calculate() { - if (!getFisher()) { - return false; - } + if (!getFisher()) { + return false; + } - // header - cout << "# Contingency Table" << endl; + // header + cout << "# Contingency Table" << endl; // for fisher's exact test, we need the contingency table - // XXXXXXXX | not in A | in A + // XXXXXXXX | not in A | in A // not in B | n11: in neither | n12: only in A - // in B | n21: only in B | n22: in A & B + // in B | n21: only in B | n22: in A & B // double left, right, two; @@ -46,9 +46,9 @@ bool Fisher::calculate() { long long n22 = _intersectionVal; printf("#_________________________________________\n"); - printf("# | %-12s | %-12s |\n", "not in -b", "in -b"); + printf("# | %-12s | %-12s |\n", "not in -b", "in -b"); printf("# not in -a | %-12lld | %-12lld |\n", n11, n12); - printf("# in -a | %-12lld | %-12lld |\n", n21, n22); + printf("# in -a | %-12lld | %-12lld |\n", n21, n22); printf("#_________________________________________\n"); kt_fisher_exact(n11, n12, n21, n22, &left, &right, &two); @@ -56,57 +56,55 @@ bool Fisher::calculate() { printf("# p-values for fisher's exact test\n"); printf("left\tright\ttwo-tail\tratio\n"); - printf("%.5f\t%.5f\t%.5f\t%.3f\n", left, right, two, ratio); + printf("%.5g\t%.5g\t%.5g\t%.3f\n", left, right, two, ratio); - //kt_fisher_exact(50010000, 10000000, 15000000, 3000000, &left, &right, &two); - - return true; + return true; } bool Fisher::getFisher() { - NewChromSweep sweep(_context); - if (!sweep.init()) { - return false; - } - RecordKeyVector hitSet; - while (sweep.next(hitSet)) { - if (_context->getObeySplits()) { - RecordKeyVector keySet(hitSet.getKey()); - RecordKeyVector resultSet(hitSet.getKey()); - _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet); - _intersectionVal += getTotalIntersection(resultSet); - } else { - _intersectionVal += getTotalIntersection(hitSet); - } - } - - sweep.closeOut(); - _queryLen = sweep.getQueryTotalRecordLength(); - _dbLen = sweep.getDatabaseTotalRecordLength(); - - _unionVal = _queryLen + _dbLen; - return true; + NewChromSweep sweep(_context); + if (!sweep.init()) { + return false; + } + RecordKeyVector hitSet; + while (sweep.next(hitSet)) { + if (_context->getObeySplits()) { + RecordKeyVector keySet(hitSet.getKey()); + RecordKeyVector resultSet(hitSet.getKey()); + _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet); + _intersectionVal += getTotalIntersection(resultSet); + } else { + _intersectionVal += getTotalIntersection(hitSet); + } + } + + sweep.closeOut(); + _queryLen = sweep.getQueryTotalRecordLength(); + _dbLen = sweep.getDatabaseTotalRecordLength(); + + _unionVal = _queryLen + _dbLen; + return true; } unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList) { - unsigned long intersection = 0; - const Record *key = recList.getKey(); - int keyStart = key->getStartPos(); - int keyEnd = key->getEndPos(); - - int hitIdx = 0; - for (RecordKeyVector::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { - int maxStart = max((*iter)->getStartPos(), keyStart); - int minEnd = min((*iter)->getEndPos(), keyEnd); - if (_context->getObeySplits()) { - intersection += _blockMgr->getOverlapBases(hitIdx); - hitIdx++; - } else { - intersection += (unsigned long)(minEnd - maxStart); - } - } - _numIntersections += (int)recList.size(); - return intersection; + unsigned long intersection = 0; + const Record *key = recList.getKey(); + int keyStart = key->getStartPos(); + int keyEnd = key->getEndPos(); + + int hitIdx = 0; + for (RecordKeyVector::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { + int maxStart = max((*iter)->getStartPos(), keyStart); + int minEnd = min((*iter)->getEndPos(), keyEnd); + if (_context->getObeySplits()) { + intersection += _blockMgr->getOverlapBases(hitIdx); + hitIdx++; + } else { + intersection += (unsigned long)(minEnd - maxStart); + } + } + _numIntersections += (int)recList.size(); + return intersection; } diff --git a/test/bamtofastq/golden.fq b/test/bamtofastq/golden.fq new file mode 100644 index 0000000000000000000000000000000000000000..78819d813d217b9ea7eeb96707fd494967211bb4 --- /dev/null +++ b/test/bamtofastq/golden.fq @@ -0,0 +1,8 @@ +@paired-1/1 +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ++ +DDDDDDDDDDDDDDDDDDDDDDDDDDDDDD +@paired-2/1 +TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT ++ +FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF diff --git a/test/bamtofastq/golden.fq2 b/test/bamtofastq/golden.fq2 new file mode 100644 index 0000000000000000000000000000000000000000..4d32c42e6fd11b6a91c6cb82c206ae192c363a25 --- /dev/null +++ b/test/bamtofastq/golden.fq2 @@ -0,0 +1,8 @@ +@paired-1/2 +GGGGGGGGGGGGGGGGGGGGGGGGGGGGGG ++ +EEEEEEEEEEEEEEEEEEEEEEEEEEEEEE +@paired-2/2 +CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC ++ +HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH diff --git a/test/bamtofastq/test-bamtofastq.sh b/test/bamtofastq/test-bamtofastq.sh new file mode 100755 index 0000000000000000000000000000000000000000..5cd672fc0d29a066d03d6b010cdc5562da184c24 --- /dev/null +++ b/test/bamtofastq/test-bamtofastq.sh @@ -0,0 +1,20 @@ +BT=${BT-../../bin/bedtools} + +check() +{ + if diff $1 $2; then + echo ok + else + echo fail + fi +} + +samtools view -Sb test.sam > test.bam 2> /dev/null + +$BT bamtofastq -i test.bam -fq test.fq -fq2 test.fq2 2> /dev/null + +check test.fq golden.fq +check test.fq2 golden.fq2 + +rm test.bam test.fq test.fq2 + diff --git a/test/bamtofastq/test.sam b/test/bamtofastq/test.sam new file mode 100644 index 0000000000000000000000000000000000000000..3d64014030c354d1155ab2d47f48ca89199444d2 --- /dev/null +++ b/test/bamtofastq/test.sam @@ -0,0 +1,7 @@ +@HD VN:1.0 GO:none SO:coordinate +@SQ SN:chr1 LN:249250621 +paired-1 1 chr1 1 40 30M * 0 0 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA DDDDDDDDDDDDDDDDDDDDDDDDDDDDDD MD:Z:50 +paired-1 1 chr1 1 100 30M * 0 0 GGGGGGGGGGGGGGGGGGGGGGGGGGGGGG EEEEEEEEEEEEEEEEEEEEEEEEEEEEEE MD:Z:50 +unpaired 1 chr1 1 100 30M * 0 0 GAAGGCCACCGCCGCGGTTATTTTCCTTCA CCCDDB?=FJIIJIGFJIJHIJJJJJJJJI MD:Z:50 +paired-2 1 chr1 1 40 30M * 0 0 TTTTTTTTTTTTTTTTTTTTTTTTTTTTTT FFFFFFFFFFFFFFFFFFFFFFFFFFFFFF MD:Z:50 +paired-2 1 chr1 1 100 30M * 0 0 CCCCCCCCCCCCCCCCCCCCCCCCCCCCCC HHHHHHHHHHHHHHHHHHHHHHHHHHHHHH MD:Z:50