Commit 9f2d8490 authored by Neil Kindlon's avatar Neil Kindlon
Browse files

Unit tests for coverge, corrections to -split behavior for jaccard and fisher, minor code clean up

parent 43006c92
/*****************************************************************************
coverageBed.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "lineFileUtilities.h"
#include "coverageBed.h"
// build
BedCoverage::BedCoverage(string &bedAFile, string &bedBFile,
bool sameStrand, bool diffStrand,
bool writeHistogram, bool bamInput,
bool obeySplits, bool eachBase,
bool countsOnly)
{
_bedAFile = bedAFile;
_bedBFile = bedBFile;
_bedA = new BedFile(bedAFile);
_bedB = new BedFile(bedBFile);
_sameStrand = sameStrand;
_diffStrand = diffStrand;
_obeySplits = obeySplits;
_eachBase = eachBase;
_writeHistogram = writeHistogram;
_bamInput = bamInput;
_countsOnly = countsOnly;
if (_bamInput == false)
CollectCoverageBed();
else
CollectCoverageBam(_bedA->bedFile);
}
// destroy
BedCoverage::~BedCoverage(void) {
delete _bedA;
delete _bedB;
}
void BedCoverage::CollectCoverageBed() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bedB->loadBedCovFileIntoMap();
BED a;
_bedA->Open();
// process each entry in A
while (_bedA->GetNextBed(a)) {
if (_bedA->_status == BED_VALID) {
// process the BED entry as a single block
if (_obeySplits == false)
_bedB->countHits(a, _sameStrand,
_diffStrand, _countsOnly);
// split the BED into discrete blocksand process
// each independently.
else {
bedVector bedBlocks;
GetBedBlocks(a, bedBlocks);
// use countSplitHits to avoid over-counting
// each split chunk as distinct read coverage.
_bedB->countSplitHits(bedBlocks, _sameStrand,
_diffStrand, _countsOnly);
}
}
}
_bedA->Close();
// report the coverage (summary or histogram) for BED B.
if (_countsOnly == true)
ReportCounts();
else
ReportCoverage();
}
void BedCoverage::CollectCoverageBam(string bamFile) {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bedB->loadBedCovFileIntoMap();
// open the BAM file
BamReader reader;
if (!reader.Open(bamFile)) {
cerr << "Failed to open BAM file " << bamFile << endl;
exit(1);
}
// get header & reference information
string header = reader.GetHeaderText();
RefVector refs = reader.GetReferenceData();
// convert each aligned BAM entry to BED
// and compute coverage on B
BamAlignment bam;
while (reader.GetNextAlignment(bam)) {
if (bam.IsMapped()) {
// treat the BAM alignment as a single "block"
if (_obeySplits == false) {
// construct a new BED entry from the current
// BAM alignment.
BED a;
try
{
a.chrom = refs.at(bam.RefID).RefName;
a.start = bam.Position;
a.end = bam.GetEndPosition(false, false);
a.strand = "+";
if (bam.IsReverseStrand()) a.strand = "-";
_bedB->countHits(a, _sameStrand,
_diffStrand, _countsOnly);
}
catch (out_of_range& oor)
{
cerr << bam.Name
<< " is said to be mapped (0x4 == false), "
<< "yet the chrom is missing. Skipping."
<< endl;
}
}
// split the BAM alignment into discrete blocks and
// look for overlaps only within each block.
else {
// vec to store the discrete BED "blocks" from a
bedVector bedBlocks;
// since we are counting coverage, we do want
// to split blocks when a deletion (D) CIGAR op
// is encountered (hence the true for the last parm)
try
{
string chrom = refs.at(bam.RefID).RefName;
GetBamBlocks(bam, chrom, bedBlocks, false, true);
// use countSplitHits to avoid over-counting
// each split chunk as distinct read coverage.
_bedB->countSplitHits(bedBlocks, _sameStrand,
_diffStrand, _countsOnly);
}
catch (out_of_range& oor)
{
cerr << bam.Name
<< " is said to be mapped (0x4 == false), "
<< "yet the chrom is missing. Skipping."
<< endl;
}
}
}
}
// report the coverage (summary or histogram) for BED B.
if (_countsOnly == true)
ReportCounts();
else
ReportCoverage();
// close the BAM file
reader.Close();
}
void BedCoverage::ReportCounts() {
// process each chromosome
masterBedCovMap::const_iterator chromItr = _bedB->bedCovMap.begin();
masterBedCovMap::const_iterator chromEnd = _bedB->bedCovMap.end();
for (; chromItr != chromEnd; ++chromItr)
{
// for each chrom, process each bin
binsToBedCovs::const_iterator binItr = chromItr->second.begin();
binsToBedCovs::const_iterator binEnd = chromItr->second.end();
for (; binItr != binEnd; ++binItr)
{
// for each chrom & bin, compute and report
// the observed coverage for each feature
vector<BEDCOV>::const_iterator bedItr = binItr->second.begin();
vector<BEDCOV>::const_iterator bedEnd = binItr->second.end();
for (; bedItr != bedEnd; ++bedItr)
{
_bedB->reportBedTab(*bedItr);
printf("%d\n", bedItr->count);
}
}
}
}
void BedCoverage::ReportCoverage() {
map<unsigned long, unsigned long> allDepthHist;
unsigned long totalLength = 0;
// process each chromosome
masterBedCovMap::const_iterator chromItr = _bedB->bedCovMap.begin();
masterBedCovMap::const_iterator chromEnd = _bedB->bedCovMap.end();
for (; chromItr != chromEnd; ++chromItr)
{
// for each chrom, process each bin
binsToBedCovs::const_iterator binItr = chromItr->second.begin();
binsToBedCovs::const_iterator binEnd = chromItr->second.end();
for (; binItr != binEnd; ++binItr)
{
// for each chrom & bin, compute and report
// the observed coverage for each feature
vector<BEDCOV>::const_iterator bedItr = binItr->second.begin();
vector<BEDCOV>::const_iterator bedEnd = binItr->second.end();
for (; bedItr != bedEnd; ++bedItr)
{
int zeroDepthCount = 0; // number of bases with zero depth
int depth = 0; // tracks the depth at the current base
// the start is either the first base in the feature OR
// the leftmost position of an overlapping feature.
// e.g. (s = start):
// A ----------
// B s ------------
int start = min(bedItr->minOverlapStart, bedItr->start);
// track the number of bases in the feature covered by
// 0, 1, 2, ... n features in A
map<unsigned int, unsigned int> depthHist;
map<unsigned int, DEPTH>::const_iterator depthItr;
// compute the coverage observed at each base in
// the feature marching from start to end.
for (CHRPOS pos = start+1; pos <= bedItr->end; pos++)
{
// map pointer grabbing the starts and
// ends observed at this position
depthItr = bedItr->depthMap.find(pos);
// increment coverage if starts observed at this position.
if (depthItr != bedItr->depthMap.end())
depth += depthItr->second.starts;
// update coverage assuming the current position is
// within the current B feature
if ((pos > bedItr->start) && (pos <= bedItr->end)) {
if (depth == 0) zeroDepthCount++;
// update our histograms, assuming we are not
// reporting "per-base" coverage.
if (_eachBase == false) {
depthHist[depth]++;
allDepthHist[depth]++;
}
else if ((_eachBase == true) &&
(bedItr->zeroLength == false))
{
_bedB->reportBedTab(*bedItr);
printf("%d\t%d\n", pos-bedItr->start, depth);
}
}
// decrement coverage if ends observed at this position.
if (depthItr != bedItr->depthMap.end())
depth = depth - depthItr->second.ends;
}
// handle the special case where the user wants "per-base" depth
// but the current feature is length = 0.
if ((_eachBase == true) && (bedItr->zeroLength == true)) {
_bedB->reportBedTab(*bedItr);
printf("1\t%d\n",depth);
}
// Summarize the coverage for the current interval,
// assuming the user has not requested "per-base" coverage.
else if (_eachBase == false)
{
CHRPOS length = bedItr->end - bedItr->start;
if (bedItr->zeroLength == true) {
length = 0;
}
totalLength += length;
int nonZeroBases = (length - zeroDepthCount);
float fractCovered = 0.0;
if (bedItr->zeroLength == false) {
fractCovered = (float) nonZeroBases / length;
}
// print a summary of the coverage
if (_writeHistogram == false) {
_bedB->reportBedTab(*bedItr);
printf("%d\t%d\t%d\t%0.7f\n",
bedItr->count, nonZeroBases,
length, fractCovered);
}
// HISTOGRAM
// report the number of bases with coverage == x
else {
// produce a histogram when not a zero length feature.
if (bedItr->zeroLength == false) {
map<unsigned int, unsigned int>::const_iterator
histItr = depthHist.begin();
map<unsigned int, unsigned int>::const_iterator
histEnd = depthHist.end();
for (; histItr != histEnd; ++histItr)
{
float fractAtThisDepth = (float)
histItr->second / length;
_bedB->reportBedTab(*bedItr);
printf("%d\t%d\t%d\t%0.7f\n",
histItr->first, histItr->second,
length, fractAtThisDepth);
}
}
// special case when it is a zero length feauture.
else {
_bedB->reportBedTab(*bedItr);
printf("%d\t%d\t%d\t%0.7f\n",
bedItr->count, 0, 0, 1.0000000);
}
}
}
}
}
}
// report a histogram of coverage among _all_
// features in B.
if (_writeHistogram == true) {
map<unsigned long, unsigned long>::const_iterator
histItr = allDepthHist.begin();
map<unsigned long, unsigned long>::const_iterator
histEnd = allDepthHist.end();
for (; histItr != histEnd; ++histItr) {
float fractAtThisDepth = (float) histItr->second / totalLength;
printf("all\t%lu\t%lu\t%lu\t%0.7f\n",
histItr->first, histItr->second,
totalLength, fractAtThisDepth);
}
}
}
/*****************************************************************************
coverageBed.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#ifndef COVERAGEBED_H
#define COVERAGEBED_H
#include "bedFile.h"
#include "api/BamReader.h"
#include "api/BamAux.h"
#include "BlockedIntervals.h"
using namespace BamTools;
#include <vector>
#include <algorithm>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <stdlib.h>
using namespace std;
//************************************************
// Class methods and elements
//************************************************
class BedCoverage {
public:
// constructor
BedCoverage(string &bedAFile, string &bedBFile, bool sameStrand, bool diffStrand, bool writeHistogram,
bool bamInput, bool obeySplits, bool eachBase, bool countsOnly);
// destructor
~BedCoverage(void);
private:
// input files.
string _bedAFile;
string _bedBFile;
// instance of a bed file class.
BedFile *_bedA, *_bedB;
// do we care about same or opposite strandedness when counting coverage?
bool _sameStrand;
bool _diffStrand;
// should we write a histogram for each feature in B?
bool _writeHistogram;
// are we dealing with BAM input for "A"?
bool _bamInput;
// should we split BED/BAM into discrete blocks?
bool _obeySplits;
// should discrete coverage be reported for each base in each feature?
bool _eachBase;
// should we just count overlaps and not try to describe the breadth?
bool _countsOnly;
// private function for reporting coverage information
void ReportCoverage();
// private function for reporting overlap counts
void ReportCounts();
void CollectCoverageBed();
void CollectCoverageBam(string bamFile);
};
#endif /* COVERAGEBED_H */
......@@ -13,6 +13,7 @@ ContextCoverage::ContextCoverage()
_showHist(false),
_coverageType(DEFAULT)
{
setExplicitBedOutput(true); //do not allow BAM output
}
......
......@@ -271,3 +271,23 @@ void BamRecord::buildCigarStr() {
}
}
int BamRecord::getLength(bool obeySplits) const {
//only bed12 and BAM need to check splits
if (!obeySplits) {
return _endPos - _startPos;
} else {
int length = 0;
//parse the Cigar Data.
const vector<BamTools::CigarOp> &cigarData = _bamAlignment.CigarData;
for (int i=0; i < (int)cigarData.size(); i++) {
char op = cigarData[i].Type;
if (op == 'M' || op == 'N' || op == 'I') {
length += (int)cigarData[i].Length;
}
}
return length;
}
}
......@@ -59,6 +59,9 @@ public:
virtual const QuickString &getField(int fieldNum) const;
virtual int getNumFields() const { return 12; }
static bool isNumericField(int fieldNum);
int getLength(bool obeySplits) const;
protected:
BamTools::BamAlignment _bamAlignment;
int _bamChromId; //different from chromId, because BAM file may be in different order
......
......@@ -172,3 +172,23 @@ bool Bed12Interval::isNumericField(int fieldNum) {
}
}
int Bed12Interval::getLength(bool obeySplits) const {
//only bed12 and BAM need to check splits
if (!obeySplits || _blockCount <=0) {
return _endPos - _startPos;
} else {
int length = 0;
//parse the blockSizes string.
char numBuf[16];
const char *startPtr = _blockSizes.c_str();
const char *endPtr = startPtr;
for (int i=0; i < _blockCount; i++) {
memset(numBuf, 0, 16);
endPtr = strchr(endPtr, ',');
memcpy(numBuf, startPtr, endPtr - startPtr);
length += str2chrPos(numBuf);
startPtr = ++endPtr;
}
return length;
}
}
......@@ -55,6 +55,7 @@ public:
virtual const QuickString &getField(int fieldNum) const;
virtual int getNumFields() const { return 12; }
static bool isNumericField(int fieldNum);
int getLength(bool obeySplits) const;
protected:
......
......@@ -35,7 +35,7 @@ void BlockMgr::getBlocks(RecordKeyVector &keyList, bool &mustDelete)
switch (keyList.getKey()->getType()) {
case FileRecordTypeChecker::BED12_RECORD_TYPE:
getBlocksFromBed12(keyList, mustDelete);
break; //not necessary after return, but here in case code is later modified.
break;
case FileRecordTypeChecker::BAM_RECORD_TYPE:
getBlocksFromBam(keyList, mustDelete);
......@@ -174,7 +174,7 @@ int BlockMgr::findBlockedOverlaps(RecordKeyVector &keyList, RecordKeyVector &hit
RecordKeyVector hitBlocks(*hitListIter);
bool deleteHitBlocks = false;
getBlocks(hitBlocks, deleteHitBlocks); //get all blocks for the hit record.
int hitBlockSumLength = getTotalBlockLength(hitBlocks); //get total lentgh of the bocks for the hitRecord.
int hitBlockSumLength = getTotalBlockLength(hitBlocks); //get total length of the bocks for the hitRecord.
int totalHitOverlap = 0;
bool hitHasOverlap = false;
......
......@@ -25,7 +25,6 @@ public:
BlockMgr(float overlapFraction = 1E-9, bool hasReciprocal = false);
~BlockMgr();
// Return value is the number of blocks this main record has been split into.
void getBlocks(RecordKeyVector &keyList, bool &mustDelete);
void deleteBlocks(RecordKeyVector &keyList);
......
......@@ -271,3 +271,8 @@ void Record::print(FILE *fp, bool newline) const {
fprintf(fp, "%s", buf.c_str());
if(newline) fprintf(fp, "\n");
}
int Record::getLength(bool obeySplits) const {
//only bed12 and BAM need to check splits
return _endPos - _startPos;
}
......@@ -138,6 +138,7 @@ public:
bool hasChrInChromName() const;
bool hasLeadingZeroInChromName(bool chrKnown = false) const;
virtual int getLength(bool obeySplits) const;
protected:
......
......@@ -84,10 +84,12 @@ void RecordKeyVector::setKey(elemType key) {
}
void RecordKeyVector::setVector(vecType *vec) {
_currPos = 0;
_recVec = vec;
}
void RecordKeyVector::clearVector() {
_currPos = 0;
_recVec->clear();
}
......
......@@ -261,7 +261,7 @@ bool NewChromSweep::nextRecord(bool query, int dbIdx) {
if (query) {
_currQueryRec = _queryFRM->getNextRecord();
if (_currQueryRec != NULL) {
_queryRecordsTotalLength += (unsigned long)(_currQueryRec->getEndPos() - _currQueryRec->getStartPos());
_queryRecordsTotalLength += (unsigned long)(_currQueryRec->getLength(_context->getObeySplits()));
_queryTotalRecords++;
return true;
}
......@@ -270,7 +270,7 @@ bool NewChromSweep::nextRecord(bool query, int dbIdx) {
Record *rec = _dbFRMs[dbIdx]->getNextRecord();
_currDbRecs[dbIdx] = rec;
if (rec != NULL) {
_databaseRecordsTotalLength += (unsigned long)(rec->getEndPos() - rec->getStartPos());
_databaseRecordsTotalLength += (unsigned long)(rec->getLength(_context->getObeySplits()));
_databaseTotalRecords++;
return true;
}
......
......@@ -37,11 +37,6 @@ int str2chrPos(const QuickString &str);
template<class T>
void int2str(int number, T& buffer, bool appendToBuf = false)
{
int maxVal = (1 << 31) -1;
if (((int)(abs(number))) > maxVal) {
fprintf(stderr, "ERROR: number out of bounds.\n");
exit(1);
}
register int useNum = number;
if (useNum == 0) {
if (appendToBuf) {
......
......@@ -24,12 +24,440 @@ samtools view -Sb sam-w-del.sam > sam-w-del.bam 2>/dev/null
##################################################################
# Test three blocks without -split
##################################################################
# echo " coverage.t1...\c"
# echo \
# "chr1 0 50 1" > exp
# $BT coverage -abam three_blocks.bam -b three_blocks_nomatch.bed > obs
# check obs exp
# rm obs exp