Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
R3
legacy
bedtools2
Commits
9f2d8490
Commit
9f2d8490
authored
May 15, 2015
by
Neil Kindlon
Browse files
Unit tests for coverge, corrections to -split behavior for jaccard and fisher, minor code clean up
parent
43006c92
Changes
16
Hide whitespace changes
Inline
Side-by-side
src/coverageFile/coverageBed.cpp
deleted
100644 → 0
View file @
43006c92
/*****************************************************************************
coverageBed.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include
"lineFileUtilities.h"
#include
"coverageBed.h"
// build
BedCoverage
::
BedCoverage
(
string
&
bedAFile
,
string
&
bedBFile
,
bool
sameStrand
,
bool
diffStrand
,
bool
writeHistogram
,
bool
bamInput
,
bool
obeySplits
,
bool
eachBase
,
bool
countsOnly
)
{
_bedAFile
=
bedAFile
;
_bedBFile
=
bedBFile
;
_bedA
=
new
BedFile
(
bedAFile
);
_bedB
=
new
BedFile
(
bedBFile
);
_sameStrand
=
sameStrand
;
_diffStrand
=
diffStrand
;
_obeySplits
=
obeySplits
;
_eachBase
=
eachBase
;
_writeHistogram
=
writeHistogram
;
_bamInput
=
bamInput
;
_countsOnly
=
countsOnly
;
if
(
_bamInput
==
false
)
CollectCoverageBed
();
else
CollectCoverageBam
(
_bedA
->
bedFile
);
}
// destroy
BedCoverage
::~
BedCoverage
(
void
)
{
delete
_bedA
;
delete
_bedB
;
}
void
BedCoverage
::
CollectCoverageBed
()
{
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bedB
->
loadBedCovFileIntoMap
();
BED
a
;
_bedA
->
Open
();
// process each entry in A
while
(
_bedA
->
GetNextBed
(
a
))
{
if
(
_bedA
->
_status
==
BED_VALID
)
{
// process the BED entry as a single block
if
(
_obeySplits
==
false
)
_bedB
->
countHits
(
a
,
_sameStrand
,
_diffStrand
,
_countsOnly
);
// split the BED into discrete blocksand process
// each independently.
else
{
bedVector
bedBlocks
;
GetBedBlocks
(
a
,
bedBlocks
);
// use countSplitHits to avoid over-counting
// each split chunk as distinct read coverage.
_bedB
->
countSplitHits
(
bedBlocks
,
_sameStrand
,
_diffStrand
,
_countsOnly
);
}
}
}
_bedA
->
Close
();
// report the coverage (summary or histogram) for BED B.
if
(
_countsOnly
==
true
)
ReportCounts
();
else
ReportCoverage
();
}
void
BedCoverage
::
CollectCoverageBam
(
string
bamFile
)
{
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bedB
->
loadBedCovFileIntoMap
();
// open the BAM file
BamReader
reader
;
if
(
!
reader
.
Open
(
bamFile
))
{
cerr
<<
"Failed to open BAM file "
<<
bamFile
<<
endl
;
exit
(
1
);
}
// get header & reference information
string
header
=
reader
.
GetHeaderText
();
RefVector
refs
=
reader
.
GetReferenceData
();
// convert each aligned BAM entry to BED
// and compute coverage on B
BamAlignment
bam
;
while
(
reader
.
GetNextAlignment
(
bam
))
{
if
(
bam
.
IsMapped
())
{
// treat the BAM alignment as a single "block"
if
(
_obeySplits
==
false
)
{
// construct a new BED entry from the current
// BAM alignment.
BED
a
;
try
{
a
.
chrom
=
refs
.
at
(
bam
.
RefID
).
RefName
;
a
.
start
=
bam
.
Position
;
a
.
end
=
bam
.
GetEndPosition
(
false
,
false
);
a
.
strand
=
"+"
;
if
(
bam
.
IsReverseStrand
())
a
.
strand
=
"-"
;
_bedB
->
countHits
(
a
,
_sameStrand
,
_diffStrand
,
_countsOnly
);
}
catch
(
out_of_range
&
oor
)
{
cerr
<<
bam
.
Name
<<
" is said to be mapped (0x4 == false), "
<<
"yet the chrom is missing. Skipping."
<<
endl
;
}
}
// split the BAM alignment into discrete blocks and
// look for overlaps only within each block.
else
{
// vec to store the discrete BED "blocks" from a
bedVector
bedBlocks
;
// since we are counting coverage, we do want
// to split blocks when a deletion (D) CIGAR op
// is encountered (hence the true for the last parm)
try
{
string
chrom
=
refs
.
at
(
bam
.
RefID
).
RefName
;
GetBamBlocks
(
bam
,
chrom
,
bedBlocks
,
false
,
true
);
// use countSplitHits to avoid over-counting
// each split chunk as distinct read coverage.
_bedB
->
countSplitHits
(
bedBlocks
,
_sameStrand
,
_diffStrand
,
_countsOnly
);
}
catch
(
out_of_range
&
oor
)
{
cerr
<<
bam
.
Name
<<
" is said to be mapped (0x4 == false), "
<<
"yet the chrom is missing. Skipping."
<<
endl
;
}
}
}
}
// report the coverage (summary or histogram) for BED B.
if
(
_countsOnly
==
true
)
ReportCounts
();
else
ReportCoverage
();
// close the BAM file
reader
.
Close
();
}
void
BedCoverage
::
ReportCounts
()
{
// process each chromosome
masterBedCovMap
::
const_iterator
chromItr
=
_bedB
->
bedCovMap
.
begin
();
masterBedCovMap
::
const_iterator
chromEnd
=
_bedB
->
bedCovMap
.
end
();
for
(;
chromItr
!=
chromEnd
;
++
chromItr
)
{
// for each chrom, process each bin
binsToBedCovs
::
const_iterator
binItr
=
chromItr
->
second
.
begin
();
binsToBedCovs
::
const_iterator
binEnd
=
chromItr
->
second
.
end
();
for
(;
binItr
!=
binEnd
;
++
binItr
)
{
// for each chrom & bin, compute and report
// the observed coverage for each feature
vector
<
BEDCOV
>::
const_iterator
bedItr
=
binItr
->
second
.
begin
();
vector
<
BEDCOV
>::
const_iterator
bedEnd
=
binItr
->
second
.
end
();
for
(;
bedItr
!=
bedEnd
;
++
bedItr
)
{
_bedB
->
reportBedTab
(
*
bedItr
);
printf
(
"%d
\n
"
,
bedItr
->
count
);
}
}
}
}
void
BedCoverage
::
ReportCoverage
()
{
map
<
unsigned
long
,
unsigned
long
>
allDepthHist
;
unsigned
long
totalLength
=
0
;
// process each chromosome
masterBedCovMap
::
const_iterator
chromItr
=
_bedB
->
bedCovMap
.
begin
();
masterBedCovMap
::
const_iterator
chromEnd
=
_bedB
->
bedCovMap
.
end
();
for
(;
chromItr
!=
chromEnd
;
++
chromItr
)
{
// for each chrom, process each bin
binsToBedCovs
::
const_iterator
binItr
=
chromItr
->
second
.
begin
();
binsToBedCovs
::
const_iterator
binEnd
=
chromItr
->
second
.
end
();
for
(;
binItr
!=
binEnd
;
++
binItr
)
{
// for each chrom & bin, compute and report
// the observed coverage for each feature
vector
<
BEDCOV
>::
const_iterator
bedItr
=
binItr
->
second
.
begin
();
vector
<
BEDCOV
>::
const_iterator
bedEnd
=
binItr
->
second
.
end
();
for
(;
bedItr
!=
bedEnd
;
++
bedItr
)
{
int
zeroDepthCount
=
0
;
// number of bases with zero depth
int
depth
=
0
;
// tracks the depth at the current base
// the start is either the first base in the feature OR
// the leftmost position of an overlapping feature.
// e.g. (s = start):
// A ----------
// B s ------------
int
start
=
min
(
bedItr
->
minOverlapStart
,
bedItr
->
start
);
// track the number of bases in the feature covered by
// 0, 1, 2, ... n features in A
map
<
unsigned
int
,
unsigned
int
>
depthHist
;
map
<
unsigned
int
,
DEPTH
>::
const_iterator
depthItr
;
// compute the coverage observed at each base in
// the feature marching from start to end.
for
(
CHRPOS
pos
=
start
+
1
;
pos
<=
bedItr
->
end
;
pos
++
)
{
// map pointer grabbing the starts and
// ends observed at this position
depthItr
=
bedItr
->
depthMap
.
find
(
pos
);
// increment coverage if starts observed at this position.
if
(
depthItr
!=
bedItr
->
depthMap
.
end
())
depth
+=
depthItr
->
second
.
starts
;
// update coverage assuming the current position is
// within the current B feature
if
((
pos
>
bedItr
->
start
)
&&
(
pos
<=
bedItr
->
end
))
{
if
(
depth
==
0
)
zeroDepthCount
++
;
// update our histograms, assuming we are not
// reporting "per-base" coverage.
if
(
_eachBase
==
false
)
{
depthHist
[
depth
]
++
;
allDepthHist
[
depth
]
++
;
}
else
if
((
_eachBase
==
true
)
&&
(
bedItr
->
zeroLength
==
false
))
{
_bedB
->
reportBedTab
(
*
bedItr
);
printf
(
"%d
\t
%d
\n
"
,
pos
-
bedItr
->
start
,
depth
);
}
}
// decrement coverage if ends observed at this position.
if
(
depthItr
!=
bedItr
->
depthMap
.
end
())
depth
=
depth
-
depthItr
->
second
.
ends
;
}
// handle the special case where the user wants "per-base" depth
// but the current feature is length = 0.
if
((
_eachBase
==
true
)
&&
(
bedItr
->
zeroLength
==
true
))
{
_bedB
->
reportBedTab
(
*
bedItr
);
printf
(
"1
\t
%d
\n
"
,
depth
);
}
// Summarize the coverage for the current interval,
// assuming the user has not requested "per-base" coverage.
else
if
(
_eachBase
==
false
)
{
CHRPOS
length
=
bedItr
->
end
-
bedItr
->
start
;
if
(
bedItr
->
zeroLength
==
true
)
{
length
=
0
;
}
totalLength
+=
length
;
int
nonZeroBases
=
(
length
-
zeroDepthCount
);
float
fractCovered
=
0.0
;
if
(
bedItr
->
zeroLength
==
false
)
{
fractCovered
=
(
float
)
nonZeroBases
/
length
;
}
// print a summary of the coverage
if
(
_writeHistogram
==
false
)
{
_bedB
->
reportBedTab
(
*
bedItr
);
printf
(
"%d
\t
%d
\t
%d
\t
%0.7f
\n
"
,
bedItr
->
count
,
nonZeroBases
,
length
,
fractCovered
);
}
// HISTOGRAM
// report the number of bases with coverage == x
else
{
// produce a histogram when not a zero length feature.
if
(
bedItr
->
zeroLength
==
false
)
{
map
<
unsigned
int
,
unsigned
int
>::
const_iterator
histItr
=
depthHist
.
begin
();
map
<
unsigned
int
,
unsigned
int
>::
const_iterator
histEnd
=
depthHist
.
end
();
for
(;
histItr
!=
histEnd
;
++
histItr
)
{
float
fractAtThisDepth
=
(
float
)
histItr
->
second
/
length
;
_bedB
->
reportBedTab
(
*
bedItr
);
printf
(
"%d
\t
%d
\t
%d
\t
%0.7f
\n
"
,
histItr
->
first
,
histItr
->
second
,
length
,
fractAtThisDepth
);
}
}
// special case when it is a zero length feauture.
else
{
_bedB
->
reportBedTab
(
*
bedItr
);
printf
(
"%d
\t
%d
\t
%d
\t
%0.7f
\n
"
,
bedItr
->
count
,
0
,
0
,
1.0000000
);
}
}
}
}
}
}
// report a histogram of coverage among _all_
// features in B.
if
(
_writeHistogram
==
true
)
{
map
<
unsigned
long
,
unsigned
long
>::
const_iterator
histItr
=
allDepthHist
.
begin
();
map
<
unsigned
long
,
unsigned
long
>::
const_iterator
histEnd
=
allDepthHist
.
end
();
for
(;
histItr
!=
histEnd
;
++
histItr
)
{
float
fractAtThisDepth
=
(
float
)
histItr
->
second
/
totalLength
;
printf
(
"all
\t
%lu
\t
%lu
\t
%lu
\t
%0.7f
\n
"
,
histItr
->
first
,
histItr
->
second
,
totalLength
,
fractAtThisDepth
);
}
}
}
src/coverageFile/coverageBed.h
deleted
100644 → 0
View file @
43006c92
/*****************************************************************************
coverageBed.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#ifndef COVERAGEBED_H
#define COVERAGEBED_H
#include
"bedFile.h"
#include
"api/BamReader.h"
#include
"api/BamAux.h"
#include
"BlockedIntervals.h"
using
namespace
BamTools
;
#include
<vector>
#include
<algorithm>
#include
<iostream>
#include
<iomanip>
#include
<fstream>
#include
<stdlib.h>
using
namespace
std
;
//************************************************
// Class methods and elements
//************************************************
class
BedCoverage
{
public:
// constructor
BedCoverage
(
string
&
bedAFile
,
string
&
bedBFile
,
bool
sameStrand
,
bool
diffStrand
,
bool
writeHistogram
,
bool
bamInput
,
bool
obeySplits
,
bool
eachBase
,
bool
countsOnly
);
// destructor
~
BedCoverage
(
void
);
private:
// input files.
string
_bedAFile
;
string
_bedBFile
;
// instance of a bed file class.
BedFile
*
_bedA
,
*
_bedB
;
// do we care about same or opposite strandedness when counting coverage?
bool
_sameStrand
;
bool
_diffStrand
;
// should we write a histogram for each feature in B?
bool
_writeHistogram
;
// are we dealing with BAM input for "A"?
bool
_bamInput
;
// should we split BED/BAM into discrete blocks?
bool
_obeySplits
;
// should discrete coverage be reported for each base in each feature?
bool
_eachBase
;
// should we just count overlaps and not try to describe the breadth?
bool
_countsOnly
;
// private function for reporting coverage information
void
ReportCoverage
();
// private function for reporting overlap counts
void
ReportCounts
();
void
CollectCoverageBed
();
void
CollectCoverageBam
(
string
bamFile
);
};
#endif
/* COVERAGEBED_H */
src/utils/Contexts/ContextCoverage.cpp
View file @
9f2d8490
...
...
@@ -13,6 +13,7 @@ ContextCoverage::ContextCoverage()
_showHist
(
false
),
_coverageType
(
DEFAULT
)
{
setExplicitBedOutput
(
true
);
//do not allow BAM output
}
...
...
src/utils/FileRecordTools/Records/BamRecord.cpp
View file @
9f2d8490
...
...
@@ -271,3 +271,23 @@ void BamRecord::buildCigarStr() {
}
}
int
BamRecord
::
getLength
(
bool
obeySplits
)
const
{
//only bed12 and BAM need to check splits
if
(
!
obeySplits
)
{
return
_endPos
-
_startPos
;
}
else
{
int
length
=
0
;
//parse the Cigar Data.
const
vector
<
BamTools
::
CigarOp
>
&
cigarData
=
_bamAlignment
.
CigarData
;
for
(
int
i
=
0
;
i
<
(
int
)
cigarData
.
size
();
i
++
)
{
char
op
=
cigarData
[
i
].
Type
;
if
(
op
==
'M'
||
op
==
'N'
||
op
==
'I'
)
{
length
+=
(
int
)
cigarData
[
i
].
Length
;
}
}
return
length
;
}
}
src/utils/FileRecordTools/Records/BamRecord.h
View file @
9f2d8490
...
...
@@ -59,6 +59,9 @@ public:
virtual
const
QuickString
&
getField
(
int
fieldNum
)
const
;
virtual
int
getNumFields
()
const
{
return
12
;
}
static
bool
isNumericField
(
int
fieldNum
);
int
getLength
(
bool
obeySplits
)
const
;
protected:
BamTools
::
BamAlignment
_bamAlignment
;
int
_bamChromId
;
//different from chromId, because BAM file may be in different order
...
...
src/utils/FileRecordTools/Records/Bed12Interval.cpp
View file @
9f2d8490
...
...
@@ -172,3 +172,23 @@ bool Bed12Interval::isNumericField(int fieldNum) {
}
}
int
Bed12Interval
::
getLength
(
bool
obeySplits
)
const
{
//only bed12 and BAM need to check splits
if
(
!
obeySplits
||
_blockCount
<=
0
)
{
return
_endPos
-
_startPos
;
}
else
{
int
length
=
0
;
//parse the blockSizes string.
char
numBuf
[
16
];
const
char
*
startPtr
=
_blockSizes
.
c_str
();
const
char
*
endPtr
=
startPtr
;
for
(
int
i
=
0
;
i
<
_blockCount
;
i
++
)
{
memset
(
numBuf
,
0
,
16
);
endPtr
=
strchr
(
endPtr
,
','
);
memcpy
(
numBuf
,
startPtr
,
endPtr
-
startPtr
);
length
+=
str2chrPos
(
numBuf
);
startPtr
=
++
endPtr
;
}
return
length
;
}
}
src/utils/FileRecordTools/Records/Bed12Interval.h
View file @
9f2d8490
...
...
@@ -55,6 +55,7 @@ public:
virtual
const
QuickString
&
getField
(
int
fieldNum
)
const
;
virtual
int
getNumFields
()
const
{
return
12
;
}
static
bool
isNumericField
(
int
fieldNum
);
int
getLength
(
bool
obeySplits
)
const
;
protected:
...
...
src/utils/FileRecordTools/Records/BlockMgr.cpp
View file @
9f2d8490
...
...
@@ -35,7 +35,7 @@ void BlockMgr::getBlocks(RecordKeyVector &keyList, bool &mustDelete)
switch
(
keyList
.
getKey
()
->
getType
())
{
case
FileRecordTypeChecker
::
BED12_RECORD_TYPE
:
getBlocksFromBed12
(
keyList
,
mustDelete
);
break
;
//not necessary after return, but here in case code is later modified.
break
;
case
FileRecordTypeChecker
::
BAM_RECORD_TYPE
:
getBlocksFromBam
(
keyList
,
mustDelete
);
...
...
@@ -174,7 +174,7 @@ int BlockMgr::findBlockedOverlaps(RecordKeyVector &keyList, RecordKeyVector &hit
RecordKeyVector
hitBlocks
(
*
hitListIter
);
bool
deleteHitBlocks
=
false
;
getBlocks
(
hitBlocks
,
deleteHitBlocks
);
//get all blocks for the hit record.
int
hitBlockSumLength
=
getTotalBlockLength
(
hitBlocks
);
//get total len
t
gh of the bocks for the hitRecord.
int
hitBlockSumLength
=
getTotalBlockLength
(
hitBlocks
);
//get total leng
t
h of the bocks for the hitRecord.
int
totalHitOverlap
=
0
;
bool
hitHasOverlap
=
false
;
...
...
src/utils/FileRecordTools/Records/BlockMgr.h
View file @
9f2d8490
...
...
@@ -25,7 +25,6 @@ public:
BlockMgr
(
float
overlapFraction
=
1E-9
,
bool
hasReciprocal
=
false
);
~
BlockMgr
();
// Return value is the number of blocks this main record has been split into.
void
getBlocks
(
RecordKeyVector
&
keyList
,
bool
&
mustDelete
);
void
deleteBlocks
(
RecordKeyVector
&
keyList
);
...
...
src/utils/FileRecordTools/Records/Record.cpp
View file @
9f2d8490
...
...
@@ -271,3 +271,8 @@ void Record::print(FILE *fp, bool newline) const {
fprintf
(
fp
,
"%s"
,
buf
.
c_str
());
if
(
newline
)
fprintf
(
fp
,
"
\n
"
);
}
int
Record
::
getLength
(
bool
obeySplits
)
const
{
//only bed12 and BAM need to check splits
return
_endPos
-
_startPos
;
}
src/utils/FileRecordTools/Records/Record.h
View file @
9f2d8490
...
...
@@ -138,6 +138,7 @@ public:
bool
hasChrInChromName
()
const
;
bool
hasLeadingZeroInChromName
(
bool
chrKnown
=
false
)
const
;
virtual
int
getLength
(
bool
obeySplits
)
const
;
protected:
...
...
src/utils/FileRecordTools/Records/RecordKeyVector.cpp
View file @
9f2d8490
...
...
@@ -84,10 +84,12 @@ void RecordKeyVector::setKey(elemType key) {
}
void
RecordKeyVector
::
setVector
(
vecType
*
vec
)
{
_currPos
=
0
;
_recVec
=
vec
;
}
void
RecordKeyVector
::
clearVector
()
{
_currPos
=
0
;
_recVec
->
clear
();
}
...
...
src/utils/NewChromsweep/NewChromsweep.cpp
View file @
9f2d8490
...
...
@@ -261,7 +261,7 @@ bool NewChromSweep::nextRecord(bool query, int dbIdx) {
if
(
query
)
{
_currQueryRec
=
_queryFRM
->
getNextRecord
();
if
(
_currQueryRec
!=
NULL
)
{
_queryRecordsTotalLength
+=
(
unsigned
long
)(
_currQueryRec
->
get
EndPos
()
-
_currQueryRec
->
getStartPo
s
());
_queryRecordsTotalLength
+=
(
unsigned
long
)(
_currQueryRec
->
get
Length
(
_context
->
getObeySplit
s
())
)
;
_queryTotalRecords
++
;
return
true
;
}
...
...
@@ -270,7 +270,7 @@ bool NewChromSweep::nextRecord(bool query, int dbIdx) {
Record
*
rec
=
_dbFRMs
[
dbIdx
]
->
getNextRecord
();
_currDbRecs
[
dbIdx
]
=
rec
;
if
(
rec
!=
NULL
)
{
_databaseRecordsTotalLength
+=
(
unsigned
long
)(
rec
->
get
EndPos
()
-
rec
->
getStartPo
s
());
_databaseRecordsTotalLength
+=
(
unsigned
long
)(
rec
->
get
Length
(
_context
->
getObeySplit
s
())
)
;
_databaseTotalRecords
++
;
return
true
;
}
...
...
src/utils/general/ParseTools.h
View file @
9f2d8490
...
...
@@ -37,11 +37,6 @@ int str2chrPos(const QuickString &str);
template
<
class
T
>
void
int2str
(
int
number
,
T
&
buffer
,
bool
appendToBuf
=
false
)
{
int
maxVal
=
(
1
<<
31
)
-
1
;
if
(((
int
)(
abs
(
number
)))
>
maxVal
)
{
fprintf
(
stderr
,
"ERROR: number out of bounds.
\n
"
);
exit
(
1
);
}
register
int
useNum
=
number
;
if
(
useNum
==
0
)
{
if
(
appendToBuf
)
{
...
...
test/coverage/test-coverage.sh
View file @
9f2d8490
...
...
@@ -24,12 +24,440 @@ samtools view -Sb sam-w-del.sam > sam-w-del.bam 2>/dev/null
##################################################################
# Test three blocks without -split
##################################################################
#
echo " coverage.t1...\c"
#
echo \
#
"chr1 0 50
1
" > exp
#
$BT coverage -abam three_blocks.bam -b three_blocks_nomatch.bed > obs