Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
R3
legacy
bedtools2
Commits
3010bf53
Commit
3010bf53
authored
May 26, 2015
by
Neil Kindlon
Browse files
Addressed BinTree memory blow-up; -m in fisher (w/ unit tests)
parent
60022b90
Changes
7
Hide whitespace changes
Inline
Side-by-side
src/utils/BinTree/BinTree.cpp
View file @
3010bf53
...
...
@@ -5,66 +5,20 @@
BinTree
::
BinTree
(
ContextIntersect
*
context
)
:
_context
(
context
),
_binOffsetsExtended
(
NULL
),
_showBinMetrics
(
false
),
_maxBinNumFound
(
0
)
{
_binOffsetsExtended
=
new
u
in
t32_t
[
NUM_BIN_LEVELS
];
memset
(
_binOffsetsExtended
,
0
,
NUM_BIN_LEVELS
*
sizeof
(
u
in
t32_t
));
_binOffsetsExtended
=
new
b
in
NumType
[
NUM_BIN_LEVELS
];
memset
(
_binOffsetsExtended
,
0
,
NUM_BIN_LEVELS
*
sizeof
(
b
in
NumType
));
//start at idx 1, because the memset above already initialized
//the first idx to zero, which is what we want.
for
(
u
in
t32_t
i
=
1
;
i
<
NUM_BIN_LEVELS
;
i
++
)
{
for
(
b
in
NumType
i
=
1
;
i
<
NUM_BIN_LEVELS
;
i
++
)
{
_binOffsetsExtended
[
i
]
=
_binOffsetsExtended
[
i
-
1
]
+
(
1
<<
((
NUM_BIN_LEVELS
-
i
-
1
)
*
3
));
}
}
BinTree
::~
BinTree
()
{
//TBD: pass all elements in tree back to FRM for proper cleanup/deletion
for
(
mainMapType
::
iterator
mainIter
=
_mainMap
.
begin
();
mainIter
!=
_mainMap
.
end
();
mainIter
++
)
{
allBinsType
bins
=
mainIter
->
second
;
if
(
bins
==
NULL
)
{
fprintf
(
stderr
,
"ERROR: In BinTree destructor: found chromosome with NULL bin array.
\n
"
);
continue
;
}
if
(
!
_showBinMetrics
)
{
//don't clean up bins when simply reporting metrics.
for
(
uint32_t
i
=
0
;
i
<
NUM_BINS
;
i
++
)
{
binType
bin
=
bins
[
i
];
if
(
bin
==
NULL
)
{
continue
;
}
for
(
innerListIterType
listIter
=
bin
->
begin
();
listIter
!=
bin
->
end
();
listIter
=
bin
->
next
())
{
const
Record
*
record
=
listIter
->
value
();
_context
->
getFile
(
record
->
getFileIdx
())
->
deleteRecord
(
record
);
}
delete
bin
;
bin
=
NULL
;
}
}
delete
[]
bins
;
bins
=
NULL
;
}
delete
[]
_binOffsetsExtended
;
if
(
_showBinMetrics
)
{
map
<
int
,
int
>
hitsHistogram
;
FILE
*
fp
=
fopen
(
"BinsHitFile.txt"
,
"w"
);
fprintf
(
fp
,
"The largest bin was %u
\n
"
,
_maxBinNumFound
);
fprintf
(
fp
,
"There were %d different bins hit by database.
\n
"
,
(
int
)
_binsHit
.
size
());
for
(
map
<
uint32_t
,
int
>::
iterator
binIter
=
_binsHit
.
begin
();
binIter
!=
_binsHit
.
end
();
binIter
++
)
{
uint32_t
binNum
=
binIter
->
first
;
int
numHits
=
binIter
->
second
;
fprintf
(
fp
,
"%u
\t
%d
\n
"
,
binNum
,
numHits
);
hitsHistogram
[
numHits
]
++
;
}
fclose
(
fp
);
fp
=
fopen
(
"BinHitsHistogram.txt"
,
"w"
);
fprintf
(
fp
,
"NumHits
\t
NumBins
\n
"
);
for
(
map
<
int
,
int
>::
iterator
histIter
=
hitsHistogram
.
begin
();
histIter
!=
hitsHistogram
.
end
();
histIter
++
)
{
fprintf
(
fp
,
"%d
\t
%d
\n
"
,
histIter
->
first
,
histIter
->
second
);
}
fclose
(
fp
);
}
}
void
BinTree
::
loadDB
()
...
...
@@ -93,9 +47,6 @@ void BinTree::loadDB()
void
BinTree
::
getHits
(
Record
*
record
,
RecordKeyVector
&
hitSet
)
{
if
(
_showBinMetrics
)
{
return
;
//don't care about query entries just yet.
}
if
(
record
->
isUnmapped
())
{
return
;
}
...
...
@@ -106,11 +57,11 @@ void BinTree::getHits(Record *record, RecordKeyVector &hitSet)
return
;
}
u
in
t32_t
startBin
=
0
;
u
in
t32_t
endBin
=
0
;
b
in
NumType
startBin
=
0
;
b
in
NumType
endBin
=
0
;
u
in
t32_t
startPos
=
record
->
getStartPos
();
u
in
t32_t
endPos
=
record
->
getEndPos
();
b
in
NumType
startPos
=
record
->
getStartPos
();
b
in
NumType
endPos
=
record
->
getEndPos
();
startBin
=
(
startPos
>>
_binFirstShift
);
endBin
=
((
endPos
-
1
)
>>
_binFirstShift
);
...
...
@@ -124,25 +75,19 @@ void BinTree::getHits(Record *record, RecordKeyVector &hitSet)
hits if it meets all of the user's requests, which include:
(a) overlap fractio, (b) strandedness, (c) reciprocal overlap
*/
for
(
u
in
t32_t
i
=
0
;
i
<
NUM_BIN_LEVELS
;
i
++
)
{
u
in
t32_t
offset
=
_binOffsetsExtended
[
i
];
for
(
u
in
t32_t
j
=
(
startBin
+
offset
);
j
<=
(
endBin
+
offset
);
j
++
)
{
for
(
b
in
NumType
i
=
0
;
i
<
NUM_BIN_LEVELS
;
i
++
)
{
b
in
NumType
offset
=
_binOffsetsExtended
[
i
];
for
(
b
in
NumType
j
=
(
startBin
+
offset
);
j
<=
(
endBin
+
offset
);
j
++
)
{
// move to the next bin if this one is empty
const
binType
&
bin
=
bins
[
j
];
if
(
bin
==
NULL
)
{
//no list of records in this bin.
continue
;
}
if
(
bin
->
empty
())
{
//bin has list, but it's empty.
//Actually, this should never happen, so throw an error.
fprintf
(
stderr
,
"ERROR: found empty list for bin %u of chromosome %s
\n
"
,
j
,
chr
.
c_str
());
allBinsType
::
const_iterator
allBinsIter
=
bins
.
find
(
j
);
if
(
allBinsIter
==
bins
.
end
())
{
continue
;
}
for
(
innerListIterType
listIter
=
bin
->
begin
();
listIter
!=
bin
->
end
();
listIter
=
bin
->
next
())
{
const
Record
*
dbRec
=
listIter
->
value
();
const
binType
&
bin
=
allBinsIter
->
second
;
for
(
binType
::
const_iterator
iter
=
bin
.
begin
();
iter
!=
bin
.
end
();
iter
++
)
{
const
Record
*
dbRec
=
*
iter
;
if
(
record
->
intersects
(
dbRec
,
_context
->
getSameStrand
(),
_context
->
getDiffStrand
(),
_context
->
getOverlapFraction
(),
_context
->
getReciprocal
()))
{
hitSet
.
push_back
(
dbRec
);
...
...
@@ -159,55 +104,31 @@ void BinTree::getHits(Record *record, RecordKeyVector &hitSet)
bool
BinTree
::
addRecordToTree
(
const
Record
*
record
)
{
// Get chr, bin.
allocate all bins and single bins as needed.
// Get chr, bin.
const
QuickString
&
chr
=
record
->
getChrName
();
uint32_t
startPos
=
(
uint32_t
)(
record
->
getStartPos
());
uint32_t
endPos
=
(
uint32_t
)(
record
->
getEndPos
());
//is this chr currently in the main map?
allBinsType
bins
=
NULL
;
mainMapType
::
iterator
mainIter
=
_mainMap
.
find
(
chr
);
if
(
mainIter
==
_mainMap
.
end
())
{
//this is a new chr NOT currently in the map.
bins
=
new
binType
[
NUM_BINS
];
memset
(
bins
,
0
,
NUM_BINS
*
sizeof
(
binType
));
_mainMap
[
chr
]
=
bins
;
}
else
{
bins
=
mainIter
->
second
;
}
uint32_t
binNum
=
getBin
(
startPos
,
endPos
);
if
(
_showBinMetrics
)
{
if
(
binNum
>
_maxBinNumFound
)
{
_maxBinNumFound
=
binNum
;
}
_binsHit
[
binNum
]
++
;
return
true
;
}
binNumType
startPos
=
(
binNumType
)(
record
->
getStartPos
());
binNumType
endPos
=
(
binNumType
)(
record
->
getEndPos
());
binNumType
binNum
=
getBin
(
startPos
,
endPos
);
if
(
binNum
<
0
||
binNum
>=
NUM_BINS
)
{
fprintf
(
stderr
,
"ERROR: Received illegal bin number %u from getBin call.
\n
"
,
binNum
);
return
false
;
}
binType
&
bin
=
bins
[
binNum
];
if
(
bin
==
NULL
)
{
bin
=
new
innerListType
;
}
bin
->
push_back
(
record
);
_mainMap
[
chr
][
binNum
].
push_back
(
record
);
return
true
;
}
uint32_t
BinTree
::
getBin
(
const
Record
*
record
)
const
{
return
getBin
((
uint32_t
)(
record
->
getStartPos
()),
(
uint32_t
)(
record
->
getEndPos
()));
BinTree
::
binNumType
BinTree
::
getBin
(
const
Record
*
record
)
const
{
return
getBin
((
binNumType
)(
record
->
getStartPos
()),
(
binNumType
)(
record
->
getEndPos
()));
}
u
in
t32_t
BinTree
::
getBin
(
u
in
t32_t
start
,
u
in
t32_t
end
)
const
{
B
in
Tree
::
binNumType
BinTree
::
getBin
(
b
in
NumType
start
,
b
in
NumType
end
)
const
{
--
end
;
start
>>=
_binFirstShift
;
end
>>=
_binFirstShift
;
for
(
u
in
t32_t
i
=
0
;
i
<
NUM_BIN_LEVELS
;
++
i
)
{
for
(
b
in
NumType
i
=
0
;
i
<
NUM_BIN_LEVELS
;
++
i
)
{
if
(
start
==
end
)
{
return
_binOffsetsExtended
[
i
]
+
start
;
}
...
...
src/utils/BinTree/BinTree.h
View file @
3010bf53
...
...
@@ -37,8 +37,9 @@ private:
//
// BIN HANDLING
//
static
const
uint32_t
NUM_BINS
=
37450
;
static
const
uint32_t
NUM_BIN_LEVELS
=
7
;
typedef
uint32_t
binNumType
;
static
const
binNumType
NUM_BINS
=
37450
;
static
const
binNumType
NUM_BIN_LEVELS
=
7
;
// bins range in size from 16kb to 512Mb
// Bin 0 spans 512Mbp, # Level 1
...
...
@@ -47,25 +48,30 @@ private:
// Bins 73-584 span 1Mbp # Level 4
// Bins 585-4680 span 128Kbp # Level 5
// Bins 4681-37449 span 16Kbp # Level 6
uint32_t
*
_binOffsetsExtended
;
static
const
uint32_t
_binFirstShift
=
14
;
/* How much to shift to get to finest bin. */
static
const
uint32_t
_binNextShift
=
3
;
/* How much to shift to get to next larger bin. */
typedef
RecordList
innerListType
;
typedef
const
RecordListNode
*
innerListIterType
;
typedef
innerListType
*
binType
;
typedef
binType
*
allBinsType
;
typedef
QuickString
mainKeyType
;
typedef
map
<
mainKeyType
,
allBinsType
>
mainMapType
;
binNumType
*
_binOffsetsExtended
;
static
const
binNumType
_binFirstShift
=
14
;
/* How much to shift to get to finest bin. */
static
const
binNumType
_binNextShift
=
3
;
/* How much to shift to get to next larger bin. */
// typedef RecordList innerListType;
// typedef const RecordListNode * innerListIterType;
// typedef innerListType * binType;
// typedef binType * allBinsType;
// typedef QuickString mainKeyType;
// typedef map<mainKeyType, allBinsType> mainMapType;
// mainMapType _mainMap;
typedef
vector
<
const
Record
*>
binType
;
typedef
map
<
binNumType
,
binType
>
allBinsType
;
//for each bin number, have a RecordList
typedef
map
<
QuickString
,
allBinsType
>
mainMapType
;
//for each chrom, a map of bin num to RecordLists.
mainMapType
_mainMap
;
bool
_showBinMetrics
;
u
in
t32_t
_maxBinNumFound
;
map
<
u
in
t32_t
,
int
>
_binsHit
;
b
in
NumType
_maxBinNumFound
;
map
<
b
in
NumType
,
int
>
_binsHit
;
bool
addRecordToTree
(
const
Record
*
);
u
in
t32_t
getBin
(
u
in
t32_t
start
,
u
in
t32_t
end
)
const
;
u
in
t32_t
getBin
(
const
Record
*
record
)
const
;
b
in
NumType
getBin
(
b
in
NumType
start
,
b
in
NumType
end
)
const
;
b
in
NumType
getBin
(
const
Record
*
record
)
const
;
};
...
...
src/utils/Contexts/ContextFisher.cpp
View file @
3010bf53
...
...
@@ -7,7 +7,8 @@
ContextFisher
::
ContextFisher
()
{
setSortedInput
(
true
);
setUseMergedIntervals
(
false
);
setUseMergedIntervals
(
false
);
//by default,
//intervals are merged in Jaccard, but unmerged in fisher.
}
ContextFisher
::~
ContextFisher
()
{
...
...
@@ -21,45 +22,23 @@ bool ContextFisher::parseCmdArgs(int argc, char **argv, int skipFirstArgs)
else
if
(
strcmp
(
_argv
[
_i
],
"-exclude"
)
==
0
)
{
if
(
!
handle_exclude
())
return
false
;
}
else
if
(
strcmp
(
_argv
[
_i
],
"-m"
)
==
0
)
{
if
(
!
handle_m
())
return
false
;
}
}
return
ContextIntersect
::
parseCmdArgs
(
argc
,
argv
,
_skipFirstArgs
);
}
bool
ContextFisher
::
isValidState
()
{
if
(
!
Context
Intersect
::
isValidState
())
{
if
(
!
Context
Jaccard
::
isValidState
())
{
return
false
;
}
// Tests for stranded merge
//
if
(
_desiredStrand
!=
FileRecordMergeMgr
::
ANY_STRAND
)
{
// requested stranded merge
for
(
int
i
=
0
;
i
<
getNumInputFiles
();
i
++
)
{
// make sure file has strand.
if
(
!
getFile
(
i
)
->
recordsHaveStrand
())
{
_errorMsg
=
"
\n
***** ERROR: stranded merge requested, but input file "
;
_errorMsg
+=
getInputFileName
(
i
);
_errorMsg
+=
" does not have strands. *****"
;
return
false
;
}
//make sure file is not VCF.
if
(
getFile
(
1
)
->
getFileType
()
==
FileRecordTypeChecker
::
VCF_FILE_TYPE
)
{
_errorMsg
=
"
\n
***** ERROR: stranded merge not supported for VCF file "
;
_errorMsg
+=
getInputFileName
(
i
);
_errorMsg
+=
". *****"
;
return
false
;
}
}
}
if
(
_genomeFile
==
NULL
){
_errorMsg
=
"
\n
ERROR*****: specify -g genome file*****
\n
"
;
return
false
;
}
//column operations not allowed with BAM input
if
(
hasColumnOpsMethods
()
&&
getFile
(
0
)
->
getFileType
()
==
FileRecordTypeChecker
::
BAM_FILE_TYPE
)
{
_errorMsg
=
"
\n
***** ERROR: stranded merge not supported for VCF files. *****"
;
return
false
;
}
return
true
;
}
...
...
@@ -79,3 +58,11 @@ bool ContextFisher::handle_exclude()
return
true
;
}
bool
ContextFisher
::
handle_m
()
{
setUseMergedIntervals
(
true
);
markUsed
(
_i
-
_skipFirstArgs
);
_i
++
;
return
true
;
}
src/utils/Contexts/ContextFisher.h
View file @
3010bf53
...
...
@@ -24,6 +24,7 @@ public:
protected:
string
_excludeFile
;
bool
handle_exclude
();
bool
handle_m
();
};
...
...
src/utils/Contexts/ContextJaccard.cpp
View file @
3010bf53
...
...
@@ -58,12 +58,6 @@ bool ContextJaccard::isValidState()
}
}
}
//column operations not allowed with BAM input
if
(
hasColumnOpsMethods
()
&&
getFile
(
0
)
->
getFileType
()
==
FileRecordTypeChecker
::
BAM_FILE_TYPE
)
{
_errorMsg
=
"
\n
***** ERROR: stranded merge not supported for VCF files. *****"
;
return
false
;
}
return
true
;
}
...
...
test/fisher/a_merge.bed
0 → 100644
View file @
3010bf53
chr1 10 20
chr1 12 19
chr1 30 40
chr1 51 52
test/fisher/test-fisher.sh
View file @
3010bf53
...
...
@@ -51,3 +51,45 @@ left right two-tail ratio
$BT
fisher
-a
a.bed
-b
b.bed
-g
t.60.genome
>
obs
check obs exp
rm
obs exp
echo
" fisher.t3...
\c
"
echo
\
"# Number of query intervals: 4
# Number of db intervals: 2
# Number of overlaps: 3
# Number of possible intervals (estimated): 4
# phyper(3 - 1, 4, 4 - 4, 2, lower.tail=F)
# Contingency Table Of Counts
#_________________________________________
# | in -b | not in -b |
# in -a | 3 | 1 |
# not in -a | 0 | 0 |
#_________________________________________
# p-values for fisher's exact test
left right two-tail ratio
1 1 1 -nan"
>
exp
$BT
fisher
-a
a_merge.bed
-b
b.bed
-g
t.60.genome
>
obs
check obs exp
rm
obs exp
echo
" fisher.t4...
\c
"
echo
\
"# Number of query intervals: 3
# Number of db intervals: 2
# Number of overlaps: 2
# Number of possible intervals (estimated): 4
# phyper(2 - 1, 3, 4 - 3, 2, lower.tail=F)
# Contingency Table Of Counts
#_________________________________________
# | in -b | not in -b |
# in -a | 2 | 1 |
# not in -a | 0 | 1 |
#_________________________________________
# p-values for fisher's exact test
left right two-tail ratio
1 0.5 1 inf"
>
exp
$BT
fisher
-a
a_merge.bed
-b
b.bed
-g
t.60.genome
-m
>
obs
check obs exp
rm
obs exp
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment