Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
R3
legacy
bedtools2
Commits
4c7213c5
Commit
4c7213c5
authored
Dec 05, 2014
by
Neil Kindlon
Browse files
Detect differently ordered files in chromsweep and closest. Merged Fisher changes.
parent
24803222
Changes
72
Expand all
Hide whitespace changes
Inline
Side-by-side
src/closestFile/closestFile.cpp
View file @
4c7213c5
...
...
@@ -36,6 +36,10 @@ bool ClosestFile::getClosest() {
}
}
if
(
!
_context
->
hasGenomeFile
())
{
sweep
.
closeOut
(
true
);
}
return
true
;
}
src/fisher/Fisher.cpp
View file @
4c7213c5
...
...
@@ -9,9 +9,21 @@ Fisher::Fisher(ContextFisher *context)
_unionVal
(
0
),
_numIntersections
(
0
),
_queryLen
(
0
),
_dbLen
(
0
)
_dbLen
(
0
),
_queryCounts
(
0
),
_dbCounts
(
0
),
_overlapCounts
(
0
)
{
_blockMgr
=
new
BlockMgr
(
_context
->
getOverlapFraction
(),
_context
->
getReciprocal
());
_haveExclude
=
false
;
if
(
!
(
_context
->
getExcludeFile
().
empty
())){
string
ex
=
_context
->
getExcludeFile
();
exclude
=
new
BedFile
(
ex
);
exclude
->
loadBedFileIntoMergedMap
();
_haveExclude
=
true
;
}
}
Fisher
::~
Fisher
(
void
)
{
...
...
@@ -26,29 +38,41 @@ bool Fisher::calculate() {
}
// header
cout
<<
"# Contingency Table"
<<
endl
;
// for fisher's exact test, we need the contingency table
// XXXXXXXX | not in A | in A
// not in B | n11: in neither | n12: only in A
// in B | n21: only in B | n22: in A & B
//
double
left
,
right
,
two
;
long
long
genomeSize
=
_context
->
getGenomeFile
()
->
getGenomeSize
();
// bases covered by neither a nor b
long
long
n11
=
genomeSize
-
_queryLen
-
_dbLen
+
_intersectionVal
;
// bases covered only by -b
long
long
n12
=
_dbLen
-
_intersectionVal
;
// bases covered only by -a
long
long
n21
=
_queryLen
-
_intersectionVal
;
// bases covered by both
long
long
n22
=
_intersectionVal
;
if
(
_haveExclude
){
genomeSize
-=
exclude
->
getTotalFlattenedLength
();
}
// bases covered by neither
long
long
n22_full_bases
=
genomeSize
;
//long long n22_bases = genomeSize - _queryLen - _dbLen + _intersectionVal;
long
double
dMean
=
1.0
+
_dbLen
/
(
long
double
)
_dbCounts
;
long
double
qMean
=
1.0
+
_queryLen
/
(
long
double
)
_queryCounts
;
// heursitic, but seems to work quite well -- better than doing more intuitive sum then divide.
long
double
bMean
=
(
qMean
+
dMean
);
//bMean = (_unionVal + 2.0 * _intersectionVal) / (long double)(_dbCounts + _queryCounts);
long
long
n11
=
(
long
)
_overlapCounts
;
// this could be < 0 because multiple overlaps
long
long
n12
=
(
long
)
max
(
0L
,
(
long
)
_queryCounts
-
(
long
)
_overlapCounts
);
long
long
n21
=
max
(
0L
,
(
long
)(
_dbCounts
-
_overlapCounts
));
long
long
n22_full
=
max
(
n21
+
n21
+
n11
,
(
long
long
)(
n22_full_bases
/
bMean
));
long
long
n22
=
max
(
0L
,
(
long
)(
n22_full
-
n12
-
n21
-
n11
));
printf
(
"# Number of query intervals: %lu
\n
"
,
_queryCounts
);
printf
(
"# Number of db intervals: %lu
\n
"
,
_dbCounts
);
printf
(
"# Number of overlaps: %lu
\n
"
,
_overlapCounts
);
printf
(
"# Number of possible intervals (estimated): %lld
\n
"
,
n22_full
);
printf
(
"# phyper(%lld - 1, %lu, %lld - %lu, %lu, lower.tail=F)
\n
"
,
n11
,
_queryCounts
,
n22_full
,
_queryCounts
,
_dbCounts
);
cout
<<
"# Contingency Table Of Counts"
<<
endl
;
printf
(
"#_________________________________________
\n
"
);
printf
(
"# | %-12s | %-12s |
\n
"
,
"
not
in -b"
,
"in -b"
);
printf
(
"#
not
in -a | %-12lld | %-12lld |
\n
"
,
n11
,
n12
);
printf
(
"#
in -a | %-12lld | %-12lld |
\n
"
,
n21
,
n22
);
printf
(
"#
| %-12s | %-12s |
\n
"
,
" in -b"
,
"
not
in -b"
);
printf
(
"#
in -a | %-12lld | %-12lld |
\n
"
,
n11
,
n12
);
printf
(
"#
not
in -a | %-12lld | %-12lld |
\n
"
,
n21
,
n22
);
printf
(
"#_________________________________________
\n
"
);
kt_fisher_exact
(
n11
,
n12
,
n21
,
n22
,
&
left
,
&
right
,
&
two
);
...
...
@@ -66,6 +90,7 @@ bool Fisher::getFisher() {
if
(
!
sweep
.
init
())
{
return
false
;
}
RecordKeyVector
hitSet
;
while
(
sweep
.
next
(
hitSet
))
{
if
(
_context
->
getObeySplits
())
{
...
...
@@ -82,6 +107,9 @@ bool Fisher::getFisher() {
_queryLen
=
sweep
.
getQueryTotalRecordLength
();
_dbLen
=
sweep
.
getDatabaseTotalRecordLength
();
_queryCounts
=
sweep
.
getQueryTotalRecords
();
_dbCounts
=
sweep
.
getDatabaseTotalRecords
();
_unionVal
=
_queryLen
+
_dbLen
;
return
true
;
}
...
...
@@ -93,10 +121,14 @@ unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList)
int
keyStart
=
key
->
getStartPos
();
int
keyEnd
=
key
->
getEndPos
();
_overlapCounts
+=
recList
.
size
();
_qsizes
.
push_back
((
keyEnd
-
keyStart
));
int
hitIdx
=
0
;
for
(
RecordKeyVector
::
const_iterator_type
iter
=
recList
.
begin
();
iter
!=
recList
.
end
();
iter
=
recList
.
next
())
{
int
maxStart
=
max
((
*
iter
)
->
getStartPos
(),
keyStart
);
int
minEnd
=
min
((
*
iter
)
->
getEndPos
(),
keyEnd
);
_qsizes
.
push_back
((
int
)(
minEnd
-
maxStart
));
if
(
_context
->
getObeySplits
())
{
intersection
+=
_blockMgr
->
getOverlapBases
(
hitIdx
);
hitIdx
++
;
...
...
@@ -107,4 +139,3 @@ unsigned long Fisher::getTotalIntersection(RecordKeyVector &recList)
_numIntersections
+=
(
int
)
recList
.
size
();
return
intersection
;
}
src/fisher/Fisher.h
View file @
4c7213c5
#ifndef FISHER_H
#define FISHER_H
#include "bedFile.h"
#include "bedFilePE.h"
#include "ContextFisher.h"
class
BlockMgr
;
...
...
@@ -19,11 +22,17 @@ private:
BlockMgr
*
_blockMgr
;
unsigned
long
_intersectionVal
;
unsigned
long
_unionVal
;
bool
_haveExclude
;
int
_numIntersections
;
unsigned
long
_queryLen
;
unsigned
long
_dbLen
;
unsigned
long
_queryCounts
;
unsigned
long
_dbCounts
;
unsigned
long
_overlapCounts
;
bool
getFisher
();
BedFile
*
exclude
;
vector
<
int
>
_qsizes
;
unsigned
long
getTotalIntersection
(
RecordKeyVector
&
hits
);
};
...
...
src/fisher/Makefile
View file @
4c7213c5
...
...
@@ -9,27 +9,32 @@ BIN_DIR = ../../bin/
INCLUDES
=
-I
$(UTILITIES_DIR)
/Contexts/
\
-I
$(UTILITIES_DIR)
/general/
\
-I
$(UTILITIES_DIR)
/fileType/
\
-I
$(UTILITIES_DIR)
/gzstream/
\
-I
$(UTILITIES_DIR)
/GenomeFile/
\
-I
$(UTILITIES_DIR)
/BamTools/include
\
-I
$(UTILITIES_DIR)
/BamTools/src
\
-I
$(UTILITIES_DIR)
/BlockedIntervals
\
-I
$(UTILITIES_DIR)
/BamTools-Ancillary
\
-I
$(UTILITIES_DIR)
/FileRecordTools/
\
-I
$(UTILITIES_DIR)
/FileRecordTools/FileReaders/
\
-I
$(UTILITIES_DIR)
/FileRecordTools/Records/
\
-I
$(UTILITIES_DIR)
/RecordOutputMgr/
\
-I
$(UTILITIES_DIR)
/gzstream/
\
-I
$(UTILITIES_DIR)
/bedFile/
\
-I
$(UTILITIES_DIR)
/bedFilePE/
\
-I
$(UTILITIES_DIR)
/GenomeFile/
\
-I
$(UTILITIES_DIR)
/lineFileUtilities/
\
-I
$(UTILITIES_DIR)
/BamTools/include
\
-I
$(UTILITIES_DIR)
/BamTools/src
\
-I
$(UTILITIES_DIR)
/BlockedIntervals
\
-I
$(UTILITIES_DIR)
/BamTools-Ancillary
\
-I
$(UTILITIES_DIR)
/FileRecordTools/
\
-I
$(UTILITIES_DIR)
/FileRecordTools/FileReaders/
\
-I
$(UTILITIES_DIR)
/FileRecordTools/Records/
\
-I
$(UTILITIES_DIR)
/RecordOutputMgr/
\
-I
$(UTILITIES_DIR)
/KeyListOps/
\
-I
$(UTILITIES_DIR)
/NewChromsweep
\
-I
$(UTILITIES_DIR)
/VectorOps
\
-I
$(UTILITIES_DIR)
/NewChromsweep
\
-I
$(UTILITIES_DIR)
/VectorOps
\
-I
.
\
-I
$(UTILITIES_DIR)
/BinTree
\
-I
$(UTILITIES_DIR)
/version/
-I
$(UTILITIES_DIR)
/BinTree
\
-I
$(UTILITIES_DIR)
/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES
=
fisherMain.cpp Fisher.cpp Fisher.h kfunc.c
SOURCES
=
fisherMain.cpp Fisher.cpp Fisher.h kfunc.c
\
../utils/Contexts/ContextFisher.cpp
\
../utils/Contexts/ContextFisher.h
OBJECTS
=
fisherMain.o Fisher.o
BUILT_OBJECTS
=
$(
patsubst
%,
$(OBJ_DIR)
/%,
$(OBJECTS)
)
PROGRAM
=
Fisher
...
...
src/fisher/fisherMain.cpp
View file @
4c7213c5
...
...
@@ -32,13 +32,9 @@ void fisher_help(void) {
cerr
<<
"
\n
Tool: bedtools fisher (aka fisher)"
<<
endl
;
cerr
<<
"Version: "
<<
VERSION
<<
"
\n
"
;
cerr
<<
"Summary: Calculate Fisher statistic b/w two feature files."
<<
endl
<<
"
\t
Fisher is the length of the intersection over the union."
<<
endl
<<
"
\t
Values range from 0 (no intersection) to 1 (self intersection)."
<<
endl
<<
endl
;
cerr
<<
"Usage: "
<<
PROGRAM_NAME
<<
" [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf> -g <genome>"
<<
endl
<<
endl
;
cerr
<<
"Usage: "
<<
PROGRAM_NAME
<<
" [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf> -g <genome
file
>"
<<
endl
<<
endl
;
cerr
<<
"Options: "
<<
endl
;
...
...
@@ -47,6 +43,9 @@ void fisher_help(void) {
cerr
<<
"
\t\t
- Default is 1E-9 (i.e., 1bp)."
<<
endl
;
cerr
<<
"
\t\t
- FLOAT (e.g. 0.50)"
<<
endl
<<
endl
;
cerr
<<
"
\t
-m
\t
"
<<
"Merge overlapping intervals before"
<<
endl
;
cerr
<<
"
\t\t
- looking at overlap."
<<
endl
<<
endl
;
cerr
<<
"
\t
-r
\t
"
<<
"Require that the fraction overlap be reciprocal for A and B."
<<
endl
;
cerr
<<
"
\t\t
- In other words, if -f is 0.90 and -r is used, this requires"
<<
endl
;
cerr
<<
"
\t\t
that B overlap 90% of A and A _also_ overlaps 90% of B."
<<
endl
<<
endl
;
...
...
src/intersectFile/intersectFile.cpp
View file @
4c7213c5
...
...
@@ -69,6 +69,9 @@ bool FileIntersect::processSortedFiles()
processHits
(
hitSet
);
}
}
if
(
!
_context
->
hasGenomeFile
())
{
sweep
.
closeOut
(
true
);
}
return
true
;
}
...
...
src/utils/BinTree/BinTree.cpp
View file @
4c7213c5
...
...
@@ -80,6 +80,8 @@ void BinTree::loadDB()
continue
;
}
_context
->
testNameConventions
(
record
);
if
(
!
addRecordToTree
(
record
))
{
fprintf
(
stderr
,
"ERROR: Unable to add record to tree.
\n
"
);
databaseFile
->
close
();
...
...
src/utils/Contexts/ContextBase.cpp
View file @
4c7213c5
...
...
@@ -57,7 +57,9 @@ ContextBase::ContextBase()
_desiredStrand
(
FileRecordMergeMgr
::
ANY_STRAND
),
_maxDistance
(
0
),
_useMergedIntervals
(
false
),
_reportPrecision
(
-
1
)
_reportPrecision
(
-
1
),
_allFilesHaveChrInChromNames
(
UNTESTED
),
_allFileHaveLeadingZeroInChromNames
(
UNTESTED
)
{
_programNames
[
"intersect"
]
=
INTERSECT
;
...
...
@@ -599,3 +601,73 @@ bool ContextBase::parseIoBufSize(QuickString bufStr)
}
return
true
;
}
void
ContextBase
::
testNameConventions
(
const
Record
*
record
)
{
int
fileIdx
=
record
->
getFileIdx
();
//
// First test whether chr in chrom names match
//
bool
hasChr
=
record
->
hasChrInChromName
();
testType
testChrVal
=
fileHasChrInChromNames
(
fileIdx
);
if
(
testChrVal
==
UNTESTED
)
{
_fileHasChrInChromNames
[
fileIdx
]
=
hasChr
?
YES
:
NO
;
}
else
if
((
testChrVal
==
YES
&&
!
hasChr
)
||
(
testChrVal
==
NO
&&
hasChr
))
{
fprintf
(
stderr
,
"ERROR: File %s has inconsistent naming convention for record:
\n
"
,
_fileNames
[
fileIdx
].
c_str
());
record
->
print
(
stderr
,
true
);
exit
(
1
);
}
if
((
_allFilesHaveChrInChromNames
==
YES
&&
!
hasChr
)
||
(
_allFilesHaveChrInChromNames
==
NO
&&
hasChr
))
{
fprintf
(
stderr
,
"ERROR: File %s has a record where naming convention is inconsistent with other files:
\n
"
,
_fileNames
[
fileIdx
].
c_str
());
record
->
print
(
stderr
,
true
);
exit
(
1
);
}
if
(
_allFilesHaveChrInChromNames
==
UNTESTED
)
{
_allFilesHaveChrInChromNames
=
hasChr
?
YES
:
NO
;
}
//
// Now test whether leading zero in chrom names match
//
bool
zeroVal
=
record
->
hasLeadingZeroInChromName
();
testChrVal
=
fileHasLeadingZeroInChromNames
(
fileIdx
);
if
(
testChrVal
==
UNTESTED
)
{
_fileHasLeadingZeroInChromNames
[
fileIdx
]
=
zeroVal
?
YES
:
NO
;
}
else
if
((
testChrVal
==
YES
&&
!
zeroVal
)
||
(
testChrVal
==
NO
&&
zeroVal
))
{
fprintf
(
stderr
,
"ERROR: File %s has inconsistent naming convention (leading zero) for record:
\n
"
,
_fileNames
[
fileIdx
].
c_str
());
record
->
print
(
stderr
,
true
);
exit
(
1
);
}
if
((
_allFileHaveLeadingZeroInChromNames
==
YES
&&
!
zeroVal
)
||
(
_allFileHaveLeadingZeroInChromNames
==
NO
&&
zeroVal
))
{
fprintf
(
stderr
,
"ERROR: File %s has a record where naming convention (leading zero) is inconsistent with other files:
\n
"
,
_fileNames
[
fileIdx
].
c_str
());
record
->
print
(
stderr
,
true
);
exit
(
1
);
}
if
(
_allFileHaveLeadingZeroInChromNames
==
UNTESTED
)
{
_allFileHaveLeadingZeroInChromNames
=
zeroVal
?
YES
:
NO
;
}
}
ContextBase
::
testType
ContextBase
::
fileHasChrInChromNames
(
int
fileIdx
)
{
conventionType
::
iterator
iter
=
_fileHasChrInChromNames
.
find
(
fileIdx
);
if
(
iter
==
_fileHasChrInChromNames
.
end
())
{
return
UNTESTED
;
}
return
iter
->
second
;
}
ContextBase
::
testType
ContextBase
::
fileHasLeadingZeroInChromNames
(
int
fileIdx
)
{
conventionType
::
iterator
iter
=
_fileHasLeadingZeroInChromNames
.
find
(
fileIdx
);
if
(
iter
==
_fileHasLeadingZeroInChromNames
.
end
())
{
return
UNTESTED
;
}
return
iter
->
second
;
}
src/utils/Contexts/ContextBase.h
View file @
4c7213c5
...
...
@@ -145,6 +145,9 @@ public:
//methods applicable only to column operations.
int
getReportPrecision
()
const
{
return
_reportPrecision
;
}
void
testNameConventions
(
const
Record
*
);
protected:
PROGRAM_TYPE
_program
;
...
...
@@ -230,6 +233,16 @@ protected:
char
**
_argv
;
int
_i
;
//track whether each file has the letters chr in their chrom names.
//this is needed for enforcing consistent naming conventions across
//multiple files, and auto-detecting errors when they're not followed.
typedef
enum
{
YES
,
NO
,
UNTESTED
}
testType
;
typedef
map
<
int
,
testType
>
conventionType
;
conventionType
_fileHasChrInChromNames
;
// as above, but check whether first digit to appear in
// a chrom name is a zero.
conventionType
_fileHasLeadingZeroInChromNames
;
static
const
int
MIN_ALLOWED_BUF_SIZE
=
8
;
virtual
bool
handle_bed
();
...
...
@@ -255,6 +268,11 @@ protected:
bool
handle_prec
();
bool
parseIoBufSize
(
QuickString
bufStr
);
testType
fileHasChrInChromNames
(
int
fileIdx
);
testType
fileHasLeadingZeroInChromNames
(
int
fileIdx
);
testType
_allFilesHaveChrInChromNames
;
testType
_allFileHaveLeadingZeroInChromNames
;
};
#endif
/* CONTEXTBASE_H_ */
src/utils/FileRecordTools/Records/Record.cpp
View file @
4c7213c5
...
...
@@ -245,3 +245,35 @@ const QuickString &Record::getField(int fieldNum) const
<<
endl
<<
"*****"
<<
endl
;
exit
(
1
);
}
bool
Record
::
hasChrInChromName
()
const
{
const
char
*
str
=
_chrName
.
c_str
();
//if the chrom name has at least 3 characters,
//and the first 3 are c, h, r, case-insensitive,
//return true. Otherwise, return false.
return
((
_chrName
.
size
()
>=
3
)
&&
(
str
[
0
]
==
'c'
||
str
[
0
]
==
'C'
)
&&
(
str
[
1
]
==
'h'
||
str
[
1
]
==
'H'
)
&&
(
str
[
2
]
==
'r'
||
str
[
2
]
==
'R'
));
}
bool
Record
::
hasLeadingZeroInChromName
()
const
{
const
char
*
str
=
_chrName
.
c_str
();
size_t
i
=
0
;
while
(
i
<
_chrName
.
size
()
&&
!
isdigit
(
str
[
i
]))
{
if
(
str
[
i
]
==
'_'
)
return
false
;
//ignore
//leading zero after underscore, as some of the
//random and hap chroms are names that way.
i
++
;
}
return
(
i
<
_chrName
.
size
()
&&
str
[
i
]
==
'0'
);
}
void
Record
::
print
(
FILE
*
fp
,
bool
newline
)
const
{
QuickString
buf
;
print
(
buf
);
fprintf
(
fp
,
"%s"
,
buf
.
c_str
());
if
(
newline
)
fprintf
(
fp
,
"
\n
"
);
}
src/utils/FileRecordTools/Records/Record.h
View file @
4c7213c5
...
...
@@ -33,6 +33,7 @@ public:
virtual
void
print
(
QuickString
&
outBuf
)
const
{}
virtual
void
print
(
QuickString
&
outBuf
,
int
start
,
int
end
)
const
{}
virtual
void
print
(
QuickString
&
outBuf
,
const
QuickString
&
start
,
const
QuickString
&
end
)
const
{}
virtual
void
print
(
FILE
*
fp
,
bool
newline
=
false
)
const
;
virtual
void
printNull
(
QuickString
&
outBuf
)
const
{}
friend
ostream
&
operator
<<
(
ostream
&
out
,
const
Record
&
record
);
...
...
@@ -135,6 +136,9 @@ public:
// virtual static bool isNumericField(int fieldNum) const = 0;
bool
hasChrInChromName
()
const
;
bool
hasLeadingZeroInChromName
()
const
;
protected:
virtual
~
Record
();
//by making the destructor protected, only the friend class(es) can actually delete Record objects, or objects derived from Record.
...
...
src/utils/NewChromsweep/CloseSweep.cpp
View file @
4c7213c5
...
...
@@ -18,7 +18,7 @@ CloseSweep::CloseSweep(ContextClosest *context)
_overlapRecs
.
resize
(
_numDBs
,
NULL
);
_minUpstreamDist
.
resize
(
_numDBs
,
INT_MAX
);
_minDownstreamDist
.
resize
(
_numDBs
,
INT_MAX
);
_maxPrevLeftClosestEndPos
.
resize
(
_numDBs
,
0
);
_maxPrevLeftClosestEndPos
.
resize
(
_numDBs
,
0
);
for
(
int
i
=
0
;
i
<
_numDBs
;
i
++
)
{
_minUpstreamRecs
[
i
]
=
new
distRecVecType
();
...
...
@@ -36,8 +36,14 @@ CloseSweep::~CloseSweep(void) {
}
}
bool
CloseSweep
::
init
()
{
bool
retVal
=
NewChromSweep
::
init
();
_runToQueryEnd
=
true
;
return
retVal
;
}
void
CloseSweep
::
masterScan
(
RecordKeyVector
&
retList
)
{
//first clear out everything from the previous scan
if
(
_context
->
reportDistance
())
{
_finalDistances
.
clear
();
}
...
...
@@ -50,11 +56,12 @@ void CloseSweep::masterScan(RecordKeyVector &retList) {
for
(
int
i
=
0
;
i
<
_numDBs
;
i
++
)
{
//first clear out everything from the previous scan
_minUpstreamRecs
[
i
]
->
clear
();
_minDownstreamRecs
[
i
]
->
clear
();
_overlapRecs
[
i
]
->
clear
();
if
(
dbFinished
(
i
)
||
chromChange
(
i
,
retList
))
{
if
(
dbFinished
(
i
)
||
chromChange
(
i
,
retList
,
true
))
{
continue
;
}
else
{
...
...
@@ -251,9 +258,21 @@ CloseSweep::rateOvlpType CloseSweep::considerRecord(const Record *cacheRec, int
void
CloseSweep
::
finalizeSelections
(
int
dbIdx
,
RecordKeyVector
&
retList
)
{
//check all actual overlaps, as well as upstream and downstream hits.
// if all of these are empty, return.
const
vector
<
const
Record
*>
&
overlapRecs
=
(
*
(
_overlapRecs
[
dbIdx
]));
const
vector
<
const
Record
*>
&
upRecs
=
(
*
(
_minUpstreamRecs
[
dbIdx
]));
const
vector
<
const
Record
*>
&
downRecs
=
(
*
(
_minDownstreamRecs
[
dbIdx
]));
if
(
overlapRecs
.
empty
()
&&
upRecs
.
empty
()
&&
downRecs
.
empty
())
{
return
;
}
// If there are actual overlaps, only report those, then stop.
ContextClosest
::
tieModeType
tieMode
=
_context
->
getTieMode
();
const
vector
<
const
Record
*>
&
overlapRecs
=
(
*
(
_overlapRecs
[
dbIdx
]));
if
(
!
overlapRecs
.
empty
())
{
if
(
tieMode
==
ContextClosest
::
FIRST_TIE
)
{
retList
.
push_back
(
overlapRecs
[
0
]);
...
...
@@ -272,8 +291,6 @@ void CloseSweep::finalizeSelections(int dbIdx, RecordKeyVector &retList) {
}
int
upStreamDist
=
_minUpstreamDist
[
dbIdx
];
int
downStreamDist
=
_minDownstreamDist
[
dbIdx
];
const
vector
<
const
Record
*>
&
upRecs
=
(
*
(
_minUpstreamRecs
[
dbIdx
]));
const
vector
<
const
Record
*>
&
downRecs
=
(
*
(
_minDownstreamRecs
[
dbIdx
]));
if
(
abs
(
upStreamDist
)
<
abs
(
downStreamDist
))
{
if
(
tieMode
==
ContextClosest
::
FIRST_TIE
)
{
...
...
@@ -377,36 +394,51 @@ void CloseSweep::checkMultiDbs(RecordKeyVector &retList) {
}
}
bool
CloseSweep
::
chromChange
(
int
dbIdx
,
RecordKeyVector
&
retList
)
bool
CloseSweep
::
chromChange
(
int
dbIdx
,
RecordKeyVector
&
retList
,
bool
wantScan
)
{
// if (_currQueryRec->getChrName() == "chr1_gl000191_random" || (_currDbRecs[dbIdx] != NULL && _currDbRecs[dbIdx]->getChrName() == "chr1_gl000191_random")) {
// printf("Break point here.\n");
// }
const
Record
*
dbRec
=
_currDbRecs
[
dbIdx
];
bool
haveQuery
=
_currQueryRec
!=
NULL
;
bool
haveDB
=
dbRec
!=
NULL
;
\
if
(
haveQuery
&&
_currQueryChromName
!=
_prevQueryChromName
)
{
_context
->
testNameConventions
(
_currQueryRec
);
testChromOrder
(
_currQueryRec
);
}
if
(
haveDB
)
{
_context
->
testNameConventions
(
dbRec
);
testChromOrder
(
dbRec
);
}
// the files are on the same chrom
if
(
_currDbRecs
[
dbIdx
]
==
NULL
||
_currQueryRec
->
sameChrom
(
_currDbRecs
[
dbIdx
]
))
{
if
(
haveQuery
&&
(
!
haveDB
||
_currQueryRec
->
sameChrom
(
dbRec
)
))
{
//if this is the first time the query's chrom is ahead of the chrom that was in this cache,
//then we have to clear the cache.
if
(
!
_caches
[
dbIdx
].
empty
()
&&
_currQueryRec
->
c
hromAfter
(
_caches
[
dbIdx
].
begin
()
->
value
()))
{
if
(
!
_caches
[
dbIdx
].
empty
()
&&
queryC
hromAfter
DbRec
(
_caches
[
dbIdx
].
begin
()
->
value
()))
{
clearCache
(
dbIdx
);
_maxPrevLeftClosestEndPos
[
dbIdx
]
=
0
;
}
return
false
;
}
if
(
_currDbRecs
[
dbIdx
]
->
chromAfter
(
_currQueryRec
)
&&
(
!
_caches
[
dbIdx
].
empty
()
&&
(
_caches
[
dbIdx
].
begin
()
->
value
()
->
sameChrom
(
_currQueryRec
))))
{
if
(
haveDB
&&
haveQuery
&&
dbRecAfterQueryChrom
(
dbRec
)
&&
(
!
_caches
[
dbIdx
].
empty
()
&&
(
_caches
[
dbIdx
].
begin
()
->
value
()
->
sameChrom
(
_currQueryRec
))))
{
//the newest DB record's chrom is ahead of the query, but the cache still
//has old records on that query's chrom
scanCache
(
dbIdx
,
retList
);
finalizeSelections
(
dbIdx
,
retList
);
return
true
;
}
if
(
!
haveQuery
||
!
haveDB
)
return
false
;
// the query is ahead of the database. fast-forward the database to catch-up.
if
(
_currQueryRec
->
chromAfter
(
_currDbRecs
[
dbIdx
]
))
{
if
(
queryChromAfterDbRec
(
dbRec
))
{
while
(
_currDbRecs
[
dbIdx
]
!=
NULL
&&
_currQueryRec
->
chromAfter
(
_currDbRecs
[
dbIdx
]
))
{
_dbFRMs
[
dbIdx
]
->
deleteRecord
(
_currDbRecs
[
dbIdx
]
);
while
(
dbRec
!=
NULL
&&
queryChromAfterDbRec
(
dbRec
))
{
_dbFRMs
[
dbIdx
]
->
deleteRecord
(
dbRec
);
nextRecord
(
false
,
dbIdx
);
}
clearCache
(
dbIdx
);
...
...
@@ -416,7 +448,7 @@ bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList)
// the database is ahead of the query.
else
{
// 1. scan the cache for remaining hits on the query's current chrom.
scanCache
(
dbIdx
,
retList
);
if
(
wantScan
)
scanCache
(
dbIdx
,
retList
);
return
true
;
}
...
...
@@ -424,3 +456,24 @@ bool CloseSweep::chromChange(int dbIdx, RecordKeyVector &retList)
//control can't reach here, but compiler still wants a return statement.
return
true
;
}
bool
CloseSweep
::
dbRecAfterQueryChrom
(
const
Record
*
dbRec
)
{
//If using a genome file, compare chrom ids.
//Otherwise, compare global order, inserting as needed.
if
(
_context
->
hasGenomeFile
())
{
return
(
dbRec
->
getChromId
()
>
_currQueryRec
->
getChromId
()
)
;
}
//see if the db has both it's curr chrom and the query's curr chrom.
const
_orderTrackType
*
track
=
_fileTracks
[
dbRec
->
getFileIdx
()];
_orderTrackType
::
const_iterator
iter
=
track
->
find
(
dbRec
->
getChrName
());
if
(
iter
==
track
->
end
())
return
false
;
//db file does not contain the curr chrom
int
dbOrder
=
iter
->
second
;
iter
=
track
->
find
(
_currQueryRec
->
getChrName
());
if
(
iter
==
track
->
end
())
return
false
;
//db file does not contain the query chrom.
int
qOrder
=
iter
->
second
;
return
(
dbOrder
>
qOrder
);
}
src/utils/NewChromsweep/CloseSweep.h
View file @
4c7213c5
...
...
@@ -16,7 +16,7 @@ class CloseSweep : public NewChromSweep {
public:
CloseSweep
(
ContextClosest
*
context
);
~
CloseSweep
(
void
);
bool
init
();