Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
R3
legacy
bedtools2
Commits
6961cd20
Commit
6961cd20
authored
Feb 25, 2015
by
Neil Kindlon
Browse files
First check-in for new version of subtract
parent
4abcc2b0
Changes
15
Hide whitespace changes
Inline
Side-by-side
Makefile
View file @
6961cd20
...
...
@@ -66,7 +66,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)
/sortBed
\
$(SRC_DIR)
/spacingFile
\
$(SRC_DIR)
/split
\
$(SRC_DIR)
/subtract
Bed
\
$(SRC_DIR)
/subtract
File
\
$(SRC_DIR)
/tagBam
\
$(SRC_DIR)
/unionBedGraphs
\
$(SRC_DIR)
/windowBed
\
...
...
src/subtractBed/subtractBed.cpp
deleted
100644 → 0
View file @
4abcc2b0
/*****************************************************************************
subtractBed.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include
"lineFileUtilities.h"
#include
"subtractBed.h"
#include
<numeric>
/*
Constructor
*/
BedSubtract
::
BedSubtract
(
string
&
bedAFile
,
string
&
bedBFile
,
float
overlapFraction
,
bool
sameStrand
,
bool
diffStrand
,
bool
removeAll
,
bool
removeAny
)
:
_bedAFile
(
bedAFile
)
,
_bedBFile
(
bedBFile
)
,
_overlapFraction
(
overlapFraction
)
,
_sameStrand
(
sameStrand
)
,
_diffStrand
(
diffStrand
)
,
_removeAll
(
removeAll
)
,
_removeAny
(
removeAny
)
{
_bedA
=
new
BedFile
(
bedAFile
);
_bedB
=
new
BedFile
(
bedBFile
);
SubtractBed
();
}
/*
Destructor
*/
BedSubtract
::~
BedSubtract
(
void
)
{
}
void
BedSubtract
::
FindAndSubtractOverlaps
(
BED
&
a
,
vector
<
BED
>
&
hits
)
{
// find all of the overlaps between a and B.
_bedB
->
allHits
(
a
.
chrom
,
a
.
start
,
a
.
end
,
a
.
strand
,
hits
,
_sameStrand
,
_diffStrand
,
0.0
,
false
);
// is A completely spanned by an entry in B?
// if so, A should not be reported.
int
numConsumedByB
=
0
;
int
numOverlaps
=
0
;
vector
<
BED
>
bOverlaps
;
// list of hits in B. Special proc. if multiple.
vector
<
BED
>::
const_iterator
h
=
hits
.
begin
();
vector
<
BED
>::
const_iterator
hitsEnd
=
hits
.
end
();
for
(;
h
!=
hitsEnd
;
++
h
)
{
int
s
=
max
(
a
.
start
,
h
->
start
);
int
e
=
min
(
a
.
end
,
h
->
end
);
int
overlapBases
=
(
e
-
s
);
int
aLength
=
(
a
.
end
-
a
.
start
);
if
(
s
<
e
)
{
// is there enough overlap (default ~ 1bp)
float
overlap
=
((
float
)
overlapBases
/
(
float
)
aLength
);
if
(
overlap
>=
1.0
)
{
numOverlaps
++
;
numConsumedByB
++
;
}
else
if
(
overlap
>=
_overlapFraction
||
_removeAny
)
{
numOverlaps
++
;
bOverlaps
.
push_back
(
*
h
);
}
}
}
if
(
numOverlaps
==
0
)
{
// no overlap found, so just report A as-is.
_bedA
->
reportBedNewLine
(
a
);
}
else
if
((
numOverlaps
==
1
)
&&
(
!
_removeAny
))
{
// one overlap found. only need to look at the single
// entry in bOverlaps.
// if A was not "consumed" by any entry in B
if
((
numConsumedByB
==
0
)
&&
(
_removeAll
==
false
))
{
BED
theHit
=
bOverlaps
[
0
];
// A ++++++++++++
// B ----
// Res. ==== ====
if
(
(
theHit
.
start
>
a
.
start
)
&&
(
theHit
.
end
<
a
.
end
)
)
{
_bedA
->
reportBedRangeNewLine
(
a
,
a
.
start
,
theHit
.
start
);
_bedA
->
reportBedRangeNewLine
(
a
,
theHit
.
end
,
a
.
end
);
}
// A ++++++++++++
// B ----------
// Res. ==
else
if
(
theHit
.
start
==
a
.
start
)
{
_bedA
->
reportBedRangeNewLine
(
a
,
theHit
.
end
,
a
.
end
);
}
// A ++++++++++++
// B ----------
// Res. ====
else
if
(
theHit
.
start
<
a
.
start
)
{
_bedA
->
reportBedRangeNewLine
(
a
,
theHit
.
end
,
a
.
end
);
}
// A ++++++++++++
// B ----------
// Res. =======
else
if
(
theHit
.
start
>
a
.
start
)
{
_bedA
->
reportBedRangeNewLine
(
a
,
a
.
start
,
theHit
.
start
);
}
}
}
else
if
((
numOverlaps
>
1
)
||
_removeAny
)
{
// multiple overlapz found. look at all the hits
// and figure out which bases in A survived. then
// report the contigous intervals that survived.
vector
<
bool
>
aKeep
(
a
.
end
-
a
.
start
,
true
);
if
(
numConsumedByB
==
0
)
{
if
(
_removeAll
){
return
;
}
// if there's any overlap, then we don't report.
// track the number of hit starts and ends at each position in A
for
(
vector
<
BED
>::
iterator
h
=
bOverlaps
.
begin
();
h
!=
bOverlaps
.
end
();
++
h
)
{
int
s
=
max
(
a
.
start
,
h
->
start
);
int
e
=
min
(
a
.
end
,
h
->
end
);
for
(
int
i
=
s
+
1
;
i
<=
e
;
++
i
)
{
aKeep
[
i
-
a
.
start
-
1
]
=
false
;
}
}
if
(
_removeAny
){
int
asum
=
std
::
accumulate
(
aKeep
.
rbegin
(),
aKeep
.
rend
(),
0
);
float
afrac
=
1.0
-
(
float
)
asum
/
aKeep
.
size
();
if
(
afrac
>
_overlapFraction
){
return
;
}
for
(
unsigned
int
i
=
0
;
i
<
aKeep
.
size
();
++
i
)
{
aKeep
[
i
]
=
true
;
}
}
// report the remaining blocks.
for
(
unsigned
int
i
=
0
;
i
<
aKeep
.
size
();
++
i
)
{
if
(
aKeep
[
i
]
==
true
)
{
CHRPOS
blockStart
=
i
+
a
.
start
;
while
((
aKeep
[
i
]
==
true
)
&&
(
i
<
aKeep
.
size
()))
{
i
++
;
}
CHRPOS
blockEnd
=
i
+
a
.
start
;
blockEnd
=
min
(
a
.
end
,
blockEnd
);
_bedA
->
reportBedRangeNewLine
(
a
,
blockStart
,
blockEnd
);
}
}
}
}
}
void
BedSubtract
::
SubtractBed
()
{
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bedB
->
loadBedFileIntoMap
();
BED
a
;
vector
<
BED
>
hits
;
// reserve some space
hits
.
reserve
(
100
);
_bedA
->
Open
();
while
(
_bedA
->
GetNextBed
(
a
))
{
if
(
_bedA
->
_status
==
BED_VALID
)
{
FindAndSubtractOverlaps
(
a
,
hits
);
hits
.
clear
();
}
}
_bedA
->
Close
();
}
// END Intersect
src/subtractBed/subtractBed.h
deleted
100644 → 0
View file @
4abcc2b0
/*****************************************************************************
subtractBed.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#ifndef SUBTRACTBED_H
#define SUBTRACTBED_H
#include
"bedFile.h"
#include
<vector>
#include
<iostream>
#include
<fstream>
using
namespace
std
;
//************************************************
// Class methods and elements
//************************************************
class
BedSubtract
{
public:
// constructor
BedSubtract
(
string
&
bedAFile
,
string
&
bedBFile
,
float
overlapFraction
,
bool
sameStrand
,
bool
diffStrand
,
bool
removeAll
,
bool
removeAny
);
// destructor
~
BedSubtract
(
void
);
private:
// processing variables
string
_bedAFile
;
string
_bedBFile
;
float
_overlapFraction
;
bool
_sameStrand
;
bool
_diffStrand
;
bool
_removeAll
;
bool
_removeAny
;
// instances of bed file class.
BedFile
*
_bedA
,
*
_bedB
;
// methods
void
FindAndSubtractOverlaps
(
BED
&
a
,
vector
<
BED
>
&
hits
);
void
SubtractBed
();
};
#endif
/* SUBTRACTBED_H */
src/subtract
Bed
/Makefile
→
src/subtract
File
/Makefile
View file @
6961cd20
...
...
@@ -5,18 +5,31 @@ BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES
=
-I
$(UTILITIES_DIR)
/bedFile/
\
-I
$(UTILITIES_DIR)
/lineFileUtilities/
\
-I
$(UTILITIES_DIR)
/gzstream/
\
-I
$(UTILITIES_DIR)
/fileType/
\
INCLUDES
=
-I
$(UTILITIES_DIR)
/Contexts/
\
-I
$(UTILITIES_DIR)
/general/
\
-I
$(UTILITIES_DIR)
/fileType/
\
-I
$(UTILITIES_DIR)
/gzstream/
\
-I
$(UTILITIES_DIR)
/GenomeFile/
\
-I
$(UTILITIES_DIR)
/BamTools/include
\
-I
$(UTILITIES_DIR)
/BamTools/src
\
-I
$(UTILITIES_DIR)
/BlockedIntervals
\
-I
$(UTILITIES_DIR)
/BamTools-Ancillary
\
-I
$(UTILITIES_DIR)
/FileRecordTools/
\
-I
$(UTILITIES_DIR)
/FileRecordTools/FileReaders/
\
-I
$(UTILITIES_DIR)
/FileRecordTools/Records/
\
-I
$(UTILITIES_DIR)
/KeyListOps/
\
-I
$(UTILITIES_DIR)
/RecordOutputMgr/
\
-I
$(UTILITIES_DIR)
/NewChromsweep
\
-I
$(UTILITIES_DIR)
/BinTree
\
-I
$(UTILITIES_DIR)
/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES
=
subtractMain.cpp subtract
Bed
.cpp subtract
Bed
.h
OBJECTS
=
subtractMain.o subtract
Bed
.o
SOURCES
=
subtractMain.cpp subtract
File
.cpp subtract
File
.h
OBJECTS
=
subtractMain.o subtract
File
.o
BUILT_OBJECTS
=
$(
patsubst
%,
$(OBJ_DIR)
/%,
$(OBJECTS)
)
PROGRAM
=
subtractFile
all
:
$(BUILT_OBJECTS)
...
...
@@ -24,10 +37,10 @@ all: $(BUILT_OBJECTS)
$(BUILT_OBJECTS)
:
$(SOURCES)
@
echo
" * compiling"
$
(
*
F
)
.cpp
@
$(CXX)
-c
-o
$@
$
(
*
F
)
.cpp
$(LDFLAGS)
$(CXXFLAGS)
$(INCLUDES)
@
$(CXX)
-c
-o
$@
$
(
*
F
)
.cpp
$(LDFLAGS)
$(CXXFLAGS)
$(DFLAGS)
$(INCLUDES)
clean
:
@
echo
"Cleaning up."
@
rm
-f
$(OBJ_DIR)
/subtractMain.o
$(OBJ_DIR)
/subtract
Bed
.o
@
rm
-f
$(OBJ_DIR)
/subtractMain.o
$(OBJ_DIR)
/subtract
File
.o
.PHONY
:
clean
src/subtractFile/subtractFile.cpp
0 → 100644
View file @
6961cd20
/*
* subtractFile.cpp
*
* Created on: Feb 19, 2015
* Author: nek3d
*/
/*****************************************************************************
subtractFile.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include
"subtractFile.h"
#include
"ContextSubtract.h"
#include
"FileRecordMgr.h"
#include
"NewChromsweep.h"
#include
"BinTree.h"
#include
"RecordOutputMgr.h"
#include
<numeric>
//for std::accumulate
SubtractFile
::
SubtractFile
(
ContextSubtract
*
context
)
:
_context
(
context
),
_blockMgr
(
NULL
),
_recordOutputMgr
(
NULL
),
_tmpBlocksMgr
(
NULL
)
{
_recordOutputMgr
=
new
RecordOutputMgr
();
_recordOutputMgr
->
init
(
_context
);
if
(
_context
->
getObeySplits
())
{
_blockMgr
=
new
BlockMgr
(
_context
->
getOverlapFraction
(),
_context
->
getReciprocal
());
_recordOutputMgr
->
setSplitInfo
(
_blockMgr
);
}
_tmpBlocksMgr
=
new
BlockMgr
(
_context
->
getOverlapFraction
(),
_context
->
getReciprocal
());
}
SubtractFile
::~
SubtractFile
(
void
)
{
delete
_blockMgr
;
_blockMgr
=
NULL
;
delete
_recordOutputMgr
;
delete
_tmpBlocksMgr
;
}
void
SubtractFile
::
processHits
(
RecordKeyVector
&
hits
)
{
_recordOutputMgr
->
printRecord
(
hits
);
}
bool
SubtractFile
::
subtractFiles
()
{
if
(
_context
->
getSortedInput
())
{
return
processSortedFiles
();
}
return
processUnsortedFiles
();
}
bool
SubtractFile
::
processSortedFiles
()
{
// use the chromsweep algorithm to detect overlaps on the fly.
NewChromSweep
sweep
(
_context
);
if
(
!
sweep
.
init
())
{
return
false
;
}
RecordKeyVector
hitSet
;
while
(
sweep
.
next
(
hitSet
))
{
if
(
_context
->
getObeySplits
())
{
RecordKeyVector
keySet
(
hitSet
.
getKey
());
RecordKeyVector
resultSet
(
hitSet
.
getKey
());
_blockMgr
->
findBlockedOverlaps
(
keySet
,
hitSet
,
resultSet
);
subtractHits
(
resultSet
);
}
else
{
subtractHits
(
hitSet
);
}
}
if
(
!
_context
->
hasGenomeFile
())
{
sweep
.
closeOut
(
true
);
}
return
true
;
}
bool
SubtractFile
::
processUnsortedFiles
()
{
BinTree
*
binTree
=
new
BinTree
(
_context
);
binTree
->
loadDB
();
FileRecordMgr
*
queryFRM
=
_context
->
getFile
(
_context
->
getQueryFileIdx
());
while
(
!
queryFRM
->
eof
())
{
Record
*
queryRecord
=
queryFRM
->
getNextRecord
();
if
(
queryRecord
==
NULL
)
{
continue
;
}
RecordKeyVector
hitSet
(
queryRecord
);
binTree
->
getHits
(
queryRecord
,
hitSet
);
if
(
_context
->
getObeySplits
())
{
RecordKeyVector
keySet
(
hitSet
.
getKey
());
RecordKeyVector
resultSet
;
_blockMgr
->
findBlockedOverlaps
(
keySet
,
hitSet
,
resultSet
);
subtractHits
(
resultSet
);
}
else
{
subtractHits
(
hitSet
);
}
queryFRM
->
deleteRecord
(
queryRecord
);
}
//clean up.
delete
binTree
;
return
true
;
}
void
SubtractFile
::
subtractHits
(
RecordKeyVector
&
hits
)
{
if
(
hits
.
empty
())
{
// no intersection, nothing to subtract.
// just copy key to hits as if it were a
// self-intersection. This is just for reporting
// purposes.
hits
.
push_back
(
hits
.
getKey
());
processHits
(
hits
);
return
;
}
if
(
_context
->
getRemoveAll
()
&&
_context
->
getSubtractFraction
()
==
0.0
)
{
// hits aren't empty, meaning there is intersection,
// so we want to not report the hit.
hits
.
clearAll
();
return
;
}
//loop through hits. Track which bases in query were covered
const
Record
*
keyRec
=
hits
.
getKey
();
int
keyStart
=
keyRec
->
getStartPos
();
int
keyEnd
=
keyRec
->
getEndPos
();
//this vector of bools will represent the bases of the query.
//for each base, true means uncovered, false means covered.
//they begin as all uncovered.
vector
<
bool
>
keyBases
(
keyEnd
-
keyStart
,
true
);
//now loop through the hits, and cover corresponding query bases
//by setting them to false.
bool
basesRemoved
=
false
;
for
(
RecordKeyVector
::
const_iterator_type
iter
=
hits
.
begin
();
iter
!=
hits
.
end
();
iter
=
hits
.
next
())
{
const
Record
*
hitRec
=
*
iter
;
int
hitStart
=
hitRec
->
getStartPos
();
int
hitEnd
=
hitRec
->
getEndPos
();
int
startIdx
=
max
(
keyStart
,
hitStart
)
-
keyStart
;
int
endIdx
=
min
(
keyEnd
,
hitEnd
)
-
keyStart
;
int
keyLen
=
keyEnd
-
keyStart
;
int
coveredLen
=
endIdx
-
startIdx
;
float
coveragePct
=
(
float
)
coveredLen
/
(
float
)
keyLen
;
//for each base in the hit, set the base in the query to false.
//this effectively "erases" the covered bits. Only do
if
(
_context
->
getRemoveSum
()
||
coveragePct
>=
_context
->
getSubtractFraction
())
{
std
::
fill
(
keyBases
.
begin
()
+
startIdx
,
keyBases
.
begin
()
+
endIdx
,
false
);
basesRemoved
=
true
;
}
}
if
(
!
basesRemoved
)
{
//treat as if there were no intersection
hits
.
clearVector
();
hits
.
push_back
(
hits
.
getKey
());
processHits
(
hits
);
return
;
}
else
if
(
_context
->
getRemoveAll
())
{
hits
.
clearAll
();
return
;
}
// if the -N option is used ( removeSum), do not report if the percentage of
// uniquely covered bases exceeds the overlap fraction.
if
(
_context
->
getRemoveSum
())
{
//determine how many bases are left uncovered.
int
numBasesUncovered
=
std
::
accumulate
(
keyBases
.
begin
(),
keyBases
.
end
(),
0
);
//determine percentage that are covered.
float
pctCovered
=
1.0
-
(
float
)
numBasesUncovered
/
(
float
)(
keyEnd
-
keyStart
);
if
(
pctCovered
>
_context
->
getSubtractFraction
())
{
hits
.
clearAll
();
return
;
}
else
{
hits
.
clearVector
();
hits
.
push_back
(
hits
.
getKey
());
}
processHits
(
hits
);
return
;
}
//now make "blocks" out of the query's remaining stretches of
//uncovered bases.
RecordKeyVector
tempHits
(
keyRec
);
for
(
int
i
=
0
;
i
<
(
int
)
keyBases
.
size
();
i
++
)
{
if
(
keyBases
[
i
]
==
true
)
{
int
blockStart
=
keyStart
+
i
;
while
(
keyBases
[
i
]
==
true
&&
i
<
(
int
)
keyBases
.
size
())
{
i
++
;
}
int
blockEnd
=
min
(
keyStart
+
i
,
keyEnd
);
tempHits
.
push_back
(
_tmpBlocksMgr
->
allocateAndAssignRecord
(
keyRec
,
blockStart
,
blockEnd
));
}
}
processHits
(
tempHits
);
_tmpBlocksMgr
->
deleteBlocks
(
tempHits
);
}
src/subtractFile/subtractFile.h
0 → 100644
View file @
6961cd20
/*
* subtractFile.h
*
* Created on: Feb 19, 2015
* Author: nek3d
*/
#ifndef SUBTRACTFILE_H_
#define SUBTRACTFILE_H_
#include
"RecordKeyVector.h"
#include
"BlockMgr.h"
using
namespace
std
;
class
ContextSubtract
;
class
BlockMgr
;
class
RecordOutputMgr
;
class
BlockMgr
;
class
SubtractFile
{
public:
SubtractFile
(
ContextSubtract
*
context
);
~
SubtractFile
(
void
);
bool
subtractFiles
();
private:
ContextSubtract
*
_context
;
Record
*
_queryRec
;
Record
*
_databaseRec
;
BlockMgr
*
_blockMgr
;
RecordOutputMgr
*
_recordOutputMgr
;
void
processHits
(
RecordKeyVector
&
hits
);
bool
processSortedFiles
();
bool
processUnsortedFiles
();
void
subtractHits
(
RecordKeyVector
&
hits
);
BlockMgr
*
_tmpBlocksMgr
;
};
#endif
/* SUBTRACTFILE_H_ */
src/subtract
Bed
/subtractMain.cpp
→
src/subtract
File
/subtractMain.cpp
View file @
6961cd20
/*****************************************************************************
subtractMain.cpp
/*
* subtractMain.cpp
*
* Created on: Feb 19, 2015
* Author: nek3d
*/
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include
"subtractBed.h"
#include
"version.h"
#include
"subtractFile.h"
#include
"ContextSubtract.h"
#include
"CommonHelp.h"
using
namespace
std
;
// define our program name
#define PROGRAM_NAME "bedtools subtract"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) \
(strncmp(argv[i], param, min(actualLen, paramLen))== 0) && \
(actualLen == paramLen)
// function declarations
void
subtract_help
(
void
);