Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
R3
legacy
bedtools2
Commits
54e27a15
Commit
54e27a15
authored
Oct 09, 2014
by
Neil Kindlon
Browse files
First check-in for new closest tool
parent
7434d8a3
Changes
15
Expand all
Hide whitespace changes
Inline
Side-by-side
src/bedtools.cpp
View file @
54e27a15
This diff is collapsed.
Click to expand it.
src/closestBed/closestBed.cpp
deleted
100644 → 0
View file @
7434d8a3
/*****************************************************************************
closestBed.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "lineFileUtilities.h"
#include "closestBed.h"
const
int
MAXSLOP
=
256000000
;
// 2*MAXSLOP = 512 megabases.
// We don't want to keep looking if we
// can't find a nearby feature within 512 Mb.
const
int
SLOPGROWTH
=
2048000
;
/*
Constructor
*/
BedClosest
::
BedClosest
(
string
&
bedAFile
,
string
&
bedBFile
,
bool
sameStrand
,
bool
diffStrand
,
string
&
tieMode
,
bool
reportDistance
,
bool
signDistance
,
string
&
_strandedDistMode
,
bool
ignoreOverlaps
,
bool
ignoreUpstream
,
bool
ignoreDownstream
,
bool
printHeader
,
bool
diffNames
)
:
_bedAFile
(
bedAFile
)
,
_bedBFile
(
bedBFile
)
,
_tieMode
(
tieMode
)
,
_sameStrand
(
sameStrand
)
,
_diffStrand
(
diffStrand
)
,
_reportDistance
(
reportDistance
)
,
_signDistance
(
signDistance
)
,
_strandedDistMode
(
_strandedDistMode
)
,
_ignoreOverlaps
(
ignoreOverlaps
)
,
_ignoreUpstream
(
ignoreUpstream
)
,
_ignoreDownstream
(
ignoreDownstream
)
,
_printHeader
(
printHeader
)
,
_diffNames
(
diffNames
)
{
_bedA
=
new
BedFile
(
_bedAFile
);
_bedB
=
new
BedFile
(
_bedBFile
);
FindClosestBed
();
}
/*
Destructor
*/
BedClosest
::~
BedClosest
(
void
)
{
}
void
BedClosest
::
ReportClosestNotFound
(
const
BED
&
a
)
{
_bedA
->
reportBedTab
(
a
);
if
(
_reportDistance
==
true
)
{
_bedB
->
reportNullBedTab
();
cout
<<
-
1
<<
endl
;
}
else
_bedB
->
reportNullBedNewLine
();
}
void
BedClosest
::
FindWindowOverlaps
(
BED
&
a
,
vector
<
BED
>
&
hits
)
{
int
slop
=
0
;
// start out just looking for overlaps
// within the current bin (~128Kb)
// update the current feature's start and end
CHRPOS
aFudgeStart
=
0
;
CHRPOS
aFudgeEnd
;
int
numOverlaps
=
0
;
vector
<
BED
>
closestB
;
CHRPOS
minDistance
=
INT_MAX
;
int32_t
curDistance
=
INT_MAX
;
vector
<
int32_t
>
distances
;
// is there at least one feature in B on the same chrom
// as the current A feature?
if
(
_bedB
->
bedMap
.
find
(
a
.
chrom
)
!=
_bedB
->
bedMap
.
end
())
{
while
((
numOverlaps
==
0
)
&&
(
slop
<=
MAXSLOP
))
{
// add some slop (starting at 0 bases) to a in hopes
// of finding a hit in B
if
((
static_cast
<
int
>
(
a
.
start
)
-
slop
)
>
0
)
aFudgeStart
=
a
.
start
-
slop
;
else
aFudgeStart
=
0
;
if
((
static_cast
<
int
>
(
a
.
start
)
+
slop
)
<
(
2
*
MAXSLOP
))
aFudgeEnd
=
a
.
end
+
slop
;
else
aFudgeEnd
=
2
*
MAXSLOP
;
// THE HEAVY LIFTING
// search for hits with the current slop added
_bedB
->
allHits
(
a
.
chrom
,
aFudgeStart
,
aFudgeEnd
,
a
.
strand
,
hits
,
_sameStrand
,
_diffStrand
,
0.0
,
false
);
vector
<
BED
>::
const_iterator
h
=
hits
.
begin
();
vector
<
BED
>::
const_iterator
hitsEnd
=
hits
.
end
();
for
(;
h
!=
hitsEnd
;
++
h
)
{
// skip the hit if the user doesn't want to allow same names.
if
((
_diffNames
==
true
)
&&
(
a
.
name
==
h
->
name
))
continue
;
// do the actual features overlap?
int
s
=
max
(
a
.
start
,
h
->
start
);
int
e
=
min
(
a
.
end
,
h
->
end
);
int
overlapBases
=
(
e
-
s
);
// make sure we allow overlapping features.
if
((
overlapBases
>
0
)
&&
(
_ignoreOverlaps
==
true
))
continue
;
else
numOverlaps
++
;
// there is overlap. make sure we allow overlapping features ()
if
(
overlapBases
>
0
)
{
curDistance
=
0
;
if
(
curDistance
<
(
int32_t
)
minDistance
)
{
closestB
.
clear
();
distances
.
clear
();
}
minDistance
=
0
;
closestB
.
push_back
(
*
h
);
distances
.
push_back
(
0
);
}
// the hit is to the "left" of A
else
if
(
h
->
end
<=
a
.
start
)
{
curDistance
=
(
a
.
start
-
h
->
end
)
+
1
;
if
(
_signDistance
)
{
if
((
_strandedDistMode
==
"ref"
)
||
(
_strandedDistMode
==
"a"
&&
a
.
strand
!=
"-"
)
||
(
_strandedDistMode
==
"b"
&&
h
->
strand
==
"-"
))
{
// hit is "upstream" of A
if
(
_ignoreUpstream
)
{
numOverlaps
--
;
continue
;
}
else
{
curDistance
=
-
curDistance
;
}
}
else
if
(
_ignoreDownstream
)
{
numOverlaps
--
;
continue
;
}
}
if
(
abs
(
curDistance
)
<
(
int32_t
)
minDistance
)
{
minDistance
=
abs
(
curDistance
);
closestB
.
clear
();
closestB
.
push_back
(
*
h
);
distances
.
clear
();
distances
.
push_back
(
curDistance
);
}
else
if
(
abs
(
curDistance
)
==
(
int32_t
)
minDistance
)
{
minDistance
=
abs
(
curDistance
);
closestB
.
push_back
(
*
h
);
distances
.
push_back
(
curDistance
);
}
}
// the hit is to the "right" of A
else
if
(
h
->
start
>=
a
.
end
)
{
curDistance
=
(
h
->
start
-
a
.
end
)
+
1
;
if
(
_signDistance
)
{
if
((
_strandedDistMode
==
"a"
&&
a
.
strand
==
"-"
)
||
(
_strandedDistMode
==
"b"
&&
h
->
strand
!=
"-"
))
{
// hit is "upstream" of A
if
(
_ignoreUpstream
)
{
numOverlaps
--
;
continue
;
}
else
{
curDistance
=
-
curDistance
;
}
}
else
if
(
_ignoreDownstream
){
numOverlaps
--
;
continue
;
}
}
if
(
abs
(
curDistance
)
<
(
int32_t
)
minDistance
)
{
minDistance
=
abs
(
curDistance
);
closestB
.
clear
();
closestB
.
push_back
(
*
h
);
distances
.
clear
();
distances
.
push_back
(
curDistance
);
}
else
if
(
abs
(
curDistance
)
==
(
int32_t
)
minDistance
)
{
minDistance
=
abs
(
curDistance
);
closestB
.
push_back
(
*
h
);
distances
.
push_back
(
curDistance
);
}
}
}
// if no overlaps were found, we'll widen the range
// by SLOPGROWTH in each direction and search again.
slop
+=
SLOPGROWTH
;
}
// while ((numOverlaps == 0) && (slop <= MAXSLOP))
// report the closest feature(s) in B to the current A feature.
// obey the user's reporting request (_tieMode)
if
(
numOverlaps
>
0
)
{
if
(
closestB
.
size
()
==
1
||
(
_tieMode
==
"first"
&&
closestB
.
size
()
>
0
))
{
_bedA
->
reportBedTab
(
a
);
if
(
_reportDistance
==
true
)
{
_bedB
->
reportBedTab
(
closestB
[
0
]);
cout
<<
distances
[
0
]
<<
endl
;
}
else
_bedB
->
reportBedNewLine
(
closestB
[
0
]);
}
else
{
if
(
_tieMode
==
"all"
)
{
size_t
i
=
0
;
vector
<
BED
>::
iterator
b
=
closestB
.
begin
();
for
(;
b
!=
closestB
.
end
();
++
b
)
{
_bedA
->
reportBedTab
(
a
);
if
(
_reportDistance
==
true
)
{
_bedB
->
reportBedTab
(
*
b
);
cout
<<
distances
[
i
++
]
<<
endl
;
}
else
_bedB
->
reportBedNewLine
(
*
b
);
}
}
else
if
(
_tieMode
==
"last"
&&
closestB
.
size
()
>
0
)
{
_bedA
->
reportBedTab
(
a
);
if
(
_reportDistance
==
true
)
{
_bedB
->
reportBedTab
(
closestB
[
closestB
.
size
()
-
1
]);
cout
<<
distances
[
distances
.
size
()
-
1
]
<<
endl
;
}
else
_bedB
->
reportBedNewLine
(
closestB
[
closestB
.
size
()
-
1
]);
}
}
}
// there were B features on the same chrom as A, but there were
// none that met the user's criteria (e.g., that it be on the
// same chrom)
else
if
(
numOverlaps
==
0
)
{
ReportClosestNotFound
(
a
);
}
}
// there is no feature in B on the same chromosome as A
else
{
ReportClosestNotFound
(
a
);
}
}
void
BedClosest
::
FindClosestBed
()
{
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bedB
->
loadBedFileIntoMap
();
BED
a
;
vector
<
BED
>
hits
;
// vector of potential hits
hits
.
reserve
(
100
);
_bedA
->
Open
();
// report A's header first if asked.
if
(
_printHeader
==
true
)
{
_bedA
->
PrintHeader
();
}
// process each entry in A in search of the closest feature in B
while
(
_bedA
->
GetNextBed
(
a
))
{
if
(
_bedA
->
_status
==
BED_VALID
)
{
FindWindowOverlaps
(
a
,
hits
);
hits
.
clear
();
}
}
_bedA
->
Close
();
}
// END ClosestBed
src/closestBed/closestBed.h
deleted
100644 → 0
View file @
7434d8a3
/*****************************************************************************
closestBed.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#ifndef CLOSESTBED_H
#define CLOSESTBED_H
#include "bedFile.h"
#include <vector>
#include <iostream>
#include <fstream>
using
namespace
std
;
//************************************************
// Class methods and elements
//************************************************
class
BedClosest
{
public:
// constructor
BedClosest
(
string
&
bedAFile
,
string
&
bedBFile
,
bool
sameStrand
,
bool
diffStrand
,
string
&
tieMode
,
bool
reportDistance
,
bool
signDistance
,
string
&
strandedDistMode
,
bool
ignoreOverlaps
,
bool
ignoreUpstream
,
bool
ignoreDownstream
,
bool
printHeader
,
bool
diffNames
);
// destructor
~
BedClosest
(
void
);
// find the closest feature in B to A
void
FindClosestBed
();
private:
// data
string
_bedAFile
;
string
_bedBFile
;
string
_tieMode
;
bool
_sameStrand
;
bool
_diffStrand
;
bool
_reportDistance
;
bool
_signDistance
;
string
_strandedDistMode
;
bool
_ignoreOverlaps
;
bool
_ignoreUpstream
;
bool
_ignoreDownstream
;
bool
_printHeader
;
bool
_diffNames
;
BedFile
*
_bedA
,
*
_bedB
;
// methods
void
FindWindowOverlaps
(
BED
&
,
vector
<
BED
>
&
);
void
ReportClosestNotFound
(
const
BED
&
a
);
};
#endif
/* CLOSEST_H */
src/closest
Bed
/Makefile
→
src/closest
File
/Makefile
View file @
54e27a15
UTILITIES_DIR
=
../utils/
OBJ_DIR
=
../../obj/
BIN_DIR
=
../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES
=
-I
$(UTILITIES_DIR)
/bedFile/
\
-I
$(UTILITIES_DIR)
/lineFileUtilities/
\
-I
$(UTILITIES_DIR)
/gzstream/
\
-I
$(UTILITIES_DIR)
/fileType/
\
-I
$(UTILITIES_DIR)
/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES
=
closestMain.cpp closestBed.cpp closestBed.h
OBJECTS
=
closestMain.o closestBed.o
BUILT_OBJECTS
=
$(
patsubst
%,
$(OBJ_DIR)
/%,
$(OBJECTS)
)
all
:
$(BUILT_OBJECTS)
.PHONY
:
all
$(BUILT_OBJECTS)
:
$(SOURCES)
@
echo
" * compiling"
$
(
*
F
)
.cpp
@
$(CXX)
-c
-o
$@
$
(
*
F
)
.cpp
$(LDFLAGS)
$(CXXFLAGS)
$(INCLUDES)
clean
:
@
echo
"Cleaning up."
@
rm
-f
$(OBJ_DIR)
/closestMain.o
$(OBJ_DIR)
/closestBed.o
.PHONY
:
clean
UTILITIES_DIR
=
../utils/
OBJ_DIR
=
../../obj/
BIN_DIR
=
../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES
=
-I
$(UTILITIES_DIR)
/Contexts/
\
-I$(UTILITIES_DIR)/general/
\
-I$(UTILITIES_DIR)/fileType/
\
-I$(UTILITIES_DIR)/gzstream/
\
-I$(UTILITIES_DIR)/GenomeFile/
\
-I$(UTILITIES_DIR)/BamTools/
include
\
-I$(UTILITIES_DIR)/BamTools/src
\
-I$(UTILITIES_DIR)/BlockedIntervals
\
-I$(UTILITIES_DIR)/BamTools-Ancillary
\
-I$(UTILITIES_DIR)/FileRecordTools/
\
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/
\
-I$(UTILITIES_DIR)/FileRecordTools/Records/
\
-I$(UTILITIES_DIR)/KeyListOps/
\
-I$(UTILITIES_DIR)/RecordOutputMgr/
\
-I$(UTILITIES_DIR)/NewChromsweep
\
-I$(UTILITIES_DIR)/BinTree
\
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES
=
closestMain.cpp closestFile.cpp closestFile.h
OBJECTS
=
closestMain.o closestFile.o
BUILT_OBJECTS
=
$(
patsubst
%,
$(OBJ_DIR)
/%,
$(OBJECTS)
)
all
:
$(BUILT_OBJECTS)
.PHONY
:
all
$(BUILT_OBJECTS)
:
$(SOURCES)
@
echo
" * compiling"
$
(
*
F
)
.cpp
@
$(CXX)
-c
-o
$@
$
(
*
F
)
.cpp
$(LDFLAGS)
$(CXXFLAGS)
$(INCLUDES)
clean
:
@
echo
"Cleaning up."
@
rm
-f
$(OBJ_DIR)
/closestMain.o
$(OBJ_DIR)
/closestFile.o
.PHONY
:
clean
src/closestFile/closestFile.cpp
0 → 100644
View file @
54e27a15
/*
* newClosestFile.cpp
*
* Created on: Sep 25, 2014
* Author: nek3d
*/
#include "FileRecordMgr.h"
#include "RecordOutputMgr.h"
#include "closestFile.h"
#include "CloseSweep.h"
ClosestFile
::
ClosestFile
(
ContextClosest
*
context
)
:
_context
(
context
),
_recordOutputMgr
(
NULL
)
{
_recordOutputMgr
=
new
RecordOutputMgr
();
_recordOutputMgr
->
init
(
_context
);
}
ClosestFile
::~
ClosestFile
()
{
delete
_recordOutputMgr
;
}
bool
ClosestFile
::
getClosest
()
{
CloseSweep
sweep
(
_context
);
if
(
!
sweep
.
init
())
{
return
false
;
}
RecordKeyVector
hitSet
;
while
(
sweep
.
next
(
hitSet
))
{
if
(
_context
->
reportDistance
())
{
_recordOutputMgr
->
printClosest
(
hitSet
,
&
(
sweep
.
getDistances
()));
}
else
{
_recordOutputMgr
->
printClosest
(
hitSet
,
NULL
);
}
}
return
true
;
}
src/closestFile/closestFile.h
0 → 100644
View file @
54e27a15
/*
* newClosestFile.h
*
* Created on: Sep 25, 2014
* Author: nek3d
*/
#ifndef NEWCLOSESTFILE_H_
#define NEWCLOSESTFILE_H_
#include "ContextClosest.h"
using
namespace
std
;
class
RecordOutputMgr
;
class
ClosestFile
{
public:
ClosestFile
(
ContextClosest
*
context
);
~
ClosestFile
(
void
);
bool
getClosest
();
private:
ContextClosest
*
_context
;
RecordOutputMgr
*
_recordOutputMgr
;
};
#endif
/* NEWCLOSESTFILE_H_ */
src/closest
Bed
/closestMain.cpp
→
src/closest
File
/closestMain.cpp
View file @
54e27a15
/*****************************************************************************
closestMain.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "closestBed.h"
#include "version.h"
using
namespace
std
;
// define our program name
#define PROGRAM_NAME "bedtools closest"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void
closest_help
(
void
);
int
closest_main
(
int
argc
,
char
*
argv
[])
{
// our configuration variables
bool
showHelp
=
false
;
// input files
string
bedAFile
;
string
bedBFile
;
string
tieMode
=
"all"
;
string
strandedDistMode
=
""
;
bool
haveBedA
=
false
;
bool
haveBedB
=
false
;
bool
haveTieMode
=
false
;
bool
sameStrand
=
false
;
bool
diffStrand
=
false
;
bool
ignoreOverlaps
=
false
;
bool
ignoreUpstream
=
false
;
bool
ignoreDownstream
=
false
;
bool
reportDistance
=
false
;
bool
signDistance
=
false
;
bool
haveStrandedDistMode
=
false
;
bool
printHeader
=
false
;
bool
diffNames
=
false
;
// check to see if we should print out some help
if
(
argc
<=
1
)
showHelp
=
true
;
for
(
int
i
=
1
;
i
<
argc
;
i
++
)
{
int
parameterLength
=
(
int
)
strlen
(
argv
[
i
]);
if
(
(
PARAMETER_CHECK
(
"-h"
,
2
,
parameterLength
))
||
(
PARAMETER_CHECK
(
"--help"
,
5
,
parameterLength
)))
{
showHelp
=
true
;
}
}
if
(
showHelp
)
closest_help
();
// do some parsing (all of these parameters require 2 strings)
for
(
int
i
=
1
;
i
<
argc
;
i
++
)
{
int
parameterLength
=
(
int
)
strlen
(
argv
[
i
]);
if
(
PARAMETER_CHECK
(
"-a"
,
2
,
parameterLength
))
{
if
((
i
+
1
)
<
argc
)
{
haveBedA
=
true
;
bedAFile
=
argv
[
i
+
1
];
i
++
;
}
}
else
if
(
PARAMETER_CHECK
(
"-b"
,
2
,
parameterLength
))
{
if
((
i
+
1
)
<
argc
)
{
haveBedB
=
true
;
bedBFile
=
argv
[
i
+
1
];
i
++
;
}
}
else
if
(
PARAMETER_CHECK
(
"-s"
,
2
,
parameterLength
))
{
sameStrand
=
true
;
}
else
if
(
PARAMETER_CHECK
(
"-S"
,
2
,
parameterLength
))
{
diffStrand
=
true
;
}
else
if
(
PARAMETER_CHECK
(
"-d"
,
2
,
parameterLength
))
{
reportDistance
=
true
;
}
else
if
(
PARAMETER_CHECK
(
"-D"
,
2
,
parameterLength
))
{
if
((
i
+
1
)
<
argc
)
{
reportDistance
=
true
;
signDistance
=
true
;
haveStrandedDistMode
=
true
;
strandedDistMode
=
argv
[
i
+
1
];
i
++
;
}
}
else
if
(
PARAMETER_CHECK
(
"-io"
,
3
,
parameterLength
))
{
ignoreOverlaps
=
true
;
}