Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
R3
legacy
bedtools2
Commits
761d4a58
Commit
761d4a58
authored
Dec 30, 2011
by
Aaron
Browse files
added cluster tool. both cluster and merge require sorted files for efficiency.
parent
c045c254
Changes
9
Hide whitespace changes
Inline
Side-by-side
Makefile
View file @
761d4a58
...
...
@@ -23,6 +23,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)
/bedToIgv
\
$(SRC_DIR)
/bed12ToBed6
\
$(SRC_DIR)
/closestBed
\
$(SRC_DIR)
/clusterBed
\
$(SRC_DIR)
/complementBed
\
$(SRC_DIR)
/coverageBed
\
$(SRC_DIR)
/fastaFromBed
\
...
...
scripts/makeBashScripts.py
View file @
761d4a58
...
...
@@ -14,7 +14,8 @@ def main():
'bed12tobed6'
:
'bed12ToBed6'
,
'bedpetobam'
:
'bedpeToBam'
,
'bedtobam'
:
'bedToBam'
,
'closest'
:
'closestBed'
,
'closest'
:
'closestBed'
,
'cluster'
:
'clusterBed'
,
'complement'
:
'complementBed'
,
'coverage'
:
'coverageBed'
,
'flank'
:
'flankBed'
,
...
...
src/bedtools.cpp
View file @
761d4a58
...
...
@@ -41,6 +41,7 @@ int bedtobam_main(int argc, char* argv[]);//
int
bedtoigv_main
(
int
argc
,
char
*
argv
[]);
//
int
bedpetobam_main
(
int
argc
,
char
*
argv
[]);
//
int
closest_main
(
int
argc
,
char
*
argv
[]);
//
int
cluster_main
(
int
argc
,
char
*
argv
[]);
//
int
complement_main
(
int
argc
,
char
*
argv
[]);
//
int
coverage_main
(
int
argc
,
char
*
argv
[]);
//
int
fastafrombed_main
(
int
argc
,
char
*
argv
[]);
//
...
...
@@ -82,6 +83,7 @@ int main(int argc, char *argv[])
else
if
(
sub_cmd
==
"coverage"
)
return
coverage_main
(
argc
-
1
,
argv
+
1
);
else
if
(
sub_cmd
==
"genomecov"
)
return
genomecoverage_main
(
argc
-
1
,
argv
+
1
);
else
if
(
sub_cmd
==
"merge"
)
return
merge_main
(
argc
-
1
,
argv
+
1
);
else
if
(
sub_cmd
==
"cluster"
)
return
cluster_main
(
argc
-
1
,
argv
+
1
);
else
if
(
sub_cmd
==
"complement"
)
return
complement_main
(
argc
-
1
,
argv
+
1
);
else
if
(
sub_cmd
==
"subtract"
)
return
subtract_main
(
argc
-
1
,
argv
+
1
);
else
if
(
sub_cmd
==
"slop"
)
return
slop_main
(
argc
-
1
,
argv
+
1
);
...
...
@@ -171,6 +173,7 @@ int bedtools_help(void)
cout
<<
" coverage "
<<
"Compute the coverage over defined intervals.
\n
"
;
cout
<<
" genomecov "
<<
"Compute the coverage over an entire genome.
\n
"
;
cout
<<
" merge "
<<
"Combine overlapping/nearby intervals into a single interval.
\n
"
;
cout
<<
" cluster "
<<
"Cluster (but don't merge) overlapping/nearby intervals.
\n
"
;
cout
<<
" complement "
<<
"Extract intervals _not_ represented by an interval file.
\n
"
;
cout
<<
" subtract "
<<
"Remove intervals based on overlaps b/w two files.
\n
"
;
cout
<<
" slop "
<<
"Adjust the size of intervals.
\n
"
;
...
...
src/clusterBed/Makefile
0 → 100644
View file @
761d4a58
UTILITIES_DIR
=
../utils/
OBJ_DIR
=
../../obj/
BIN_DIR
=
../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES
=
-I
$(UTILITIES_DIR)
/bedFile/
\
-I
$(UTILITIES_DIR)
/lineFileUtilities/
\
-I
$(UTILITIES_DIR)
/gzstream/
\
-I
$(UTILITIES_DIR)
/fileType/
\
-I
$(UTILITIES_DIR)
/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES
=
clusterMain.cpp clusterBed.cpp
OBJECTS
=
$(SOURCES:.cpp=.o)
_EXT_OBJECTS
=
bedFile.o lineFileUtilities.o gzstream.o fileType.o
EXT_OBJECTS
=
$(
patsubst
%,
$(OBJ_DIR)
/%,
$(_EXT_OBJECTS)
)
BUILT_OBJECTS
=
$(
patsubst
%,
$(OBJ_DIR)
/%,
$(OBJECTS)
)
all
:
$(BUILT_OBJECTS)
.PHONY
:
all
$(BUILT_OBJECTS)
:
$(SOURCES)
@
echo
" * compiling"
$
(
*
F
)
.cpp
@
$(CXX)
-c
-o
$@
$
(
*
F
)
.cpp
$(LDFLAGS)
$(CXXFLAGS)
$(INCLUDES)
$(EXT_OBJECTS)
:
@
$(MAKE)
--no-print-directory
-C
$(UTILITIES_DIR)
/bedFile/
@
$(MAKE)
--no-print-directory
-C
$(UTILITIES_DIR)
/lineFileUtilities/
@
$(MAKE)
--no-print-directory
-C
$(UTILITIES_DIR)
/gzstream/
@
$(MAKE)
--no-print-directory
-C
$(UTILITIES_DIR)
/fileType/
@
$(MAKE)
--no-print-directory
-C
$(UTILITIES_DIR)
/fileType/
clean
:
@
echo
"Cleaning up."
@
rm
-f
$(OBJ_DIR)
/
*
$(BIN_DIR)
/
*
.PHONY
:
clean
\ No newline at end of file
src/clusterBed/clusterBed.cpp
0 → 100644
View file @
761d4a58
/*****************************************************************************
clusterBed.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "lineFileUtilities.h"
#include "clusterBed.h"
// = Constructor =
BedCluster
::
BedCluster
(
string
&
bedFile
,
int
maxDistance
,
bool
forceStrand
)
:
_bedFile
(
bedFile
),
_forceStrand
(
forceStrand
),
_maxDistance
(
maxDistance
)
{
_bed
=
new
BedFile
(
bedFile
);
if
(
_forceStrand
==
false
)
ClusterBed
();
else
ClusterBedStranded
();
}
// = Destructor =
BedCluster
::~
BedCluster
(
void
)
{}
// = Cluster overlapping or nearby BED entries =
void
BedCluster
::
ClusterBed
()
{
uint32_t
cluster_id
=
0
;
BED
prev
,
curr
;
int
end
=
-
1
;
_bed
->
Open
();
while
(
_bed
->
GetNextBed
(
curr
,
true
))
{
// true = force sorted intervals
if
(
_bed
->
_status
!=
BED_VALID
)
continue
;
// new cluster, no overlap
if
(
(((
int
)
curr
.
start
-
end
)
>
_maxDistance
)
||
(
curr
.
chrom
!=
prev
.
chrom
)
)
{
cluster_id
++
;
end
=
curr
.
end
;
}
prev
=
curr
;
_bed
->
reportBedTab
(
curr
);
printf
(
"%d
\n
"
,
cluster_id
);
}
}
// = Cluster overlapping BED entries, accounting for strandedness =
void
BedCluster
::
ClusterBedStranded
()
{
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bed
->
loadBedFileIntoMapNoBin
();
// loop through each chromosome and merge their BED entries
masterBedMapNoBin
::
const_iterator
m
=
_bed
->
bedMapNoBin
.
begin
();
masterBedMapNoBin
::
const_iterator
mEnd
=
_bed
->
bedMapNoBin
.
end
();
for
(;
m
!=
mEnd
;
++
m
)
{
// bedList is already sorted by start position.
string
chrom
=
m
->
first
;
vector
<
BED
>
bedList
=
m
->
second
;
// make a list of the two strands to merge separately.
vector
<
string
>
strands
(
2
);
strands
[
0
]
=
"+"
;
strands
[
1
]
=
"-"
;
uint32_t
cluster_id
=
0
;
// do two passes, one for each strand.
for
(
unsigned
int
s
=
0
;
s
<
strands
.
size
();
s
++
)
{
// cluster overlapping features for this chromosome.
int
end
=
-
1
;
BED
prev
;
vector
<
BED
>::
const_iterator
bedItr
=
bedList
.
begin
();
vector
<
BED
>::
const_iterator
bedEnd
=
bedList
.
end
();
for
(;
bedItr
!=
bedEnd
;
++
bedItr
)
{
// if forcing strandedness, move on if the hit
// is not on the current strand.
if
(
bedItr
->
strand
!=
strands
[
s
])
continue
;
// new cluster, no overlap
if
(
(((
int
)
bedItr
->
start
-
end
)
>
_maxDistance
)
||
(
end
<
0
))
{
cluster_id
++
;
end
=
bedItr
->
end
;
}
// same cluster, overlaps
else
{
if
((
int
)
bedItr
->
end
>
end
)
end
=
bedItr
->
end
;
}
prev
=
*
bedItr
;
_bed
->
reportBedTab
(
prev
);
printf
(
"%d
\n
"
,
cluster_id
);
}
}
}
}
src/clusterBed/clusterBed.h
0 → 100644
View file @
761d4a58
/*****************************************************************************
clusterBed.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "bedFile.h"
#include <vector>
#include <algorithm>
#include <numeric>
#include <iostream>
#include <fstream>
#include <limits.h>
#include <stdlib.h>
using
namespace
std
;
//************************************************
// Class methods and elements
//************************************************
class
BedCluster
{
public:
// constructor
BedCluster
(
string
&
bedFile
,
int
maxDistance
,
bool
forceStrand
);
// destructor
~
BedCluster
(
void
);
// find clusters
void
ClusterBed
();
// find clusters based on strand
void
ClusterBedStranded
();
private:
string
_bedFile
;
bool
_forceStrand
;
int
_maxDistance
;
// instance of a bed file class.
BedFile
*
_bed
;
};
src/clusterBed/clusterMain.cpp
0 → 100644
View file @
761d4a58
/*****************************************************************************
clusterMain.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "clusterBed.h"
#include "version.h"
using
namespace
std
;
// define our program name
#define PROGRAM_NAME "bedtools cluster"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void
cluster_help
(
void
);
int
cluster_main
(
int
argc
,
char
*
argv
[])
{
// our configuration variables
bool
showHelp
=
false
;
// input files
string
bedFile
=
"stdin"
;
int
maxDistance
=
0
;
// input arguments
bool
haveBed
=
true
;
bool
haveMaxDistance
=
false
;
bool
forceStrand
=
false
;
for
(
int
i
=
1
;
i
<
argc
;
i
++
)
{
int
parameterLength
=
(
int
)
strlen
(
argv
[
i
]);
if
((
PARAMETER_CHECK
(
"-h"
,
2
,
parameterLength
))
||
(
PARAMETER_CHECK
(
"--help"
,
5
,
parameterLength
)))
{
showHelp
=
true
;
}
}
if
(
showHelp
)
cluster_help
();
// do some parsing (all of these parameters require 2 strings)
for
(
int
i
=
1
;
i
<
argc
;
i
++
)
{
int
parameterLength
=
(
int
)
strlen
(
argv
[
i
]);
if
(
PARAMETER_CHECK
(
"-i"
,
2
,
parameterLength
))
{
if
((
i
+
1
)
<
argc
)
{
bedFile
=
argv
[
i
+
1
];
i
++
;
}
}
else
if
(
PARAMETER_CHECK
(
"-d"
,
2
,
parameterLength
))
{
if
((
i
+
1
)
<
argc
)
{
haveMaxDistance
=
true
;
maxDistance
=
atoi
(
argv
[
i
+
1
]);
i
++
;
}
}
else
if
(
PARAMETER_CHECK
(
"-s"
,
2
,
parameterLength
))
{
forceStrand
=
true
;
}
else
{
cerr
<<
endl
<<
"*****ERROR: Unrecognized parameter: "
<<
argv
[
i
]
<<
" *****"
<<
endl
<<
endl
;
showHelp
=
true
;
}
}
// make sure we have both input files
if
(
!
haveBed
)
{
cerr
<<
endl
<<
"*****"
<<
endl
<<
"*****ERROR: Need -i BED file. "
<<
endl
<<
"*****"
<<
endl
;
showHelp
=
true
;
}
if
(
!
showHelp
)
{
BedCluster
*
bc
=
new
BedCluster
(
bedFile
,
maxDistance
,
forceStrand
);
delete
bc
;
}
else
{
cluster_help
();
}
return
0
;
}
void
cluster_help
(
void
)
{
cerr
<<
"
\n
Tool: bedtools cluster"
<<
endl
;
cerr
<<
"Version: "
<<
VERSION
<<
"
\n
"
;
cerr
<<
"Summary: Clusters overlapping/nearby BED/GFF/VCF intervals."
<<
endl
<<
endl
;
cerr
<<
"Usage: "
<<
PROGRAM_NAME
<<
" [OPTIONS] -i <bed/gff/vcf>"
<<
endl
<<
endl
;
cerr
<<
"Options: "
<<
endl
;
cerr
<<
"
\t
-s
\t
"
<<
"Force strandedness. That is, only merge features"
<<
endl
;
cerr
<<
"
\t\t
that are the same strand."
<<
endl
;
cerr
<<
"
\t\t
- By default, merging is done without respect to strand."
<<
endl
<<
endl
;
cerr
<<
"
\t
-d
\t
"
<<
"Maximum distance between features allowed for features"
<<
endl
;
cerr
<<
"
\t\t
to be merged."
<<
endl
;
cerr
<<
"
\t\t
- Def. 0. That is, overlapping & book-ended features are merged."
<<
endl
;
cerr
<<
"
\t\t
- (INTEGER)"
<<
endl
<<
endl
;
cerr
<<
"Notes: "
<<
endl
;
cerr
<<
"
\t
(1) All output, regardless of input type (e.g., GFF or VCF)"
<<
endl
;
cerr
<<
"
\t
will in BED format with zero-based starts"
<<
endl
<<
endl
;
// end the program here
exit
(
1
);
}
src/mergeBed/mergeBed.cpp
View file @
761d4a58
...
...
@@ -241,54 +241,47 @@ void BedMerge::ReportStranded(string chrom, int start, int end,
// = Merge overlapping BED entries into a single entry =
// =====================================================
void
BedMerge
::
MergeBed
()
{
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bed
->
loadBedFileIntoMapNoBin
();
// loop through each chromosome and merge their BED entries
masterBedMapNoBin
::
const_iterator
m
=
_bed
->
bedMapNoBin
.
begin
();
masterBedMapNoBin
::
const_iterator
mEnd
=
_bed
->
bedMapNoBin
.
end
();
for
(;
m
!=
mEnd
;
++
m
)
{
// bedList is already sorted by start position.
string
chrom
=
m
->
first
;
vector
<
BED
>
bedList
=
m
->
second
;
int
mergeCount
=
1
;
vector
<
string
>
names
;
vector
<
string
>
scores
;
// merge overlapping features for this chromosome.
int
start
=
-
1
;
int
end
=
-
1
;
vector
<
BED
>::
const_iterator
bedItr
=
bedList
.
begin
();
vector
<
BED
>::
const_iterator
bedEnd
=
bedList
.
end
();
for
(;
bedItr
!=
bedEnd
;
++
bedItr
)
{
// new block, no overlap
if
(
(((
int
)
bedItr
->
start
-
end
)
>
_maxDistance
)
||
(
end
<
0
))
{
if
(
start
>=
0
)
{
Report
(
chrom
,
start
,
end
,
names
,
scores
,
mergeCount
);
// reset
mergeCount
=
1
;
names
.
clear
();
scores
.
clear
();
}
start
=
bedItr
->
start
;
end
=
bedItr
->
end
;
if
(
!
bedItr
->
name
.
empty
())
names
.
push_back
(
bedItr
->
name
);
if
(
!
bedItr
->
score
.
empty
())
scores
.
push_back
(
bedItr
->
score
);
}
// same block, overlaps
else
{
if
((
int
)
bedItr
->
end
>
end
)
end
=
bedItr
->
end
;
mergeCount
++
;
if
(
!
bedItr
->
name
.
empty
())
names
.
push_back
(
bedItr
->
name
);
if
(
!
bedItr
->
score
.
empty
())
scores
.
push_back
(
bedItr
->
score
);
int
mergeCount
=
1
;
vector
<
string
>
names
;
vector
<
string
>
scores
;
int
start
=
-
1
;
int
end
=
-
1
;
BED
prev
,
curr
;
_bed
->
Open
();
while
(
_bed
->
GetNextBed
(
curr
,
true
))
{
// true = force sorted intervals
if
(
_bed
->
_status
!=
BED_VALID
)
continue
;
// new block, no overlap
if
(
(((
int
)
curr
.
start
-
end
)
>
_maxDistance
)
||
(
curr
.
chrom
!=
prev
.
chrom
))
{
if
(
start
>=
0
)
{
Report
(
prev
.
chrom
,
start
,
end
,
names
,
scores
,
mergeCount
);
// reset
mergeCount
=
1
;
names
.
clear
();
scores
.
clear
();
}
start
=
curr
.
start
;
end
=
curr
.
end
;
if
(
!
curr
.
name
.
empty
())
names
.
push_back
(
curr
.
name
);
if
(
!
curr
.
score
.
empty
())
scores
.
push_back
(
curr
.
score
);
}
if
(
start
>=
0
)
{
Report
(
chrom
,
start
,
end
,
names
,
scores
,
mergeCount
);
// same block, overlaps
else
{
if
((
int
)
curr
.
end
>
end
)
end
=
curr
.
end
;
if
(
!
curr
.
name
.
empty
())
names
.
push_back
(
curr
.
name
);
if
(
!
curr
.
score
.
empty
())
scores
.
push_back
(
curr
.
score
);
mergeCount
++
;
}
prev
=
curr
;
}
if
(
start
>=
0
)
{
Report
(
prev
.
chrom
,
start
,
end
,
names
,
scores
,
mergeCount
);
}
}
...
...
src/utils/bedFile/bedFile.cpp
View file @
761d4a58
...
...
@@ -242,7 +242,11 @@ bool BedFile::GetNextBed(BED &bed, bool forceSorted) {
_prev_start
=
bed
.
start
;
}
else
if
(
forceSorted
)
{
cerr
<<
"ERROR: input file: ("
<<
bedFile
<<
") is not sorted by chrom then start"
<<
endl
;
cerr
<<
"ERROR: input file: ("
<<
bedFile
<<
") is not sorted by chrom then start."
<<
endl
<<
" The start coordinate at line "
<<
_lineNum
<<
" is less than the start at line "
<<
_lineNum
-
1
<<
endl
;
exit
(
1
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment