Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
R3
legacy
bedtools2
Commits
2aad3460
Commit
2aad3460
authored
Jan 12, 2012
by
Aaron
Browse files
ENH: groupBy now uses VectorOps
parent
93d2520c
Changes
5
Hide whitespace changes
Inline
Side-by-side
src/groupBy/Makefile
View file @
2aad3460
...
...
@@ -10,6 +10,7 @@ INCLUDES = -I$(UTILITIES_DIR)/tabFile/ \
-I
$(UTILITIES_DIR)
/lineFileUtilities/
\
-I
$(UTILITIES_DIR)
/gzstream/
\
-I
$(UTILITIES_DIR)
/fileType/
\
-I
$(UTILITIES_DIR)
/VectorOps/
\
-I
$(UTILITIES_DIR)
/version/
# ----------------------------------
...
...
@@ -17,7 +18,7 @@ INCLUDES = -I$(UTILITIES_DIR)/tabFile/ \
# ----------------------------------
SOURCES
=
groupBy.cpp
OBJECTS
=
$(SOURCES:.cpp=.o)
_EXT_OBJECTS
=
tabFile.o gzstream.o fileType.o
_EXT_OBJECTS
=
tabFile.o gzstream.o fileType.o
VectorOps.o
EXT_OBJECTS
=
$(
patsubst
%,
$(OBJ_DIR)
/%,
$(_EXT_OBJECTS)
)
BUILT_OBJECTS
=
$(
patsubst
%,
$(OBJ_DIR)
/%,
$(OBJECTS)
)
...
...
src/groupBy/groupBy.cpp
View file @
2aad3460
...
...
@@ -27,6 +27,7 @@ Licenced under the MIT license.
#include "version.h"
#include "lineFileUtilities.h"
#include "tabFile.h"
#include "VectorOps.h"
using
namespace
std
;
...
...
@@ -39,33 +40,12 @@ const int PRECISION = 21;
#define PARAMETER_CHECK(param, paramLen, actualLen) ((strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen))
#define LOOKS_LIKE_A_PARAM(string) (strlen(string)>0 && string[0]=='-')
struct
ValueGreaterThan
{
bool
operator
()(
const
vector
<
pair
<
int
,
string
>
>::
value_type
&
lhs
,
const
vector
<
pair
<
int
,
string
>
>::
value_type
&
rhs
)
const
{
return
lhs
.
first
>
rhs
.
first
;
}
};
struct
ValueLessThan
{
bool
operator
()(
const
vector
<
pair
<
int
,
string
>
>::
value_type
&
lhs
,
const
vector
<
pair
<
int
,
string
>
>::
value_type
&
rhs
)
const
{
return
lhs
.
first
<
rhs
.
first
;
}
};
// function declarations
void
groupby_help
(
void
);
void
GroupBy
(
const
string
&
inFile
,
const
vector
<
int
>
&
groupColumns
,
const
vector
<
int
>
&
opColumns
,
const
vector
<
string
>
&
ops
,
const
bool
printOriginalLine
,
const
bool
printHeaderLine
,
const
bool
InputHaveHeaderLine
,
const
bool
ignoreCase
);
void
PrintHeaderLine
(
const
vector
<
string
>
&
InputFields
,
const
vector
<
int
>
&
groupColumns
,
const
vector
<
int
>
&
opColumns
,
const
vector
<
string
>
&
ops
,
const
bool
PrintFullInputLine
,
const
bool
InputHaveHeaderLine
);
void
ReportSummary
(
const
vector
<
string
>
&
group
,
const
vector
<
vector
<
string
>
>
&
data
,
const
vector
<
string
>
&
ops
);
void
addValue
(
const
vector
<
string
>
&
fromList
,
vector
<
string
>
&
toList
,
int
index
,
int
lineNum
,
const
bool
ignoreCase
);
float
ToFloat
(
string
element
);
double
ToDouble
(
const
string
&
element
);
void
TabPrintPost
(
string
element
);
void
TabPrintPre
(
string
element
);
void
CommaPrint
(
string
element
);
...
...
@@ -407,170 +387,59 @@ void ReportSummary(const vector<string> &group, const vector<vector<string> > &d
string
op
=
ops
[
i
];
std
::
stringstream
buffer
;
vector
<
double
>
dataF
;
// are we doing a numeric conversion? if so, convert the strings to doubles.
if
((
op
==
"sum"
)
||
(
op
==
"max"
)
||
(
op
==
"min"
)
||
(
op
==
"mean"
)
||
(
op
==
"median"
)
||
(
op
==
"stdev"
)
||
(
op
==
"sstdev"
))
{
transform
(
data
[
i
].
begin
(),
data
[
i
].
end
(),
back_inserter
(
dataF
),
ToDouble
);
}
VectorOps
vo
(
data
[
i
]);
if
(
op
==
"sum"
)
{
// sum them up
double
total
=
accumulate
(
dataF
.
begin
(),
dataF
.
end
(),
0.0
);
buffer
<<
setprecision
(
PRECISION
)
<<
total
;
buffer
<<
setprecision
(
PRECISION
)
<<
vo
.
GetSum
();
result
.
push_back
(
buffer
.
str
());
}
else
if
(
op
==
"collapse"
)
{
string
collapse
;
for
(
size_t
j
=
0
;
j
<
data
[
i
].
size
();
j
++
)
{
//Ugly, but cannot use back_inserter
if
(
j
>
0
)
collapse
.
append
(
","
);
collapse
.
append
(
data
[
i
][
j
]);
}
result
.
push_back
(
collapse
);
result
.
push_back
(
vo
.
GetCollapse
());
}
else
if
(
op
==
"distinct"
)
{
string
distinct
;
// get the current column's data
vector
<
string
>
col_data
=
data
[
i
];
// remove duplicate entries from the vector
// http://stackoverflow.com/questions/1041620/most-efficient-way-to-erase-duplicates-and-sort-a-c-vector
sort
(
col_data
.
begin
(),
col_data
.
end
()
);
col_data
.
erase
(
unique
(
col_data
.
begin
(),
col_data
.
end
()
),
col_data
.
end
()
);
for
(
size_t
j
=
0
;
j
<
col_data
.
size
();
j
++
)
{
//Ugly, but cannot use back_inserter
if
(
j
>
0
)
distinct
.
append
(
","
);
distinct
.
append
(
col_data
[
j
]);
}
result
.
push_back
(
distinct
);
result
.
push_back
(
vo
.
GetDistinct
());
}
else
if
(
op
==
"concat"
)
{
string
concat
;
for
(
size_t
j
=
0
;
j
<
data
[
i
].
size
();
j
++
)
{
//Ugly, but cannot use back_inserter
concat
.
append
(
data
[
i
][
j
]);
}
result
.
push_back
(
concat
);
result
.
push_back
(
vo
.
GetConcat
());
}
else
if
(
op
==
"min"
)
{
buffer
<<
setprecision
(
PRECISION
)
<<
*
min_element
(
dataF
.
begin
(),
dataF
.
end
()
);
buffer
<<
setprecision
(
PRECISION
)
<<
vo
.
GetMin
(
);
result
.
push_back
(
buffer
.
str
());
}
else
if
(
op
==
"max"
)
{
buffer
<<
setprecision
(
PRECISION
)
<<
*
max_element
(
dataF
.
begin
(),
dataF
.
end
()
);
buffer
<<
setprecision
(
PRECISION
)
<<
vo
.
GetMax
(
);
result
.
push_back
(
buffer
.
str
());
}
else
if
(
op
==
"mean"
)
{
double
total
=
accumulate
(
dataF
.
begin
(),
dataF
.
end
(),
0.0
);
double
mean
=
total
/
dataF
.
size
();
buffer
<<
setprecision
(
PRECISION
)
<<
mean
;
buffer
<<
setprecision
(
PRECISION
)
<<
vo
.
GetMean
();
result
.
push_back
(
buffer
.
str
());
}
else
if
(
op
==
"median"
)
{
double
median
=
0.0
;
sort
(
dataF
.
begin
(),
dataF
.
end
());
int
totalLines
=
dataF
.
size
();
if
((
totalLines
%
2
)
>
0
)
{
long
mid
;
mid
=
totalLines
/
2
;
median
=
dataF
[
mid
];
}
else
{
long
midLow
,
midHigh
;
midLow
=
(
totalLines
/
2
)
-
1
;
midHigh
=
(
totalLines
/
2
);
median
=
(
dataF
[
midLow
]
+
dataF
[
midHigh
])
/
2.0
;
}
buffer
<<
setprecision
(
PRECISION
)
<<
median
;
buffer
<<
setprecision
(
PRECISION
)
<<
vo
.
GetMedian
();
result
.
push_back
(
buffer
.
str
());
}
else
if
(
op
==
"count"
)
{
buffer
<<
setprecision
(
PRECISION
)
<<
data
[
i
].
size
();
result
.
push_back
(
buffer
.
str
());
}
else
if
((
op
==
"mode"
)
||
(
op
==
"antimode"
)
||
(
op
==
"freqdesc"
)
||
(
op
==
"freqasc"
))
{
// compute the frequency of each unique value
map
<
string
,
int
>
freqs
;
vector
<
string
>::
const_iterator
dIt
=
data
[
i
].
begin
();
vector
<
string
>::
const_iterator
dEnd
=
data
[
i
].
end
();
for
(;
dIt
!=
dEnd
;
++
dIt
)
{
freqs
[
*
dIt
]
++
;
}
// grab the mode and the anti mode
string
mode
,
antiMode
;
int
count
=
0
;
int
minCount
=
INT_MAX
;
for
(
map
<
string
,
int
>::
const_iterator
iter
=
freqs
.
begin
();
iter
!=
freqs
.
end
();
++
iter
)
{
if
(
iter
->
second
>
count
)
{
mode
=
iter
->
first
;
count
=
iter
->
second
;
}
if
(
iter
->
second
<
minCount
)
{
antiMode
=
iter
->
first
;
minCount
=
iter
->
second
;
}
}
// report
if
(
op
==
"mode"
)
{
buffer
<<
setprecision
(
PRECISION
)
<<
mode
;
result
.
push_back
(
buffer
.
str
());
}
else
if
(
op
==
"antimode"
)
{
buffer
<<
setprecision
(
PRECISION
)
<<
antiMode
;
result
.
push_back
(
buffer
.
str
());
}
else
if
(
op
==
"freqdesc"
||
op
==
"freqasc"
)
{
// pair for the num times a values was
// observed (1) and the value itself (2)
pair
<
int
,
string
>
freqPair
;
vector
<
pair
<
int
,
string
>
>
freqList
;
// create a list of pairs of all the observed values (second)
// and their occurences (first)
map
<
string
,
int
>::
const_iterator
mapIter
=
freqs
.
begin
();
map
<
string
,
int
>::
const_iterator
mapEnd
=
freqs
.
end
();
for
(;
mapIter
!=
mapEnd
;
++
mapIter
)
freqList
.
push_back
(
make_pair
(
mapIter
->
second
,
mapIter
->
first
)
);
// sort the list of pairs in the requested order by the frequency
// this will make the value that was observed least/most bubble to the top
if
(
op
==
"freqdesc"
)
sort
(
freqList
.
begin
(),
freqList
.
end
(),
ValueGreaterThan
());
else
if
(
op
==
"freqasc"
)
sort
(
freqList
.
begin
(),
freqList
.
end
(),
ValueLessThan
());
// record all of the values and their frequencies.
vector
<
pair
<
int
,
string
>
>::
const_iterator
iter
=
freqList
.
begin
();
vector
<
pair
<
int
,
string
>
>::
const_iterator
iterEnd
=
freqList
.
end
();
for
(;
iter
!=
iterEnd
;
++
iter
)
buffer
<<
iter
->
second
<<
":"
<<
iter
->
first
<<
","
;
result
.
push_back
(
buffer
.
str
());
}
else
if
(
op
==
"mode"
)
{
result
.
push_back
(
vo
.
GetMode
());
}
else
if
(
op
==
"stdev"
||
op
==
"sstdev"
)
{
// get the mean
double
total
=
accumulate
(
dataF
.
begin
(),
dataF
.
end
(),
0.0
);
double
mean
=
total
/
dataF
.
size
();
// get the variance
double
totalVariance
=
0.0
;
vector
<
double
>::
const_iterator
dIt
=
dataF
.
begin
();
vector
<
double
>::
const_iterator
dEnd
=
dataF
.
end
();
for
(;
dIt
!=
dEnd
;
++
dIt
)
{
totalVariance
+=
pow
((
*
dIt
-
mean
),
2
);
}
double
variance
=
0.0
;
if
(
op
==
"stdev"
)
{
variance
=
totalVariance
/
dataF
.
size
();
}
else
if
(
op
==
"sstdev"
&&
dataF
.
size
()
>
1
)
{
variance
=
totalVariance
/
(
dataF
.
size
()
-
1
);
}
double
stddev
=
sqrt
(
variance
);
// report
buffer
<<
setprecision
(
PRECISION
)
<<
stddev
;
else
if
(
op
==
"antimode"
)
{
result
.
push_back
(
vo
.
GetAntiMode
());
}
else
if
(
op
==
"freqdesc"
)
{
result
.
push_back
(
vo
.
GetFreqDesc
());
}
else
if
(
op
==
"freqasc"
)
{
result
.
push_back
(
vo
.
GetFreqAsc
());
}
else
if
(
op
==
"stdev"
)
{
buffer
<<
setprecision
(
PRECISION
)
<<
vo
.
GetStddev
();
result
.
push_back
(
buffer
.
str
());
}
else
if
(
op
==
"sstdev"
)
{
buffer
<<
setprecision
(
PRECISION
)
<<
vo
.
GetSstddev
();
result
.
push_back
(
buffer
.
str
());
}
}
...
...
@@ -596,10 +465,6 @@ void addValue (const vector<string> &fromList, vector<string> &toList, int index
}
float
ToFloat
(
string
element
)
{
return
atof
(
element
.
c_str
());
}
void
TabPrintPost
(
string
element
)
{
cout
<<
element
<<
"
\t
"
;
}
...
...
@@ -612,16 +477,6 @@ void CommaPrint (string element) {
cout
<<
element
<<
","
;
}
double
ToDouble
(
const
string
&
element
)
{
std
::
istringstream
i
(
element
);
double
x
;
if
(
!
(
i
>>
x
))
{
cerr
<<
"Error: Could not properly convert string to numeric (
\"
"
+
element
+
"
\"
)"
<<
endl
;
exit
(
1
);
}
return
x
;
}
inline
string
ColumnHeaderName
(
const
vector
<
string
>
&
inFields
,
const
size_t
FieldIndex
,
bool
InputHaveHeaderLine
)
{
...
...
src/mapBed/Makefile
View file @
2aad3460
...
...
@@ -21,7 +21,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
# ----------------------------------
SOURCES
=
mapMain.cpp mapBed.cpp
OBJECTS
=
$(SOURCES:.cpp=.o)
_EXT_OBJECTS
=
bedFile.o lineFileUtilities.o BamAncillary.o gzstream.o fileType.o chromsweep.o
_EXT_OBJECTS
=
bedFile.o lineFileUtilities.o BamAncillary.o gzstream.o fileType.o
VectorOps.o
chromsweep.o
EXT_OBJECTS
=
$(
patsubst
%,
$(OBJ_DIR)
/%,
$(_EXT_OBJECTS)
)
BUILT_OBJECTS
=
$(
patsubst
%,
$(OBJ_DIR)
/%,
$(OBJECTS)
)
PROGRAM
=
mapBed
...
...
@@ -41,6 +41,7 @@ $(EXT_OBJECTS):
@
$(MAKE)
--no-print-directory
-C
$(UTILITIES_DIR)
/BamTools-Ancillary/
@
$(MAKE)
--no-print-directory
-C
$(UTILITIES_DIR)
/gzstream/
@
$(MAKE)
--no-print-directory
-C
$(UTILITIES_DIR)
/fileType/
@
$(MAKE)
--no-print-directory
-C
$(UTILITIES_DIR)
/VectorOps/
@
$(MAKE)
--no-print-directory
-C
$(UTILITIES_DIR)
/chromsweep/
clean
:
...
...
src/utils/VectorOps/VectorOps.cpp
View file @
2aad3460
...
...
@@ -25,6 +25,24 @@ double MakeDouble(const string &element) {
return
x
;
}
struct
ValueGreaterThan
{
bool
operator
()(
const
vector
<
pair
<
int
,
string
>
>::
value_type
&
lhs
,
const
vector
<
pair
<
int
,
string
>
>::
value_type
&
rhs
)
const
{
return
lhs
.
first
>
rhs
.
first
;
}
};
struct
ValueLessThan
{
bool
operator
()(
const
vector
<
pair
<
int
,
string
>
>::
value_type
&
lhs
,
const
vector
<
pair
<
int
,
string
>
>::
value_type
&
rhs
)
const
{
return
lhs
.
first
<
rhs
.
first
;
}
};
// Constructor
VectorOps
::
VectorOps
(
const
vector
<
string
>
&
vec
)
:
_vecs
(
vec
)
...
...
@@ -52,6 +70,34 @@ double VectorOps::GetMean(void)
return
accumulate
(
_vecd
.
begin
(),
_vecd
.
end
(),
0.0
)
/
_size
;
}
double
VectorOps
::
GetStddev
(
void
)
{
double
mean
=
GetMean
();
// get the variance
double
totalVariance
=
0.0
;
vector
<
double
>::
const_iterator
dIt
=
_vecd
.
begin
();
vector
<
double
>::
const_iterator
dEnd
=
_vecd
.
end
();
for
(;
dIt
!=
dEnd
;
++
dIt
)
{
totalVariance
+=
pow
((
*
dIt
-
mean
),
2
);
}
double
variance
=
totalVariance
/
_vecd
.
size
();
return
sqrt
(
variance
);
}
double
VectorOps
::
GetSstddev
(
void
)
{
double
mean
=
GetMean
();
// get the variance
double
totalVariance
=
0.0
;
vector
<
double
>::
const_iterator
dIt
=
_vecd
.
begin
();
vector
<
double
>::
const_iterator
dEnd
=
_vecd
.
end
();
for
(;
dIt
!=
dEnd
;
++
dIt
)
{
totalVariance
+=
pow
((
*
dIt
-
mean
),
2
);
}
double
variance
=
totalVariance
/
(
_vecd
.
size
()
-
1
);
return
sqrt
(
variance
);
}
double
VectorOps
::
GetMedian
(
void
)
{
// convert the vec of strings to a vec of doubles
...
...
@@ -169,6 +215,14 @@ string VectorOps::GetCollapse(void)
return
collapse
.
str
();
}
string
VectorOps
::
GetConcat
(
void
)
{
ostringstream
collapse
;
for
(
size_t
i
=
0
;
i
<
_vecs
.
size
();
i
++
)
collapse
<<
_vecs
[
i
];
return
collapse
.
str
();
}
string
VectorOps
::
GetDistinct
(
void
)
{
ostringstream
distinct
;
...
...
@@ -183,4 +237,75 @@ string VectorOps::GetDistinct(void)
distinct
<<
_vecs
[
i
];
}
return
distinct
.
str
();
}
string
VectorOps
::
GetFreqDesc
(
void
)
{
// compute the frequency of each unique value
map
<
string
,
int
>
freqs
;
vector
<
string
>::
const_iterator
dIt
=
_vecs
.
begin
();
vector
<
string
>::
const_iterator
dEnd
=
_vecs
.
end
();
for
(;
dIt
!=
dEnd
;
++
dIt
)
{
freqs
[
*
dIt
]
++
;
}
// pair for the num times a values was
// observed (1) and the value itself (2)
pair
<
int
,
string
>
freqPair
;
vector
<
pair
<
int
,
string
>
>
freqList
;
// create a list of pairs of all the observed values (second)
// and their occurences (first)
map
<
string
,
int
>::
const_iterator
mapIter
=
freqs
.
begin
();
map
<
string
,
int
>::
const_iterator
mapEnd
=
freqs
.
end
();
for
(;
mapIter
!=
mapEnd
;
++
mapIter
)
freqList
.
push_back
(
make_pair
(
mapIter
->
second
,
mapIter
->
first
)
);
// sort the list of pairs in the requested order by the frequency
// this will make the value that was observed least/most bubble to the top
sort
(
freqList
.
begin
(),
freqList
.
end
(),
ValueGreaterThan
());
// record all of the values and their frequencies.
ostringstream
buffer
;
vector
<
pair
<
int
,
string
>
>::
const_iterator
iter
=
freqList
.
begin
();
vector
<
pair
<
int
,
string
>
>::
const_iterator
iterEnd
=
freqList
.
end
();
for
(;
iter
!=
iterEnd
;
++
iter
)
buffer
<<
iter
->
second
<<
":"
<<
iter
->
first
<<
","
;
return
buffer
.
str
();
}
string
VectorOps
::
GetFreqAsc
(
void
)
{
// compute the frequency of each unique value
map
<
string
,
int
>
freqs
;
vector
<
string
>::
const_iterator
dIt
=
_vecs
.
begin
();
vector
<
string
>::
const_iterator
dEnd
=
_vecs
.
end
();
for
(;
dIt
!=
dEnd
;
++
dIt
)
{
freqs
[
*
dIt
]
++
;
}
// pair for the num times a values was
// observed (1) and the value itself (2)
pair
<
int
,
string
>
freqPair
;
vector
<
pair
<
int
,
string
>
>
freqList
;
// create a list of pairs of all the observed values (second)
// and their occurences (first)
map
<
string
,
int
>::
const_iterator
mapIter
=
freqs
.
begin
();
map
<
string
,
int
>::
const_iterator
mapEnd
=
freqs
.
end
();
for
(;
mapIter
!=
mapEnd
;
++
mapIter
)
freqList
.
push_back
(
make_pair
(
mapIter
->
second
,
mapIter
->
first
)
);
// sort the list of pairs in the requested order by the frequency
// this will make the value that was observed least/most bubble to the top
sort
(
freqList
.
begin
(),
freqList
.
end
(),
ValueLessThan
());
// record all of the values and their frequencies.
ostringstream
buffer
;
vector
<
pair
<
int
,
string
>
>::
const_iterator
iter
=
freqList
.
begin
();
vector
<
pair
<
int
,
string
>
>::
const_iterator
iterEnd
=
freqList
.
end
();
for
(;
iter
!=
iterEnd
;
++
iter
)
buffer
<<
iter
->
second
<<
":"
<<
iter
->
first
<<
","
;
return
buffer
.
str
();
}
\ No newline at end of file
src/utils/VectorOps/VectorOps.h
View file @
2aad3460
...
...
@@ -13,6 +13,8 @@
#define VECTOROPS_H
#include <vector>
#include <map>
#include <math.h>
#include <string>
#include <sstream>
#include <algorithm>
...
...
@@ -38,16 +40,20 @@ public:
// user-interface
double
GetSum
(
void
);
// return the total of the values in the vector
double
GetMean
(
void
);
// return the average value in the vector
double
GetStddev
(
void
);
// return the standard deviation
double
GetSstddev
(
void
);
// return the sample standard deviation
double
GetMedian
(
void
);
// return the median value in the vector
string
GetMode
(
void
);
// return the most common value in the vector
string
GetAntiMode
(
void
);
// return the least common value in the vector
double
GetMin
(
void
);
// return the minimum element of the vector
double
GetMax
(
void
);
// return the maximum element of the vector
uint32_t
GetCount
(
void
);
// return the count of element in the vector
uint32_t
GetCountDistinct
(
void
);
// return a the count of _unique_ elements in the
list
uint32_t
GetCountDistinct
(
void
);
// return a the count of _unique_ elements in the
vector
string
GetCollapse
(
void
);
// return a comma-separated list of elements
string
GetConcat
(
void
);
// return a concatenation of all elements in the vector
string
GetDistinct
(
void
);
// return a comma-separated list of the _unique_ elements
string
GetFreqDesc
(
void
);
// return a histogram of values and their frequencies in desc. order of frequency
string
GetFreqAsc
(
void
);
// return a histogram of values and their frequencies in asc. order of frequency
private:
vector
<
string
>
_vecs
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment