Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
R3
legacy
bedtools2
Commits
b1e3d8de
Commit
b1e3d8de
authored
Jul 15, 2015
by
Aaron Quinlan
Browse files
Merge pull request #265 from nkindlon/master
1st check-in: Converted groupby to PFM
parents
b0bc5b7e
53ecd3c1
Changes
36
Hide whitespace changes
Inline
Side-by-side
src/bedtools.cpp
View file @
b1e3d8de
...
@@ -54,7 +54,7 @@ int fastafrombed_main(int argc, char* argv[]);//
...
@@ -54,7 +54,7 @@ int fastafrombed_main(int argc, char* argv[]);//
int
flank_main
(
int
argc
,
char
*
argv
[]);
//
int
flank_main
(
int
argc
,
char
*
argv
[]);
//
int
genomecoverage_main
(
int
argc
,
char
*
argv
[]);
//
int
genomecoverage_main
(
int
argc
,
char
*
argv
[]);
//
int
getoverlap_main
(
int
argc
,
char
*
argv
[]);
//
int
getoverlap_main
(
int
argc
,
char
*
argv
[]);
//
int
groupby_
main
(
int
argc
,
char
*
argv
[]);
//
void
groupby_
help
();
void
intersect_help
();
void
intersect_help
();
void
map_help
();
void
map_help
();
void
jaccard_help
();
//
void
jaccard_help
();
//
...
@@ -146,7 +146,6 @@ int main(int argc, char *argv[])
...
@@ -146,7 +146,6 @@ int main(int argc, char *argv[])
else
if
(
subCmd
==
"igv"
)
return
bedtoigv_main
(
argc
-
1
,
argv
+
1
);
else
if
(
subCmd
==
"igv"
)
return
bedtoigv_main
(
argc
-
1
,
argv
+
1
);
else
if
(
subCmd
==
"links"
)
return
links_main
(
argc
-
1
,
argv
+
1
);
else
if
(
subCmd
==
"links"
)
return
links_main
(
argc
-
1
,
argv
+
1
);
else
if
(
subCmd
==
"makewindows"
)
return
windowmaker_main
(
argc
-
1
,
argv
+
1
);
else
if
(
subCmd
==
"makewindows"
)
return
windowmaker_main
(
argc
-
1
,
argv
+
1
);
else
if
(
subCmd
==
"groupby"
)
return
groupby_main
(
argc
-
1
,
argv
+
1
);
else
if
(
subCmd
==
"expand"
)
return
expand_main
(
argc
-
1
,
argv
+
1
);
else
if
(
subCmd
==
"expand"
)
return
expand_main
(
argc
-
1
,
argv
+
1
);
else
if
(
subCmd
==
"neksb1"
)
return
nek_sandbox1_main
(
argc
-
1
,
argv
+
1
);
else
if
(
subCmd
==
"neksb1"
)
return
nek_sandbox1_main
(
argc
-
1
,
argv
+
1
);
else
if
(
subCmd
==
"regresstest"
)
return
regress_test_main
(
argc
,
argv
);
//this command does need all the orig args.
else
if
(
subCmd
==
"regresstest"
)
return
regress_test_main
(
argc
,
argv
);
//this command does need all the orig args.
...
@@ -308,6 +307,9 @@ void showHelp(const QuickString &subCmd) {
...
@@ -308,6 +307,9 @@ void showHelp(const QuickString &subCmd) {
coverage_help
();
coverage_help
();
}
else
if
(
subCmd
==
"complement"
)
{
}
else
if
(
subCmd
==
"complement"
)
{
complement_help
();
complement_help
();
}
else
if
(
subCmd
==
"groupby"
)
{
groupby_help
();
}
}
}
}
src/groupBy/Makefile
View file @
b1e3d8de
UTILITIES_DIR
=
../utils/
UTILITIES_DIR
=
../utils/
OBJ_DIR
=
../../obj/
OBJ_DIR
=
../../obj/
BIN_DIR
=
../../bin/
BIN_DIR
=
../../bin/
TOOL_DIR
=
../../src/
# -------------------
# -------------------
# define our includes
# define our includes
# -------------------
# -------------------
INCLUDES
=
-I
$(UTILITIES_DIR)
/tabFile/
\
INCLUDES
=
-I
$(UTILITIES_DIR)
/Contexts/
\
-I
$(UTILITIES_DIR)
/lineFileUtilities/
\
-I
$(UTILITIES_DIR)
/general/
\
-I
$(UTILITIES_DIR)
/gzstream/
\
-I
$(UTILITIES_DIR)
/fileType/
\
-I
$(UTILITIES_DIR)
/fileType/
\
-I
$(UTILITIES_DIR)
/gzstream/
\
-I
$(UTILITIES_DIR)
/VectorOps/
\
-I
$(UTILITIES_DIR)
/GenomeFile/
\
-I
$(UTILITIES_DIR)
/version/
-I
$(UTILITIES_DIR)
/BamTools/include
\
-I
$(UTILITIES_DIR)
/BamTools/src
\
-I
$(UTILITIES_DIR)
/BlockedIntervals
\
-I
$(UTILITIES_DIR)
/BamTools-Ancillary
\
-I
$(UTILITIES_DIR)
/FileRecordTools/
\
-I
$(UTILITIES_DIR)
/FileRecordTools/FileReaders/
\
-I
$(UTILITIES_DIR)
/FileRecordTools/Records/
\
-I
$(UTILITIES_DIR)
/KeyListOps/
\
-I
$(UTILITIES_DIR)
/RecordOutputMgr/
\
-I
$(UTILITIES_DIR)
/NewChromsweep
\
-I
$(UTILITIES_DIR)
/BinTree
\
-I
$(UTILITIES_DIR)
/version/
\
-I
$(UTILITIES_DIR)
/ToolBase/
\
-I
$(TOOL_DIR)
/intersectFile/
# ----------------------------------
# ----------------------------------
# define our source and object files
# define our source and object files
# ----------------------------------
# ----------------------------------
SOURCES
=
groupBy.cpp
$(UTILITIES_DIR)
/lineFileUtilities/lineFileUtilities
.h
SOURCES
=
groupBy
Help
.cpp
groupBy.cpp groupBy
.h
OBJECTS
=
groupBy.o
OBJECTS
=
groupBy.o
groupByHelp.o
BUILT_OBJECTS
=
$(
patsubst
%,
$(OBJ_DIR)
/%,
$(OBJECTS)
)
BUILT_OBJECTS
=
$(
patsubst
%,
$(OBJ_DIR)
/%,
$(OBJECTS)
)
PROGRAM
=
groupBy
all
:
$(BUILT_OBJECTS)
all
:
$(BUILT_OBJECTS)
.PHONY
:
all
.PHONY
:
all
$(BUILT_OBJECTS)
:
$(SOURCES)
$(BUILT_OBJECTS)
:
$(SOURCES)
@
echo
" * compiling"
$
(
*
F
)
.cpp
@
echo
" * compiling"
$
(
*
F
)
.cpp
@
$(CXX)
-c
-o
$@
$
(
*
F
)
.cpp
$(LDFLAGS)
$(CXXFLAGS)
$(INCLUDES)
@
$(CXX)
-c
-o
$@
$
(
*
F
)
.cpp
$(LDFLAGS)
$(CXXFLAGS)
$(DFLAGS)
$(INCLUDES)
clean
:
clean
:
@
echo
"Cleaning up."
@
echo
"Cleaning up."
@
rm
-f
$(OBJ_DIR)
/groupBy.o
@
rm
-f
$(OBJ_DIR)
/groupBy.o
$(OBJ_DIR)
/groupBy.o
.PHONY
:
clean
.PHONY
:
clean
src/groupBy/groupBy.cpp
View file @
b1e3d8de
/*****************************************************************************
/* groupBy.cpp
groupBy.cpp
*
* Created on: Apr 22, 2015
(c) 2009, 2010, 2011 - Aaron Quinlan
* Author: nek3d
Center for Public Health Genomics
*/
University of Virginia
aaronquinlan@gmail.com
#include "groupBy.h"
#include "Tokenizer.h"
Licenced under the MIT license.
#include "ParseTools.h"
******************************************************************************/
#include <vector>
GroupBy
::
GroupBy
(
ContextGroupBy
*
context
)
#include <map>
:
ToolBase
(
context
),
#include <numeric>
_queryFRM
(
NULL
),
#include <algorithm>
_prevRecord
(
NULL
)
#include <iterator>
{
#include <iostream>
#include <iomanip>
#include <fstream>
#include <sstream>
#include <stdlib.h>
#include <math.h>
#include <limits.h>
#include <string.h>
#include <exception>
#include <stdexcept> // out_of_range exception
#include "version.h"
#include "lineFileUtilities.h"
#include "tabFile.h"
#include "VectorOps.h"
using
namespace
std
;
// define our program name
#define PROGRAM_NAME "bedtools groupby"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) ((strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen))
#define LOOKS_LIKE_A_PARAM(string) (strlen(string)>0 && string[0]=='-')
// function declarations
void
groupby_help
(
void
);
void
GroupBy
(
const
string
&
inFile
,
const
vector
<
int
>
&
groupColumns
,
const
vector
<
int
>
&
opColumns
,
const
vector
<
string
>
&
ops
,
const
bool
printOriginalLine
,
const
bool
printHeaderLine
,
const
bool
InputHaveHeaderLine
,
const
bool
ignoreCase
,
int
precision
,
const
string
&
delim
);
void
PrintHeaderLine
(
const
vector
<
string
>
&
InputFields
,
const
vector
<
int
>
&
groupColumns
,
const
vector
<
int
>
&
opColumns
,
const
vector
<
string
>
&
ops
,
const
bool
PrintFullInputLine
,
const
bool
InputHaveHeaderLine
);
void
ReportSummary
(
const
vector
<
string
>
&
group
,
const
vector
<
vector
<
string
>
>
&
data
,
const
vector
<
string
>
&
ops
,
int
precision
,
const
string
&
delim
);
void
addValue
(
const
vector
<
string
>
&
fromList
,
vector
<
string
>
&
toList
,
int
index
,
int
lineNum
,
const
bool
ignoreCase
);
void
TabPrintPost
(
string
element
);
void
TabPrintPre
(
string
element
);
void
CommaPrint
(
string
element
);
int
groupby_main
(
int
argc
,
char
*
argv
[])
{
// input files
string
inFile
=
"stdin"
;
string
groupColumnsString
=
"1,2,3"
;
string
opsColumnString
;
string
opsString
;
// our configuration variables
bool
showHelp
=
false
;
bool
haveOpColumns
=
false
;
bool
haveOps
=
true
;
bool
printOriginalLine
=
false
;
bool
printHeaderLine
=
false
;
bool
InputHaveHeaderLine
=
false
;
bool
ignoreCase
=
false
;
int
precision
=
21
;
string
delim
(
","
);
// check to see if we should print out some help
if
(
argc
<=
1
)
showHelp
=
true
;
for
(
int
i
=
1
;
i
<
argc
;
i
++
)
{
int
parameterLength
=
(
int
)
strlen
(
argv
[
i
]);
if
((
PARAMETER_CHECK
(
"-h"
,
2
,
parameterLength
))
||
(
PARAMETER_CHECK
(
"--help"
,
5
,
parameterLength
)))
{
showHelp
=
true
;
}
}
if
(
showHelp
)
groupby_help
();
// do some parsing (all of these parameters require 2 strings)
for
(
int
i
=
1
;
i
<
argc
;
i
++
)
{
int
parameterLength
=
(
int
)
strlen
(
argv
[
i
]);
if
(
PARAMETER_CHECK
(
"-i"
,
2
,
parameterLength
))
{
if
((
i
+
1
)
<
argc
)
{
inFile
=
argv
[
i
+
1
];
i
++
;
}
}
else
if
(
PARAMETER_CHECK
(
"-grp"
,
4
,
parameterLength
)
||
PARAMETER_CHECK
(
"-g"
,
2
,
parameterLength
))
{
if
((
i
+
1
)
>=
argc
||
LOOKS_LIKE_A_PARAM
(
argv
[
i
+
1
]))
{
cerr
<<
endl
<<
"*****ERROR: -grp parameter requires a value."
<<
endl
<<
endl
;
groupby_help
();
break
;
}
else
{
groupColumnsString
=
argv
[
i
+
1
];
i
++
;
}
}
else
if
(
PARAMETER_CHECK
(
"-opCols"
,
7
,
parameterLength
)
||
PARAMETER_CHECK
(
"-c"
,
2
,
parameterLength
))
{
if
((
i
+
1
)
>=
argc
||
LOOKS_LIKE_A_PARAM
(
argv
[
i
+
1
]))
{
cerr
<<
endl
<<
"*****ERROR: -opCols parameter requires a value."
<<
endl
<<
endl
;
groupby_help
();
break
;
}
else
{
haveOpColumns
=
true
;
opsColumnString
=
argv
[
i
+
1
];
i
++
;
}
}
else
if
(
PARAMETER_CHECK
(
"-ops"
,
4
,
parameterLength
)
||
PARAMETER_CHECK
(
"-o"
,
2
,
parameterLength
))
{
if
((
i
+
1
)
>=
argc
||
LOOKS_LIKE_A_PARAM
(
argv
[
i
+
1
]))
{
cerr
<<
endl
<<
"*****ERROR: -ops parameter requires a value."
<<
endl
<<
endl
;
groupby_help
();
break
;
}
else
{
haveOps
=
true
;
opsString
=
argv
[
i
+
1
];
i
++
;
}
}
else
if
(
PARAMETER_CHECK
(
"-delim"
,
6
,
parameterLength
))
{
if
((
i
+
1
)
>=
argc
||
LOOKS_LIKE_A_PARAM
(
argv
[
i
+
1
]))
{
cerr
<<
endl
<<
"*****ERROR: -delim parameter requires a value."
<<
endl
<<
endl
;
groupby_help
();
break
;
}
else
{
delim
=
argv
[
i
+
1
];
i
++
;
}
}
else
if
(
PARAMETER_CHECK
(
"-full"
,
5
,
parameterLength
))
{
printOriginalLine
=
true
;
}
else
if
(
PARAMETER_CHECK
(
"-outheader"
,
10
,
parameterLength
))
{
printHeaderLine
=
true
;
}
else
if
(
PARAMETER_CHECK
(
"-inheader"
,
9
,
parameterLength
))
{
InputHaveHeaderLine
=
true
;
}
else
if
(
PARAMETER_CHECK
(
"-header"
,
7
,
parameterLength
))
{
InputHaveHeaderLine
=
true
;
printHeaderLine
=
true
;
}
else
if
(
PARAMETER_CHECK
(
"-ignorecase"
,
11
,
parameterLength
))
{
ignoreCase
=
true
;
}
else
if
(
PARAMETER_CHECK
(
"-prec"
,
5
,
parameterLength
))
{
if
((
i
+
1
)
>=
argc
||
LOOKS_LIKE_A_PARAM
(
argv
[
i
+
1
]))
{
cerr
<<
endl
<<
"*****ERROR: -prec parameter requires a value."
<<
endl
<<
endl
;
groupby_help
();
break
;
}
else
{
precision
=
atoi
(
argv
[
i
+
1
]);
i
++
;
}
}
else
{
cerr
<<
endl
<<
"*****ERROR: Unrecognized parameter: "
<<
argv
[
i
]
<<
" *****"
<<
endl
<<
endl
;
showHelp
=
true
;
}
}
if
(
!
haveOpColumns
)
{
cerr
<<
endl
<<
"*****"
<<
endl
<<
"*****ERROR: Need -opCols."
<<
endl
<<
"*****"
<<
endl
;
showHelp
=
true
;
}
// split the opsString into discrete operations and make sure they are all valid.
vector
<
string
>
ops
;
opsString
.
erase
(
remove_if
(
opsString
.
begin
(),
opsString
.
end
(),
::
isspace
),
opsString
.
end
());
Tokenize
(
opsString
,
ops
,
','
);
for
(
size_t
i
=
0
;
i
<
ops
.
size
();
i
++
)
{
if
((
ops
[
i
]
!=
"sum"
)
&&
(
ops
[
i
]
!=
"max"
)
&&
(
ops
[
i
]
!=
"min"
)
&&
(
ops
[
i
]
!=
"mean"
)
&&
(
ops
[
i
]
!=
"mode"
)
&&
(
ops
[
i
]
!=
"median"
)
&&
(
ops
[
i
]
!=
"antimode"
)
&&
(
ops
[
i
]
!=
"stdev"
)
&&
(
ops
[
i
]
!=
"sstdev"
)
&&
(
ops
[
i
]
!=
"count"
)
&&
(
ops
[
i
]
!=
"count_distinct"
)
&&
(
ops
[
i
]
!=
"collapse"
)
&&
(
ops
[
i
]
!=
"distinct"
)
&&
(
ops
[
i
]
!=
"distinct_sort_num"
)
&&
(
ops
[
i
]
!=
"distinct_sort_num_desc"
)
&&
(
ops
[
i
]
!=
"concat"
)
&&
(
ops
[
i
]
!=
"freqdesc"
)
&&
(
ops
[
i
]
!=
"freqasc"
)
&&
(
ops
[
i
]
!=
"first"
)
&&
(
ops
[
i
]
!=
"last"
)
&&
(
ops
[
i
]
!=
"delim"
))
{
cerr
<<
endl
<<
"*****"
<<
endl
<<
"*****ERROR: Invalid operation selection
\"
"
<<
ops
[
i
]
<<
endl
<<
"
\"
*****"
<<
endl
;
showHelp
=
true
;
}
}
if
(
!
showHelp
)
{
// Split the column string sent by the user into discrete column numbers
// A comma separated string is expected.
vector
<
int
>
groupColumnsInt
;
TokenizeColumns
(
groupColumnsString
,
groupColumnsInt
);
vector
<
int
>
opColumnsInt
;
TokenizeColumns
(
opsColumnString
,
opColumnsInt
);
// sanity check the group columns
for
(
size_t
i
=
0
;
i
<
groupColumnsInt
.
size
();
++
i
)
{
int
groupColumnInt
=
groupColumnsInt
[
i
];
if
(
groupColumnInt
<
1
)
{
cerr
<<
endl
<<
"*****"
<<
endl
<<
"*****ERROR: group columns must be >=1. "
<<
endl
<<
"*****"
<<
endl
;
groupby_help
();
}
}
// sanity check the op columns
for
(
size_t
i
=
0
;
i
<
opColumnsInt
.
size
();
++
i
)
{
int
opColumnInt
=
opColumnsInt
[
i
];
if
(
opColumnInt
<
1
)
{
cerr
<<
endl
<<
"*****"
<<
endl
<<
"*****ERROR: op columns must be >=1. "
<<
endl
<<
"*****"
<<
endl
;
groupby_help
();
}
}
// sanity check that there are equal number of opColumns and ops
if
(
ops
.
size
()
!=
opColumnsInt
.
size
())
{
cerr
<<
endl
<<
"*****"
<<
endl
<<
"*****ERROR: There must be equal num. of ops and opCols. "
<<
endl
<<
"*****"
<<
endl
;
groupby_help
();
}
GroupBy
(
inFile
,
groupColumnsInt
,
opColumnsInt
,
ops
,
printOriginalLine
,
printHeaderLine
,
InputHaveHeaderLine
,
ignoreCase
,
precision
,
delim
);
}
else
{
groupby_help
();
}
return
0
;
}
}
void
groupby_help
(
void
)
{
GroupBy
::~
GroupBy
()
{
cerr
<<
"
\n
Tool: bedtools groupby "
<<
endl
;
cerr
<<
"Version: "
<<
VERSION
<<
"
\n
"
;
cerr
<<
"Summary: Summarizes a dataset column based upon"
<<
endl
;
cerr
<<
"
\t
common column groupings. Akin to the SQL
\"
group by
\"
command."
<<
endl
<<
endl
;
cerr
<<
"Usage:
\t
"
<<
PROGRAM_NAME
<<
" -g [group_column(s)] -c [op_column(s)] -o [ops] "
<<
endl
;
cerr
<<
"
\t
"
<<
"cat [FILE] | "
<<
PROGRAM_NAME
<<
" -g [group_column(s)] -c [op_column(s)] -o [ops] "
<<
endl
<<
endl
;
cerr
<<
"Options: "
<<
endl
;
cerr
<<
"
\t
-i
\t\t
"
<<
"Input file. Assumes
\"
stdin
\"
if omitted."
<<
endl
<<
endl
;
cerr
<<
"
\t
-g -grp
\t\t
"
<<
"Specify the columns (1-based) for the grouping."
<<
endl
;
cerr
<<
"
\t\t\t
The columns must be comma separated."
<<
endl
;
cerr
<<
"
\t\t\t
- Default: 1,2,3"
<<
endl
<<
endl
;
cerr
<<
"
\t
-c -opCols
\t
"
<<
"Specify the column (1-based) that should be summarized."
<<
endl
;
cerr
<<
"
\t\t\t
- Required."
<<
endl
<<
endl
;
cerr
<<
"
\t
-o -ops
\t\t
"
<<
"Specify the operation that should be applied to opCol."
<<
endl
;
cerr
<<
"
\t\t\t
Valid operations:"
<<
endl
;
cerr
<<
"
\t\t\t
sum, count, count_distinct, min, max,"
<<
endl
;
cerr
<<
"
\t\t\t
mean, median, mode, antimode,"
<<
endl
;
cerr
<<
"
\t\t\t
stdev, sstdev (sample standard dev.),"
<<
endl
;
cerr
<<
"
\t\t\t
collapse (i.e., print a comma separated list (duplicates allowed)), "
<<
endl
;
cerr
<<
"
\t\t\t
distinct (i.e., print a comma separated list (NO duplicates allowed)), "
<<
endl
;
cerr
<<
"
\t\t\t
distinct_sort_num (as distinct, but sorted numerically, ascending), "
<<
endl
;
cerr
<<
"
\t\t\t
distinct_sort_num_desc (as distinct, but sorted numerically, descending), "
<<
endl
;
cerr
<<
"
\t\t\t
concat (i.e., merge values into a single, non-delimited string), "
<<
endl
;
cerr
<<
"
\t\t\t
freqdesc (i.e., print desc. list of values:freq)"
<<
endl
;
cerr
<<
"
\t\t\t
freqasc (i.e., print asc. list of values:freq)"
<<
endl
;
cerr
<<
"
\t\t\t
first (i.e., print first value)"
<<
endl
;
cerr
<<
"
\t\t\t
last (i.e., print last value)"
<<
endl
;
cerr
<<
"
\t\t\t
- Default: sum"
<<
endl
<<
endl
;
cerr
<<
"
\t
-full
\t\t
"
<<
"Print all columns from input file. The first line in the group is used."
<<
endl
;
cerr
<<
"
\t\t\t
Default: print only grouped columns."
<<
endl
<<
endl
;
cerr
<<
"
\t
-inheader
\t
"
<<
"Input file has a header line - the first line will be ignored."
<<
endl
<<
endl
;
cerr
<<
"
\t
-outheader
\t
"
<<
"Print header line in the output, detailing the column names. "
<<
endl
;
cerr
<<
"
\t\t\t
If the input file has headers (-inheader), the output file"
<<
endl
;
cerr
<<
"
\t\t\t
will use the input's column names."
<<
endl
;
cerr
<<
"
\t\t\t
If the input file has no headers, the output file"
<<
endl
;
cerr
<<
"
\t\t\t
will use
\"
col_1
\"
,
\"
col_2
\"
, etc. as the column names."
<<
endl
<<
endl
;
cerr
<<
"
\t
-header
\t\t
"
<<
"same as '-inheader -outheader'"
<<
endl
<<
endl
;
cerr
<<
"
\t
-ignorecase
\t
"
<<
"Group values regardless of upper/lower case."
<<
endl
<<
endl
;
cerr
<<
"
\t
-prec
\t
"
<<
"Sets the decimal precision for output (Default: 5)"
<<
endl
<<
endl
;
cerr
<<
"
\t
-delim
\t
"
<<
"Specify a custom delimiter for the collapse operations."
<<
endl
;
cerr
<<
"
\t\t
- Example: -delim
\"
|
\"
"
<<
endl
;
cerr
<<
"
\t\t
- Default:
\"
,
\"
."
<<
endl
<<
endl
;
cerr
<<
"Examples: "
<<
endl
;
cerr
<<
"
\t
$ cat ex1.out"
<<
endl
;
cerr
<<
"
\t
chr1 10 20 A chr1 15 25 B.1 1000 ATAT"
<<
endl
;
cerr
<<
"
\t
chr1 10 20 A chr1 25 35 B.2 10000 CGCG"
<<
endl
<<
endl
;
cerr
<<
"
\t
$ groupBy -i ex1.out -g 1,2,3,4 -c 9 -o sum"
<<
endl
;
cerr
<<
"
\t
chr1 10 20 A 11000"
<<
endl
<<
endl
;
cerr
<<
"
\t
$ groupBy -i ex1.out -grp 1,2,3,4 -opCols 9,9 -ops sum,max"
<<
endl
;
cerr
<<
"
\t
chr1 10 20 A 11000 10000"
<<
endl
<<
endl
;
cerr
<<
"
\t
$ groupBy -i ex1.out -g 1,2,3,4 -c 8,9 -o collapse,mean"
<<
endl
;
cerr
<<
"
\t
chr1 10 20 A B.1,B.2, 5500"
<<
endl
<<
endl
;
cerr
<<
"
\t
$ cat ex1.out | groupBy -g 1,2,3,4 -c 8,9 -o collapse,mean"
<<
endl
;
cerr
<<
"
\t
chr1 10 20 A B.1,B.2, 5500"
<<
endl
<<
endl
;
cerr
<<
"
\t
$ cat ex1.out | groupBy -g 1,2,3,4 -c 10 -o concat"
<<
endl
;
cerr
<<
"
\t
chr1 10 20 A ATATCGCG"
<<
endl
<<
endl
;
cerr
<<
"Notes: "
<<
endl
;
cerr
<<
"
\t
(1) The input file/stream should be sorted/grouped by the -grp. columns"
<<
endl
;
cerr
<<
"
\t
(2) If -i is unspecified, input is assumed to come from stdin."
<<
endl
<<
endl
;
// end the program here
exit
(
1
);
}
}
bool
GroupBy
::
init
()
void
GroupBy
(
const
string
&
inFile
,
{
const
vector
<
int
>
&
groupColumns
,
Tokenizer
groupColsTokens
;
const
vector
<
int
>
&
opColumns
,
groupColsTokens
.
tokenize
(
upCast
(
_context
)
->
getGroupCols
(),
','
);
const
vector
<
string
>
&
ops
,
int
numElems
=
groupColsTokens
.
getNumValidElems
();
const
bool
printOriginalLine
,
for
(
int
i
=
0
;
i
<
numElems
;
i
++
)
{
const
bool
printHeaderLine
,
//if the item is a range, such as 3-5,
const
bool
InputHaveHeaderLine
,
//must split that as well.
const
bool
ignoreCase
,
const
QuickString
&
elem
=
groupColsTokens
.
getElem
(
i
);
int
precision
,
const
string
&
delim
)
{
if
(
strchr
(
elem
.
c_str
(),
'-'
))
{
// current line number
Tokenizer
rangeElems
;
int
lineNum
=
0
;
rangeElems
.
tokenize
(
elem
,
'-'
);
// string representing current line
int
startNum
=
str2chrPos
(
rangeElems
.
getElem
(
0
));
string
inLine
;
int
endNum
=
str2chrPos
(
rangeElems
.
getElem
(
1
));
for
(
int
i
=
startNum
;
i
<=
endNum
;
i
++
)
{
// vector of strings holding the tokenized current line
_groupCols
.
push_back
(
i
);
vector
<
string
>
inFields
;
}
vector
<
string
>
inFieldsFirstLineInGroup
;
}
else
{
inFields
.
reserve
(
20
);
_groupCols
.
push_back
(
str2chrPos
(
elem
));
}
// keys for the current and previous group
}
vector
<
string
>
prevGroup
(
0
);
_queryFRM
=
_context
->
getFile
(
0
);
vector
<
string
>
currGroup
(
0
);
_prevFields
.
resize
(
_groupCols
.
size
());
// vector (one per column) of vector (one per value/column) of the opColumn values for the current group
_prevRecord
=
getNextRecord
();
vector
<
vector
<
string
>
>
values
;
return
true
;
for
(
size_t
i
=
0
;
i
<
opColumns
.
size
();
i
++
)
{
values
.
push_back
(
vector
<
string
>
()
);
}
bool
first_line
=
true
;
// check the status of the current line
TabLineStatus
tabLineStatus
;
// open a new tab file, loop through it line by line
// and summarize the data for a given group when the group
// fields change
TabFile
*
_tab
=
new
TabFile
(
inFile
);
_tab
->
Open
();
while
((
tabLineStatus
=
_tab
->
GetNextTabLine
(
inFields
,
lineNum
))
!=
TAB_INVALID
)
{
if
((
tabLineStatus
==
TAB_VALID
)
||
(
tabLineStatus
==
TAB_HEADER
))
{
if
(
first_line
)
{
first_line
=
false
;
if
(
printHeaderLine
)
PrintHeaderLine
(
inFields
,
groupColumns
,
opColumns
,
ops
,
printOriginalLine
,
InputHaveHeaderLine
);
if
(
InputHaveHeaderLine
)
{
inFields
.
clear
();
continue
;
// no need to process this line - it's the header
}
}
if
(
inFieldsFirstLineInGroup
.
empty
())
//first line in file? - save it