diff --git a/src/groupBy/Makefile b/src/groupBy/Makefile index fe86d953a0ef0c831e7eaf3b6a34e0d15b8583d0..51fe345e87ca6794fc70bab83ef4698927b5bc98 100644 --- a/src/groupBy/Makefile +++ b/src/groupBy/Makefile @@ -16,7 +16,7 @@ INCLUDES = -I$(UTILITIES_DIR)/tabFile/ \ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= groupBy.cpp +SOURCES= groupBy.cpp $(UTILITIES_DIR)/lineFileUtilities/lineFileUtilities.h OBJECTS= groupBy.o BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) diff --git a/src/groupBy/groupBy.cpp b/src/groupBy/groupBy.cpp index f5c69c41081481d8c6496c7da5ca43002506469f..0cf1911928601e8611f19309fc08072330fd4bbe 100644 --- a/src/groupBy/groupBy.cpp +++ b/src/groupBy/groupBy.cpp @@ -172,10 +172,10 @@ int groupby_main(int argc, char* argv[]) { // Split the column string sent by the user into discrete column numbers // A comma separated string is expected. vector<int> groupColumnsInt; - Tokenize(groupColumnsString, groupColumnsInt, ','); + TokenizeColumns(groupColumnsString, groupColumnsInt); vector<int> opColumnsInt; - Tokenize(opsColumnString, opColumnsInt, ','); + TokenizeColumns(opsColumnString, opColumnsInt); // sanity check the group columns for(size_t i = 0; i < groupColumnsInt.size(); ++i) { diff --git a/src/utils/lineFileUtilities/lineFileUtilities.h b/src/utils/lineFileUtilities/lineFileUtilities.h index 6d73a2e2c85b0bed61028159fd661ba5e690ceca..3e1b833a8280f1669c4b1a8c5ad84ef147a1f701 100644 --- a/src/utils/lineFileUtilities/lineFileUtilities.h +++ b/src/utils/lineFileUtilities/lineFileUtilities.h @@ -46,5 +46,40 @@ void Tokenize(const string &str, vector<int> &elems, char delimiter = '\t') } } +// tokenize a column string into a list of integers. +inline +void TokenizeColumns(const string &str, vector<int> &elems) +{ + + // http://stackoverflow.com/questions/236129/how-to-split-a-string-in-c/236803#236803 + // NOTE: this approach intentionally allows consecutive delimiters + vector<string> col_sets; + Tokenize(str, col_sets, ','); + + for( size_t i = 0; i < col_sets.size(); i++ ) { + string col_set = col_sets[i]; + if( string::npos == col_set.find("-") ){ + elems.push_back(atoi(col_set.c_str())); + } + else { + vector<string> ends; + Tokenize(col_set, ends, '-'); + int start = atoi(ends[0].c_str()); + int end = atoi(ends[1].c_str()); + if(start <= end){ + for(int i = start; i <= end; i++){ + elems.push_back(i); + } + } + else { + for(int i = start; i >= end; i--){ + elems.push_back(i); + } + } + } + } +} + + #endif /* LINEFILEUTILITIES_H */ diff --git a/test/groupBy/test-groupby.sh b/test/groupBy/test-groupby.sh new file mode 100644 index 0000000000000000000000000000000000000000..4a9c4c7c7687d3cc39516387716d2b00ec081646 --- /dev/null +++ b/test/groupBy/test-groupby.sh @@ -0,0 +1,14 @@ +lines_a=$(../../bin/groupBy -g 3-1 -o collapse -c 4 -i ../map/values3.bed | wc -l) +lines_b=$(../../bin/groupBy -g 1-3 -o collapse -c 4 -i ../map/values3.bed | wc -l) +lines_c=$(../../bin/groupBy -g 1,2,3 -o collapse -c 4 -i ../map/values3.bed | wc -l) +lines_d=$(../../bin/groupBy -g 1-2,3 -o collapse -c 4 -i ../map/values3.bed | wc -l) + +check(){ + if [ "$1" != "$2" ]; then + "fail groupby" + fi +} + +check $lines_a $lines_b +check $lines_a $lines_c +check $lines_a $lines_d