Commit 53ecd3c1 authored by Neil Kindlon's avatar Neil Kindlon
Browse files

New unit tests (with bug fixes) for groupby

parent d28d1fc8
......@@ -44,7 +44,7 @@ bool GroupBy::init()
}
}
_queryFRM = _context->getFile(0);
_prevFields.resize(numElems);
_prevFields.resize(_groupCols.size());
_prevRecord = getNextRecord();
return true;
......
......@@ -76,6 +76,12 @@ ContextBase::ContextBase()
_programNames["jaccard"] = JACCARD;
_programNames["spacing"] = SPACING;
_programNames["fisher"] = FISHER;
_programNames["sample"] = SAMPLE;
_programNames["coverage"] = COVERAGE;
_programNames["complement"] = COMPLEMENT;
_programNames["groupby"] = GROUP_BY;
if (hasColumnOpsMethods()) {
_keyListOps = new KeyListOps();
......@@ -243,6 +249,12 @@ bool ContextBase::isValidState()
if (!determineOutputType()) {
return false;
}
if (_program != GROUP_BY && _files[0]->getRecordType() == FileRecordTypeChecker::NO_POS_PLUS_RECORD_TYPE) {
_errorMsg = "ERROR: file ";
_errorMsg.append(_files[0]->getFileName());
_errorMsg.append(" has non positional records, which are only valid for the groupBy tool.");
return false;
}
if (getObeySplits()) {
_splitBlockInfo = new BlockMgr(_overlapFraction, _reciprocal);
}
......
......@@ -98,16 +98,18 @@ bool ContextGroupBy::handle_inheader()
}
bool ContextGroupBy::handle_outheader() {
return ContextBase::handle_header();
setPrintHeader(true);
markUsed(_i - _skipFirstArgs);
return true;
}
bool ContextGroupBy::handle_header() {
_inheader = true;
return ContextBase::handle_header();
setPrintHeader(true);
markUsed(_i - _skipFirstArgs);
return true;
}
bool ContextGroupBy::handle_full() {
_printFullCols = true;
markUsed(_i - _skipFirstArgs);
......@@ -130,5 +132,7 @@ const QuickString &ContextGroupBy::getDefaultHeader() {
_defaultHeader.append(i);
_defaultHeader.append('\t');
}
//change last tab into newline
_defaultHeader[_defaultHeader.size()-1] = '\n';
return _defaultHeader;
}
......@@ -20,7 +20,7 @@ public:
bool init();
const FileRecordTypeChecker & getTypeChecker() const { return _typeChecker; }
FileRecordTypeChecker & getTypeChecker() { return _typeChecker; }
bool eof() const { return _eof; }
bool getLine(QuickString &line);
......
......@@ -27,7 +27,7 @@ FileRecordMgr::FileRecordMgr(const QuickString &filename)
_genomeFile(NULL),
_ioBufSize(0),
_noEnforceCoordSort(false)
{
{
}
FileRecordMgr::~FileRecordMgr(){
......@@ -45,6 +45,8 @@ FileRecordMgr::~FileRecordMgr(){
bool FileRecordMgr::open(bool inheader){
_bufStreamMgr = new BufferedStreamMgr(_filename);
_bufStreamMgr->getTypeChecker().setInHeader(inheader);
if (_ioBufSize > 0) _bufStreamMgr->setIoBufSize(_ioBufSize);
if (!_bufStreamMgr->init()) {
cerr << "Error: unable to open file or unable to determine types for file " << _filename << endl;
......@@ -110,8 +112,8 @@ Record *FileRecordMgr::getNextRecord(RecordKeyVector *keyList)
// but still return it so the -v (noHit) option and the like will still
// see it.
if (!record->isUnmapped()) {
if (!record->coordsValid()) {
if (!record->isUnmapped() ) {
if (!record->coordsValid() && (record->getType() != FileRecordTypeChecker::NO_POS_PLUS_RECORD_TYPE)) {
cerr << "Error: Invalid record in file " << _filename << ". Record is " << endl << *record << endl;
exit(1);
}
......
......@@ -296,14 +296,6 @@ void RecordOutputMgr::checkForHeader() {
// Do we need to print a header?
if (!_context->getPrintHeader()) return;
//if the program is based on intersection, we want the header from the query file.
if (_context->hasIntersectMethods()) {
int queryIdx = (static_cast<ContextIntersect *>(_context))->getQueryFileIdx();
const QuickString &header = _context->getFile(queryIdx)->getHeader();
_outBuf.append(header);
} else {
_outBuf.append(_context->getFile(0)->getHeader());
}
//If the tool is groupBy, and outheader was set, but the header is empty, we need to print groupBy's
//default header
if (_context->getProgram() == ContextBase::GROUP_BY) {
......@@ -314,7 +306,16 @@ void RecordOutputMgr::checkForHeader() {
} else {
_outBuf.append(header);
}
} else if (_context->hasIntersectMethods()) {
//if the tool is based on intersection, we want the header from the query file.
int queryIdx = (static_cast<ContextIntersect *>(_context))->getQueryFileIdx();
const QuickString &header = _context->getFile(queryIdx)->getHeader();
_outBuf.append(header);
} else {
_outBuf.append(_context->getFile(0)->getHeader());
}
_context->setPrintHeader(false);
flush();
}
......
......@@ -4,7 +4,8 @@
#include "ParseTools.h"
FileRecordTypeChecker::FileRecordTypeChecker()
: _eofHit(false)
: _eofHit(false),
_inheader(false)
{
_fileType = UNKNOWN_FILE_TYPE;
_recordType = UNKNOWN_RECORD_TYPE;
......@@ -226,8 +227,9 @@ bool FileRecordTypeChecker::handleTextFormat(const char *buffer, size_t len)
_recordType = GFF_RECORD_TYPE;
return true;
}
//Here the Record must not have positions, so it is the NoPosPlus Type
return false;
//Here the Record must not have positions, so it is the NoPosPlus Type.
_recordType = NO_POS_PLUS_RECORD_TYPE;
return true;
}
return false;
}
......@@ -325,7 +327,15 @@ bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len)
continue;
}
//
//skip over any header line
//
if (_inheader) {
headerCount++;
_inheader = false; //inheaders can only apply to first line
continue;
}
if (isHeaderLine(line)) {
//clear any previously found supposedly valid data lines, because valid lines can only come after header lines.
if (_firstValidDataLineIdx > -1 && _firstValidDataLineIdx < i) {
......@@ -336,6 +346,9 @@ bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len)
headerCount++;
continue;
}
//a line must have some alphanumeric characters in order to be valid.
bool hasAlphaNum = false;
for (int j=0; j < len; j++) {
......
......@@ -79,7 +79,7 @@ public:
bool isBed12() const { return (_isBed && _numFields == 12); }
bool isGFF() const { return _isGFF; }
void setInHeader(bool val) { _inheader = val; }
......@@ -118,6 +118,7 @@ private:
map<RECORD_TYPE, bool> _hasStrand;
bool _eofHit;
bool _inheader;
bool isBinaryBuffer(const char *buffer, size_t len);
bool isBAM(const char *buffer);
......
##fileformat=VCFv4.1
19 252806 791255 G <DEL> 70.90 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-389,-4611;END=253195;STR=+-:4;IMPRECISE;CIPOS=-2,137;CIEND=0,0;EVENT=791255;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
19 260365 791256 C <DEL> 33.71 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-680;END=261045;STR=+-:4;IMPRECISE;CIPOS=-1,257;CIEND=0,0;EVENT=791256;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=upstream_gene_variant|||ENSG00000271846|CTD-3113P16.9|ENST00000607399|||||processed_pseudogene
19 265134 791257 A <DEL> 20.25 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-558;END=265692;STR=+-:4;IMPRECISE;CIPOS=-1,196;CIEND=0,0;EVENT=791257;SUP=4;PESUP=4;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
19 265986 791258 A <DEL> 22.15 . TOOL=LUMPY;SVTYPE=DEL;SVLEN=-401;END=266387;STR=+-:6;IMPRECISE;CIPOS=-2,87;CIEND=0,0;EVENT=791258;SUP=6;PESUP=6;SRSUP=0;EV=PE;PRIN;CSQ=intergenic_variant||||||||||
#L chrom start end A B C D
l chr1 0 10 a1 10 + a
k chr1 10 20 a2 5 + b
j chr1 11 21 a3 5 + c
i chr1 20 30 a4 15 + d
h chr1 20 30 a5 15 + e
g chr1 20 30 a6 15 + f
f chr1 120 130 a7 1 + g
e chr3 0 10 a8 1 + h
d chr3 10 20 a9 2 + i
c chr3 20 30 a10 3 + j
b chr3 120 130 a11 4 + k
a chr3 120 130 a12 4 + l
BT=${BT-../../bin/bedtools}
lines_a=$($BT groupby -g 3-1 -o collapse -c 4 -i ../map/values3.bed | wc -l)
lines_b=$($BT groupby -g 1-3 -o collapse -c 4 -i ../map/values3.bed | wc -l)
lines_c=$($BT groupby -g 1,2,3 -o collapse -c 4 -i ../map/values3.bed | wc -l)
lines_d=$($BT groupby -g 1-2,3 -o collapse -c 4 -i ../map/values3.bed | wc -l)
check(){
if [ "$1" != "$2" ]; then
echo "fail groupby" $1 $2
fi
}
checkfile()
check()
{
if diff $1 $2; then
echo ok
return 1
else
echo fail
return 0
fi
}
check $lines_a $lines_b
check $lines_a $lines_c
check $lines_a $lines_d
###########################################################
# Test that -n option is shown as deperecated
###########################################################
#echo " merge.t2...\c"
#echo "***** ERROR: -n option is deprecated. Please see the documentation for the -c and -o column operation options. *****" > exp
#$BT merge -i a.bed -n 2>&1 > /dev/null | head -2 | tail -1 > obs
#check obs exp
#rm obs exp
###########################################################
# Test basic grouping
###########################################################
echo " groupby.t1...\c"
echo \
"chr1 0 10 10
chr1 10 20 5
chr1 11 21 5
chr1 20 30 45
chr1 120 130 1
chr3 0 10 1
chr3 10 20 2
chr3 20 30 3
chr3 120 130 8" > exp
$BT groupby -i values3.header.bed -c 5 > obs
check obs exp
rm obs exp
###########################################################
# Test case insensitive grouping works
###########################################################
echo " groupby.t2...\c"
echo \
"chr1 0 10 10
cHr1 10 20 5
Chr1 11 21 5
chR1 20 30 45
Chr1 120 130 1
CHr3 0 10 1
cHR3 10 20 2
CHR3 20 30 3
chr3 120 130 8" > exp
$BT groupby -i values3_case.header.bed -c 5 -ignorecase > obs
check obs exp
rm obs exp
###########################################################
# Test -full option (print all columns, not just grouped
# ones)
###########################################################
echo " groupby.t3...\c"
echo \
"chr1 0 10 a1 10 + 10
chr1 10 20 a2 5 + 5
chr1 11 21 a3 5 + 5
chr1 20 30 a4 15 + 45
chr1 120 130 a7 1 + 1
chr3 0 10 a8 1 + 1
chr3 10 20 a9 2 + 2
chr3 20 30 a10 3 + 3
chr3 120 130 a11 4 + 8" > exp
$BT groupby -i values3.header.bed -c 5 -full > obs
check obs exp
rm obs exp
###########################################################
# Test -inheader option
###########################################################
echo " groupby.t4...\c"
echo \
"chr1 0 10 10
chr1 10 20 5
chr1 11 21 5
chr1 20 30 45
chr1 120 130 1
chr3 0 10 1
chr3 10 20 2
chr3 20 30 3
chr3 120 130 8" > exp
$BT groupby -i values3.header.bed -c 5 -inheader > obs
check obs exp
rm obs exp
###########################################################
# Test -inheader option when header not marked by
# recognized char
###########################################################
echo " groupby.t5...\c"
echo \
"chr1 0 10 10
chr1 10 20 5
chr1 11 21 5
chr1 20 30 45
chr1 120 130 1
chr3 0 10 1
chr3 10 20 2
chr3 20 30 3
chr3 120 130 8" > exp
$BT groupby -i values3.unmarked_header.bed -c 5 -inheader > obs
check obs exp
rm obs exp
###########################################################
# Test -inheader option when no header present will skip
# first line
###########################################################
echo " groupby.t6...\c"
echo \
"chr1 10 20 5
chr1 11 21 5
chr1 20 30 45
chr1 120 130 1
chr3 0 10 1
chr3 10 20 2
chr3 20 30 3
chr3 120 130 8" > exp
$BT groupby -i values3.no_header.bed -c 5 -inheader > obs
check obs exp
rm obs exp
###########################################################
# Test -outheader option will work automatically, even
# without -inheader, if header has normally marked start char.
###########################################################
echo " groupby.t7...\c"
echo \
"#chrom start end A B C
chr1 0 10 10
chr1 10 20 5
chr1 11 21 5
chr1 20 30 45
chr1 120 130 1
chr3 0 10 1
chr3 10 20 2
chr3 20 30 3
chr3 120 130 8" > exp
$BT groupby -i values3.header.bed -c 5 -outheader > obs
check obs exp
rm obs exp
###########################################################
# Test that unmarked header will be included by default.
###########################################################
echo " groupby.t8...\c"
echo \
"chrom start end B
chr1 0 10 10
chr1 10 20 5
chr1 11 21 5
chr1 20 30 15
chr1 120 130 1
chr3 0 10 1
chr3 10 20 2
chr3 20 30 3
chr3 120 130 4" > exp
$BT groupby -i values3.unmarked_header.bed -c 5 -o distinct > obs
check obs exp
rm obs exp
###########################################################
# Test that -outheader does nothing with unmarked header
###########################################################
echo " groupby.t9...\c"
echo \
"col_1 col_2 col_3 col_4 col_5 col_6
chrom start end B
chr1 0 10 10
chr1 10 20 5
chr1 11 21 5
chr1 20 30 15
chr1 120 130 1
chr3 0 10 1
chr3 10 20 2
chr3 20 30 3
chr3 120 130 4" > exp
$BT groupby -i values3.unmarked_header.bed -c 5 -o distinct -outheader > obs
check obs exp
rm obs exp
###########################################################
# Test that -header works with unmarked header
###########################################################
echo " groupby.t10...\c"
echo \
"chrom start end A B C
chr1 0 10 10
chr1 10 20 5
chr1 11 21 5
chr1 20 30 15
chr1 120 130 1
chr3 0 10 1
chr3 10 20 2
chr3 20 30 3
chr3 120 130 4" > exp
$BT groupby -i values3.unmarked_header.bed -c 5 -o distinct -header > obs
check obs exp
rm obs exp
###########################################################
# Test that -header works normally with normal header
###########################################################
echo " groupby.t11...\c"
echo \
"#chrom start end A B C
chr1 0 10 10
chr1 10 20 5
chr1 11 21 5
chr1 20 30 45
chr1 120 130 1
chr3 0 10 1
chr3 10 20 2
chr3 20 30 3
chr3 120 130 8" > exp
$BT groupby -i values3.header.bed -c 5 -header > obs
check obs exp
rm obs exp
###########################################################
# Test a BedPlus file (7 fields)
###########################################################
echo " groupby.t12...\c"
echo \
"chr1 0 10 10
chr1 10 20 5
chr1 11 21 5
chr1 20 30 45
chr1 120 130 1
chr3 0 10 1
chr3 10 20 2
chr3 20 30 3
chr3 120 130 8" > exp
$BT groupby -i values3.7fields.header.bed -c 5 > obs
check obs exp
rm obs exp
H=$(head -n 1 values3.header.bed)
A=$($BT groupby -i values3.header.bed -g 1,2,3 -c 4 -o concat -inheader | head -n 1)
if [ "$A" != $'chr1\t0\t10\ta1' ]; then
echo "fail groupby"
fi
###########################################################
# Test noPosPlus file (8 fields, not starting with
# chr, starte, end
###########################################################
echo " groupby.t13...\c"
echo \
"chr1 0 10 10
chr1 10 20 5
chr1 11 21 5
chr1 20 30 45
chr1 120 130 1
chr3 0 10 1
chr3 10 20 2
chr3 20 30 3
chr3 120 130 8" > exp
$BT groupby -g 2-4 -i noPosvalues.header.bed -c 6 > obs
check obs exp
rm obs exp
B=$($BT groupby -i values3.header.bed -g 1,2,3 -c 4 -o concat -header | head -n 1)
###########################################################
# Test noPosPlus file with mof columns (iterated and range)
###########################################################
echo " groupby.t14...\c"
echo \
"0 10 chr1 10
10 20 chr1 5
11 21 chr1 5
20 30 chr1 45
120 130 chr1 1
0 10 chr3 1
10 20 chr3 2
20 30 chr3 3
120 130 chr3 8" > exp
$BT groupby -g 3-4,2 -i noPosvalues.header.bed -c 6 > obs
check obs exp
rm obs exp
if [ "$B" != $'#chrom\tstart\tend\tconcat(A)' ]; then
echo "fail groupby"
fi
###########################################################
# Test that only the groupBy tool may use
# non-positional records
###########################################################
echo " groupby.t15...\c"
echo \
"ERROR: file noPosvalues.header.bed has non positional records, which are only valid for the groupBy tool." > exp
$BT merge -i noPosvalues.header.bed 2>&1 >/dev/null | cat - > obs
check obs exp
rm obs exp
###########################################################
# Test precision
############################################################
echo " groupby.t01...\c"
# Test a VCF file
###########################################################
echo " groupby.t16...\c"
echo \
"chr1 11168000 11168003 CALLABLE" > exp
$BT groupby -i test.bed -g 1,4 -c 1,2,3,4 -ops first,first,max,first | cut -f 3-6 > obs
checkfile obs exp
"19 G 70.9
19 C 33.71
19 A 21.2" > exp
$BT groupby -i a_vcfSVtest.vcf -g 1,4 -o mean -c 6 > obs
check obs exp
rm obs exp
###########################################################
# Test a BAM file
###########################################################
# Test precision
############################################################
echo " groupby.t02...\c"
echo " groupby.t17...\c"
echo \
"chr1 11168000 1.1168e+07 CALLABLE" > exp
$BT groupby -i test.bed -g 1,4 -c 1,2,3,4 -ops first,first,max,first -prec 5 | cut -f 3-6 > obs
checkfile obs exp
"None chr2L 118.75" > exp
$BT groupby -i gdc.bam -g 1,3 -c 4 -o mean > obs
check obs exp
rm obs exp
rm obs exp
\ No newline at end of file
#chrom start end A B C D
chr1 0 10 a1 10 + a
chr1 10 20 a2 5 + b
chr1 11 21 a3 5 + c
chr1 20 30 a4 15 + d
chr1 20 30 a5 15 + e
chr1 20 30 a6 15 + f
chr1 120 130 a7 1 + g
chr3 0 10 a8 1 + h
chr3 10 20 a9 2 + i
chr3 20 30 a10 3 + j
chr3 120 130 a11 4 + k
chr3 120 130 a12 4 + l
chr1 0 10 a1 10 +
chr1 10 20 a2 5 +
chr1 11 21 a3 5 +
chr1 20 30 a4 15 +
chr1 20 30 a5 15 +
chr1 20 30 a6 15 +
chr1 120 130 a7 1 +
chr3 0 10 a8 1 +
chr3 10 20 a9 2 +
chr3 20 30 a10 3 +
chr3 120 130 a11 4 +
chr3 120 130 a12 4 +
chrom start end A B C
chr1 0 10 a1 10 +
chr1 10 20 a2 5 +
chr1 11 21 a3 5 +
chr1 20 30 a4 15 +
chr1 20 30 a5 15 +
chr1 20 30 a6 15 +
chr1 120 130 a7 1 +
chr3 0 10 a8 1 +