Commit b6f8ff74 authored by Neil Kindlon's avatar Neil Kindlon
Browse files

Fixed 44: race condition in bgzip for vcf files.

parent 85818904
......@@ -68,7 +68,7 @@ bool BufferedStreamMgr::getTypeData()
_currScanBuffer = _inputStreamMgr->getSavedData();
_typeChecker.setFilename(_filename);
do {
if (!_typeChecker.scanBuffer(_currScanBuffer.c_str(), _currScanBuffer.size(), _inputStreamMgr->getEofHit()) && !_typeChecker.needsMoreData()) {
if (!_typeChecker.scanBuffer(_currScanBuffer.c_str(), _currScanBuffer.size(), _inputStreamMgr->getEofHit(), _inputStreamMgr->isCompressed()) && !_typeChecker.needsMoreData()) {
return false;
} else if (_typeChecker.needsMoreData()) {
if (!_inputStreamMgr->populateScanBuffer()) {
......
......@@ -33,9 +33,12 @@ public:
bool populateScanBuffer();
const QuickString &getSavedData() const { return _saveDataStr; }
bool isGzipped() const { return _isGzipped; }
bool isBGzipped() const { return _isBgzipped; }
bool isBam() const { return _isBam; }
bool isCompressed() const { return _isGzipped || _isBgzipped || _isBam; }
PushBackStreamBuf *getPushBackStreamBuf() const {return _pushBackStreamBuf; }
// void getSavedData(QuickString &str) const { str = _saveDataStr; }
bool isBam() const { return _isBam; }
BamTools::BamReader *getBamReader() { return _bamReader; }
bool resetStream();
bool getEofHit() { return _eofHit; }
......
......@@ -20,6 +20,7 @@ FileRecordTypeChecker::FileRecordTypeChecker()
_isGFF = false;
_isGFFplus = false;
_isGzipped = false;
_isCompressed = false;
_insufficientData = false;
_fourthFieldNumeric = false;
_givenEmptyBuffer = false;
......@@ -80,9 +81,10 @@ FileRecordTypeChecker::FileRecordTypeChecker()
}
bool FileRecordTypeChecker::scanBuffer(const char *buffer, size_t len, bool eofHit)
bool FileRecordTypeChecker::scanBuffer(const char *buffer, size_t len, bool eofHit, bool isCompressed)
{
_eofHit = eofHit;
_isCompressed = isCompressed;
_numBytesInBuffer = len;
if (_numBytesInBuffer == 0) {
_fileType = EMPTY_FILE_TYPE;
......@@ -281,7 +283,7 @@ bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len)
{
//Break single string buffer into vector of QuickStrings. Delimiter is newline.
_tokenizer.setKeepFinalIncompleteElem(Tokenizer::IGNORE);
int numLines = _tokenizer.tokenize(buffer, '\n', _eofHit);
int numLines = _tokenizer.tokenize(buffer, '\n', _eofHit, _isCompressed);
//anticipated delimiter characters are tab, comma, and semi-colon.
//If we need new ones, they must be added in this method.
......
......@@ -35,7 +35,7 @@ public:
GFF_PLUS_RECORD_TYPE} RECORD_TYPE;
void setFilename(const QuickString & filename) { _filename = filename; }
bool scanBuffer(const char *buf, size_t len, bool eofHit);
bool scanBuffer(const char *buf, size_t len, bool eofHit, bool isCompressed = false);
bool needsMoreData() const { return _insufficientData; }
bool recordTypeHasName(RECORD_TYPE type) const { return _hasName.find(type) != _hasName.end(); }
......@@ -105,6 +105,7 @@ private:
bool _isGFF;
bool _isGFFplus;
bool _isGzipped;
bool _isCompressed;
bool _insufficientData; //set to true if scan buffer had only header lines.
bool _fourthFieldNumeric; //this is just to distinguish between Bed4 and BedGraph files.
bool _givenEmptyBuffer;
......
......@@ -11,7 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/lineFileUtilities/ \
# define our source and object files
# ----------------------------------
SOURCES= QuickString.h QuickString.cpp ParseTools.h ParseTools.cpp PushBackStreamBuf.cpp PushBackStreamBuf.h CompressionTools.h CompressionTools.cpp \
Tokenizer.h Tokenizer.h CommonHelp.h CommonHelp.cpp ErrorMsg.h ErrorMsg.cpp
Tokenizer.h Tokenizer.cpp CommonHelp.h CommonHelp.cpp ErrorMsg.h ErrorMsg.cpp
OBJECTS= QuickString.o ParseTools.o PushBackStreamBuf.o CompressionTools.o Tokenizer.o CommonHelp.o
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
......
......@@ -27,7 +27,7 @@ void Tokenizer::setNumExpectedItems(int newSize) {
resize(newSize);
}
int Tokenizer::tokenize(const QuickString &str, char delimiter, bool eofHit) {
int Tokenizer::tokenize(const QuickString &str, char delimiter, bool eofHit, bool isCompressed) {
int strLen = (int)str.size();
......@@ -42,7 +42,7 @@ int Tokenizer::tokenize(const QuickString &str, char delimiter, bool eofHit) {
}
if (currPos > startPos) {
if ((currPos == strLen && _keepFinalIncElem != USE_NOW) &&
(!(delimiter == '\n' && eofHit))) {
( (!(delimiter == '\n' && eofHit)) || isCompressed)) {
//we found an incomplete final element.
// if we're ignoring incomplete elems, do nothing with it.
currIdx--; //make sure it's not included in the final count of valid elems.
......
......@@ -22,7 +22,7 @@ public:
// If not, don't worry about it.
void setNumExpectedItems(int val);
int tokenize(const QuickString &str, char delimiter = '\t', bool eofHit = false);
int tokenize(const QuickString &str, char delimiter = '\t', bool eofHit = false, bool isCompressed = true);
// If the final element ends before a delim char, that means
// the buffer passed in ends mid-element. The last, incomplete
......
9 141151174 141151213 Low_complexity
9 141151355 141151475 LINE
9 141151514 141151815 SINE
9 141151817 141151937 SINE
9 141152314 141152378 Simple_repeat
9 141152592 141152787 SINE
9 141152832 141153431 LTR
MT 2591 2747 rRNA
MT 3230 3308 tRNA
MT 4329 4401 tRNA
MT 7448 7515 tRNA
X 67103806 67109934 LINE
X 134217438 134217916 LTR
X 8388422 8388715 SINE
X 16777024 16777338 SINE
X 25165534 25165834 LINE
X 41942860 41943542 LINE
X 50331587 50331685 SINE
X 75496377 75497894 LINE
......@@ -624,6 +624,18 @@ $BT intersect -a bug223_f.vcf -b bug223_f.vcf | cut -f1-6 > obs
check exp obs
rm exp obs
##################################################################
# Bug 44: test that bgzipped vcf file works correctly
# with race condition
##################################################################
echo " intersect.t51...\c"
echo \
"MT 2706 . A G 2965 PASS BRF=0.05;FR=1;HP=1;HapScore=1;MGOF=17;MMLQ=30;MQ=62.05;NF=7607;NR=8147;PP=2965;QD=20;SC=AGGCGGGCATAACACAGCAAG;SbPval=0.52;Source=Platypus;TC=15840;TCF=7679;TCR=8161;TR=15754;WE=2749;WS=2693;CSQ=G|ENSG00000198763|ENST00000361453|Transcript|upstream_gene_variant||||||rs2854128|1764|1|MT-ND2|HGNC|7456|protein_coding|YES||ENSP00000355046|NU2M_HUMAN|Q7GXY9_HUMAN&Q5Q3P5_HUMAN&Q14X33_HUMAN&Q14WT3_HUMAN&A6ZH82_HUMAN&A6ZGN8_HUMAN&A6ZGG3_HUMAN|UPI0000000AA2||||||A:0.1656|||||||||||||,G|ENSG00000210151|ENST00000387416|Transcript|downstream_gene_variant||||||rs2854128|4740|-1|MT-TS1|HGNC|7497|Mt_tRNA|YES|||||||||||A:0.1656|||||||||||||,G|ENSG00000210077|ENST00000387342|Transcript|downstream_gene_variant||||||rs2854128|1036|1|MT-TV|HGNC|7500|Mt_tRNA|YES|||||||||||A:0.1656|||||||||||||,G|ENSG00000210144|ENST00000387409|Transcript|downstream_gene_variant||||||rs2854128|3120|-1|MT-TY|HGNC|7502|Mt_tRNA|YES|||||||||||A:0.1656|||||||||||||,G|ENSG00000210117|ENST00000387382|Transcript|upstream_gene_variant||||||rs2854128|2806|1|MT-TW|HGNC|7501|Mt_tRNA|YES|||||||||||A:0.1656|||||||||||||,G|ENSG00000210107|ENST00000387372|Transcript|downstream_gene_variant||||||rs2854128|1623|-1|MT-TQ|HGNC|7495|Mt_tRNA|YES|||||||||||A:0.1656|||||||||||||,G|ENSG00000210140|ENST00000387405|Transcript|downstream_gene_variant||||||rs2854128|3055|-1|MT-TC|HGNC|7477|Mt_tRNA|YES|||||||||||A:0.1656|||||||||||||,G|ENSG00000211459|ENST00000389680|Transcript|downstream_gene_variant||||||rs2854128|1105|1|MT-RNR1|HGNC|7470|Mt_rRNA|YES|||||||||||A:0.1656|||||||||||||,G|ENSG00000210082|ENST00000387347|Transcript|non_coding_transcript_exon_variant&non_coding_transcript_variant|1036|||||rs2854128||1|MT-RNR2|HGNC|7471|Mt_rRNA|YES||||||||1/1|||A:0.1656|||||||||||||,G|ENSG00000210127|ENST00000387392|Transcript|downstream_gene_variant||||||rs2854128|2881|-1|MT-TA|HGNC|7475|Mt_tRNA|YES|||||||||||A:0.1656|||||||||||||,G|ENSG00000198712|ENST00000361739|Transcript|upstream_gene_variant||||||rs2854128|4880|1|MT-CO2|HGNC|7421|protein_coding|YES||ENSP00000354876|COX2_HUMAN|Q7GXZ8_HUMAN&Q4R1L5_HUMAN&Q4R1L3_HUMAN&Q14XT3_HUMAN&K7WVJ5_HUMAN&H9E7W2_HUMAN&H9E7T7_HUMAN&H9E7P8_HUMAN&H9E7F7_HUMAN&E2DTL8_HUMAN&D3WYY9_HUMAN&D2Y6Y2_HUMAN&D2Y6Y1_HUMAN&B2YKU2_HUMAN|UPI0000000AA4||||||A:0.1656|||||||||||||,G|ENSG00000210049|ENST00000387314|Transcript|downstream_gene_variant||||||rs2854128|2059|1|MT-TF|HGNC|7481|Mt_tRNA|YES|||||||||||A:0.1656|||||||||||||,G|ENSG00000198888|ENST00000361390|Transcript|upstream_gene_variant||||||rs2854128|601|1|MT-ND1|HGNC|7455|protein_coding|YES||ENSP00000354687|NU1M_HUMAN|Q85KV6_HUMAN&Q8WCX9_HUMAN&Q5Q757_HUMAN&Q14WI3_HUMAN&G3EBI1_HUMAN&D2Y6X8_HUMAN&D2Y6X6_HUMAN&A6ZHG8_HUMAN|UPI0000000AA1||||||A:0.1656|||||||||||||,G|ENSG00000209082|ENST00000386347|Transcript|upstream_gene_variant||||||rs2854128|524|1|MT-TL1|HGNC|7490|Mt_tRNA|YES|||||||||||A:0.1656|||||||||||||,G|ENSG00000198804|ENST00000361624|Transcript|upstream_gene_variant||||||rs2854128|3198|1|MT-CO1|HGNC|7419|protein_coding|YES||ENSP00000354499|COX1_HUMAN|Q957U9_HUMAN&Q7GXY8_HUMAN&M9Z2G2_HUMAN&Q8HBX8_HUMAN&Q5Q1W2_HUMAN&Q4R1L4_HUMAN&Q14XD3_HUMAN&Q14X83_HUMAN&F8U4W0_HUMAN&D3WYY6_HUMAN&D3WYY5_HUMAN&D3WYY4_HUMAN&D2Y6W4_HUMAN&C8YAE4_HUMAN&C3UPN2_HUMAN&B7TCT8_HUMAN&B2Y9D8_HUMAN&A5YMT3_HUMAN&A1XP63_HUMAN&A0S1I7_HUMAN|UPI0000000AA3||||||A:0.1656|||||||||||||,G|ENSG00000210154|ENST00000387419|Transcript|upstream_gene_variant||||||rs2854128|4812|1|MT-TD|HGNC|7478|Mt_tRNA|YES|||||||||||A:0.1656|||||||||||||,G|ENSG00000210112|ENST00000387377|Transcript|upstream_gene_variant||||||rs2854128|1696|1|MT-TM|HGNC|7492|Mt_tRNA|YES|||||||||||A:0.1656|||||||||||||,G|ENSG00000210135|ENST00000387400|Transcript|downstream_gene_variant||||||rs2854128|2951|-1|MT-TN|HGNC|7493|Mt_tRNA|YES|||||||||||A:0.1656|||||||||||||,G|ENSG00000210100|ENST00000387365|Transcript|upstream_gene_variant||||||rs2854128|1557|1|MT-TI|HGNC|7488|Mt_tRNA|YES|||||||||||A:0.1656|||||||||||||;GR=3.07;PH=0.654;PS=0.002 GT:GL:GOF:GQ:NR:NV 1/1:-300,-298.01,0:3:99:2733:2718 1/1:-300,-298.01,0:17:99:6509:6461 1/1:-300,-298.01,0:2:99:6598:6575 MT 2591 2747 rRNA" > exp
$BT intersect -a bug44_a.vcf.gz -b bug44_b.bed -wa -wb > obs
check exp obs
rm exp obs
cd multi_intersect
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment