Skip to content
Snippets Groups Projects
Commit d8c62016 authored by nkindlon's avatar nkindlon
Browse files

Fixed bug for files with no newlines.

parent bc90a1a0
No related branches found
No related tags found
No related merge requests found
......@@ -70,11 +70,9 @@ void merge_help(void) {
cerr << "\t\tMultiple columns can be specified in a comma-delimited list." << endl << endl;
KeyListOpsHelp();
cerr << "Notes: " << endl;
cerr << "\t(1) All output, regardless of input type (e.g., GFF or VCF)" << endl;
cerr << "\t will in BED format with zero-based starts" << endl << endl;
cerr << "\t(2) The input file (-i) file must be sorted by chrom, then start." << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) The input file (-i) file must be sorted by chrom, then start." << endl << endl;
// end the program here
exit(1);
......
......@@ -70,7 +70,7 @@ bool BufferedStreamMgr::getTypeData()
_currScanBuffer = _inputStreamMgr->getSavedData();
_typeChecker.setFilename(_filename);
do {
if (!_typeChecker.scanBuffer(_currScanBuffer.c_str(), _currScanBuffer.size()) && !_typeChecker.needsMoreData()) {
if (!_typeChecker.scanBuffer(_currScanBuffer.c_str(), _currScanBuffer.size(), _inputStreamMgr->getEofHit()) && !_typeChecker.needsMoreData()) {
return false;
} else if (_typeChecker.needsMoreData()) {
if (!_inputStreamMgr->populateScanBuffer()) {
......
......@@ -28,7 +28,8 @@ InputStreamMgr::InputStreamMgr(const QuickString &filename, bool buildScanBuffer
_streamFinished(false),
_numBytesInBuffer(0),
_bamReader(NULL),
_bgStream(NULL)
_bgStream(NULL),
_eofHit(false)
{
_possibleBamCode.resize(4, 0);
}
......@@ -138,6 +139,7 @@ bool InputStreamMgr::populateScanBuffer()
currChar = _pushBackStreamBuf->sbumpc();
//Stop when EOF hit.
if (currChar == EOF) {
_eofHit = true;
break;
}
numChars++;
......@@ -185,6 +187,7 @@ bool InputStreamMgr::detectBamOrBgzip(int &numChars, int currChar)
currChar = _pushBackStreamBuf->sbumpc();
//Stop when EOF hit.
if (currChar == EOF) {
_eofHit = true;
break;
}
_scanBuffer.push_back(currChar);
......
......@@ -38,7 +38,7 @@ public:
bool isBam() const { return _isBam; }
BamTools::BamReader *getBamReader() { return _bamReader; }
bool resetStream();
bool getEofHit() { return _eofHit; }
private:
QuickString _filename;
......@@ -63,6 +63,7 @@ private:
int _numBytesInBuffer; //this will hold the length of the buffer after the scan.
BamTools::BamReader *_bamReader;
BamTools::Internal::BgzfStream *_bgStream;
bool _eofHit;
static const char *FIFO_STRING_LITERAL;
bool readZipChunk();
......
......@@ -4,6 +4,7 @@
#include "ParseTools.h"
FileRecordTypeChecker::FileRecordTypeChecker()
: _eofHit(false)
{
_fileType = UNKNOWN_FILE_TYPE;
_recordType = UNKNOWN_RECORD_TYPE;
......@@ -71,11 +72,9 @@ FileRecordTypeChecker::FileRecordTypeChecker()
}
bool FileRecordTypeChecker::scanBuffer(const char *buffer, size_t len)
bool FileRecordTypeChecker::scanBuffer(const char *buffer, size_t len, bool eofHit)
{
if (len == 0) {
len = strlen(buffer);
}
_eofHit = eofHit;
_numBytesInBuffer = len;
if (_numBytesInBuffer == 0) {
_fileType = EMPTY_FILE_TYPE;
......@@ -261,7 +260,7 @@ bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len)
{
//Break single string buffer into vector of QuickStrings. Delimiter is newline.
_tokenizer.setKeepFinalIncompleteElem(Tokenizer::IGNORE);
int numLines = _tokenizer.tokenize(buffer, '\n');
int numLines = _tokenizer.tokenize(buffer, '\n', _eofHit);
//anticipated delimiter characters are tab, comma, and semi-colon.
//If we need new ones, they must be added in this method.
......
......@@ -34,7 +34,7 @@ public:
BED6_RECORD_TYPE, BED12_RECORD_TYPE, BED_PLUS_RECORD_TYPE, BAM_RECORD_TYPE, VCF_RECORD_TYPE, GFF_RECORD_TYPE} RECORD_TYPE;
void setFilename(const QuickString & filename) { _filename = filename; }
bool scanBuffer(const char *buf, size_t len=0);
bool scanBuffer(const char *buf, size_t len, bool eofHit);
bool needsMoreData() const { return _insufficientData; }
bool recordTypeHasName(RECORD_TYPE type) const { return _hasName.find(type) != _hasName.end(); }
......@@ -114,6 +114,8 @@ private:
map<RECORD_TYPE, bool> _hasScore;
map<RECORD_TYPE, bool> _hasStrand;
bool _eofHit;
//this will be used in determining whether we are looking at a binary or text file.
static const float PERCENTAGE_PRINTABLE = .9;
bool isBinaryBuffer(const char *buffer, size_t len);
......
......@@ -27,7 +27,7 @@ void Tokenizer::setNumExpectedItems(int newSize) {
resize(newSize);
}
int Tokenizer::tokenize(const QuickString &str, char delimiter) {
int Tokenizer::tokenize(const QuickString &str, char delimiter, bool eofHit) {
int strLen = (int)str.size();
......@@ -41,7 +41,8 @@ int Tokenizer::tokenize(const QuickString &str, char delimiter) {
currPos++;
}
if (currPos > startPos) {
if (currPos == strLen && _keepFinalIncElem != USE_NOW) {
if ((currPos == strLen && _keepFinalIncElem != USE_NOW) &&
(!(delimiter == '\n' && eofHit))) {
//we found an incomplete final element.
// if we're ignoring incomplete elems, do nothing with it.
currIdx--; //make sure it's not included in the final count of valid elems.
......
......@@ -22,7 +22,7 @@ public:
// If not, don't worry about it.
void setNumExpectedItems(int val);
int tokenize(const QuickString &str, char delimiter = '\t');
int tokenize(const QuickString &str, char delimiter = '\t', bool eofHit = false);
// If the final element ends before a delim char, that means
// the buffer passed in ends mid-element. The last, incomplete
......
......@@ -755,3 +755,12 @@ chr1 14750216 15119039" >exp
check exp obs
rm exp obs
###########################################################
# Test that files with no newlines at all are handled
############################################################
echo " intersect.new.t64...\c"
echo "chr17 7577068 7577157" > exp
$BT intersect -a oneRecordNoNewline.bed -b oneRecordNoNewline.bed > obs
check obs exp
rm obs exp
chr17 7577068 7577157
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment