diff --git a/src/utils/FileRecordTools/FileRecordMgr.cpp b/src/utils/FileRecordTools/FileRecordMgr.cpp index 0f2110a8e9fb791e7f607e603a178be3de8e0e54..61a92216636aeec6fc286164f78bfe826fd3b5d4 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.cpp +++ b/src/utils/FileRecordTools/FileRecordMgr.cpp @@ -17,6 +17,7 @@ FileRecordMgr::FileRecordMgr(int fileIdx, Context *context) _useFullBamTags(false), _headerSet(false), _prevStart(INT_MAX), + _prevChromId(-1), _mustBeForward(false), _mustBeReverse(false), _totalRecordLength(0), @@ -144,28 +145,44 @@ void FileRecordMgr::testInputSortOrder(const Record *record) if (currChrom != _prevChrom) { if ( _foundChroms.find(currChrom) != _foundChroms.end()) { //this is a different chrom than the last record had, but we've already seen this chrom. - fprintf(stderr, "Error: Sorted input specified, but the file %s has the following out of order record:\n", _context->getInputFileName(_contextFileIdx).c_str()); - QuickString errBuf; - record->print(errBuf); - fprintf(stderr, "%s\n", errBuf.c_str()); - exit(1); + sortError(record, false); } else { //new chrom has not been seen before. + //TBD: test genome file for ChromId. + if (_context->hasGenomeFile()) { + //For BAM records, the chromId of the BAM file will not necessarily be the same as the one from the genome file. + int currChromId = _context->getGenomeFile()->getChromId(currChrom); + if (currChromId < _prevChromId) { + sortError(record, true); + } else { + _prevChromId = currChromId; + } + } _foundChroms.insert(currChrom); _prevChrom = currChrom; _prevStart = INT_MAX; } } else if (record->getStartPos() < _prevStart) { //same chrom as last record, but with lower startPos, so still out of order. - fprintf(stderr, "Error: Sorted input specified, but the file %s has the following out of order record:\n", _context->getInputFileName(_contextFileIdx).c_str()); - QuickString errBuf; - record->print(errBuf); - fprintf(stderr, "%s\n", errBuf.c_str()); - exit(1); + sortError(record, false); } _prevStart = record->getStartPos(); } +void FileRecordMgr::sortError(const Record *record, bool genomeFileError) +{ + if (genomeFileError) { + fprintf(stderr, "Error: Sorted input specified, but the file %s has the following record with a different sort order than the genomeFile %s:\n", + _context->getInputFileName(_contextFileIdx).c_str(), _context->getGenomeFile()->getGenomeFileName().c_str()); + } else { + fprintf(stderr, "Error: Sorted input specified, but the file %s has the following out of order record:\n", _context->getInputFileName(_contextFileIdx).c_str()); + } + QuickString errBuf; + record->print(errBuf); + fprintf(stderr, "%s\n", errBuf.c_str()); + exit(1); + +} void FileRecordMgr::deleteRecord(const Record *record) { _recordMgr->deleteRecord(record); } diff --git a/src/utils/FileRecordTools/FileRecordMgr.h b/src/utils/FileRecordTools/FileRecordMgr.h index 46c58a95e7bf7d37d20209e54b8eb36ff70c1367..d19b31bb20312654d85a59b2fd15e852fbc985e2 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.h +++ b/src/utils/FileRecordTools/FileRecordMgr.h @@ -157,6 +157,7 @@ private: set<QuickString> _foundChroms; QuickString _prevChrom; int _prevStart; + int _prevChromId; //members for handling merged records DualQueue<Record *, DualQueueAscending > _storedRecords; @@ -173,6 +174,8 @@ private: void allocateFileReader(); void testInputSortOrder(const Record *record); + void sortError(const Record *record, bool genomeFileError); + void deleteAllMergedItemsButKey(RecordKeyList &recList); void addToStorage(Record *record); Record *tryToTakeFromStorage(); diff --git a/src/utils/GenomeFile/NewGenomeFile.cpp b/src/utils/GenomeFile/NewGenomeFile.cpp index c6ceb8d08f85851b3487bfdf25f6c6ba9b7d5a42..d100759707837bd0b1426437a42477b2778098ff 100644 --- a/src/utils/GenomeFile/NewGenomeFile.cpp +++ b/src/utils/GenomeFile/NewGenomeFile.cpp @@ -10,7 +10,7 @@ Licensed under the GNU General Public License 2.0 license. ******************************************************************************/ #include "NewGenomeFile.h" - +#include "ParseTools.h" NewGenomeFile::NewGenomeFile(const QuickString &genomeFilename) : _maxId(-1) @@ -37,33 +37,41 @@ NewGenomeFile::~NewGenomeFile(void) { void NewGenomeFile::loadGenomeFileIntoMap() { - FILE *fp = fopen(_genomeFileName.c_str(), "r"); - if (fp == NULL) { - fprintf(stderr, "Error: Can't open genome file %s. Exiting...\n", _genomeFileName.c_str()); + + ifstream genFile(_genomeFileName.c_str()); + if (!genFile.good()) { + cerr << "Error: Can't open genome file" << _genomeFileName << "Exiting..." << endl; exit(1); } - char sLine[2048]; - char chrName[2048]; + string sLine; + vector<QuickString> fields; CHRPOS chrSize = 0; - while (!feof(fp)) { - memset(sLine, 0, 2048); - memset(chrName, 0, 2048); + QuickString chrName; + while (!genFile.eof()) { + sLine.clear(); + fields.clear(); chrSize = 0; - fgets(sLine, 2048, fp); - sscanf(sLine, "%s %d", chrName, &chrSize); - if (strlen(sLine) == 0) { + chrName.clear(); + getline(genFile, sLine); + Tokenize(sLine.c_str(), fields); + if (fields.size() != 2) { continue; } + chrName = fields[0]; + chrSize = str2chrPos(fields[1]); _maxId++; - _chromSizeIds[chrName] = pair<CHRPOS, CHRPOS>(chrSize, _maxId); + _chromSizeIds[chrName] = pair<CHRPOS, int>(chrSize, _maxId); _startOffsets.push_back(_genomeLength); _genomeLength += chrSize; _chromList.push_back(chrName); - + } + if (_maxId == -1) { + cerr << "Error: The genome file " << _genomeFileName << " has no valid entries. Exiting." << endl; + exit(1); } _startOffsets.push_back(_genomeLength); //insert the final length as the last element //to help with the lower_bound call in the projectOnGenome method. - fclose(fp); + genFile.close(); } bool NewGenomeFile::projectOnGenome(CHRPOS genome_pos, QuickString &chrom, CHRPOS &start) { @@ -94,6 +102,7 @@ CHRPOS NewGenomeFile::getChromSize(const QuickString &chrom) { _currChromId = iter->second.second; return _currChromSize; } + cerr << "Error: chromosome " << chrom << " is not in the genome file " << _genomeFileName << ". Exiting." << endl; return INT_MAX; } @@ -107,6 +116,9 @@ CHRPOS NewGenomeFile::getChromId(const QuickString &chrom) { _currChromSize = iter->second.first; _currChromId = iter->second.second; return _currChromId; + } else { + cerr << "Error: requested chromosome " << chrom << " does not exist in the genome file " << _genomeFileName << ". Exiting." << endl; + exit(1); } return INT_MAX; } diff --git a/src/utils/GenomeFile/NewGenomeFile.h b/src/utils/GenomeFile/NewGenomeFile.h index afd23b504afed7457e2545c98840fd47f10f06aa..1235428cccd6c6f88e4cd70dc39c99e8ce384aad 100644 --- a/src/utils/GenomeFile/NewGenomeFile.h +++ b/src/utils/GenomeFile/NewGenomeFile.h @@ -45,10 +45,10 @@ public: private: QuickString _genomeFileName; - typedef map<QuickString, pair<CHRPOS, CHRPOS> > lookupType; + typedef map<QuickString, pair<CHRPOS, int> > lookupType; lookupType _chromSizeIds; vector<QuickString> _chromList; - CHRPOS _maxId; + int _maxId; // projecting chroms onto a single coordinate system CHRPOS _genomeLength; @@ -57,7 +57,7 @@ private: //cache members for quick lookup QuickString _currChromName; CHRPOS _currChromSize; - CHRPOS _currChromId; + int _currChromId; };