From 052c063834cf93ec0debbfd016bc2eed87c8f5d6 Mon Sep 17 00:00:00 2001 From: nkindlon <nek3d@virginia.edu> Date: Wed, 11 Sep 2013 22:40:00 -0400 Subject: [PATCH] -g (genomeFile) option now gives error if file is differently sorted than genome file. --- src/utils/FileRecordTools/FileRecordMgr.cpp | 37 +++++++++++++----- src/utils/FileRecordTools/FileRecordMgr.h | 3 ++ src/utils/GenomeFile/NewGenomeFile.cpp | 42 +++++++++++++-------- src/utils/GenomeFile/NewGenomeFile.h | 6 +-- 4 files changed, 60 insertions(+), 28 deletions(-) diff --git a/src/utils/FileRecordTools/FileRecordMgr.cpp b/src/utils/FileRecordTools/FileRecordMgr.cpp index 0f2110a8..61a92216 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.cpp +++ b/src/utils/FileRecordTools/FileRecordMgr.cpp @@ -17,6 +17,7 @@ FileRecordMgr::FileRecordMgr(int fileIdx, Context *context) _useFullBamTags(false), _headerSet(false), _prevStart(INT_MAX), + _prevChromId(-1), _mustBeForward(false), _mustBeReverse(false), _totalRecordLength(0), @@ -144,28 +145,44 @@ void FileRecordMgr::testInputSortOrder(const Record *record) if (currChrom != _prevChrom) { if ( _foundChroms.find(currChrom) != _foundChroms.end()) { //this is a different chrom than the last record had, but we've already seen this chrom. - fprintf(stderr, "Error: Sorted input specified, but the file %s has the following out of order record:\n", _context->getInputFileName(_contextFileIdx).c_str()); - QuickString errBuf; - record->print(errBuf); - fprintf(stderr, "%s\n", errBuf.c_str()); - exit(1); + sortError(record, false); } else { //new chrom has not been seen before. + //TBD: test genome file for ChromId. + if (_context->hasGenomeFile()) { + //For BAM records, the chromId of the BAM file will not necessarily be the same as the one from the genome file. + int currChromId = _context->getGenomeFile()->getChromId(currChrom); + if (currChromId < _prevChromId) { + sortError(record, true); + } else { + _prevChromId = currChromId; + } + } _foundChroms.insert(currChrom); _prevChrom = currChrom; _prevStart = INT_MAX; } } else if (record->getStartPos() < _prevStart) { //same chrom as last record, but with lower startPos, so still out of order. - fprintf(stderr, "Error: Sorted input specified, but the file %s has the following out of order record:\n", _context->getInputFileName(_contextFileIdx).c_str()); - QuickString errBuf; - record->print(errBuf); - fprintf(stderr, "%s\n", errBuf.c_str()); - exit(1); + sortError(record, false); } _prevStart = record->getStartPos(); } +void FileRecordMgr::sortError(const Record *record, bool genomeFileError) +{ + if (genomeFileError) { + fprintf(stderr, "Error: Sorted input specified, but the file %s has the following record with a different sort order than the genomeFile %s:\n", + _context->getInputFileName(_contextFileIdx).c_str(), _context->getGenomeFile()->getGenomeFileName().c_str()); + } else { + fprintf(stderr, "Error: Sorted input specified, but the file %s has the following out of order record:\n", _context->getInputFileName(_contextFileIdx).c_str()); + } + QuickString errBuf; + record->print(errBuf); + fprintf(stderr, "%s\n", errBuf.c_str()); + exit(1); + +} void FileRecordMgr::deleteRecord(const Record *record) { _recordMgr->deleteRecord(record); } diff --git a/src/utils/FileRecordTools/FileRecordMgr.h b/src/utils/FileRecordTools/FileRecordMgr.h index 46c58a95..d19b31bb 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.h +++ b/src/utils/FileRecordTools/FileRecordMgr.h @@ -157,6 +157,7 @@ private: set<QuickString> _foundChroms; QuickString _prevChrom; int _prevStart; + int _prevChromId; //members for handling merged records DualQueue<Record *, DualQueueAscending > _storedRecords; @@ -173,6 +174,8 @@ private: void allocateFileReader(); void testInputSortOrder(const Record *record); + void sortError(const Record *record, bool genomeFileError); + void deleteAllMergedItemsButKey(RecordKeyList &recList); void addToStorage(Record *record); Record *tryToTakeFromStorage(); diff --git a/src/utils/GenomeFile/NewGenomeFile.cpp b/src/utils/GenomeFile/NewGenomeFile.cpp index c6ceb8d0..d1007597 100644 --- a/src/utils/GenomeFile/NewGenomeFile.cpp +++ b/src/utils/GenomeFile/NewGenomeFile.cpp @@ -10,7 +10,7 @@ Licensed under the GNU General Public License 2.0 license. ******************************************************************************/ #include "NewGenomeFile.h" - +#include "ParseTools.h" NewGenomeFile::NewGenomeFile(const QuickString &genomeFilename) : _maxId(-1) @@ -37,33 +37,41 @@ NewGenomeFile::~NewGenomeFile(void) { void NewGenomeFile::loadGenomeFileIntoMap() { - FILE *fp = fopen(_genomeFileName.c_str(), "r"); - if (fp == NULL) { - fprintf(stderr, "Error: Can't open genome file %s. Exiting...\n", _genomeFileName.c_str()); + + ifstream genFile(_genomeFileName.c_str()); + if (!genFile.good()) { + cerr << "Error: Can't open genome file" << _genomeFileName << "Exiting..." << endl; exit(1); } - char sLine[2048]; - char chrName[2048]; + string sLine; + vector<QuickString> fields; CHRPOS chrSize = 0; - while (!feof(fp)) { - memset(sLine, 0, 2048); - memset(chrName, 0, 2048); + QuickString chrName; + while (!genFile.eof()) { + sLine.clear(); + fields.clear(); chrSize = 0; - fgets(sLine, 2048, fp); - sscanf(sLine, "%s %d", chrName, &chrSize); - if (strlen(sLine) == 0) { + chrName.clear(); + getline(genFile, sLine); + Tokenize(sLine.c_str(), fields); + if (fields.size() != 2) { continue; } + chrName = fields[0]; + chrSize = str2chrPos(fields[1]); _maxId++; - _chromSizeIds[chrName] = pair<CHRPOS, CHRPOS>(chrSize, _maxId); + _chromSizeIds[chrName] = pair<CHRPOS, int>(chrSize, _maxId); _startOffsets.push_back(_genomeLength); _genomeLength += chrSize; _chromList.push_back(chrName); - + } + if (_maxId == -1) { + cerr << "Error: The genome file " << _genomeFileName << " has no valid entries. Exiting." << endl; + exit(1); } _startOffsets.push_back(_genomeLength); //insert the final length as the last element //to help with the lower_bound call in the projectOnGenome method. - fclose(fp); + genFile.close(); } bool NewGenomeFile::projectOnGenome(CHRPOS genome_pos, QuickString &chrom, CHRPOS &start) { @@ -94,6 +102,7 @@ CHRPOS NewGenomeFile::getChromSize(const QuickString &chrom) { _currChromId = iter->second.second; return _currChromSize; } + cerr << "Error: chromosome " << chrom << " is not in the genome file " << _genomeFileName << ". Exiting." << endl; return INT_MAX; } @@ -107,6 +116,9 @@ CHRPOS NewGenomeFile::getChromId(const QuickString &chrom) { _currChromSize = iter->second.first; _currChromId = iter->second.second; return _currChromId; + } else { + cerr << "Error: requested chromosome " << chrom << " does not exist in the genome file " << _genomeFileName << ". Exiting." << endl; + exit(1); } return INT_MAX; } diff --git a/src/utils/GenomeFile/NewGenomeFile.h b/src/utils/GenomeFile/NewGenomeFile.h index afd23b50..1235428c 100644 --- a/src/utils/GenomeFile/NewGenomeFile.h +++ b/src/utils/GenomeFile/NewGenomeFile.h @@ -45,10 +45,10 @@ public: private: QuickString _genomeFileName; - typedef map<QuickString, pair<CHRPOS, CHRPOS> > lookupType; + typedef map<QuickString, pair<CHRPOS, int> > lookupType; lookupType _chromSizeIds; vector<QuickString> _chromList; - CHRPOS _maxId; + int _maxId; // projecting chroms onto a single coordinate system CHRPOS _genomeLength; @@ -57,7 +57,7 @@ private: //cache members for quick lookup QuickString _currChromName; CHRPOS _currChromSize; - CHRPOS _currChromId; + int _currChromId; }; -- GitLab