From 36fdad10676e72fd63d4d0a4fba09e87820d9f1e Mon Sep 17 00:00:00 2001
From: nkindlon <nek3d@virginia.edu>
Date: Fri, 13 Sep 2013 16:03:08 -0400
Subject: [PATCH] resolved chromId problem when doing Chromsweep with BAM file
 sorted differently than genome file.

---
 src/nekSandbox1/nekSandboxMain.cpp            | 40 +++++++++++++++++++
 .../FileReaders/BamFileReader.cpp             |  2 +-
 .../FileReaders/BamFileReader.h               |  2 +-
 .../SingleLineDelimTextFileReader.cpp         |  4 --
 src/utils/FileRecordTools/FileRecordMgr.cpp   | 16 +++++++-
 src/utils/FileRecordTools/FileRecordMgr.h     |  3 +-
 .../FileRecordTools/Records/BamRecord.cpp     |  5 ++-
 src/utils/FileRecordTools/Records/BamRecord.h |  3 ++
 .../FileRecordTools/Records/Bed3Interval.cpp  |  1 -
 .../FileRecordTools/Records/GffRecord.cpp     |  1 -
 10 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/src/nekSandbox1/nekSandboxMain.cpp b/src/nekSandbox1/nekSandboxMain.cpp
index a2922f92..cdc1e0a9 100644
--- a/src/nekSandbox1/nekSandboxMain.cpp
+++ b/src/nekSandbox1/nekSandboxMain.cpp
@@ -33,6 +33,46 @@ int nek_sandbox1_main(int argc,char** argv)
 		cerr << "Error: Need one input file. Use \"-\" for stdin." << endl;
 	}
 
+	ifstream myFile(argv[1]);
+	if (!myFile.good()) {
+		cerr << "Error: Can't open genome file" << argv[1] << "Exiting..." << endl;
+		exit(1);
+	}
+	string sLine;
+	vector<QuickString> fields;
+	QuickString chrName;
+
+	vector<QuickString> chroms;
+	chroms.push_back("1");
+	chroms.push_back("2");
+	chroms.push_back("10");
+	chroms.push_back("11");
+
+	vector<int> chromCounts(4, 0);
+	int chromIdx = 0;
+	while (!myFile.eof()) {
+		sLine.clear();
+		fields.clear();
+		getline(myFile, sLine);
+		if (sLine[0] == '@') {
+			cout << sLine << endl;
+			continue;
+		}
+		Tokenize(sLine.c_str(), fields);
+		const QuickString &currChrom = fields[2];
+		if (currChrom == chroms[chromIdx]) {
+			cout << sLine << endl;
+			chromCounts[chromIdx]++;
+			if (chromCounts[chromIdx] >= 3000) {
+				chromIdx++;
+			}
+			if (chromIdx > 3) {
+				break;
+			}
+		}
+	}
+
+	return 0;
 
 	Context context;
 	context.addInputFile(argv[1]);
diff --git a/src/utils/FileRecordTools/FileReaders/BamFileReader.cpp b/src/utils/FileRecordTools/FileReaders/BamFileReader.cpp
index 2fc71a35..c00b5831 100644
--- a/src/utils/FileRecordTools/FileReaders/BamFileReader.cpp
+++ b/src/utils/FileRecordTools/FileReaders/BamFileReader.cpp
@@ -73,7 +73,7 @@ void BamFileReader::getChrName(QuickString &str) const
 	str =  _references[refId].RefName;
 }
 
-int BamFileReader::getChrId() const
+int BamFileReader::getBamChrId() const
 {
 	return _bamAlignment.RefID;
 }
diff --git a/src/utils/FileRecordTools/FileReaders/BamFileReader.h b/src/utils/FileRecordTools/FileReaders/BamFileReader.h
index 9f8cb426..29032886 100644
--- a/src/utils/FileRecordTools/FileReaders/BamFileReader.h
+++ b/src/utils/FileRecordTools/FileReaders/BamFileReader.h
@@ -41,7 +41,7 @@ public:
 
 
 	void getChrName(QuickString &) const;
-	int getChrId() const;
+	int getBamChrId() const;
 	int getStartPos() const;
 	int getEndPos() const;
 	void getName(QuickString &) const;
diff --git a/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp
index d3c20bef..12739abe 100644
--- a/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp
+++ b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp
@@ -75,10 +75,6 @@ bool SingleLineDelimTextFileReader::readEntry()
 	if (!findDelimiters()) {
 		return false;
 	}
-	if (_context->hasGenomeFile()) {
-		getField(0, _currChromStr);
-		_currChromId = _context->getGenomeFile()->getChromId(_currChromStr);
-	}
 	return true;
 }
 
diff --git a/src/utils/FileRecordTools/FileRecordMgr.cpp b/src/utils/FileRecordTools/FileRecordMgr.cpp
index 61a92216..3721f961 100644
--- a/src/utils/FileRecordTools/FileRecordMgr.cpp
+++ b/src/utils/FileRecordTools/FileRecordMgr.cpp
@@ -128,12 +128,24 @@ Record *FileRecordMgr::allocateAndGetNextRecord()
 	//test for sorted order, if necessary.
 	if (_context->getSortedInput()) {
 		testInputSortOrder(record);
+	} else {
+		assignChromId(record);
 	}
 	_totalRecordLength += (unsigned long)(record->getEndPos() - record->getStartPos());
 	return record;
 }
 
-void FileRecordMgr::testInputSortOrder(const Record *record)
+void FileRecordMgr::assignChromId(Record *record) {
+	const QuickString &currChrom = record->getChrName();
+	if (currChrom != _prevChrom  && _context->hasGenomeFile()) {
+		_prevChromId = _context->getGenomeFile()->getChromId(currChrom);
+		record->setChromId(_prevChromId);
+	} else {
+		record->setChromId(_prevChromId);
+	}
+}
+
+void FileRecordMgr::testInputSortOrder(Record *record)
 {
 	//user specified that file must be sorted. Check that it is so.
 	// TBD: In future versions, we might not want/need all files to be sorted,
@@ -150,7 +162,6 @@ void FileRecordMgr::testInputSortOrder(const Record *record)
 			//new chrom has not been seen before.
 			//TBD: test genome file for ChromId.
 			if (_context->hasGenomeFile()) {
-				//For BAM records, the chromId of the BAM file will not necessarily be the same as the one from the genome file.
 				int currChromId = _context->getGenomeFile()->getChromId(currChrom);
 				if (currChromId < _prevChromId) {
 					sortError(record, true);
@@ -161,6 +172,7 @@ void FileRecordMgr::testInputSortOrder(const Record *record)
 			_foundChroms.insert(currChrom);
 			_prevChrom = currChrom;
 			_prevStart = INT_MAX;
+			record->setChromId(_prevChromId);
 		}
 	} else if (record->getStartPos() < _prevStart) { //same chrom as last record, but with lower startPos, so still out of order.
 		sortError(record, false);
diff --git a/src/utils/FileRecordTools/FileRecordMgr.h b/src/utils/FileRecordTools/FileRecordMgr.h
index d19b31bb..4c73d0fa 100644
--- a/src/utils/FileRecordTools/FileRecordMgr.h
+++ b/src/utils/FileRecordTools/FileRecordMgr.h
@@ -173,7 +173,8 @@ private:
 
 
 	void allocateFileReader();
-	void testInputSortOrder(const Record *record);
+	void testInputSortOrder(Record *record);
+	void assignChromId(Record *);
 	void sortError(const Record *record, bool genomeFileError);
 
 	void deleteAllMergedItemsButKey(RecordKeyList &recList);
diff --git a/src/utils/FileRecordTools/Records/BamRecord.cpp b/src/utils/FileRecordTools/Records/BamRecord.cpp
index 94c1fcd9..ba7e1cc2 100644
--- a/src/utils/FileRecordTools/Records/BamRecord.cpp
+++ b/src/utils/FileRecordTools/Records/BamRecord.cpp
@@ -3,6 +3,7 @@
 #include "RecordKeyList.h"
 
 BamRecord::BamRecord()
+: _bamChromId(-1)
 {
 
 }
@@ -31,7 +32,7 @@ bool BamRecord::initFromFile(BamFileReader *bamFileReader)
 {
 	bamFileReader->getChrName(_chrName);
 
-	_chrId = bamFileReader->getCurrChromdId();
+	_bamChromId = bamFileReader->getCurrChromdId();
 	_startPos = bamFileReader->getStartPos();
 	int2str(_startPos, _startPosStr);
 	_endPos = bamFileReader->getEndPos();
@@ -48,7 +49,7 @@ bool BamRecord::initFromFile(BamFileReader *bamFileReader)
 void BamRecord::clear()
 {
 	Bed6Interval::clear();
-
+	_bamChromId = -1;
 	//For now, we're going to not clear the BamAlignment object, as all of its
 	//fields will be reset next time it is used anyway. If testing shows this to be a
 	//problem, we'll revisit.
diff --git a/src/utils/FileRecordTools/Records/BamRecord.h b/src/utils/FileRecordTools/Records/BamRecord.h
index b412b96e..574da454 100644
--- a/src/utils/FileRecordTools/Records/BamRecord.h
+++ b/src/utils/FileRecordTools/Records/BamRecord.h
@@ -36,6 +36,7 @@ public:
 	virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BAM_RECORD_TYPE; }
 
 	const BamTools::BamAlignment &getAlignment() const { return _bamAlignment; }
+	int getBamChromId() const { return _bamChromId; }
 
 protected:
 	virtual ~BamRecord();
@@ -43,6 +44,8 @@ protected:
 
 
 	BamTools::BamAlignment _bamAlignment;
+	int _bamChromId; //different from chromId, because BAM file may be in different order
+	//than the genomeFile.
 
 };
 
diff --git a/src/utils/FileRecordTools/Records/Bed3Interval.cpp b/src/utils/FileRecordTools/Records/Bed3Interval.cpp
index 5e285dc8..742ea7b9 100644
--- a/src/utils/FileRecordTools/Records/Bed3Interval.cpp
+++ b/src/utils/FileRecordTools/Records/Bed3Interval.cpp
@@ -24,7 +24,6 @@ bool Bed3Interval::initFromFile(FileReader *fileReader)
 bool Bed3Interval::initFromFile(SingleLineDelimTextFileReader *fileReader)
 {
 	fileReader->getField(0, _chrName);
-	_chrId = fileReader->getCurrChromdId();
 	fileReader->getField(1, _startPosStr);
 	fileReader->getField(2, _endPosStr);
 	_startPos = str2chrPos(_startPosStr);
diff --git a/src/utils/FileRecordTools/Records/GffRecord.cpp b/src/utils/FileRecordTools/Records/GffRecord.cpp
index 5a6cc6fa..cceb01ac 100644
--- a/src/utils/FileRecordTools/Records/GffRecord.cpp
+++ b/src/utils/FileRecordTools/Records/GffRecord.cpp
@@ -24,7 +24,6 @@ void GffRecord::clear()
 bool GffRecord::initFromFile(SingleLineDelimTextFileReader *fileReader)
 {
 	fileReader->getField(0, _chrName);
-	_chrId = fileReader->getCurrChromdId();
 	fileReader->getField(3, _startPosStr);
 	_startPos = str2chrPos(_startPosStr);
 	_startPos--; // VCF is one-based. Here we intentionally don't decrement the string version,
-- 
GitLab