diff --git a/src/intersectFile/intersectFile.cpp b/src/intersectFile/intersectFile.cpp index 32643ac9fc851a20c5ef92b625ae4e31067976e4..87b998d7e66ece23bccef488f368ecba55be65d2 100644 --- a/src/intersectFile/intersectFile.cpp +++ b/src/intersectFile/intersectFile.cpp @@ -37,7 +37,7 @@ FileIntersect::~FileIntersect(void) { delete _recordOutputMgr; } -void FileIntersect::processHits(RecordKeyList &hits) { +void FileIntersect::processHits(RecordKeyVector &hits) { _recordOutputMgr->printRecord(hits); } @@ -58,11 +58,11 @@ bool FileIntersect::processSortedFiles() return false; } - RecordKeyList hitSet; + RecordKeyVector hitSet; while (sweep.next(hitSet)) { if (_context->getObeySplits()) { - RecordKeyList keySet(hitSet.getKey()); - RecordKeyList resultSet(hitSet.getKey()); + RecordKeyVector keySet(hitSet.getKey()); + RecordKeyVector resultSet(hitSet.getKey()); _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet); processHits(resultSet); } else { @@ -85,11 +85,11 @@ bool FileIntersect::processUnsortedFiles() if (queryRecord == NULL) { continue; } - RecordKeyList hitSet(queryRecord); + RecordKeyVector hitSet(queryRecord); binTree->getHits(queryRecord, hitSet); if (_context->getObeySplits()) { - RecordKeyList keySet(hitSet.getKey()); - RecordKeyList resultSet; + RecordKeyVector keySet(hitSet.getKey()); + RecordKeyVector resultSet; _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet); processHits(resultSet); } else { diff --git a/src/intersectFile/intersectFile.h b/src/intersectFile/intersectFile.h index 869e7e159160476eb65f4e7c52431f7cca90f174..585de04427b49cd75f3b620e445f0dbb54c7ac7b 100644 --- a/src/intersectFile/intersectFile.h +++ b/src/intersectFile/intersectFile.h @@ -35,7 +35,7 @@ private: BlockMgr *_blockMgr; RecordOutputMgr *_recordOutputMgr; - void processHits(RecordKeyList &hits); + void processHits(RecordKeyVector &hits); bool processSortedFiles(); bool processUnsortedFiles(); diff --git a/src/intersectFile/intersectMain.cpp b/src/intersectFile/intersectMain.cpp index 98f140fe4e9f2b6c20d77c4f8cb3a481128131e8..fe33bf9a4ab90cacf3cdc7dede6374f19d3edf52 100644 --- a/src/intersectFile/intersectMain.cpp +++ b/src/intersectFile/intersectMain.cpp @@ -12,6 +12,7 @@ #include "intersectFile.h" #include "ContextIntersect.h" +#include "CommonHelp.h" using namespace std; @@ -47,6 +48,9 @@ void intersect_help(void) { cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; + cerr << "\t\t" << "Note: -b may be followed with multiple databases and/or " << endl; + cerr << "\t\t" "wildcard (*) character(s). " << endl; + cerr << "Options: " << endl; cerr << "\t-abam\t" << "The A input file is in BAM format. Output will be BAM as well." << endl << endl; @@ -120,6 +124,17 @@ void intersect_help(void) { cerr <<"\t\tother software tools and scripts that need to process one" << endl; cerr <<"\t\tline of bedtools output at a time." << endl << endl; + cerr << "\t-names\t" << "When using multiple databases, provide an alias for each that" << endl; + cerr <<"\t\twill appear instead of a fileId when also printing the DB record." << endl << endl; + + cerr << "\t-filenames" << "\tWhen using multiple databases, show each complete filename" << endl; + cerr <<"\t\t\tinstead of a fileId when also printing the DB record." << endl << endl; + + cerr << "\t-sortout\t" << "When using multiple databases, sort the output DB hits" << endl; + cerr << "\t\t\tfor each record." << endl << endl; + + CommonHelp(); + cerr << "Notes: " << endl; cerr << "\t(1) When a BAM file is used for the A file, the alignment is retained if overlaps exist," << endl; cerr << "\tand exlcuded if an overlap cannot be found. If multiple overlaps exist, they are not" << endl; diff --git a/src/jaccard/Jaccard.cpp b/src/jaccard/Jaccard.cpp index cf3d7136d393d154f72b86a0062fc9987a0bdb83..3ccda00d34320b5a7b8134641a18d642f8eac069 100644 --- a/src/jaccard/Jaccard.cpp +++ b/src/jaccard/Jaccard.cpp @@ -49,11 +49,11 @@ bool Jaccard::getIntersectionAndUnion() { if (!sweep.init()) { return false; } - RecordKeyList hitSet; + RecordKeyVector hitSet; while (sweep.next(hitSet)) { if (_context->getObeySplits()) { - RecordKeyList keySet(hitSet.getKey()); - RecordKeyList resultSet(hitSet.getKey()); + RecordKeyVector keySet(hitSet.getKey()); + RecordKeyVector resultSet(hitSet.getKey()); _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet); _intersectionVal += getTotalIntersection(&resultSet); } else { @@ -69,7 +69,7 @@ bool Jaccard::getIntersectionAndUnion() { return true; } -unsigned long Jaccard::getTotalIntersection(RecordKeyList *recList) +unsigned long Jaccard::getTotalIntersection(RecordKeyVector *recList) { unsigned long intersection = 0; const Record *key = recList->getKey(); @@ -77,8 +77,8 @@ unsigned long Jaccard::getTotalIntersection(RecordKeyList *recList) int keyEnd = key->getEndPos(); int hitIdx = 0; - for (RecordKeyList::const_iterator_type iter = recList->begin(); iter != recList->end(); iter = recList->next()) { - const Record *currRec = iter->value(); + for (RecordKeyVector::const_iterator_type iter = recList->begin(); iter != recList->end(); iter = recList->next()) { + const Record *currRec = *iter; int maxStart = max(currRec->getStartPos(), keyStart); int minEnd = min(currRec->getEndPos(), keyEnd); if (_context->getObeySplits()) { diff --git a/src/jaccard/Jaccard.h b/src/jaccard/Jaccard.h index 21a90f6fb94d9108e41a71c08b7572d18ff8a915..3d61e8f8c00fa5121cfebff4d10b519ce75bcad4 100644 --- a/src/jaccard/Jaccard.h +++ b/src/jaccard/Jaccard.h @@ -33,7 +33,7 @@ private: int _numIntersections; bool getIntersectionAndUnion(); - unsigned long getTotalIntersection(RecordKeyList *hits); + unsigned long getTotalIntersection(RecordKeyVector *hits); }; #endif /* JACCARD_H */ diff --git a/src/mapFile/mapFile.cpp b/src/mapFile/mapFile.cpp index 8dbf24ad54a2e93715fd453092e778efcdf24b0c..e78b59c0b6a9ccd1c5bef5efe6321bcb824b841c 100644 --- a/src/mapFile/mapFile.cpp +++ b/src/mapFile/mapFile.cpp @@ -41,11 +41,11 @@ bool FileMap::mapFiles() if (!sweep.init()) { return false; } - RecordKeyList hitSet; + RecordKeyVector hitSet; while (sweep.next(hitSet)) { if (_context->getObeySplits()) { - RecordKeyList keySet(hitSet.getKey()); - RecordKeyList resultSet(hitSet.getKey()); + RecordKeyVector keySet(hitSet.getKey()); + RecordKeyVector resultSet(hitSet.getKey()); _blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet); _recordOutputMgr->printRecord(resultSet.getKey(), _context->getColumnOpsVal(resultSet)); } else { diff --git a/src/mapFile/mapFile.h b/src/mapFile/mapFile.h index 38cbcda1f1b1e78cb0bde8cb2c0ab00b37c3e0e2..df92220235302d7a5dbd534a98fb10af058a943c 100644 --- a/src/mapFile/mapFile.h +++ b/src/mapFile/mapFile.h @@ -15,7 +15,7 @@ #include <sstream> #include <iomanip> #include "VectorOps.h" -#include "RecordKeyList.h" +#include "RecordKeyVector.h" #include "ContextMap.h" using namespace std; diff --git a/src/mergeFile/mergeFile.cpp b/src/mergeFile/mergeFile.cpp index f772629c440db0fae1ab9e7535491fc6d2edf4eb..e4a3c863c579fd78eeaf8d3946cb5aeeef96fdac 100644 --- a/src/mergeFile/mergeFile.cpp +++ b/src/mergeFile/mergeFile.cpp @@ -28,7 +28,7 @@ MergeFile::~MergeFile() bool MergeFile::merge() { - RecordKeyList hitSet; + RecordKeyVector hitSet; FileRecordMgr *frm = _context->getFile(0); while (!frm->eof()) { Record *key = frm->getNextRecord(&hitSet); diff --git a/src/utils/BinTree/BinTree.cpp b/src/utils/BinTree/BinTree.cpp index 253dea2798320bfd7886931213612b2c21931baf..5dce775b06c9facfb02d54d757e74d43d76fb909 100644 --- a/src/utils/BinTree/BinTree.cpp +++ b/src/utils/BinTree/BinTree.cpp @@ -3,8 +3,7 @@ BinTree::BinTree(ContextIntersect *context) -: _databaseFile(NULL), - _context(context), +: _context(context), _binOffsetsExtended(NULL), _showBinMetrics(false), _maxBinNumFound(0) @@ -36,7 +35,7 @@ BinTree::~BinTree() { } for (innerListIterType listIter = bin->begin(); listIter != bin->end(); listIter = bin->next()) { const Record *record = listIter->value(); - _databaseFile->deleteRecord(record); + _context->getFile(record->getFileIdx())->deleteRecord(record); } delete bin; bin = NULL; @@ -70,25 +69,27 @@ BinTree::~BinTree() { void BinTree::loadDB() { - _databaseFile = _context->getFile(_context->getDatabaseFileIdx()); - - Record *record = NULL; - while (!_databaseFile->eof()) { - record = _databaseFile->getNextRecord(); - //In addition to NULL records, we also don't want to add unmapped reads. - if (record == NULL || record->isUnmapped()) { - continue; - } + for (int i=0; i < _context->getNumDatabaseFiles(); i++) { + FileRecordMgr *databaseFile = _context->getDatabaseFile(i); + + Record *record = NULL; + while (!databaseFile->eof()) { + record = databaseFile->getNextRecord(); + //In addition to NULL records, we also don't want to add unmapped reads. + if (record == NULL || record->isUnmapped()) { + continue; + } - if (!addRecordToTree(record)) { - fprintf(stderr, "ERROR: Unable to add record to tree.\n"); - _databaseFile->close(); - exit(1); + if (!addRecordToTree(record)) { + fprintf(stderr, "ERROR: Unable to add record to tree.\n"); + databaseFile->close(); + exit(1); + } } } } -void BinTree::getHits(Record *record, RecordKeyList &hitSet) +void BinTree::getHits(Record *record, RecordKeyVector &hitSet) { if (_showBinMetrics) { return; //don't care about query entries just yet. @@ -149,6 +150,9 @@ void BinTree::getHits(Record *record, RecordKeyList &hitSet) startBin >>= _binNextShift; endBin >>= _binNextShift; } + if (_context->getSortOutput()) { + hitSet.sortVector(); + } } bool BinTree::addRecordToTree(const Record *record) diff --git a/src/utils/BinTree/BinTree.h b/src/utils/BinTree/BinTree.h index 161696c1739763530f21a500a161cea0ef06aad1..cd981c293e18c75b4276593d1795b72c5e3d49a3 100644 --- a/src/utils/BinTree/BinTree.h +++ b/src/utils/BinTree/BinTree.h @@ -28,11 +28,10 @@ public: ~BinTree(); void loadDB(); - void getHits(Record *record, RecordKeyList &hitSet); + void getHits(Record *record, RecordKeyVector &hitSet); private: - FileRecordMgr *_databaseFile; ContextIntersect *_context; // @@ -52,8 +51,8 @@ private: static const uint32_t _binFirstShift = 14; /* How much to shift to get to finest bin. */ static const uint32_t _binNextShift = 3; /* How much to shift to get to next larger bin. */ - typedef BTlist<const Record *> innerListType; - typedef const BTlistNode<const Record *> * innerListIterType; + typedef RecordList innerListType; + typedef const RecordListNode * innerListIterType; typedef innerListType * binType; typedef binType * allBinsType; typedef QuickString mainKeyType; diff --git a/src/utils/Contexts/ContextBase.cpp b/src/utils/Contexts/ContextBase.cpp index 60fbd9330585d673b26ff596a22b3d5fd707f421..6fa01c1b5fc0bf241404afe422da0ef0ec219109 100644 --- a/src/utils/Contexts/ContextBase.cpp +++ b/src/utils/Contexts/ContextBase.cpp @@ -36,18 +36,17 @@ ContextBase::ContextBase() _reciprocal(false), _sameStrand(false), _diffStrand(false), - _sortedInput(false), + _sortedInput(false), + _sortOutput(false), + _reportDBnameTags(false), + _reportDBfileNames(false), _printHeader(false), _printable(true), _explicitBedOutput(false), _queryFileIdx(-1), - _databaseFileIdx(-1), _bamHeaderAndRefIdx(-1), _maxNumDatabaseFields(0), _useFullBamTags(false), - _reportCount(false), - _reportNames(false), - _reportScores(false), _numOutputRecords(0), _hasConstantSeed(false), _seed(0), @@ -193,6 +192,9 @@ bool ContextBase::parseCmdArgs(int argc, char **argv, int skipFirstArgs) { else if (strcmp(_argv[_i], "-delim") == 0) { if (!handle_delim()) return false; } + else if (strcmp(_argv[_i], "-sortout") == 0) { + if (!handle_sortout()) return false; + } } return true; @@ -210,7 +212,11 @@ bool ContextBase::isValidState() return false; } if (hasColumnOpsMethods()) { - FileRecordMgr *dbFile = getFile(hasIntersectMethods() ? _databaseFileIdx : 0); + + //TBD: Adjust column ops for multiple databases. + //For now, use last file. +// FileRecordMgr *dbFile = getFile(hasIntersectMethods() ? _databaseFileIdx : 0); + FileRecordMgr *dbFile = getFile(getNumInputFiles()-1); _keyListOps->setDBfileType(dbFile->getFileType()); if (!_keyListOps->isValidColumnOps(dbFile)) { return false; @@ -251,7 +257,7 @@ bool ContextBase::openFiles() { _files.resize(_fileNames.size()); for (int i = 0; i < (int)_fileNames.size(); i++) { - FileRecordMgr *frm = getNewFRM(_fileNames[i]); + FileRecordMgr *frm = getNewFRM(_fileNames[i], i); if (hasGenomeFile()) { frm->setGenomeFile(_genomeFile); } @@ -281,7 +287,7 @@ int ContextBase::getBamHeaderAndRefIdx() { if (_files[_queryFileIdx]->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { _bamHeaderAndRefIdx = _queryFileIdx; } else { - _bamHeaderAndRefIdx = _databaseFileIdx; + _bamHeaderAndRefIdx = _dbFileIdxs[0]; } return _bamHeaderAndRefIdx; } @@ -492,6 +498,13 @@ bool ContextBase::handle_delim() return true; } +bool ContextBase::handle_sortout() +{ + setSortOutput(true); + markUsed(_i - _skipFirstArgs); + return true; +} + void ContextBase::setColumnOpsMethods(bool val) { if (val && !_hasColumnOpsMethods) { @@ -501,20 +514,24 @@ void ContextBase::setColumnOpsMethods(bool val) _hasColumnOpsMethods = val; } -const QuickString &ContextBase::getColumnOpsVal(RecordKeyList &keyList) const { +const QuickString &ContextBase::getColumnOpsVal(RecordKeyVector &keyList) const { if (!hasColumnOpsMethods()) { return _nullStr; } return _keyListOps->getOpVals(keyList); } -FileRecordMgr *ContextBase::getNewFRM(const QuickString &filename) { - if (!_useMergedIntervals) { - return new FileRecordMgr(filename); - } else { +FileRecordMgr *ContextBase::getNewFRM(const QuickString &filename, int fileIdx) { + + if (_useMergedIntervals) { FileRecordMergeMgr *frm = new FileRecordMergeMgr(filename); frm->setStrandType(_desiredStrand); frm->setMaxDistance(_maxDistance); + frm->setFileIdx(fileIdx); + return frm; + } else { + FileRecordMgr *frm = new FileRecordMgr(filename); + frm->setFileIdx(fileIdx); return frm; } } diff --git a/src/utils/Contexts/ContextBase.h b/src/utils/Contexts/ContextBase.h index d28b9e6ade6e4e5fcd5cd3591962ffd3c52eab6a..1c6cecbdbe66ab8dafcb11875bf0a9835b2ba331 100644 --- a/src/utils/Contexts/ContextBase.h +++ b/src/utils/Contexts/ContextBase.h @@ -98,6 +98,15 @@ public: virtual bool getSortedInput() const {return _sortedInput; } virtual void setSortedInput(bool val) { _sortedInput = val; } + virtual bool getSortOutput() const {return _sortOutput; } + virtual void setSortOutput(bool val) { _sortOutput = val; } + + virtual bool getUseDBnameTags() const { return _reportDBnameTags; } + virtual void setUseDBnameTags(bool val) { _reportDBnameTags = val; } + + virtual bool getUseDBfileNames() const { return _reportDBfileNames; } + virtual void setUseDBfileNames(bool val) { _reportDBfileNames = val; } + virtual bool getPrintHeader() const {return _printHeader; } virtual void setPrintHeader(bool val) { _printHeader = val; } @@ -107,24 +116,6 @@ public: virtual bool getUseFullBamTags() const { return _useFullBamTags; } virtual void setUseFullBamTags(bool val) { _useFullBamTags = val; } -// // -// // MERGE METHODS -// // -// virtual bool getReportCount() const { return _reportCount; } -// virtual void setReportCount(bool val) { _reportCount = val; } -// -// virtual int getMaxDistance() const { return _maxDistance; } -// virtual void setMaxDistance(int distance) { _maxDistance = distance; } -// -// virtual bool getReportNames() const { return _reportNames; } -// virtual void setReportNames(bool val) { _reportNames = val; } -// -// virtual bool getReportScores() const { return _reportScores; } -// virtual void setReportScores(bool val) { _reportScores = val; } -// -// virtual const QuickString &getScoreOp() const { return _scoreOp; } -// virtual void setScoreOp(const QuickString &op) { _scoreOp = op; } - // METHODS FOR PROGRAMS WITH USER_SPECIFIED NUMBER // OF OUTPUT RECORDS. @@ -150,7 +141,7 @@ public: // are available. void setColumnOpsMethods(bool val); virtual bool hasColumnOpsMethods() const { return _hasColumnOpsMethods; } - const QuickString &getColumnOpsVal(RecordKeyList &keyList) const; + const QuickString &getColumnOpsVal(RecordKeyVector &keyList) const; //methods applicable only to column operations. protected: @@ -192,18 +183,19 @@ protected: bool _sameStrand; bool _diffStrand; bool _sortedInput; + bool _sortOutput; + bool _reportDBnameTags; + bool _reportDBfileNames; bool _printHeader; bool _printable; bool _explicitBedOutput; int _queryFileIdx; - int _databaseFileIdx; + vector<int> _dbFileIdxs; + vector<QuickString> _dbNameTags; + map<int, int> _fileIdsToDbIdxs; int _bamHeaderAndRefIdx; int _maxNumDatabaseFields; bool _useFullBamTags; - bool _reportCount; - bool _reportNames; - bool _reportScores; - QuickString _scoreOp; int _numOutputRecords; @@ -227,7 +219,7 @@ protected: bool isUsed(int i) const { return _argsProcessed[i]; } bool cmdArgsValid(); bool openFiles(); - virtual FileRecordMgr *getNewFRM(const QuickString &filename); + virtual FileRecordMgr *getNewFRM(const QuickString &filename, int fileIdx); //set cmd line params and counter, i, as members so code //is more readable (as opposed to passing all 3 everywhere). @@ -256,6 +248,7 @@ protected: virtual bool handle_o(); virtual bool handle_null(); virtual bool handle_delim(); + virtual bool handle_sortout(); bool parseIoBufSize(QuickString bufStr); diff --git a/src/utils/Contexts/ContextIntersect.cpp b/src/utils/Contexts/ContextIntersect.cpp index fecf51d115ebde0c87446c9341cb95e1cf4b89ea..1ef4b3d312f4fc6323a1595306828c6fd1eda9f6 100644 --- a/src/utils/Contexts/ContextIntersect.cpp +++ b/src/utils/Contexts/ContextIntersect.cpp @@ -44,6 +44,12 @@ bool ContextIntersect::parseCmdArgs(int argc, char **argv, int skipFirstArgs) { else if (strcmp(_argv[_i], "-b") == 0) { if (!handle_b()) return false; } + else if (strcmp(_argv[_i], "-names") == 0) { + if (!handle_names()) return false; + } + else if (strcmp(_argv[_i], "-filenames") == 0) { + if (!handle_filenames()) return false; + } else if (strcmp(_argv[_i], "-u") == 0) { if (!handle_u()) return false; } @@ -92,7 +98,7 @@ bool ContextIntersect::isValidState() return false; } - if (_queryFileIdx == -1 || _databaseFileIdx == -1) { + if (_queryFileIdx == -1 || _dbFileIdxs.size() == -0) { _errorMsg = "\n***** ERROR: query and database files not specified. *****"; return false; } @@ -113,6 +119,11 @@ bool ContextIntersect::isValidState() return false; } } + if (getUseDBnameTags() && _dbNameTags.size() != _dbFileIdxs.size()) { + _errorMsg = "\n***** ERROR: Number of database name tags given does not match number of databases. *****"; + return false; + + } if (getWriteOverlap()) { if (getWriteA()) { @@ -149,7 +160,7 @@ bool ContextIntersect::isValidState() if (getAnyHit() || getNoHit() || getWriteCount()) { setPrintable(false); } - if (_files.size() != 2 ) { + if (_files.size() < 2 ) { return false; } return true; @@ -161,8 +172,8 @@ bool ContextIntersect::determineOutputType() { } //determine the maximum number of database fields. - for (int i=0; i < (int)_files.size(); i++) { - int numFields = _files[i]->getNumFields(); + for (int i=0; i < getNumDatabaseFiles(); i++) { + int numFields = getDatabaseFile(i)->getNumFields(); if ( numFields > _maxNumDatabaseFields) { _maxNumDatabaseFields = numFields; } @@ -211,19 +222,46 @@ bool ContextIntersect::handle_abam() bool ContextIntersect::handle_b() { if (_argc <= _i+1) { - _errorMsg = "\n***** ERROR: -b option given, but no query file specified. *****"; + _errorMsg = "\n***** ERROR: -b option given, but no database file specified. *****"; return false; } - addInputFile(_argv[_i+1]); - _databaseFileIdx = getNumInputFiles() -1; - markUsed(_i - _skipFirstArgs); - _i++; - markUsed(_i - _skipFirstArgs); + do { + addInputFile(_argv[_i+1]); + int fileId = getNumInputFiles() -1; + _dbFileIdxs.push_back(fileId); + _fileIdsToDbIdxs[fileId] = _dbFileIdxs.size() -1; + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + } while (_argc > _i+1 && _argv[_i+1][0] != '-'); return true; } +bool ContextIntersect::handle_names() +{ + if (_argc <= _i+1) { + _errorMsg = "\n***** ERROR: -b option given, but no database names specified. *****"; + return false; + } + + do { + addDatabaseNameTag(_argv[_i+1]); + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + } while (_argc > _i+1 && _argv[_i+1][0] != '-'); + setUseDBnameTags(true); + return true; +} + +bool ContextIntersect::handle_filenames() +{ + markUsed(_i - _skipFirstArgs); + setUseDBfileNames(true); + return true; +} bool ContextIntersect::handle_c() { diff --git a/src/utils/Contexts/ContextIntersect.h b/src/utils/Contexts/ContextIntersect.h index b066e9465eac744b8bed21dcd3c39c935407345c..9bb20464124bbae9e2d7d076158bff5921ac9fd9 100644 --- a/src/utils/Contexts/ContextIntersect.h +++ b/src/utils/Contexts/ContextIntersect.h @@ -22,19 +22,22 @@ public: //NOTE: Query and database files will only be marked as such by either the //parseCmdArgs method, or by explicitly setting them. FileRecordMgr *getQueryFile() { return getFile(_queryFileIdx); } - FileRecordMgr *getDatabaseFile() { return getFile(_databaseFileIdx); } + FileRecordMgr *getDatabaseFile(int idx) { return getFile(_dbFileIdxs[idx]); } int getQueryFileIdx() const { return _queryFileIdx; } void setQueryFileIdx(int idx) { _queryFileIdx = idx; } - int getDatabaseFileIdx() const { return _databaseFileIdx; } - void setDatabaseFileIdx(int idx) { _databaseFileIdx = idx; } + int getNumDatabaseFiles() { return (int)_dbFileIdxs.size(); } + const vector<int> &getDbFileIdxs() const { return _dbFileIdxs; } const QuickString &getQueryFileName() const { return _files[_queryFileIdx]->getFileName(); } - const QuickString &getDatabaseFileName() const { return _files[_databaseFileIdx]->getFileName(); } + const QuickString &getDatabaseFileName(int idx) const { return _files[_dbFileIdxs[idx]]->getFileName(); } ContextFileType getQueryFileType() const { return _files[_queryFileIdx]->getFileType(); } - ContextFileType getDatabaseFileType() const { return _files[_databaseFileIdx]->getFileType(); } + ContextFileType getDatabaseFileType(int idx) const { return _files[_dbFileIdxs[idx]]->getFileType(); } ContextRecordType getQueryRecordType() const { return _files[_queryFileIdx]->getRecordType(); } - ContextRecordType getDatabaseRecordType() const { return _files[_databaseFileIdx]->getRecordType(); } + ContextRecordType getDatabaseRecordType(int idx) const { return _files[_dbFileIdxs[idx]]->getRecordType(); } int getMaxNumDatabaseFields() const { return _maxNumDatabaseFields; } void setMaxNumDatabaseFields(int val) { _maxNumDatabaseFields = val; } + int getDbIdx(int fileId) { return _fileIdsToDbIdxs.find(fileId)->second; } + void addDatabaseNameTag(const QuickString &tag) { _dbNameTags.push_back(tag); } + const QuickString &getDatabaseNameTag(int dbIdx) const { return _dbNameTags[dbIdx]; } bool getAnyHit() const {return _anyHit; } void setAnyHit(bool val) { _anyHit = val; } @@ -83,6 +86,9 @@ private: virtual bool handle_a(); virtual bool handle_abam(); virtual bool handle_b(); + virtual bool handle_names(); + virtual bool handle_filenames(); + virtual bool handle_c(); virtual bool handle_f(); virtual bool handle_loj(); diff --git a/src/utils/Contexts/ContextMap.cpp b/src/utils/Contexts/ContextMap.cpp index e6abc519501a1e5320c5aad4103bd04fc083096a..1d79d7a690d750bbdbe1435e42c05bcacb2a5f24 100644 --- a/src/utils/Contexts/ContextMap.cpp +++ b/src/utils/Contexts/ContextMap.cpp @@ -47,3 +47,17 @@ bool ContextMap::parseCmdArgs(int argc, char **argv, int skipFirstArgs) { } return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs); } + +bool ContextMap::isValidState() +{ + if (!ContextIntersect::isValidState()) { + return false; + } + + // Multiple databases are currently not supported + if (getNumDatabaseFiles() > 1) { + _errorMsg = "\n***** ERROR: multiple database files currently not supported for map. *****"; + return false; + } + return true; +} diff --git a/src/utils/Contexts/ContextMap.h b/src/utils/Contexts/ContextMap.h index fb23f4d5412fedcd3599f3951fd5e24bd76dbfb0..53240af65f9a7e1cce65fab3ed7a13c1aadbefda 100644 --- a/src/utils/Contexts/ContextMap.h +++ b/src/utils/Contexts/ContextMap.h @@ -16,6 +16,8 @@ public: virtual ~ContextMap(); virtual bool parseCmdArgs(int argc, char **argv, int skipFirstArgs); virtual bool hasIntersectMethods() const { return true; } + virtual bool isValidState(); + private: diff --git a/src/utils/FileRecordTools/FileReaders/FileReader.cpp b/src/utils/FileRecordTools/FileReaders/FileReader.cpp index 967b82a0052c5b1c374efc32b4349e1e84d57768..de495810bf862c8816285643e64824e1662b31b1 100644 --- a/src/utils/FileRecordTools/FileReaders/FileReader.cpp +++ b/src/utils/FileRecordTools/FileReaders/FileReader.cpp @@ -4,7 +4,7 @@ #include "BufferedStreamMgr.h" FileReader::FileReader() -: +: _fileIdx(-1), _bufStreamMgr(NULL), _isFileOpen(false), _currChromId(-1) diff --git a/src/utils/FileRecordTools/FileReaders/FileReader.h b/src/utils/FileRecordTools/FileReaders/FileReader.h index 73a968bace4b39e9b0b5c6536e959834a1cf88d1..48b861ad70876587a89a5927fa4f3edfecb8e6b2 100644 --- a/src/utils/FileRecordTools/FileReaders/FileReader.h +++ b/src/utils/FileRecordTools/FileReaders/FileReader.h @@ -16,6 +16,9 @@ public: FileReader(); virtual ~FileReader(); void setFileName(const string &filename) { _filename = filename; } + virtual int getFileIdx() const { return _fileIdx; } + virtual void setFileIdx(int fileIdx) { _fileIdx = fileIdx; } + void setInputStream(BufferedStreamMgr *bufStreamMgr) { _bufStreamMgr = bufStreamMgr; _isFileOpen = true; @@ -31,6 +34,7 @@ public: virtual const QuickString &getHeader() const =0; virtual int getNumFields() const = 0; protected: + int _fileIdx; string _filename; BufferedStreamMgr *_bufStreamMgr; diff --git a/src/utils/FileRecordTools/FileReaders/Makefile b/src/utils/FileRecordTools/FileReaders/Makefile index 37ff6e89f0cc87f1b4597dbc2f6fba8a6b9b4a8e..20c9c0ed179eec073da20a253e2fe301bc52e310 100644 --- a/src/utils/FileRecordTools/FileReaders/Makefile +++ b/src/utils/FileRecordTools/FileReaders/Makefile @@ -17,10 +17,10 @@ INCLUDES = -I$(UTILITIES_DIR)/BamTools/include/ \ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= FileReader.h FileReader.cpp SingleLineDelimTransferBuffer.h SingleLineDelimTransferBuffer.cpp \ +SOURCES= FileReader.h FileReader.cpp \ SingleLineDelimTextFileReader.h SingleLineDelimTextFileReader.cpp BamFileReader.h BamFileReader.cpp \ BufferedStreamMgr.h BufferedStreamMgr.cpp InputStreamMgr.h InputStreamMgr.cpp -OBJECTS= FileReader.o SingleLineDelimTransferBuffer.o SingleLineDelimTextFileReader.o BamFileReader.o BufferedStreamMgr.o InputStreamMgr.o +OBJECTS= FileReader.o SingleLineDelimTextFileReader.o BamFileReader.o BufferedStreamMgr.o InputStreamMgr.o _EXT_OBJECTS=ParseTools.o QuickString.o CompressionTools.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) @@ -35,7 +35,7 @@ $(BUILT_OBJECTS): $(SOURCES) clean: @echo "Cleaning up." - @rm -f $(OBJ_DIR)/FileReader.o $(OBJ_DIR)SingleLineDelimTransferBuffer $(OBJ_DIR)/SingleLineDelimTextFileReader.o \ + @rm -f $(OBJ_DIR)/FileReader.o $(OBJ_DIR)/SingleLineDelimTextFileReader.o \ $(OBJ_DIR)/BinaryFileReader.o $(OBJ_DIR)/BamFileReader.o $(OBJ_DIR)/BufferedStreamMgr.o $(OBJ_DIR)/InputStreamMgr.o $(OBJ_DIR)/PushBackGzStream.o .PHONY: clean diff --git a/src/utils/FileRecordTools/FileReaders/Makefile~ b/src/utils/FileRecordTools/FileReaders/Makefile~ new file mode 100644 index 0000000000000000000000000000000000000000..37ff6e89f0cc87f1b4597dbc2f6fba8a6b9b4a8e --- /dev/null +++ b/src/utils/FileRecordTools/FileReaders/Makefile~ @@ -0,0 +1,41 @@ +OBJ_DIR = ../../../../obj/ +BIN_DIR = ../../../../bin/ +UTILITIES_DIR = ../../../utils/ +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/BamTools/include/ \ + -I$(UTILITIES_DIR)/BamTools/src/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/Contexts/ \ + -I$(UTILITIES_DIR)/GenomeFile/ \ + -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/version/ + + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= FileReader.h FileReader.cpp SingleLineDelimTransferBuffer.h SingleLineDelimTransferBuffer.cpp \ + SingleLineDelimTextFileReader.h SingleLineDelimTextFileReader.cpp BamFileReader.h BamFileReader.cpp \ + BufferedStreamMgr.h BufferedStreamMgr.cpp InputStreamMgr.h InputStreamMgr.cpp +OBJECTS= FileReader.o SingleLineDelimTransferBuffer.o SingleLineDelimTextFileReader.o BamFileReader.o BufferedStreamMgr.o InputStreamMgr.o +_EXT_OBJECTS=ParseTools.o QuickString.o CompressionTools.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/FileReader.o $(OBJ_DIR)SingleLineDelimTransferBuffer $(OBJ_DIR)/SingleLineDelimTextFileReader.o \ + $(OBJ_DIR)/BinaryFileReader.o $(OBJ_DIR)/BamFileReader.o $(OBJ_DIR)/BufferedStreamMgr.o $(OBJ_DIR)/InputStreamMgr.o $(OBJ_DIR)/PushBackGzStream.o + +.PHONY: clean diff --git a/src/utils/FileRecordTools/FileRecordMergeMgr.cpp b/src/utils/FileRecordTools/FileRecordMergeMgr.cpp index 7c7461fa0f4a98458cbb52d79857b50d78c46faa..40e42b3f3352c483c9cf83ec9baffca513bfb971 100644 --- a/src/utils/FileRecordTools/FileRecordMergeMgr.cpp +++ b/src/utils/FileRecordTools/FileRecordMergeMgr.cpp @@ -16,7 +16,7 @@ FileRecordMergeMgr::FileRecordMergeMgr(const QuickString & filename) } //Record *FileRecordMergeMgr::allocateAndGetNextMergedRecord(WANT_STRAND_TYPE desiredStrand, int maxDistance) { -// RecordKeyList recList; +// RecordKeyVector recList; // if (!allocateAndGetNextMergedRecord(recList, desiredStrand, maxDistance)) { // return NULL; // } @@ -24,7 +24,7 @@ FileRecordMergeMgr::FileRecordMergeMgr(const QuickString & filename) // return const_cast<Record *>(recList.getKey()); //want key to be non-const //} -Record *FileRecordMergeMgr::getNextRecord(RecordKeyList *recList) +Record *FileRecordMergeMgr::getNextRecord(RecordKeyVector *recList) { //clear the recList if there is one, and if it has records // in it. @@ -191,7 +191,7 @@ Record *FileRecordMergeMgr::tryToTakeFromStorage(Record::strandType strand) { return record; } -void FileRecordMergeMgr::deleteMergedRecord(RecordKeyList &recList) +void FileRecordMergeMgr::deleteMergedRecord(RecordKeyVector &recList) { deleteAllMergedItemsButKey(recList); deleteRecord(recList.getKey()); @@ -203,15 +203,15 @@ bool FileRecordMergeMgr::eof(){ } -void FileRecordMergeMgr::deleteAllMergedItemsButKey(RecordKeyList &recList) { +void FileRecordMergeMgr::deleteAllMergedItemsButKey(RecordKeyVector &recList) { //if the key is also in the list, this method won't delete it. - for (RecordKeyList::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { - if (iter->value() == recList.getKey()) { + for (RecordKeyVector::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { + if (*iter == recList.getKey()) { continue; } - deleteRecord(iter->value()); + deleteRecord(*iter); } - recList.clearList(); + recList.clearVector(); } diff --git a/src/utils/FileRecordTools/FileRecordMergeMgr.h b/src/utils/FileRecordTools/FileRecordMergeMgr.h index aecdeefd856debcbc29dbf352165b1a78c936f15..ce6cfcfa5bc5a2e865b2c4b1b900563757067cbf 100644 --- a/src/utils/FileRecordTools/FileRecordMergeMgr.h +++ b/src/utils/FileRecordTools/FileRecordMergeMgr.h @@ -22,13 +22,13 @@ public: // // This will give a single "meta" record containing "flattened" or merged records. // - // Pass an empty RecordKeyList. When done, will have a pair: 1st is the final merged record, + // Pass an empty RecordKeyVector. When done, will have a pair: 1st is the final merged record, // second is list of constituent Records merged. // /////////////////////////////////////////////////////////////////////////////////// - Record *getNextRecord(RecordKeyList *keyList = NULL); - void deleteMergedRecord(RecordKeyList &recList); // MUST use this method for cleanup! + Record *getNextRecord(RecordKeyVector *keyList = NULL); + void deleteMergedRecord(RecordKeyVector &recList); // MUST use this method for cleanup! bool eof(); @@ -48,7 +48,7 @@ private: int _maxDistance; StrandQueue _storedRecords; - void deleteAllMergedItemsButKey(RecordKeyList &recList); + void deleteAllMergedItemsButKey(RecordKeyVector &recList); void addToStorage(Record *record); Record *tryToTakeFromStorage(); Record *tryToTakeFromStorage(Record::strandType strand); diff --git a/src/utils/FileRecordTools/FileRecordMgr.cpp b/src/utils/FileRecordTools/FileRecordMgr.cpp index fc8a986e5dd416344fef8e3b51751aa76136bb81..4386961115812378a65582bc31686383161338e1 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.cpp +++ b/src/utils/FileRecordTools/FileRecordMgr.cpp @@ -5,7 +5,7 @@ #include "NewGenomeFile.h" FileRecordMgr::FileRecordMgr(const QuickString &filename) -: +: _fileIdx(-1), _filename(filename), _bufStreamMgr(NULL), _fileReader(NULL), @@ -90,7 +90,7 @@ bool FileRecordMgr::eof(){ return _fileReader->eof(); } -Record *FileRecordMgr::getNextRecord(RecordKeyList *keyList) +Record *FileRecordMgr::getNextRecord(RecordKeyVector *keyList) { if (!_fileReader->isOpen()) { return NULL; @@ -203,7 +203,7 @@ void FileRecordMgr::deleteRecord(const Record *record) { _recordMgr->deleteRecord(record); } -void FileRecordMgr::deleteRecord(RecordKeyList *keyList) { +void FileRecordMgr::deleteRecord(RecordKeyVector *keyList) { _recordMgr->deleteRecord(keyList->getKey()); } @@ -224,6 +224,7 @@ void FileRecordMgr::allocateFileReader() default: break; } + _fileReader->setFileIdx(_fileIdx); } const BamTools::RefVector & FileRecordMgr::getBamReferences() { diff --git a/src/utils/FileRecordTools/FileRecordMgr.h b/src/utils/FileRecordTools/FileRecordMgr.h index 95e459e7add14b299fc45e6043d71f89aeb1d2db..ac3bc1289d98ff403d328c6148fdf21b9cbe9273 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.h +++ b/src/utils/FileRecordTools/FileRecordMgr.h @@ -22,7 +22,7 @@ //record manager and all record classes #include "RecordMgr.h" -#include "RecordKeyList.h" +#include "RecordKeyVector.h" #include "BlockMgr.h" using namespace std; @@ -37,14 +37,19 @@ public: bool open(); void close(); virtual bool eof(); + void setFileIdx(int fileIdx) { _fileIdx = fileIdx; } + int getFileIdx() const { return _fileIdx; } + //This is an all-in-one method to give the user a new record that is initialized with //the next entry in the data file. //NOTE!! User MUST pass back the returned pointer to deleteRecord method for cleanup! //Also Note! User must check for NULL returned, meaning we failed to get the next record. - virtual Record *getNextRecord(RecordKeyList *keyList = NULL); + virtual Record *getNextRecord(RecordKeyVector *keyList = NULL); void deleteRecord(const Record *); - virtual void deleteRecord(RecordKeyList *keyList); + virtual void deleteRecord(RecordKeyVector *keyList); + + const QuickString &getFileName() const { return _filename;} bool hasHeader() const { return _fileReader->hasHeader(); } @@ -103,6 +108,7 @@ public: void setIoBufSize(int val) { _ioBufSize = val; } protected: + int _fileIdx; QuickString _filename; BufferedStreamMgr *_bufStreamMgr; diff --git a/src/utils/FileRecordTools/Records/BamRecord.cpp b/src/utils/FileRecordTools/Records/BamRecord.cpp index a55a0615e400d41b2ef3aacd84e78c2b45ae188d..9c3bd5a310e152967f5a364752c2d40d8e84425f 100644 --- a/src/utils/FileRecordTools/Records/BamRecord.cpp +++ b/src/utils/FileRecordTools/Records/BamRecord.cpp @@ -7,7 +7,7 @@ #include "BamRecord.h" #include "BamFileReader.h" -#include "RecordKeyList.h" +#include "RecordKeyVector.h" BamRecord::BamRecord() : _bamChromId(-1) @@ -45,6 +45,7 @@ bool BamRecord::initFromFile(FileReader *fileReader) bool BamRecord::initFromFile(BamFileReader *bamFileReader) { + setFileIdx(bamFileReader->getFileIdx()); _bamAlignment = bamFileReader->getAlignment(); bamFileReader->getChrName(_chrName); @@ -116,19 +117,19 @@ void BamRecord::clear() } -void BamRecord::print(QuickString &outBuf, RecordKeyList *keyList) const +void BamRecord::print(QuickString &outBuf, RecordKeyVector *keyList) const { Bed6Interval::print(outBuf); printRemainingBamFields(outBuf, keyList); } -void BamRecord::print(QuickString &outBuf, int start, int end, RecordKeyList *keyList) const +void BamRecord::print(QuickString &outBuf, int start, int end, RecordKeyVector *keyList) const { Bed6Interval::print(outBuf, start, end); printRemainingBamFields(outBuf, keyList); } -void BamRecord::print(QuickString &outBuf, const QuickString & start, const QuickString & end, RecordKeyList *keyList) const +void BamRecord::print(QuickString &outBuf, const QuickString & start, const QuickString & end, RecordKeyVector *keyList) const { Bed6Interval::print(outBuf, start, end); printRemainingBamFields(outBuf, keyList); @@ -140,7 +141,7 @@ void BamRecord::printNull(QuickString &outBuf) const outBuf.append("\t.\t.\t.\t.\t.\t.", 12); } -void BamRecord::printRemainingBamFields(QuickString &outBuf, RecordKeyList *keyList) const +void BamRecord::printRemainingBamFields(QuickString &outBuf, RecordKeyVector *keyList) const { outBuf.append('\t'); outBuf.append(_startPosStr); @@ -156,8 +157,8 @@ void BamRecord::printRemainingBamFields(QuickString &outBuf, RecordKeyList *keyL vector<int> blockLengths; vector<int> blockStarts; - for (RecordKeyList::const_iterator_type iter = keyList->begin(); iter != keyList->end(); iter = keyList->next()) { - const Record *block = iter->value(); + for (RecordKeyVector::const_iterator_type iter = keyList->begin(); iter != keyList->end(); iter = keyList->next()) { + const Record *block = *iter; blockLengths.push_back(block->getEndPos() - block->getStartPos()); blockStarts.push_back(block->getStartPos() - _startPos); } diff --git a/src/utils/FileRecordTools/Records/BamRecord.h b/src/utils/FileRecordTools/Records/BamRecord.h index 074e5ee1d8e5a0a7c34465d3e3fecdf9b40e3702..f3c2494863fa5e9ad480b40c291c57a4db8835ee 100644 --- a/src/utils/FileRecordTools/Records/BamRecord.h +++ b/src/utils/FileRecordTools/Records/BamRecord.h @@ -15,7 +15,7 @@ class FileReader; class BamFileReader; -class RecordKeyList; +class RecordKeyVector; class BamRecord : public Bed6Interval { public: @@ -42,11 +42,11 @@ public: using Bed6Interval::print; - virtual void print(QuickString &outBuf, int start, int end, RecordKeyList *keyList) const; - virtual void print(QuickString &outBuf, RecordKeyList *keyList) const; - virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end, RecordKeyList *keyList) const; + virtual void print(QuickString &outBuf, int start, int end, RecordKeyVector *keyList) const; + virtual void print(QuickString &outBuf, RecordKeyVector *keyList) const; + virtual void print(QuickString &outBuf, const QuickString & start, const QuickString & end, RecordKeyVector *keyList) const; virtual void printNull(QuickString &outBuf) const; - virtual void printRemainingBamFields(QuickString &outBuf, RecordKeyList *keyList) const; + virtual void printRemainingBamFields(QuickString &outBuf, RecordKeyVector *keyList) const; virtual void printUnmapped(QuickString &outBuf) const; virtual FileRecordTypeChecker::RECORD_TYPE getType() const { return FileRecordTypeChecker::BAM_RECORD_TYPE; } diff --git a/src/utils/FileRecordTools/Records/Bed3Interval.cpp b/src/utils/FileRecordTools/Records/Bed3Interval.cpp index e31e43eafa5e5bb2a1b7eeadc9940a5332fe17b5..adc2c6e12f4058226f404d2a947345e46d0033a9 100644 --- a/src/utils/FileRecordTools/Records/Bed3Interval.cpp +++ b/src/utils/FileRecordTools/Records/Bed3Interval.cpp @@ -23,6 +23,7 @@ bool Bed3Interval::initFromFile(FileReader *fileReader) bool Bed3Interval::initFromFile(SingleLineDelimTextFileReader *fileReader) { + setFileIdx(fileReader->getFileIdx()); fileReader->getField(0, _chrName); fileReader->getField(1, _startPosStr); fileReader->getField(2, _endPosStr); diff --git a/src/utils/FileRecordTools/Records/BlockMgr.cpp b/src/utils/FileRecordTools/Records/BlockMgr.cpp index 0ef322a9c31f85c8dad221e5e2ac15878b620186..e9465527d687820ad859d003616fb182946a0b07 100644 --- a/src/utils/FileRecordTools/Records/BlockMgr.cpp +++ b/src/utils/FileRecordTools/Records/BlockMgr.cpp @@ -30,7 +30,7 @@ BlockMgr::~BlockMgr() delete _blockRecordsMgr; } -void BlockMgr::getBlocks(RecordKeyList &keyList, bool &mustDelete) +void BlockMgr::getBlocks(RecordKeyVector &keyList, bool &mustDelete) { switch (keyList.getKey()->getType()) { case FileRecordTypeChecker::BED12_RECORD_TYPE: @@ -50,7 +50,7 @@ void BlockMgr::getBlocks(RecordKeyList &keyList, bool &mustDelete) -void BlockMgr::getBlocksFromBed12(RecordKeyList &keyList, bool &mustDelete) +void BlockMgr::getBlocksFromBed12(RecordKeyVector &keyList, bool &mustDelete) { const Bed12Interval *keyRecord = static_cast<const Bed12Interval *>(keyList.getKey()); int blockCount = keyRecord->getBlockCount(); @@ -78,7 +78,7 @@ void BlockMgr::getBlocksFromBed12(RecordKeyList &keyList, bool &mustDelete) mustDelete = true; } -void BlockMgr::getBlocksFromBam(RecordKeyList &keyList, bool &mustDelete) +void BlockMgr::getBlocksFromBam(RecordKeyVector &keyList, bool &mustDelete) { const BamRecord *keyRecord = static_cast<const BamRecord *>(keyList.getKey()); const vector<BamTools::CigarOp> &cigarData = keyRecord->getCigarData(); @@ -135,26 +135,26 @@ Record *BlockMgr::allocateAndAssignRecord(const Record *keyRecord, int startPos, return record; } -int BlockMgr::getTotalBlockLength(RecordKeyList &keyList) { +int BlockMgr::getTotalBlockLength(RecordKeyVector &keyList) { int sum = 0; - for (RecordKeyList::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { - const Record *record = iter->value(); + for (RecordKeyVector::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { + const Record *record = *iter; sum += record->getEndPos() - record->getStartPos(); } return sum; } -void BlockMgr::deleteBlocks(RecordKeyList &keyList) +void BlockMgr::deleteBlocks(RecordKeyVector &keyList) { - for (RecordKeyList::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { - _blockRecordsMgr->deleteRecord(iter->value()); + for (RecordKeyVector::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { + _blockRecordsMgr->deleteRecord(*iter); } - keyList.clearList(); + keyList.clearVector(); } -int BlockMgr::findBlockedOverlaps(RecordKeyList &keyList, RecordKeyList &hitList, RecordKeyList &resultList) +int BlockMgr::findBlockedOverlaps(RecordKeyVector &keyList, RecordKeyVector &hitList, RecordKeyVector &resultList) { bool deleteKeyBlocks = false; if (keyList.empty()) { @@ -164,8 +164,8 @@ int BlockMgr::findBlockedOverlaps(RecordKeyList &keyList, RecordKeyList &hitList _overlapBases.clear(); int keyBlocksSumLength = getTotalBlockLength(keyList); //Loop through every database record the query intersected with - for (RecordKeyList::const_iterator_type hitListIter = hitList.begin(); hitListIter != hitList.end(); hitListIter = hitList.next()) { - RecordKeyList hitBlocks(hitListIter->value()); + for (RecordKeyVector::const_iterator_type hitListIter = hitList.begin(); hitListIter != hitList.end(); hitListIter = hitList.next()) { + RecordKeyVector hitBlocks(*hitListIter); bool deleteHitBlocks = false; getBlocks(hitBlocks, deleteHitBlocks); //get all blocks for the hit record. int hitBlockSumLength = getTotalBlockLength(hitBlocks); //get total lentgh of the bocks for the hitRecord. @@ -173,11 +173,11 @@ int BlockMgr::findBlockedOverlaps(RecordKeyList &keyList, RecordKeyList &hitList bool hitHasOverlap = false; //loop through every block of the database record. - for (RecordKeyList::const_iterator_type hitBlockIter = hitBlocks.begin(); hitBlockIter != hitBlocks.end(); hitBlockIter = hitBlocks.next()) { + for (RecordKeyVector::const_iterator_type hitBlockIter = hitBlocks.begin(); hitBlockIter != hitBlocks.end(); hitBlockIter = hitBlocks.next()) { //loop through every block of the query record. - for (RecordKeyList::const_iterator_type keyListIter = keyList.begin(); keyListIter != keyList.end(); keyListIter = keyList.next()) { - const Record *keyBlock = keyListIter->value(); - const Record *hitBlock = hitBlockIter->value(); + for (RecordKeyVector::const_iterator_type keyListIter = keyList.begin(); keyListIter != keyList.end(); keyListIter = keyList.next()) { + const Record *keyBlock = *keyListIter; + const Record *hitBlock = *hitBlockIter; int maxStart = max(keyBlock->getStartPos(), hitBlock->getStartPos()); int minEnd = min(keyBlock->getEndPos(), hitBlock->getEndPos()); @@ -194,9 +194,9 @@ int BlockMgr::findBlockedOverlaps(RecordKeyList &keyList, RecordKeyList &hitList if ((float) totalHitOverlap / (float)keyBlocksSumLength > _overlapFraction) { if (_hasReciprocal && ((float)totalHitOverlap / (float)hitBlockSumLength > _overlapFraction)) { - resultList.push_back(hitListIter->value()); + resultList.push_back(*hitListIter); } else if (!_hasReciprocal) { - resultList.push_back(hitListIter->value()); + resultList.push_back(*hitListIter); } } } diff --git a/src/utils/FileRecordTools/Records/BlockMgr.h b/src/utils/FileRecordTools/Records/BlockMgr.h index 13b76fa87b299a4325d9143df012365d11ae4d64..c503df634f54a219ca4a722263f149122736654a 100644 --- a/src/utils/FileRecordTools/Records/BlockMgr.h +++ b/src/utils/FileRecordTools/Records/BlockMgr.h @@ -14,7 +14,7 @@ //Produce and manage seperate records for the sub-intervals inside the #include "FileRecordTypeChecker.h" -#include "RecordKeyList.h" +#include "RecordKeyVector.h" using namespace std; @@ -26,18 +26,18 @@ public: ~BlockMgr(); // Return value is the number of blocks this main record has been split into. - void getBlocks(RecordKeyList &keyList, bool &mustDelete); - void deleteBlocks(RecordKeyList &keyList); + void getBlocks(RecordKeyVector &keyList, bool &mustDelete); + void deleteBlocks(RecordKeyVector &keyList); // Get the sum of the lengths of all blocks for a record. - int getTotalBlockLength(RecordKeyList &keyList); + int getTotalBlockLength(RecordKeyVector &keyList); // Determine which hits in the hitList intersect the hits in the keyList by comparing all blocks in each // and checking that their total intersection meets any overlapFraction and reciprocal criteria compared to // the total block lengths of the hitList and keyList. All hits that pass will be in the resultList. // Return value is the number of hits in the result set. - int findBlockedOverlaps(RecordKeyList &keyList, RecordKeyList &hitList, RecordKeyList &resultList); + int findBlockedOverlaps(RecordKeyVector &keyList, RecordKeyVector &hitList, RecordKeyVector &resultList); //these are setting options for splitting BAM records void setBreakOnDeletionOps(bool val) { _breakOnDeletionOps = val; } @@ -57,8 +57,8 @@ private: // For now, all records will be split into Bed6 records. const static FileRecordTypeChecker::RECORD_TYPE _blockRecordsType = FileRecordTypeChecker::BED6_RECORD_TYPE; - void getBlocksFromBed12(RecordKeyList &keyList, bool &mustDelete); - void getBlocksFromBam(RecordKeyList &keyList, bool &mustDelete); + void getBlocksFromBed12(RecordKeyVector &keyList, bool &mustDelete); + void getBlocksFromBam(RecordKeyVector &keyList, bool &mustDelete); Record *allocateAndAssignRecord(const Record *keyRecord, int startPos, int endPos); diff --git a/src/utils/FileRecordTools/Records/GffRecord.cpp b/src/utils/FileRecordTools/Records/GffRecord.cpp index 21cea1dab87ea07e05e5b6331ecca4f9abe56cb5..b80d2d19b82d747670bae2955f28c2c1510fc388 100644 --- a/src/utils/FileRecordTools/Records/GffRecord.cpp +++ b/src/utils/FileRecordTools/Records/GffRecord.cpp @@ -23,6 +23,7 @@ void GffRecord::clear() bool GffRecord::initFromFile(SingleLineDelimTextFileReader *fileReader) { + setFileIdx(fileReader->getFileIdx()); fileReader->getField(0, _chrName); fileReader->getField(3, _startPosStr); _startPos = str2chrPos(_startPosStr); diff --git a/src/utils/FileRecordTools/Records/Makefile b/src/utils/FileRecordTools/Records/Makefile index f471e666f110fa39f3cc4dde99b879ec580e07f1..14e03c60810eb2460e294aba562e1f454e26302d 100644 --- a/src/utils/FileRecordTools/Records/Makefile +++ b/src/utils/FileRecordTools/Records/Makefile @@ -17,13 +17,14 @@ INCLUDES = -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= RecordMgr.cpp RecordMgr.h Record.h Record.cpp EmptyRecord.h EmptyRecord.cpp Bed3Interval.h Bed3Interval.cpp \ +SOURCES = Record.h Record.cpp EmptyRecord.h EmptyRecord.cpp Bed3Interval.h Bed3Interval.cpp \ Bed4Interval.h Bed4Interval.cpp BedGraphInterval.h BedGraphInterval.cpp Bed5Interval.h Bed5Interval.cpp \ Bed6Interval.h Bed6Interval.cpp \ BedPlusInterval.h BedPlusInterval.cpp Bed12Interval.h Bed12Interval.cpp BamRecord.h BamRecord.cpp VcfRecord.h VcfRecord.cpp \ - GffRecord.h GffRecord.cpp RecordKeyList.h RecordKeyList.cpp BlockMgr.h BlockMgr.cpp StrandQueue.h StrandQueue.cpp -OBJECTS= RecordMgr.o Record.o EmptyRecord.o Bed3Interval.o Bed4Interval.o BedGraphInterval.o Bed5Interval.o Bed6Interval.o BedPlusInterval.o Bed12Interval.o BamRecord.o \ - VcfRecord.o GffRecord.o RecordKeyList.o BlockMgr.o StrandQueue.o + GffRecord.h GffRecord.cpp BlockMgr.h BlockMgr.cpp StrandQueue.h StrandQueue.cpp \ + RecordMgr.cpp RecordMgr.h RecordList.h RecordList.cpp RecordKeyList.h RecordKeyList.cpp RecordKeyVector.h RecordKeyVector.cpp +OBJECTS= Record.o EmptyRecord.o Bed3Interval.o Bed4Interval.o BedGraphInterval.o Bed5Interval.o Bed6Interval.o BedPlusInterval.o Bed12Interval.o BamRecord.o \ + VcfRecord.o GffRecord.o BlockMgr.o StrandQueue.o RecordMgr.o RecordList.o RecordKeyList.o RecordKeyVector.o _EXT_OBJECTS=ParseTools.o QuickString.o ChromIdLookup.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) @@ -38,8 +39,9 @@ $(BUILT_OBJECTS): $(SOURCES) clean: @echo "Cleaning up." - @rm -f $(OBJ_DIR)/RecordMgr.o $(OBJ_DIR)/Record.o $(OBJ_DIR)/EmptyRecord.o $(OBJ_DIR)/Bed3Interval.o $(OBJ_DIR)/Bed4Interval.o \ + @rm -f $(OBJ_DIR)/RecordMgr.o $(OBJ_DIR)/RecordList.o $(OBJ_DIR)/Record.o $(OBJ_DIR)/EmptyRecord.o $(OBJ_DIR)/Bed3Interval.o $(OBJ_DIR)/Bed4Interval.o \ $(OBJ_DIR)/BedGraphInterval.o $(OBJ_DIR)/Bed5Interval.o $(OBJ_DIR)/Bed6Interval.o \ - $(OBJ_DIR)/BedPlusInterval.o $(OBJ_DIR)/Bed12Interval.o $(OBJ_DIR)/BamRecord.o $(OBJ_DIR)/VcfRecord.o $(OBJ_DIR)/GffRecord.o $(OBJ_DIR)/BlockMgr.o $(OBJ_DIR)/StrandQueue.o + $(OBJ_DIR)/BedPlusInterval.o $(OBJ_DIR)/Bed12Interval.o $(OBJ_DIR)/BamRecord.o $(OBJ_DIR)/VcfRecord.o $(OBJ_DIR)/GffRecord.o $(OBJ_DIR)/BlockMgr.o $(OBJ_DIR)/StrandQueue.o \ + $(OBJ_DIR)/RecordKeyList.o $(OBJ_DIR)/RecordKeyVector.o .PHONY: clean \ No newline at end of file diff --git a/src/utils/FileRecordTools/Records/Record.cpp b/src/utils/FileRecordTools/Records/Record.cpp index 89544ed21f23e91d7db73ab1e6e2158edc695ae9..2499ce28cad9cbdc3fc25a7fb7c11853ba95a45d 100644 --- a/src/utils/FileRecordTools/Records/Record.cpp +++ b/src/utils/FileRecordTools/Records/Record.cpp @@ -3,7 +3,8 @@ #include <cstdio> Record::Record() -: _chrId(-1), +: _fileIdx(-1), + _chrId(-1), _startPos(-1), _endPos(-1), _strandVal(UNKNOWN), @@ -18,6 +19,7 @@ Record::~Record() { const Record &Record::operator=(const Record &other) { + _fileIdx = other._fileIdx; _chrName = other._chrName; _chrId = other._chrId; _startPos = other._startPos; @@ -29,6 +31,7 @@ const Record &Record::operator=(const Record &other) } void Record::clear() { + _fileIdx = -1; _chrName.clear(); _chrId = -1; _startPos = -1; @@ -72,6 +75,35 @@ bool Record::operator > (const Record &other) const { return false; } +bool Record::lessThan(const Record *other) const +{ + if (!sameChrom(other)) { + return chromBefore(other); + } + if (_startPos != other->_startPos) { + return _startPos < other->_startPos; + } + if (_endPos != other->_endPos) { + return _endPos < other->_endPos; + } + return false; +} + +bool Record::greaterThan(const Record *other) const +{ + if (!sameChrom(other)) { + return chromAfter(other); + } + if (_startPos != other->_startPos) { + return _startPos > other->_startPos; + } + if (_endPos != other->_endPos) { + return _endPos > other->_endPos; + } + return false; + +} + bool Record::sameChrom(const Record *other) const { return (_chrId == -1 || other->_chrId == -1) ? ( _chrName == other->_chrName) : (_chrId == other->_chrId); } diff --git a/src/utils/FileRecordTools/Records/Record.h b/src/utils/FileRecordTools/Records/Record.h index 5adbc787035c551c9d8910cb1687453469b830ab..75b7dd655de451ed0997117c00c230bc3aab26f9 100644 --- a/src/utils/FileRecordTools/Records/Record.h +++ b/src/utils/FileRecordTools/Records/Record.h @@ -43,6 +43,8 @@ public: virtual void setChrName(const string &chr) { _chrName = chr; } virtual void setChrName(const char *chr) { _chrName = chr; } + virtual int getFileIdx() const { return _fileIdx; } + virtual void setFileIdx(int fileIdx) { _fileIdx = fileIdx; } virtual int getChromId() const { return _chrId; } virtual void setChromId(int id) { _chrId = id; } @@ -110,7 +112,8 @@ public: virtual bool operator < (const Record &other) const; virtual bool operator > (const Record &other) const; - + virtual bool lessThan(const Record *other) const; + virtual bool greaterThan(const Record *other) const; //is this on the same chromosome as another record? bool sameChrom(const Record *other) const; @@ -136,6 +139,7 @@ public: protected: virtual ~Record(); //by making the destructor protected, only the friend class(es) can actually delete Record objects, or objects derived from Record. + int _fileIdx; //associated file the record came from QuickString _chrName; int _chrId; int _startPos; @@ -153,9 +157,13 @@ protected: bool _isMateUnmapped; }; -class RecordPtrSortFunctor { +class RecordPtrSortAscFunctor { public: - bool operator()(const Record *rec1, const Record *rec2) const { return *rec1 > *rec2; } + bool operator()(const Record *rec1, const Record *rec2) const { return *rec1 < *rec2; } }; +class RecordPtrSortDescFunctor { +public: + bool operator()(const Record *rec1, const Record *rec2) const { return *rec1 > *rec2; } +}; #endif /* RECORD_H_ */ diff --git a/src/utils/FileRecordTools/Records/RecordKeyList.h b/src/utils/FileRecordTools/Records/RecordKeyList.h index 948f677b0f6e427c3559ca60ae9e7732d7ed85a3..6c0a3fcaeff1b7a561e4ecc2efe3fe0139c2e48f 100644 --- a/src/utils/FileRecordTools/Records/RecordKeyList.h +++ b/src/utils/FileRecordTools/Records/RecordKeyList.h @@ -9,13 +9,13 @@ #define KEYLIST_H_ #include "Record.h" -#include "BTlist.h" +#include "RecordList.h" class RecordKeyList { public: typedef const Record * elemType; - typedef BTlist<elemType> listType; - typedef const BTlistNode<elemType> *const_iterator_type; + typedef RecordList listType; + typedef const RecordListNode *const_iterator_type; RecordKeyList(); RecordKeyList(elemType item); RecordKeyList(elemType item, const listType &list); @@ -45,6 +45,8 @@ public: } bool allClear() { return (_key == NULL && empty()); } + void sort() { _list.sort(); } + private: elemType _key; diff --git a/src/utils/FileRecordTools/Records/RecordKeyVector.cpp b/src/utils/FileRecordTools/Records/RecordKeyVector.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e1110ffc3678806bcd2c8a91ed8e9324a055d305 --- /dev/null +++ b/src/utils/FileRecordTools/Records/RecordKeyVector.cpp @@ -0,0 +1,88 @@ +/* + * RecordKeyVector.cpp + * + * Created on: Aug 1, 2014 + * Author: nek3d + */ + + +#include "RecordKeyVector.h" +#include <algorithm> + +RecordKeyVector::RecordKeyVector() +: _key(NULL), + _currPos(0) +{ +} + +RecordKeyVector::RecordKeyVector(const Record * item) +: _key(item), + _currPos(0) +{ +} + +RecordKeyVector::RecordKeyVector(const Record * item, const vecType &vec) +: _key(item), + _currPos(0) +{ + _recVec = vec; +} + +RecordKeyVector::~RecordKeyVector() { +} + +const RecordKeyVector &RecordKeyVector::operator=(const RecordKeyVector &other) +{ + setKey(other._key); + _recVec = other._recVec; + return *this; +} + +const RecordKeyVector::const_iterator_type RecordKeyVector::begin() { + _currPos = 0; + return _recVec.begin(); +} + +const RecordKeyVector::const_iterator_type RecordKeyVector::next() { + _currPos++; + return _recVec.begin() + _currPos; +} + + +const RecordKeyVector::const_iterator_type RecordKeyVector::end() { + return _recVec.end(); +} + +size_t RecordKeyVector::size() const { + return _recVec.size(); +} + +bool RecordKeyVector::empty() const { + return _recVec.empty(); +} + +void RecordKeyVector::push_back(elemType item) { + _recVec.push_back(item); +} + +const Record *RecordKeyVector::getKey() const { + return _key; +} + +void RecordKeyVector::setKey(elemType key) { + _key = key; +} + +void RecordKeyVector::setVector(const vecType &vec) { + _recVec = vec; +} + +void RecordKeyVector::clearVector() { + _recVec.clear(); +} + +void RecordKeyVector::sortVector() { + std::sort(_recVec.begin(), _recVec.end(), RecordPtrSortAscFunctor()); +} + + diff --git a/src/utils/FileRecordTools/Records/RecordKeyVector.h b/src/utils/FileRecordTools/Records/RecordKeyVector.h new file mode 100644 index 0000000000000000000000000000000000000000..a7389d52cf9c1503171cd80f898172117e2e2bbd --- /dev/null +++ b/src/utils/FileRecordTools/Records/RecordKeyVector.h @@ -0,0 +1,63 @@ +/* + * RecordKeyVector.h + * + * Created on: Dec 14, 2012 + * Author: nek3d + */ + +#ifndef KEYVECTOR_H_ +#define KEYVECTOR_H_ + + + + +using namespace std; + +#include "Record.h" +#include <vector> + +class RecordKeyVector { +public: + typedef const Record * elemType; + typedef vector<const Record *> vecType; + typedef vecType::const_iterator const_iterator_type; + RecordKeyVector(); + RecordKeyVector(elemType item); + RecordKeyVector(elemType item, const vecType &vector); + ~RecordKeyVector(); + + const RecordKeyVector &operator=(const RecordKeyVector &other); + const const_iterator_type begin(); + const const_iterator_type next(); + const const_iterator_type end(); + size_t size() const ; + bool empty() const ; //only checks whether list is empty. Doesn't check key. + void push_back(elemType item); + elemType getKey() const ; + void setKey(elemType key); + + //setVectorNoCopy will make our list share the nodes of another + //list, not copy them. + void setVector(const vecType &list); + + // WARNING! clearVector will NOT delete records pointed to by list nodes. Caller must do that separately, since the RecordKeyVector + // does not have it's own RecordMgr member. + void clearVector(); + + void clearAll() { + setKey(NULL); + clearVector(); + } + bool allClear() { return (_key == NULL && empty()); } + + void sortVector(); + + +private: + elemType _key; + vecType _recVec; + int _currPos; +}; + + +#endif /* RECORDKEYLIST_H_ */ diff --git a/src/utils/FileRecordTools/Records/RecordList.cpp b/src/utils/FileRecordTools/Records/RecordList.cpp new file mode 100644 index 0000000000000000000000000000000000000000..142706fd312ece5fa6b2641ba9488a929915be1f --- /dev/null +++ b/src/utils/FileRecordTools/Records/RecordList.cpp @@ -0,0 +1,202 @@ +/* + * RecordList.cpp + * + * Created on: Jun 19, 2014 + * Author: nek3d + */ + +#include "RecordList.h" + +RecordList::RecordList() : + _begin(NULL), + _currEnd(NULL), + _prevCursor(NULL), + _size(0), + _dontDelete(false) +{ +} + +void RecordList::pop_front() { + if (empty()) { + return; + } + RecordListNode *killNode = _begin; + if (_begin->_next == NULL) { //this is the only item. List size is 1. + delete killNode; + _begin = NULL; + _currEnd = NULL; + _prevCursor = NULL; + _size = 0; + return; + } + if (_prevCursor == _begin) { + _prevCursor = _begin->_next; + } + _begin = _begin->next(); + delete killNode; + _size--; +} + +const RecordListNode *RecordList::next() { + if (_prevCursor == NULL) { + _prevCursor = _begin; + return _begin->_next; + } + _prevCursor = _prevCursor->_next; + return _prevCursor->_next; +} + + +RecordListNode *RecordList::deleteCurrent() { + //delete what the current cursor is pointing to, + //meaning whatever the prevCursor's next member points to. + RecordListNode *returnNode = NULL; + RecordListNode *killNode = NULL; + if (_prevCursor == NULL) { + //deleting first item in list. + killNode = _begin; + _begin = _begin->_next; + returnNode = _begin; + + } else { + killNode = _prevCursor->_next; + _prevCursor->_next = _prevCursor->_next->_next; + returnNode = _prevCursor->_next; + } + if (_currEnd == killNode) { + //deleting last item in list + _currEnd = _prevCursor; //back up the current end. + } + delete killNode; + _size--; + + return returnNode; +} + +void RecordList::push_back(const Record * &val) { + RecordListNode *newNode = new RecordListNode(val); + if (empty()) { + _begin = newNode; + _currEnd = newNode; + _prevCursor = NULL; + } else { + _currEnd->_next = newNode; + _prevCursor = _currEnd; + _currEnd = newNode; + } + _size++; +} + + +void RecordList::clear() { + if (_dontDelete) { + return; + } + //delete all nodes, set cursors to NULL, set size to 0. + RecordListNode *killNode = _begin; + while (killNode != NULL) { + RecordListNode *nextNode = killNode->_next; + delete killNode; + killNode = nextNode; + } + _begin = NULL; + _currEnd = NULL; + _prevCursor = NULL; + _size = 0; +} + +const RecordList &RecordList::operator=(const RecordList &other) { + //need to make copies of all nodes and assign values + clear(); + + RecordListNode *node = other._begin; + while (node != NULL) { + push_back(node->_val); + node = node->_next; + } + return *this; +} +void RecordList::assignNoCopy(RecordList &other) { + _begin = other._begin; + _size = other._size; + _currEnd = other._currEnd; + _prevCursor = other._prevCursor; + _dontDelete = true; +} + + +void RecordList::mergeSort(RecordListNode **headRef) { + RecordListNode *head = *headRef; + RecordListNode *a; + RecordListNode *b; + + /* Base case -- length 0 or 1 */ + if ((head == NULL) || (head->_next == NULL)) { + return; + } + + /* Split head into 'a' and 'b' sublists */ + frontBackSplit(head, &a, &b); + + /* Recursively sort the sublists */ + mergeSort(&a); + mergeSort(&b); + + /* answer = merge the two sorted lists together */ + *headRef = sortedMerge(a, b); +} + +RecordListNode* RecordList::sortedMerge(RecordListNode * a, RecordListNode * b) { + RecordListNode *result = NULL; + + /* Base cases */ + if (a == NULL) return(b); + else if (b==NULL) return(a); + + /* Pick either a or b, and recur */ + if (a->_val->lessThan(b->_val)) { + result = a; + result->_next = sortedMerge(a->_next, b); + } else { + result = b; + result->_next = sortedMerge(a, b->_next); + } + return(result); +} + +/* URecord *ILIRecord *Y FUNCRecord *IONS */ +/* Split the nodes of the given list into front and back halves, + and return the two lists using the reference parameters. + If the length is odd, the extra node should go in the front list. + Uses the fast/slow pointer strategy. */ +void RecordList::frontBackSplit(RecordListNode* source, + RecordListNode** frontRef, RecordListNode** backRef) { + RecordListNode *fast; + RecordListNode *slow; + if (source==NULL || source->_next==NULL) { + /* length < 2 cases */ + *frontRef = source; + *backRef = NULL; + } else { + slow = source; + fast = source->_next; + + /* Advance 'fast' two nodes, and advance 'slow' one node */ + while (fast != NULL) { + fast = fast->_next; + if (fast != NULL) { + slow = slow->_next; + fast = fast->_next; + } + } + + /* 'slow' is before the midpoint in the list, so split it in two + at that point. */ + *frontRef = source; + *backRef = slow->_next; + slow->_next = NULL; + } +} + + + diff --git a/src/utils/FileRecordTools/Records/RecordList.h b/src/utils/FileRecordTools/Records/RecordList.h new file mode 100644 index 0000000000000000000000000000000000000000..e1b24b562ec08797baeeaa664eec0749b539c223 --- /dev/null +++ b/src/utils/FileRecordTools/Records/RecordList.h @@ -0,0 +1,117 @@ +/* + * RecordListPtr.h + * + * Created on: Jun 19, 2014 + * Author: nek3d + */ + +#ifndef RECORD_LIST_H_ +#define RECORD_LIST_H_ + + +// Writing our own singly linked list class because: +// +// 1) SRecord *L doesn't currently support one that I can find without +// C++0x support or SGI extensions. +// +// 2) A normal singly linked list doesn't support constant time +// insertion or deletion at the end. It also doesn't track +// your current position or let you delete at that position, +// but this does. +// +// 3) A normal SRecord *L list is doubly-linked, which would do all +// these things, but more slowly and with more memory. +// +// 4) I can. + + +#include "FreeList.h" +#include "QuickString.h" +#include <cstring> //for memset +#include "Record.h" + +class RecordListNode { +public: + friend class RecordList; + + RecordListNode() : _next(NULL){} + RecordListNode(const Record * val) : _val(val), _next(NULL) {} + const Record * value() const { return _val; } + const RecordListNode *next() const { return _next; } + RecordListNode *next() { return _next; } + bool hasNext() const { return _next != NULL; } + void clear() { + _next = NULL; + _val = NULL; // WARNING! Since _val is a pointer, + // the user is responsible for any needed deallocation. + } + +private: + + const Record * _val; + RecordListNode *_next; +}; + +class RecordList { +public: + RecordList(); + + ~RecordList() { clear(); } + + typedef const RecordListNode * const_iterator_type; + + bool empty() const { return _begin == NULL; } + size_t size() const { return _size; } + + const RecordListNode *front() const { return _begin;} + RecordListNode *back() const { return _currEnd; } + void pop_front(); + const RecordListNode *begin() { + _prevCursor = NULL; + return _begin; + } + + const RecordListNode *begin() const { return _begin; } + const RecordListNode *next(); + const RecordListNode *end() const { return NULL; } + + RecordListNode * deleteCurrent(); + void push_back(const Record * &val); + + void clear(); + const RecordList &operator=(const RecordList &other); + void assignNoCopy(RecordList &other); + + void sort() { mergeSort(&_begin); } + +private: + RecordListNode *_begin; + //keep a cursor to the last element for constant time push_back and pop_back functions. + RecordListNode *_currEnd; + RecordListNode *_prevCursor; + size_t _size; + + bool _dontDelete; //this will usally be false, but set to true when + //calling assignNoCopy, as our list will actually just be a pointer + //to another list. + + + + // The rest of this is just helper methods for sorting. + // We'll use a mergeSort for optimal time / space efficiency. + // The following methods are copied almost verbatim from + // http://www.geeksforgeeks.org/merge-sort-for-linked-list/ + + void mergeSort(RecordListNode **headRef); + RecordListNode* sortedMerge(RecordListNode * a, RecordListNode * b); + + /* Split the nodes of the given list into front and back halves, + and return the two lists using the reference parameters. + If the length is odd, the extra node should go in the front list. + Uses the fast/slow pointer strategy. */ + void frontBackSplit(RecordListNode* source, + RecordListNode** frontRef, RecordListNode** backRef); +}; + + +#endif /* RECORD_LIST_H_ */ diff --git a/src/utils/FileRecordTools/Records/StrandQueue.h b/src/utils/FileRecordTools/Records/StrandQueue.h index bd02b4090fc4577bc85425f27bcba8019b0f8cf5..f20e1a1c1454e3928076330d92e27f80ddfdcc33 100644 --- a/src/utils/FileRecordTools/Records/StrandQueue.h +++ b/src/utils/FileRecordTools/Records/StrandQueue.h @@ -30,7 +30,7 @@ public: private: // static RecordPtrSortFunctor _recSortFunctor; - typedef priority_queue<Record *, vector<const Record *>, RecordPtrSortFunctor > queueType; + typedef priority_queue<Record *, vector<const Record *>, RecordPtrSortDescFunctor > queueType; vector<queueType *> _queues; static const int NUM_QUEUES = 3; diff --git a/src/utils/FileRecordTools/Records/VcfRecord.cpp b/src/utils/FileRecordTools/Records/VcfRecord.cpp index 20cbb6f7cf881f8af7a3f468cb73ce96e21210dc..b3ac4666a89fde61db2e1904512638734450ab2d 100644 --- a/src/utils/FileRecordTools/Records/VcfRecord.cpp +++ b/src/utils/FileRecordTools/Records/VcfRecord.cpp @@ -10,6 +10,7 @@ bool VcfRecord::initFromFile(SingleLineDelimTextFileReader *fileReader) { + setFileIdx(fileReader->getFileIdx()); fileReader->getField(0, _chrName); _chrId = fileReader->getCurrChromdId(); fileReader->getField(1, _startPosStr); @@ -32,8 +33,8 @@ bool VcfRecord::initFromFile(SingleLineDelimTextFileReader *fileReader) void VcfRecord::clear() { BedPlusInterval::clear(); - _varAlt.clear(); - _varRef.clear(); + _varAlt.release(); + _varRef.release(); } void VcfRecord::print(QuickString &outBuf) const { diff --git a/src/utils/GenomeFile/NewGenomeFile.cpp b/src/utils/GenomeFile/NewGenomeFile.cpp index 84e44cbed97dcdbd7c0fdb760dc02edca634bad8..8d7af4d88abf514f6e0570d1210ebda7cb007e5c 100644 --- a/src/utils/GenomeFile/NewGenomeFile.cpp +++ b/src/utils/GenomeFile/NewGenomeFile.cpp @@ -23,12 +23,20 @@ NewGenomeFile::NewGenomeFile(const QuickString &genomeFilename) NewGenomeFile::NewGenomeFile(const BamTools::RefVector &refVector) : _maxId(-1) { - for (size_t i = 0; i < refVector.size(); ++i) { + size_t i = 0; + for (; i < refVector.size(); ++i) { QuickString chrom = refVector[i].RefName; CHRPOS length = refVector[i].RefLength; _maxId++; _chromSizeIds[chrom] = pair<CHRPOS, CHRPOS>(length, _maxId); + _chromList.push_back(chrom); } + // Special: BAM files can have unmapped reads, which show as no chromosome, or an empty chrom string. + // Add in an empty chrom so these don't error. + _maxId++; + _chromSizeIds[""] = pair<CHRPOS, int>(0, _maxId); + _chromList.push_back(""); + } // Destructor @@ -69,6 +77,13 @@ void NewGenomeFile::loadGenomeFileIntoMap() { cerr << "Error: The genome file " << _genomeFileName << " has no valid entries. Exiting." << endl; exit(1); } + // Special: BAM files can have unmapped reads, which show as no chromosome, or an empty chrom string. + // Add in an empty chrom so these don't error. + _maxId++; + _chromSizeIds[""] = pair<CHRPOS, int>(0, _maxId); + _chromList.push_back(""); + + _startOffsets.push_back(_genomeLength); //insert the final length as the last element //to help with the lower_bound call in the projectOnGenome method. genFile.close(); diff --git a/src/utils/GenomeFile/NewGenomeFile.h b/src/utils/GenomeFile/NewGenomeFile.h index 99a97fab46d9f63d7548ecb8efecb9c56b1dad72..7b804a6c51f06daef36bfc8e135d542df1e8a483 100644 --- a/src/utils/GenomeFile/NewGenomeFile.h +++ b/src/utils/GenomeFile/NewGenomeFile.h @@ -38,7 +38,7 @@ public: CHRPOS getChromSize(const QuickString &chrom); // return the size of a chromosome CHRPOS getChromId(const QuickString &chrom); // return chromosome's sort order const vector<QuickString> &getChromList() const { return _chromList; } // return a list of chrom names - CHRPOS getNumberOfChroms() const { return _chromList.size(); } + CHRPOS getNumberOfChroms() const { return _chromList.size() -1; }//the -1 excludes the blank chrom added for unmapped reads const QuickString &getGenomeFileName() const { return _genomeFileName; } diff --git a/src/utils/KeyListOps/KeyListOps.cpp b/src/utils/KeyListOps/KeyListOps.cpp index 15ec3fda48f3b2579721d40a5520e0bbad97c2bf..531654896363d82a5909f62edf0d5928f03e8b5b 100644 --- a/src/utils/KeyListOps/KeyListOps.cpp +++ b/src/utils/KeyListOps/KeyListOps.cpp @@ -154,7 +154,7 @@ bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) { return true; } -const QuickString & KeyListOps::getOpVals(RecordKeyList &hits) +const QuickString & KeyListOps::getOpVals(RecordKeyVector &hits) { //loop through all requested columns, and for each one, call the method needed //for the operation specified. diff --git a/src/utils/KeyListOps/KeyListOps.h b/src/utils/KeyListOps/KeyListOps.h index b2137d87d8f1e80b4389e7291169d50823f2838d..8262677cd133fe4122be1481c7bb1916e4034e85 100644 --- a/src/utils/KeyListOps/KeyListOps.h +++ b/src/utils/KeyListOps/KeyListOps.h @@ -40,7 +40,7 @@ public: const QuickString &getNullValue() { return _methods.getNullValue(); } const QuickString &getDelimStr() { return _methods.getDelimStr(); } - void setKeyList(RecordKeyList *keyList) { _methods.setKeyList(keyList); } + void setKeyList(RecordKeyVector *keyList) { _methods.setKeyList(keyList); } typedef enum { SUM, MEAN, STDDEV, SAMPLE_STDDEV, MEDIAN, MODE, ANTIMODE, MIN, MAX, ABSMIN, ABSMAX, COUNT, DISTINCT, COUNT_DISTINCT, DISTINCT_ONLY, COLLAPSE, CONCAT, FREQ_ASC, FREQ_DESC, FIRST, LAST, INVALID } OP_TYPES; @@ -48,7 +48,7 @@ public: void setDBfileType(FileRecordTypeChecker::FILE_TYPE type) { _dbFileType = type; } bool isValidColumnOps(FileRecordMgr *dbFile); - const QuickString &getOpVals(RecordKeyList &hits); + const QuickString &getOpVals(RecordKeyVector &hits); private: void init(); diff --git a/src/utils/KeyListOps/KeyListOpsMethods.cpp b/src/utils/KeyListOps/KeyListOpsMethods.cpp index cd649faae463f7d8ad5d49a86444ee3205241484..028471ae791ce2ed29e2b334de5098ec51f72b77 100644 --- a/src/utils/KeyListOps/KeyListOpsMethods.cpp +++ b/src/utils/KeyListOps/KeyListOpsMethods.cpp @@ -22,7 +22,7 @@ KeyListOpsMethods::KeyListOpsMethods() { } -KeyListOpsMethods::KeyListOpsMethods(RecordKeyList *keyList, int column) +KeyListOpsMethods::KeyListOpsMethods(RecordKeyVector *keyList, int column) : _keyList(keyList), _column(column), _nullVal("."), @@ -313,13 +313,13 @@ const QuickString &KeyListOpsMethods::getLast() { } const QuickString &KeyListOpsMethods::getColVal() { - const QuickString &retVal = _iter->value()->getField(_column); + const QuickString &retVal = (*_iter)->getField(_column); if (_isBam && retVal.empty()) return _nullVal; return retVal; } double KeyListOpsMethods::getColValNum() { - const QuickString &strVal = _iter->value()->getField(_column); + const QuickString &strVal = (*_iter)->getField(_column); if (!isNumeric(strVal)) { _nonNumErrFlag = true; _errMsg = " ***** WARNING: Non numeric value "; diff --git a/src/utils/KeyListOps/KeyListOpsMethods.h b/src/utils/KeyListOps/KeyListOpsMethods.h index 16796858841eea0524015dd9c0073e641c5ecab6..fbf5101214a4c7ae749f2a4298e6981c332be738 100644 --- a/src/utils/KeyListOps/KeyListOpsMethods.h +++ b/src/utils/KeyListOps/KeyListOpsMethods.h @@ -12,18 +12,18 @@ #include <utility> //for pair #include "QuickString.h" #include <stdint.h> -#include "RecordKeyList.h" +#include "RecordKeyVector.h" using namespace std; class KeyListOpsMethods { public: KeyListOpsMethods(); - KeyListOpsMethods(RecordKeyList *keyList, int column = 1); + KeyListOpsMethods(RecordKeyVector *keyList, int column = 1); ~KeyListOpsMethods(); void setIsBam(bool isBam) { _isBam = isBam; } - void setKeyList(RecordKeyList *keyList) { _keyList = keyList; } + void setKeyList(RecordKeyVector *keyList) { _keyList = keyList; } void setColumn(int col) { _column = col; } void setNullValue(const QuickString & nullVal) { _nullVal = nullVal; } const QuickString &getNullValue() const { return _nullVal; } @@ -81,14 +81,14 @@ public: } private: - RecordKeyList *_keyList; + RecordKeyVector *_keyList; int _column; QuickString _nullVal; QuickString _delimStr; QuickString _retStr; - RecordKeyList _nullKeyList; //this has to exist just so we can initialize _iter, below. - RecordKeyList::const_iterator_type _iter; + RecordKeyVector _nullKeyList; //this has to exist just so we can initialize _iter, below. + RecordKeyVector::const_iterator_type _iter; // Some methods need to put values into a vector, mostly for sorting. vector<double> _numArray; diff --git a/src/utils/NewChromsweep/NewChromsweep.cpp b/src/utils/NewChromsweep/NewChromsweep.cpp index 34647d678c448ccdfefbe32adcd856a81476b58b..f7561e8c139087b72a1755efe2abf7373088179a 100644 --- a/src/utils/NewChromsweep/NewChromsweep.cpp +++ b/src/utils/NewChromsweep/NewChromsweep.cpp @@ -18,32 +18,35 @@ NewChromSweep::NewChromSweep(ContextIntersect *context, bool useMergedIntervals) : _context(context), _queryFRM(NULL), - _databaseFRM(NULL), - _useMergedIntervals(false), + _numDBs(_context->getNumDatabaseFiles()), + _useMergedIntervals(false), _queryRecordsTotalLength(0), _databaseRecordsTotalLength(0), _wasInitialized(false), _currQueryRec(NULL), - _currDatabaseRec(NULL), _runToQueryEnd(false) - { } bool NewChromSweep::init() { - //Create new FileRecordMgrs for the two input files. + //Create new FileRecordMgrs for the input files. //Open them, and get the first record from each. - //if any of that goes wrong, return false; //otherwise, return true. _queryFRM = _context->getFile(_context->getQueryFileIdx()); - _databaseFRM = _context->getFile(_context->getDatabaseFileIdx()); - nextRecord(false); -// if (_currDatabaseRec == NULL) { -// return false; -// } + _dbFRMs.resize(_numDBs, NULL); + for (int i=0; i < _numDBs; i++) { + _dbFRMs[i] = _context->getDatabaseFile(i); + } + + _currDbRecs.resize(_numDBs, NULL); + for (int i=0; i < _numDBs; i++) { + nextRecord(false, i); + } + + _caches.resize(_numDBs); //determine whether to stop when the database end is hit, or keep going until the //end of the query file is hit as well. @@ -59,9 +62,12 @@ void NewChromSweep::closeOut() { while (!_queryFRM->eof()) { nextRecord(true); } - while (!_databaseFRM->eof()) { - nextRecord(false); - } + + for (int i=0; i < _numDBs; i++) { + while (!_dbFRMs[i]->eof()) { + nextRecord(false, i); + } + } } NewChromSweep::~NewChromSweep(void) { @@ -71,136 +77,147 @@ NewChromSweep::~NewChromSweep(void) { _queryFRM->deleteRecord(_currQueryRec); _currQueryRec = NULL; - _databaseFRM->deleteRecord(_currDatabaseRec); - _currDatabaseRec = NULL; + + for (int i=0; i < _numDBs; i++) { + _dbFRMs[i]->deleteRecord(_currDbRecs[i]); + _currDbRecs[i] = NULL; + } _queryFRM->close(); - _databaseFRM->close(); + + for (int i=0; i < _numDBs; i++) { + _dbFRMs[i]->close(); + } } -void NewChromSweep::scanCache() { - recListIterType cacheIter = _cache.begin(); - while (cacheIter != _cache.end()) +void NewChromSweep::scanCache(int dbIdx, RecordKeyVector &retList) { + recListIterType cacheIter = _caches[dbIdx].begin(); + while (cacheIter != _caches[dbIdx].end()) { const Record *cacheRec = cacheIter->value(); if (_currQueryRec->sameChrom(cacheRec) && !(_currQueryRec->after(cacheRec))) { if (intersects(_currQueryRec, cacheRec)) { - _hits.push_back(cacheRec); - } - cacheIter = _cache.next(); + retList.push_back(cacheRec); + } else break; // cacheRec is after the query rec, stop scanning. + cacheIter = _caches[dbIdx].next(); } else { - cacheIter = _cache.deleteCurrent(); - _databaseFRM->deleteRecord(cacheRec); + cacheIter = _caches[dbIdx].deleteCurrent(); + _dbFRMs[dbIdx]->deleteRecord(cacheRec); } } } -void NewChromSweep::clearCache() +void NewChromSweep::clearCache(int dbIdx) { //delete all objects pointed to by cache - for (recListIterType iter = _cache.begin(); iter != _cache.end(); iter = _cache.next()) { - _databaseFRM->deleteRecord(iter->value()); + recListType &cache = _caches[dbIdx]; + for (recListIterType iter = cache.begin(); iter != cache.end(); iter = cache.next()) { + _dbFRMs[dbIdx]->deleteRecord(iter->value()); } - _cache.clear(); + cache.clear(); } -bool NewChromSweep::chromChange() +void NewChromSweep::masterScan(RecordKeyVector &retList) { + for (int i=0; i < _numDBs; i++) { + if (dbFinished(i) || chromChange(i, retList)) { + continue; + } else { + + // scan the database cache for hits + scanCache(i, retList); + //skip if we hit the end of the DB + // advance the db until we are ahead of the query. update hits and cache as necessary + while (_currDbRecs[i] != NULL && + _currQueryRec->sameChrom(_currDbRecs[i]) && + !(_currDbRecs[i]->after(_currQueryRec))) { + if (intersects(_currQueryRec, _currDbRecs[i])) { + retList.push_back(_currDbRecs[i]); + } + if (_currQueryRec->after(_currDbRecs[i])) { + _dbFRMs[i]->deleteRecord(_currDbRecs[i]); + _currDbRecs[i] = NULL; + } else { + _caches[i].push_back(_currDbRecs[i]); + _currDbRecs[i] = NULL; + } + nextRecord(false, i); + } + } + } +} + +bool NewChromSweep::chromChange(int dbIdx, RecordKeyVector &retList) { // the files are on the same chrom - if (_currDatabaseRec == NULL || _currQueryRec->sameChrom(_currDatabaseRec)) { + if (_currDbRecs[dbIdx] == NULL || _currQueryRec->sameChrom(_currDbRecs[dbIdx])) { return false; } // the query is ahead of the database. fast-forward the database to catch-up. - if (_currQueryRec->chromAfter(_currDatabaseRec)) { + if (_currQueryRec->chromAfter(_currDbRecs[dbIdx])) { - while (_currDatabaseRec != NULL && - _currQueryRec->chromAfter(_currDatabaseRec)) { - _databaseFRM->deleteRecord(_currDatabaseRec); - nextRecord(false); + while (_currDbRecs[dbIdx] != NULL && + _currQueryRec->chromAfter(_currDbRecs[dbIdx])) { + _dbFRMs[dbIdx]->deleteRecord(_currDbRecs[dbIdx]); + nextRecord(false, dbIdx); } - clearCache(); + clearCache(dbIdx); return false; } // the database is ahead of the query. else { // 1. scan the cache for remaining hits on the query's current chrom. - if (_currQueryRec->getChrName() == _currChromName) - { - scanCache(); - } - // 2. fast-forward until we catch up and report 0 hits until we do. - else if (_currQueryRec->chromBefore(_currDatabaseRec)) - { - clearCache(); - } + scanCache(dbIdx, retList); return true; } + + //control can't reach here, but compiler still wants a return statement. + return true; } -bool NewChromSweep::next(RecordKeyList &next) { +bool NewChromSweep::next(RecordKeyVector &retList) { + retList.clearVector(); if (_currQueryRec != NULL) { _queryFRM->deleteRecord(_currQueryRec); } - nextRecord(true); - if (_currQueryRec == NULL) { //eof hit! - return false; - } - if (_currDatabaseRec == NULL && _cache.empty() && !_runToQueryEnd) { + if (!nextRecord(true)) return false; // query EOF hit + + + if (allCurrDBrecsNull() && allCachesEmpty() && !_runToQueryEnd) { return false; } - _hits.clear(); _currChromName = _currQueryRec->getChrName(); - // have we changed chromosomes? - if (!chromChange()) { - // scan the database cache for hits - scanCache(); - //skip if we hit the end of the DB - // advance the db until we are ahead of the query. update hits and cache as necessary - while (_currDatabaseRec != NULL && - _currQueryRec->sameChrom(_currDatabaseRec) && - !(_currDatabaseRec->after(_currQueryRec))) { - if (intersects(_currQueryRec, _currDatabaseRec)) { - _hits.push_back(_currDatabaseRec); - } - if (_currQueryRec->after(_currDatabaseRec)) { - _databaseFRM->deleteRecord(_currDatabaseRec); - _currDatabaseRec = NULL; - } else { - _cache.push_back(_currDatabaseRec); - _currDatabaseRec = NULL; - } - nextRecord(false); - } + + masterScan(retList); + + if (_context->getSortOutput()) { + retList.sortVector(); } - next.setKey(_currQueryRec); - next.setListNoCopy(_hits); + + retList.setKey(_currQueryRec); return true; } -void NewChromSweep::nextRecord(bool query) { +bool NewChromSweep::nextRecord(bool query, int dbIdx) { if (query) { -// if (!_context->getUseMergedIntervals()) { - _currQueryRec = _queryFRM->getNextRecord(); -// } else { -// _currQueryRec = _queryFRM->allocateAndGetNextMergedRecord(_context->getSameStrand() ? FileRecordMgr::SAME_STRAND_EITHER : FileRecordMgr::ANY_STRAND); -// } + _currQueryRec = _queryFRM->getNextRecord(); if (_currQueryRec != NULL) { _queryRecordsTotalLength += (unsigned long)(_currQueryRec->getEndPos() - _currQueryRec->getStartPos()); + return true; } + return false; } else { //database -// if (!_context->getUseMergedIntervals()) { - _currDatabaseRec = _databaseFRM->getNextRecord(); -// } else { -// _currDatabaseRec = _databaseFRM->allocateAndGetNextMergedRecord(_context->getSameStrand() ? FileRecordMgr::SAME_STRAND_EITHER : FileRecordMgr::ANY_STRAND); -// } - if (_currDatabaseRec != NULL) { - _databaseRecordsTotalLength += (unsigned long)(_currDatabaseRec->getEndPos() - _currDatabaseRec->getStartPos()); + Record *rec = _dbFRMs[dbIdx]->getNextRecord(); + _currDbRecs[dbIdx] = rec; + if (rec != NULL) { + _databaseRecordsTotalLength += (unsigned long)(rec->getEndPos() - rec->getStartPos()); + return true; } + return false; } } @@ -209,3 +226,29 @@ bool NewChromSweep::intersects(const Record *rec1, const Record *rec2) const return rec1->sameChromIntersects(rec2, _context->getSameStrand(), _context->getDiffStrand(), _context->getOverlapFraction(), _context->getReciprocal()); } + + +bool NewChromSweep::allCachesEmpty() { + for (int i=0; i < _numDBs; i++) { + if (!_caches[i].empty()) { + return false; + } + } + return true; +} + +bool NewChromSweep::allCurrDBrecsNull() { + for (int i=0; i < _numDBs; i++) { + if (_currDbRecs[i] != NULL) { + return false; + } + } + return true; +} + +bool NewChromSweep::dbFinished(int dbIdx) { + if (_currDbRecs[dbIdx] == NULL && _caches[dbIdx].empty()) { + return true; + } + return false; +} diff --git a/src/utils/NewChromsweep/NewChromsweep.h b/src/utils/NewChromsweep/NewChromsweep.h index 480b399ce0af0fecbf4c7cbe00662ac8d425aafe..1dfe5b7bc543aa14eb42ad2803e943df8fd0862c 100644 --- a/src/utils/NewChromsweep/NewChromsweep.h +++ b/src/utils/NewChromsweep/NewChromsweep.h @@ -1,5 +1,5 @@ /***************************************************************************** - NewChromsweepBed.h + NewChromsweep.h (c) 2009 - Aaron Quinlan Hall Laboratory @@ -15,6 +15,7 @@ #include <string> #include "BTlist.h" #include "RecordKeyList.h" +#include "RecordKeyVector.h" #include <queue> #include <iostream> #include <fstream> @@ -34,61 +35,67 @@ public: ~NewChromSweep(void); bool init(); - typedef BTlist<const Record *> recListType; - typedef const BTlistNode<const Record *> * recListIterType; + typedef RecordList recListType; + typedef const RecordListNode *recListIterType; // loads next (a pair) with the current query and it's overlaps - bool next(RecordKeyList &next); + bool next(RecordKeyVector &next); - //MUST call this method after sweep if you want the - //getTotalRecordLength methods to return the whole length of the - //record files, rather than just what was used by sweep. + // NOTE! You MUST call this method after sweep if you want the + // getTotalRecordLength methods to return the whole length of the + // record files, rather than just what was used by sweep. void closeOut(); + unsigned long getQueryTotalRecordLength() { return _queryRecordsTotalLength; } unsigned long getDatabaseTotalRecordLength() { return _databaseRecordsTotalLength; } - // Usage: - // NewChromSweep sweep = NewChromSweep(queryFileName, databaseFileName); - // RecordKeyList hits; - // while (sweep.next(hits)) - // { - // processHits(hits); - // } - // getQueryTotalRecordLength() - // getDatabaseTotalRecordLength() - private: ContextIntersect *_context; FileRecordMgr *_queryFRM; - FileRecordMgr *_databaseFRM; + int _numDBs; //don't really need this stored, but here for code brevity. + vector<FileRecordMgr *> _dbFRMs; bool _useMergedIntervals; unsigned long _queryRecordsTotalLength; + vector<unsigned long> _dbFileRecordsLength; //each value in this vector have the + //length of all records in the corresponding db file. + unsigned long _databaseRecordsTotalLength; bool _wasInitialized; // a cache of still active features from the database file - recListType _cache; +// typedef enum { BEFORE_QUERY, NEAR_QUERY, AFTER_QUERY, EOF } cacheStatusType; +// vector <pair<cacheStatusType, recListType> >_caches; + + vector <recListType>_caches; // the set of hits in the database for the current query - recListType _hits; +// recListType _hits; // the current query and db features. - Record * _currQueryRec; - Record *_currDatabaseRec; + const Record * _currQueryRec; + vector<const Record *> _currDbRecs; + // a cache of the current chrom from the query. used to handle chrom changes. QuickString _currChromName; bool _runToQueryEnd; - void nextRecord(bool query); //true fetches next query record, false fetches next db record. - void nextDatabase(); + void masterScan(RecordKeyVector &retList); + + bool nextRecord(bool query, int dbIdx = -1); //true fetches next query record, false fetches next db record. - void scanCache(); - void clearCache(); - bool chromChange(); + void scanCache(int dbIdx, RecordKeyVector &retList); + void clearCache(int dbIdx); + bool chromChange(int dbIdx, RecordKeyVector &retList); + + bool dbFinished(int dbIdx); + bool intersects(const Record *rec1, const Record *rec2) const; + bool allCachesEmpty(); + bool allCurrDBrecsNull(); + }; #endif /* NewChromSweep_H */ diff --git a/src/utils/RecordOutputMgr/RecordOutputMgr.cpp b/src/utils/RecordOutputMgr/RecordOutputMgr.cpp index b7b2f9a456878783864c85b97bcbd3c667c27536..f4d80741c09bab8e7b7d6db455dd59ffcd3307d6 100644 --- a/src/utils/RecordOutputMgr/RecordOutputMgr.cpp +++ b/src/utils/RecordOutputMgr/RecordOutputMgr.cpp @@ -75,7 +75,7 @@ void RecordOutputMgr::init(ContextBase *context) { } } -bool RecordOutputMgr::printKeyAndTerminate(RecordKeyList &keyList) { +bool RecordOutputMgr::printKeyAndTerminate(RecordKeyVector &keyList) { if (_context->getProgram() == ContextBase::MERGE) { //when printing merged records, we want to force the printing into //bed3 format, which is surprisingly difficult to do. Had to use the following: @@ -95,7 +95,7 @@ bool RecordOutputMgr::printKeyAndTerminate(RecordKeyList &keyList) { } -RecordOutputMgr::printBamType RecordOutputMgr::printBamRecord(RecordKeyList &keyList, bool bamOutputOnly) +RecordOutputMgr::printBamType RecordOutputMgr::printBamRecord(RecordKeyVector &keyList, bool bamOutputOnly) { const Record *record = keyList.getKey(); if (record->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) { @@ -118,7 +118,7 @@ RecordOutputMgr::printBamType RecordOutputMgr::printBamRecord(RecordKeyList &key void RecordOutputMgr::printRecord(const Record *record) { - RecordKeyList keyList(record); + RecordKeyVector keyList(record); printRecord(keyList); } @@ -137,9 +137,9 @@ void RecordOutputMgr::printRecord(const Record *record, const QuickString & valu } } -void RecordOutputMgr::printRecord(RecordKeyList &keyList) { +void RecordOutputMgr::printRecord(RecordKeyVector &keyList) { if (keyList.getKey()->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) { - RecordKeyList blockList(keyList.getKey()); + RecordKeyVector blockList(keyList.getKey()); bool deleteBlocks = false; _bamBlockMgr->getBlocks(blockList, deleteBlocks); printRecord(keyList, &blockList); @@ -152,7 +152,7 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList) { } -void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockList) +void RecordOutputMgr::printRecord(RecordKeyVector &keyList, RecordKeyVector *blockList) { if (needsFlush()) { flush(); @@ -198,8 +198,8 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi return; } int hitIdx = 0; - for (RecordKeyList::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { - reportOverlapDetail(keyList.getKey(), iter->value(), hitIdx); + for (RecordKeyVector::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { + reportOverlapDetail(keyList.getKey(), *iter, hitIdx); hitIdx++; } } @@ -306,6 +306,7 @@ void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record (static_cast<ContextIntersect *>(_context))->getWriteB()) || (static_cast<ContextIntersect *>(_context))->getLeftJoin()) { printKey(keyRecord); tab(); + addDbFileId(hitRecord->getFileIdx()); hitRecord->print(_outBuf); newline(); if (needsFlush()) flush(); @@ -318,6 +319,7 @@ void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record else if ((static_cast<ContextIntersect *>(_context))->getWriteB()) { printKey(keyRecord, *startStr, *endStr); tab(); + addDbFileId(hitRecord->getFileIdx()); hitRecord->print(_outBuf); newline(); if (needsFlush()) flush(); @@ -331,6 +333,7 @@ void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record } printKey(keyRecord); tab(); + addDbFileId(hitRecord->getFileIdx()); hitRecord->print(_outBuf); tab(); int2str(printOverlapBases, _outBuf, true); @@ -339,7 +342,7 @@ void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record } } -void RecordOutputMgr::reportOverlapSummary(RecordKeyList &keyList) +void RecordOutputMgr::reportOverlapSummary(RecordKeyVector &keyList) { int numOverlapsFound = (int)keyList.size(); if ((static_cast<ContextIntersect *>(_context))->getAnyHit() && numOverlapsFound > 0) { @@ -365,6 +368,17 @@ void RecordOutputMgr::reportOverlapSummary(RecordKeyList &keyList) } } +void RecordOutputMgr::addDbFileId(int fileId) { + if ((static_cast<ContextIntersect *>(_context))->getNumDatabaseFiles() == 1) return; + if (!_context->getUseDBnameTags() && (!_context->getUseDBfileNames())) { + _outBuf.append(fileId); + } else if (_context->getUseDBnameTags()){ + _outBuf.append((static_cast<ContextIntersect *>(_context))->getDatabaseNameTag((static_cast<ContextIntersect *>(_context))->getDbIdx(fileId))); + } else { + _outBuf.append(_context->getInputFileName(fileId)); + } + tab(); +} void RecordOutputMgr::null(bool queryType, bool dbType) { @@ -373,7 +387,7 @@ void RecordOutputMgr::null(bool queryType, bool dbType) if (queryType) { recordType = (static_cast<ContextIntersect *>(_context))->getQueryRecordType(); } else if (dbType) { - recordType = (static_cast<ContextIntersect *>(_context))->getDatabaseRecordType(); + recordType = (static_cast<ContextIntersect *>(_context))->getDatabaseRecordType(0); } } else { recordType = _context->getFile(0)->getRecordType(); diff --git a/src/utils/RecordOutputMgr/RecordOutputMgr.h b/src/utils/RecordOutputMgr/RecordOutputMgr.h index 1a7fe609835e2e87fcbbf0e13e662362430e68d7..33ad8a11bcff7920f67601a00dffd9b07fafa509 100644 --- a/src/utils/RecordOutputMgr/RecordOutputMgr.h +++ b/src/utils/RecordOutputMgr/RecordOutputMgr.h @@ -9,7 +9,7 @@ #define RECORDOUTPUTMGR_H_ #include "ContextBase.h" -#include "RecordKeyList.h" +#include "RecordKeyVector.h" #include "api/BamWriter.h" class BlockMgr; @@ -23,7 +23,7 @@ public: void init(ContextBase *context); void printRecord(const Record *record); - void printRecord(RecordKeyList &keyList); + void printRecord(RecordKeyVector &keyList); void printRecord(const Record *record, const QuickString & value); //where necessary, pass additional information about splits through the blockMgr. @@ -35,7 +35,7 @@ private: ContextBase *_context; bool _printable; BamTools::BamWriter *_bamWriter; - RecordKeyList *_currBamBlockList; + RecordKeyVector *_currBamBlockList; QuickString _outBuf; @@ -48,14 +48,15 @@ private: void newline() { _outBuf.append('\n'); } void null(bool queryType, bool dbType); - void printRecord(RecordKeyList &keyList, RecordKeyList *blockList); + void printRecord(RecordKeyVector &keyList, RecordKeyVector *blockList); void printKey(const Record *key); void printKey(const Record *key, const QuickString & start, const QuickString & end); - bool printKeyAndTerminate(RecordKeyList &keyList); - printBamType printBamRecord(RecordKeyList &keyList, bool bamOutputOnly = false); + void addDbFileId(int fileId); + bool printKeyAndTerminate(RecordKeyVector &keyList); + printBamType printBamRecord(RecordKeyVector &keyList, bool bamOutputOnly = false); void checkForHeader(); void reportOverlapDetail(const Record *keyRecord, const Record *hitRecord, int hitIdx = 0); - void reportOverlapSummary(RecordKeyList &keyList); + void reportOverlapSummary(RecordKeyVector &keyList); static const unsigned int MAX_OUTBUF_SIZE = 16384; //16 K diff --git a/src/utils/general/CommonHelp.cpp b/src/utils/general/CommonHelp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a0efc473dc915ca64667e415e7245675f5082817 --- /dev/null +++ b/src/utils/general/CommonHelp.cpp @@ -0,0 +1,17 @@ +/* + * CommonHelp.cpp + * + * Created on: Jul 2, 2014 + * Author: nek3d + */ +#include "CommonHelp.h" + +void CommonHelp() { + cerr << "\t-iobuf\t" << "Follow with desired integer size of read buffer." << endl; + cerr << "\t\t" << "Optional suffixes K/M/G supported." << endl; + cerr << "\t\t" << "Note: currently has no effect with compressed files." << endl << endl; +} + + + + diff --git a/src/utils/general/CommonHelp.h b/src/utils/general/CommonHelp.h new file mode 100644 index 0000000000000000000000000000000000000000..c54c4556509a2d07be6abdd695fa6225bf75e742 --- /dev/null +++ b/src/utils/general/CommonHelp.h @@ -0,0 +1,19 @@ +/* + * CommonHelpFile.h + * + * Created on: Jul 2, 2014 + * Author: nek3d + */ + +#ifndef COMMONHELPFILE_H_ +#define COMMONHELPFILE_H_ + +using namespace std; + +#include <iostream> + +extern void CommonHelp(); + + + +#endif /* COMMONHELPFILE_H_ */ diff --git a/src/utils/general/Makefile b/src/utils/general/Makefile index 20d7aeead018248fc8d954d4eb982c66c4009701..33f94ecef59b113dca056a1c9bbfdda3bfe793e9 100644 --- a/src/utils/general/Makefile +++ b/src/utils/general/Makefile @@ -9,8 +9,9 @@ INCLUDES = -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= QuickString.h QuickString.cpp ParseTools.h ParseTools.cpp PushBackStreamBuf.cpp PushBackStreamBuf.h CompressionTools.h CompressionTools.cpp Tokenizer.h Tokenizer.h -OBJECTS= QuickString.o ParseTools.o PushBackStreamBuf.o CompressionTools.o Tokenizer.o +SOURCES= QuickString.h QuickString.cpp ParseTools.h ParseTools.cpp PushBackStreamBuf.cpp PushBackStreamBuf.h CompressionTools.h CompressionTools.cpp \ + Tokenizer.h Tokenizer.h CommonHelp.h CommonHelp.cpp +OBJECTS= QuickString.o ParseTools.o PushBackStreamBuf.o CompressionTools.o Tokenizer.o CommonHelp.o BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) all: $(BUILT_OBJECTS) @@ -23,6 +24,6 @@ $(BUILT_OBJECTS): $(SOURCES) clean: @echo "Cleaning up." - @rm -f $(OBJ_DIR)/QuickString.o $(OBJ_DIR)/ParseTools.o $(OBJ_DIR)/PushBackStreamBuf.o $(OBJ_DIR)/Tokenizer.o + @rm -f $(OBJ_DIR)/QuickString.o $(OBJ_DIR)/ParseTools.o $(OBJ_DIR)/PushBackStreamBuf.o $(OBJ_DIR)/Tokenizer.o $(OBJ_DIR)/CommonHelp.o .PHONY: clean diff --git a/src/utils/general/PushBackStreamBuf.h b/src/utils/general/PushBackStreamBuf.h index d2538eaa3209ef77477bbdd8ffd8f63606806401..1b2e2301e16f933fc5df95c643ea5c6ef9a15d22 100644 --- a/src/utils/general/PushBackStreamBuf.h +++ b/src/utils/general/PushBackStreamBuf.h @@ -31,8 +31,6 @@ protected: private: streambuf* _primary_stream; BTlist<int> _buffer; - static const int SCAN_BUFFER_SIZE = 8192; //8 KB buffer - static const int MAIN_BUFFER_SIZE = 128 * 1024; //128K buffer }; diff --git a/src/utils/general/QuickString.cpp b/src/utils/general/QuickString.cpp index 9e061866f62969778b829c247bad8d32345aad58..5f536156914546e08ea3dfdcddef46ddcb89bd3b 100644 --- a/src/utils/general/QuickString.cpp +++ b/src/utils/general/QuickString.cpp @@ -67,6 +67,11 @@ void QuickString::clear() { _currSize = 0; } +void QuickString::release() { + free(_buffer); + _currCapacity = DEFAULT_CAPACITY; + build(); +} QuickString &QuickString::operator = (const char *inBuf){ set(inBuf, strlen(inBuf)); diff --git a/src/utils/general/QuickString.h b/src/utils/general/QuickString.h index 1024a25f4449149bf19a68c23112228ec8e72a89..93f34bbc52bc24001e8182455cccb0794c970b0d 100644 --- a/src/utils/general/QuickString.h +++ b/src/utils/general/QuickString.h @@ -31,6 +31,7 @@ public: bool empty() const { return _currSize == 0; } void clear(); //only clears buffer, doesn't delete it. + void release(); //will deallocate current buffer, reallocate it at default size. QuickString &operator = (const string &); QuickString &operator = (const char *); QuickString &operator = (const QuickString &); diff --git a/test/intersect/multi_intersect/d1.bed b/test/intersect/multi_intersect/d1.bed new file mode 100644 index 0000000000000000000000000000000000000000..91a43d13300eb670f065b5b31097f8c3019951d0 --- /dev/null +++ b/test/intersect/multi_intersect/d1.bed @@ -0,0 +1,9 @@ +chr1 5 25 +chr1 65 75 +chr1 95 100 +chr2 5 25 +chr2 65 75 +chr2 95 100 +chr3 5 25 +chr3 65 75 +chr3 95 100 diff --git a/test/intersect/multi_intersect/d2.bed b/test/intersect/multi_intersect/d2.bed new file mode 100644 index 0000000000000000000000000000000000000000..c2b4e526c3278797ef2bfd4fb85625ac452e1128 --- /dev/null +++ b/test/intersect/multi_intersect/d2.bed @@ -0,0 +1,6 @@ +chr1 40 50 +chr1 110 125 +chr2 40 50 +chr2 110 125 +chr3 40 50 +chr3 110 125 diff --git a/test/intersect/multi_intersect/d3.bed b/test/intersect/multi_intersect/d3.bed new file mode 100644 index 0000000000000000000000000000000000000000..46862402f8e7f825026125905fb9653389c976ff --- /dev/null +++ b/test/intersect/multi_intersect/d3.bed @@ -0,0 +1,3 @@ +chr1 85 115 +chr2 85 115 +chr3 85 115 diff --git a/test/intersect/multi_intersect/d4.bed b/test/intersect/multi_intersect/d4.bed new file mode 100644 index 0000000000000000000000000000000000000000..34f4e61afba204358755a9382c3ebc7ac3cd111c --- /dev/null +++ b/test/intersect/multi_intersect/d4.bed @@ -0,0 +1,9 @@ +chr1 5 25 +chr1 65 75 +chr1 95 100 +chr2 5 25 +chr2 65 75 +chr2 95 100 +chr10 5 25 +chr10 65 75 +chr10 95 100 diff --git a/test/intersect/multi_intersect/d5.bed b/test/intersect/multi_intersect/d5.bed new file mode 100644 index 0000000000000000000000000000000000000000..8f03c32596c3ccd083e56e933783ca706196e41f --- /dev/null +++ b/test/intersect/multi_intersect/d5.bed @@ -0,0 +1,6 @@ +chr1 40 50 +chr1 110 125 +chr2 40 50 +chr2 110 125 +chr10 40 50 +chr10 110 125 diff --git a/test/intersect/multi_intersect/d6.bed b/test/intersect/multi_intersect/d6.bed new file mode 100644 index 0000000000000000000000000000000000000000..e1026a74057e9daa78ddf979b963d1cb0c2a8485 --- /dev/null +++ b/test/intersect/multi_intersect/d6.bed @@ -0,0 +1,3 @@ +chr1 85 115 +chr2 85 115 +chr10 85 115 diff --git a/test/intersect/multi_intersect/d7.bed b/test/intersect/multi_intersect/d7.bed new file mode 100644 index 0000000000000000000000000000000000000000..c1220eb3d63ba894fe7d3ef30861de340688bc9c --- /dev/null +++ b/test/intersect/multi_intersect/d7.bed @@ -0,0 +1,10 @@ +# DB 7 Header line. +chr1 5 25 d7_1 100 + +chr1 65 75 d7_2 100 - +chr1 95 100 d7_3 100 + +chr2 5 25 d7_4 100 - +chr2 65 75 d7_5 100 . +chr2 95 100 d7_6 100 - +chr3 5 25 d7_7 100 + +chr3 65 75 d7_8 100 - +chr3 95 100 d7_9 100 . diff --git a/test/intersect/multi_intersect/d8.bed b/test/intersect/multi_intersect/d8.bed new file mode 100644 index 0000000000000000000000000000000000000000..565eb9808623e2c636a1b8905a6077b7f9d70cb1 --- /dev/null +++ b/test/intersect/multi_intersect/d8.bed @@ -0,0 +1,7 @@ +# DB 8 Header Line. +chr1 40 50 d8_1 100 - +chr1 110 125 d8_2 100 + +chr2 40 50 d8_3 100 - +chr2 110 125 d8_4 100 . +chr3 40 50 d8_5 100 - +chr3 110 125 d8_6 100 + diff --git a/test/intersect/multi_intersect/d9.bed b/test/intersect/multi_intersect/d9.bed new file mode 100644 index 0000000000000000000000000000000000000000..188c7b4894a1cc4b0e068c2c50525ecb335a429f --- /dev/null +++ b/test/intersect/multi_intersect/d9.bed @@ -0,0 +1,4 @@ +# DB 9 Header Line. +chr1 85 115 d9_1 100 + +chr2 85 115 d9_1 100 - +chr3 85 115 d9_1 100 + diff --git a/test/intersect/multi_intersect/g.bed b/test/intersect/multi_intersect/g.bed new file mode 100644 index 0000000000000000000000000000000000000000..c340e1413e15f4f8d94159d24e4bf29aded418d3 --- /dev/null +++ b/test/intersect/multi_intersect/g.bed @@ -0,0 +1,3 @@ +chr1 200 +chr2 200 +chr10 200 diff --git a/test/intersect/multi_intersect/query.bed b/test/intersect/multi_intersect/query.bed new file mode 100644 index 0000000000000000000000000000000000000000..b778252d69bb06b4345b309041303f84818c23fc --- /dev/null +++ b/test/intersect/multi_intersect/query.bed @@ -0,0 +1,12 @@ +chr1 1 20 +chr1 40 45 +chr1 70 90 +chr1 105 120 +chr2 1 20 +chr2 40 45 +chr2 70 90 +chr2 105 120 +chr3 1 20 +chr3 40 45 +chr3 70 90 +chr3 105 120 diff --git a/test/intersect/multi_intersect/query2.bed b/test/intersect/multi_intersect/query2.bed new file mode 100644 index 0000000000000000000000000000000000000000..dd6000e6666c58be5442b54cdc0f1cfe7dfad1a4 --- /dev/null +++ b/test/intersect/multi_intersect/query2.bed @@ -0,0 +1,13 @@ +# Query 2 Header Line. +chr1 1 20 +chr1 40 45 +chr1 70 90 +chr1 105 120 +chr2 1 20 +chr2 40 45 +chr2 70 90 +chr2 105 120 +chr10 1 20 +chr10 40 45 +chr10 70 90 +chr10 105 120 diff --git a/test/intersect/multi_intersect/query3.bed b/test/intersect/multi_intersect/query3.bed new file mode 100644 index 0000000000000000000000000000000000000000..0c875c8bb2162d069efc891b15f42fd66657b955 --- /dev/null +++ b/test/intersect/multi_intersect/query3.bed @@ -0,0 +1,13 @@ +# Query 3 Header Line. +chr1 1 20 q1 100 + +chr1 40 45 q2 100 + +chr1 70 90 q3 100 + +chr1 105 120 q4 100 + +chr2 1 20 q5 100 + +chr2 40 45 q6 100 + +chr2 70 90 q7 100 + +chr2 105 120 q8 100 + +chr3 1 20 q9 100 + +chr3 40 45 q10 100 + +chr3 70 90 q11 100 + +chr3 105 120 q12 100 + diff --git a/test/intersect/multi_intersect/test-multi_intersect.sh b/test/intersect/multi_intersect/test-multi_intersect.sh new file mode 100644 index 0000000000000000000000000000000000000000..ed3b4ae80b44e3fc833d372e7f583361b634e8f6 --- /dev/null +++ b/test/intersect/multi_intersect/test-multi_intersect.sh @@ -0,0 +1,404 @@ +echo -e \ +"\n########################################################### +# +# MULTIPLE DATABASE INTERSECTION +# +###########################################################\n" + + +BT=${BT-../../../bin/bedtools} + +check() +{ + if diff $1 $2; then + echo ok + else + echo fail + fi +} + +########################################################### +# Test the intersection of a query with 3 dbs against +# the merged and sorted result of 3 +# seperate intersections of the query with each db. +# +# First make sure the 3 seperate intersections are correct. +# +############################################################ +echo " intersect.t01...\c" +echo \ +"chr1 5 20 +chr1 70 75 +chr2 5 20 +chr2 70 75 +chr3 5 20 +chr3 70 75" > exp1 +$BT intersect -a query.bed -b d1.bed > obs +check obs exp1 +rm obs + +echo " intersect.t02...\c" +echo \ +"chr1 40 45 +chr1 110 120 +chr2 40 45 +chr2 110 120 +chr3 40 45 +chr3 110 120" > exp2 +$BT intersect -a query.bed -b d2.bed > obs +check obs exp2 +rm obs + +echo " intersect.t03...\c" +echo \ +"chr1 85 90 +chr1 105 115 +chr2 85 90 +chr2 105 115 +chr3 85 90 +chr3 105 115" > exp3 +$BT intersect -a query.bed -b d3.bed > obs +check obs exp3 +rm obs + +########################################################### +# Now test that the R-tree (unsorted input) with sorted +# output will give the correct result +########################################################### +echo " intersect.t04...\c" +cat exp1 exp2 exp3 | sort -k1,1 -k2,2n > exp +$BT intersect -a query.bed -b d1.bed d2.bed d3.bed -sortout > obs +check exp obs +rm obs exp1 exp2 exp3 + + +########################################################### +# And then test that sorted input with sorted +# output will give the correct result +########################################################### +echo " intersect.t05...\c" +$BT intersect -a query.bed -b d1.bed d2.bed d3.bed -sorted -sortout > obs +check exp obs +rm obs exp + + +########################################################### +# +# Now repeat the above with chroms in a non-standard order +# +########################################################### + +echo " intersect.t06...\c" +echo \ +"chr1 5 20 +chr1 70 75 +chr2 5 20 +chr2 70 75 +chr10 5 20 +chr10 70 75" > exp1 +$BT intersect -a query2.bed -b d4.bed > obs +check obs exp1 +rm obs + +echo " intersect.t07...\c" +echo \ +"chr1 40 45 +chr1 110 120 +chr2 40 45 +chr2 110 120 +chr10 40 45 +chr10 110 120" > exp2 +$BT intersect -a query2.bed -b d5.bed > obs +check obs exp2 +rm obs + +echo " intersect.t08...\c" +echo \ +"chr1 85 90 +chr1 105 115 +chr2 85 90 +chr2 105 115 +chr10 85 90 +chr10 105 115" > exp3 +$BT intersect -a query2.bed -b d6.bed > obs +check obs exp3 +rm obs + +########################################################### +# Now test that the R-tree (unsorted input) with sorted +# output will give the correct result +########################################################### +echo " intersect.t09...\c" +cat exp1 exp2 exp3 | sort -k1,1V -k2,2n > exp +$BT intersect -a query2.bed -b d4.bed d5.bed d6.bed -g g.bed -sortout > obs +check exp obs +rm obs exp1 exp2 exp3 + + +########################################################### +# And then test that sorted input with sorted +# output will give the correct result +########################################################### +echo " intersect.t10...\c" +$BT intersect -a query2.bed -b d4.bed d5.bed d6.bed -sorted -g g.bed -sortout > obs +check exp obs +rm obs exp + + + +########################################################### +# +# TEST VARIOUS CMD LINE OUTPUT OPTIONS +# +########################################################### + + +########################################################### +# Test -c option +########################################################### +echo " intersect.t11...\c" +echo \ +"chr1 1 20 1 +chr1 40 45 1 +chr1 70 90 2 +chr1 105 120 2 +chr2 1 20 1 +chr2 40 45 1 +chr2 70 90 2 +chr2 105 120 2 +chr3 1 20 1 +chr3 40 45 1 +chr3 70 90 2 +chr3 105 120 2" > exp +$BT intersect -a query.bed -b d1.bed d2.bed d3.bed -c -sorted -sortout > obs +check exp obs +rm exp obs + + +########################################################### +# Test -f option +########################################################### +echo " intersect.t12...\c" +echo \ +"chr1 5 20 +chr1 40 45 +chr1 110 120 +chr1 105 115 +chr2 5 20 +chr2 40 45 +chr2 110 120 +chr2 105 115 +chr3 5 20 +chr3 40 45 +chr3 110 120 +chr3 105 115" > exp +$BT intersect -a query.bed -b d1.bed d2.bed d3.bed -f .6 > obs +check exp obs + + +########################################################### +# Test -s option +########################################################### +echo " intersect.t13...\c" +echo \ +"chr1 5 20 q1 100 + +chr1 85 90 q3 100 + +chr1 110 120 q4 100 + +chr1 105 115 q4 100 + +chr3 5 20 q9 100 + +chr3 85 90 q11 100 + +chr3 110 120 q12 100 + +chr3 105 115 q12 100 +" > exp +$BT intersect -a query3.bed -b d7.bed d8.bed d9.bed -s > obs +check exp obs +rm exp obs + + +########################################################### +# Test -s option +########################################################### +echo " intersect.t13...\c" +echo \ +"chr1 40 45 q2 100 + +chr1 70 75 q3 100 + +chr2 5 20 q5 100 + +chr2 40 45 q6 100 + +chr2 85 90 q7 100 + +chr2 105 115 q8 100 + +chr3 40 45 q10 100 + +chr3 70 75 q11 100 +" > exp +$BT intersect -a query3.bed -b d7.bed d8.bed d9.bed -S > obs +check exp obs +rm exp obs + + +########################################################### +# Test -wo option +########################################################### +echo " intersect.t14...\c" +echo \ +"chr1 1 20 1 chr1 5 25 15 +chr1 40 45 2 chr1 40 50 5 +chr1 70 90 1 chr1 65 75 5 +chr1 70 90 3 chr1 85 115 5 +chr1 105 120 2 chr1 110 125 10 +chr1 105 120 3 chr1 85 115 10 +chr2 1 20 1 chr2 5 25 15 +chr2 40 45 2 chr2 40 50 5 +chr2 70 90 1 chr2 65 75 5 +chr2 70 90 3 chr2 85 115 5 +chr2 105 120 2 chr2 110 125 10 +chr2 105 120 3 chr2 85 115 10 +chr3 1 20 1 chr3 5 25 15 +chr3 40 45 2 chr3 40 50 5 +chr3 70 90 1 chr3 65 75 5 +chr3 70 90 3 chr3 85 115 5 +chr3 105 120 2 chr3 110 125 10 +chr3 105 120 3 chr3 85 115 10" > exp +$BT intersect -a query.bed -b d1.bed d2.bed d3.bed -wo > obs +check exp obs +rm exp obs + + + +########################################################### +# Test -wa -wb -header option +########################################################### +echo " intersect.t15...\c" +echo \ +"# Query 3 Header Line. +chr1 1 20 q1 100 + 1 chr1 5 25 d7_1 100 + +chr1 40 45 q2 100 + 2 chr1 40 50 d8_1 100 - +chr1 70 90 q3 100 + 1 chr1 65 75 d7_2 100 - +chr1 70 90 q3 100 + 3 chr1 85 115 d9_1 100 + +chr1 105 120 q4 100 + 2 chr1 110 125 d8_2 100 + +chr1 105 120 q4 100 + 3 chr1 85 115 d9_1 100 + +chr2 1 20 q5 100 + 1 chr2 5 25 d7_4 100 - +chr2 40 45 q6 100 + 2 chr2 40 50 d8_3 100 - +chr2 70 90 q7 100 + 1 chr2 65 75 d7_5 100 . +chr2 70 90 q7 100 + 3 chr2 85 115 d9_1 100 - +chr2 105 120 q8 100 + 2 chr2 110 125 d8_4 100 . +chr2 105 120 q8 100 + 3 chr2 85 115 d9_1 100 - +chr3 1 20 q9 100 + 1 chr3 5 25 d7_7 100 + +chr3 40 45 q10 100 + 2 chr3 40 50 d8_5 100 - +chr3 70 90 q11 100 + 1 chr3 65 75 d7_8 100 - +chr3 70 90 q11 100 + 3 chr3 85 115 d9_1 100 + +chr3 105 120 q12 100 + 2 chr3 110 125 d8_6 100 + +chr3 105 120 q12 100 + 3 chr3 85 115 d9_1 100 +" > exp +$BT intersect -a query3.bed -b d7.bed d8.bed d9.bed -wa -wb -header > obs +check exp obs +rm exp obs + + + +########################################################### +# Test the -filenames option, before db listing +########################################################### +echo " intersect.t16...\c" +echo \ +"chr1 1 20 q1 100 + d7.bed chr1 5 25 d7_1 100 + +chr1 40 45 q2 100 + d8.bed chr1 40 50 d8_1 100 - +chr1 70 90 q3 100 + d7.bed chr1 65 75 d7_2 100 - +chr1 70 90 q3 100 + d9.bed chr1 85 115 d9_1 100 + +chr1 105 120 q4 100 + d8.bed chr1 110 125 d8_2 100 + +chr1 105 120 q4 100 + d9.bed chr1 85 115 d9_1 100 + +chr2 1 20 q5 100 + d7.bed chr2 5 25 d7_4 100 - +chr2 40 45 q6 100 + d8.bed chr2 40 50 d8_3 100 - +chr2 70 90 q7 100 + d7.bed chr2 65 75 d7_5 100 . +chr2 70 90 q7 100 + d9.bed chr2 85 115 d9_1 100 - +chr2 105 120 q8 100 + d8.bed chr2 110 125 d8_4 100 . +chr2 105 120 q8 100 + d9.bed chr2 85 115 d9_1 100 - +chr3 1 20 q9 100 + d7.bed chr3 5 25 d7_7 100 + +chr3 40 45 q10 100 + d8.bed chr3 40 50 d8_5 100 - +chr3 70 90 q11 100 + d7.bed chr3 65 75 d7_8 100 - +chr3 70 90 q11 100 + d9.bed chr3 85 115 d9_1 100 + +chr3 105 120 q12 100 + d8.bed chr3 110 125 d8_6 100 + +chr3 105 120 q12 100 + d9.bed chr3 85 115 d9_1 100 +" > exp +$BT intersect -a query3.bed -filenames -b d7.bed d8.bed d9.bed -wa -wb > obs +check exp obs +rm exp obs + +########################################################### +# Test the -filenames option, after db listing +########################################################### +echo " intersect.t17...\c" +echo \ +"chr1 1 20 q1 100 + d7.bed chr1 5 25 d7_1 100 + +chr1 40 45 q2 100 + d8.bed chr1 40 50 d8_1 100 - +chr1 70 90 q3 100 + d7.bed chr1 65 75 d7_2 100 - +chr1 70 90 q3 100 + d9.bed chr1 85 115 d9_1 100 + +chr1 105 120 q4 100 + d8.bed chr1 110 125 d8_2 100 + +chr1 105 120 q4 100 + d9.bed chr1 85 115 d9_1 100 + +chr2 1 20 q5 100 + d7.bed chr2 5 25 d7_4 100 - +chr2 40 45 q6 100 + d8.bed chr2 40 50 d8_3 100 - +chr2 70 90 q7 100 + d7.bed chr2 65 75 d7_5 100 . +chr2 70 90 q7 100 + d9.bed chr2 85 115 d9_1 100 - +chr2 105 120 q8 100 + d8.bed chr2 110 125 d8_4 100 . +chr2 105 120 q8 100 + d9.bed chr2 85 115 d9_1 100 - +chr3 1 20 q9 100 + d7.bed chr3 5 25 d7_7 100 + +chr3 40 45 q10 100 + d8.bed chr3 40 50 d8_5 100 - +chr3 70 90 q11 100 + d7.bed chr3 65 75 d7_8 100 - +chr3 70 90 q11 100 + d9.bed chr3 85 115 d9_1 100 + +chr3 105 120 q12 100 + d8.bed chr3 110 125 d8_6 100 + +chr3 105 120 q12 100 + d9.bed chr3 85 115 d9_1 100 +" > exp +$BT intersect -a query3.bed -b d7.bed d8.bed d9.bed -filenames -wa -wb > obs +check exp obs +rm exp obs + + +########################################################### +# Test the -names option, before db listing +########################################################### +echo " intersect.t18...\c" +echo \ +"chr1 1 20 q1 100 + blue chr1 5 25 d7_1 100 + +chr1 40 45 q2 100 + red chr1 40 50 d8_1 100 - +chr1 70 90 q3 100 + blue chr1 65 75 d7_2 100 - +chr1 70 90 q3 100 + green chr1 85 115 d9_1 100 + +chr1 105 120 q4 100 + red chr1 110 125 d8_2 100 + +chr1 105 120 q4 100 + green chr1 85 115 d9_1 100 + +chr2 1 20 q5 100 + blue chr2 5 25 d7_4 100 - +chr2 40 45 q6 100 + red chr2 40 50 d8_3 100 - +chr2 70 90 q7 100 + blue chr2 65 75 d7_5 100 . +chr2 70 90 q7 100 + green chr2 85 115 d9_1 100 - +chr2 105 120 q8 100 + red chr2 110 125 d8_4 100 . +chr2 105 120 q8 100 + green chr2 85 115 d9_1 100 - +chr3 1 20 q9 100 + blue chr3 5 25 d7_7 100 + +chr3 40 45 q10 100 + red chr3 40 50 d8_5 100 - +chr3 70 90 q11 100 + blue chr3 65 75 d7_8 100 - +chr3 70 90 q11 100 + green chr3 85 115 d9_1 100 + +chr3 105 120 q12 100 + red chr3 110 125 d8_6 100 + +chr3 105 120 q12 100 + green chr3 85 115 d9_1 100 +" > exp +$BT intersect -a query3.bed -names blue red green -b d7.bed d8.bed d9.bed -wa -wb > obs +check exp obs +rm exp obs + +########################################################### +# Test the -names option, after db listing +########################################################### +echo " intersect.t19...\c" +echo \ +"chr1 1 20 q1 100 + blue chr1 5 25 d7_1 100 + +chr1 40 45 q2 100 + red chr1 40 50 d8_1 100 - +chr1 70 90 q3 100 + blue chr1 65 75 d7_2 100 - +chr1 70 90 q3 100 + green chr1 85 115 d9_1 100 + +chr1 105 120 q4 100 + red chr1 110 125 d8_2 100 + +chr1 105 120 q4 100 + green chr1 85 115 d9_1 100 + +chr2 1 20 q5 100 + blue chr2 5 25 d7_4 100 - +chr2 40 45 q6 100 + red chr2 40 50 d8_3 100 - +chr2 70 90 q7 100 + blue chr2 65 75 d7_5 100 . +chr2 70 90 q7 100 + green chr2 85 115 d9_1 100 - +chr2 105 120 q8 100 + red chr2 110 125 d8_4 100 . +chr2 105 120 q8 100 + green chr2 85 115 d9_1 100 - +chr3 1 20 q9 100 + blue chr3 5 25 d7_7 100 + +chr3 40 45 q10 100 + red chr3 40 50 d8_5 100 - +chr3 70 90 q11 100 + blue chr3 65 75 d7_8 100 - +chr3 70 90 q11 100 + green chr3 85 115 d9_1 100 + +chr3 105 120 q12 100 + red chr3 110 125 d8_6 100 + +chr3 105 120 q12 100 + green chr3 85 115 d9_1 100 +" > exp +$BT intersect -a query3.bed -b d7.bed d8.bed d9.bed -names blue red green -wa -wb > obs +check exp obs +rm exp obs + + + diff --git a/test/intersect/new_test-intersect.sh~ b/test/intersect/new_test-intersect.sh~ new file mode 100755 index 0000000000000000000000000000000000000000..55b01ed54d89aeedd7bdb34a4563011a9fd18321 --- /dev/null +++ b/test/intersect/new_test-intersect.sh~ @@ -0,0 +1,766 @@ +BT=${BT-../../bin/bedtools} + +check() +{ + if diff $1 $2; then + echo ok + else + echo fail + fi +} + +########################################################### +# Test intersection of a as bed from file vs b as bed from file +############################################################ +echo " intersect.new.t01...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a a.bed -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a as bed from redirect vs b as bed from file +############################################################ +echo " intersect.new.t02...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a - -b b.bed < a.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a as bed from pipe vs b as bed from file +############################################################ +echo " intersect.new.t03...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +cat a.bed | $BT intersect -a - -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a as bed from fifo vs b as bed from file +############################################################ +echo " intersect.new.t04...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a <(cat a.bed) -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a as gzipped from file vs b as bed from file +############################################################ +echo " intersect.new.t05...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a a_gzipped.bed.gz -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a as gzipped from redirect vs b as bed from file +############################################################ +echo " intersect.new.t06...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a - -b b.bed < a_gzipped.bed.gz > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a as gzipped from pipe vs b as bed from file +############################################################ +echo " intersect.new.t07...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +cat a_gzipped.bed.gz | $BT intersect -a - -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a as gzipped from fifo vs b as bed from file +############################################################ +echo " intersect.new.t08...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a <(cat a_gzipped.bed.gz) -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a as bgzipped from file vs b as bed from file +############################################################ +echo " intersect.new.t09...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a a_bgzipped.bed.gz -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a as bgzipped from redirect vs b as bed from file +############################################################ +echo " intersect.new.t10...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a - -b b.bed < a_bgzipped.bed.gz > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a as bgzipped from pipe vs b as bed from file +############################################################ +echo " intersect.new.t11...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +cat a_bgzipped.bed.gz | $BT intersect -a - -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a as bgzipped from fifo vs b as bed from file +############################################################ +echo " intersect.new.t12...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a <(cat a_bgzipped.bed.gz) -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a as bam from file vs b as bed from file +############################################################ +echo " intersect.new.t13...\c" +$BT intersect -a a.bam -b b.bed> obs +check obs aVSb.bam +rm obs + + +########################################################### +# Test intersection of a as bam from redirect vs b as bed from file +############################################################ +echo " intersect.new.t14...\c" +$BT intersect -a - -b b.bed < a.bam> obs +check obs aVSb.bam +rm obs + + +########################################################### +# Test intersection of a as bam from pipe vs b as bed from file +############################################################ +echo " intersect.new.t15...\c" +cat a.bam | $BT intersect -a - -b b.bed> obs +check obs aVSb.bam +rm obs + + +########################################################### +# Test intersection of a as bam from fifo vs b as bed from file +############################################################ +echo " intersect.new.t16...\c" +$BT intersect -a <(cat a.bam) -b b.bed > obs +check obs aVSb.bam +rm obs + + +########################################################### +# Test intersection of bam file containing both good reads +# and those where both read and mate are unmapped vs b file +# as bed. +############################################################ +echo " intersect.new.t17...\c" +echo \ +"chr1 100 101 a2 255 - 100 200 0,0,0 1 100, 0, +chr1 100 110 a2 255 - 100 200 0,0,0 1 100, 0," > exp +$BT intersect -a a_with_bothUnmapped.bam -b b.bed -bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of bam file containing both good reads +# and those where both read and mate are unmapped vs b file +# as bed, with noHit (-v) option. +############################################################ +echo " intersect.new.t18...\c" +echo \ +"chr1 10 20 a1 255 + 10 20 0,0,0 1 10, 0, +. -1 -1 FCC1MK2ACXX:1:1101:5780:51632#/1 0 . -1 -1 -1 0,0,0 0 . . +. -1 -1 FCC1MK2ACXX:1:1101:5780:51632#/2 0 . -1 -1 -1 0,0,0 0 . . +. -1 -1 FCC1MK2ACXX:1:1101:8137:99409#/1 0 . -1 -1 -1 0,0,0 0 . . +. -1 -1 FCC1MK2ACXX:1:1101:8137:99409#/2 0 . -1 -1 -1 0,0,0 0 . . +. -1 -1 FCC1MK2ACXX:1:1102:6799:2633#/1 0 . -1 -1 -1 0,0,0 0 . . +. -1 -1 FCC1MK2ACXX:1:1102:6799:2633#/2 0 . -1 -1 -1 0,0,0 0 . ." > exp +$BT intersect -a a_with_bothUnmapped.bam -b b.bed -bed -v > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of bam file containing read where query +# is mapped and mate is not. +############################################################ +echo " intersect.new.t19...\c" +echo \ +"chr1 98650 98704 FCC1MK2ACXX:1:1212:13841:9775#/1 0 + 98604 98704 0,0,0 1 100, 0," > exp +$BT intersect -a oneUnmapped.bam -b j1.bed -bed > obs +check obs exp +rm obs exp + +########################################################### +# Test intersection of bam file containing read where query +# is mapped and mate is not, with noHit (-v) option. +############################################################ +echo " intersect.new.t20...\c" +echo \ +"chr1 -1 -1 FCC1MK2ACXX:1:1212:13841:9775#/2 0 . -1 -1 -1 0,0,0 0 . ." > exp +$BT intersect -a oneUnmapped.bam -b j1.bed -bed -v > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of bam file containing read where query +# is unmapped but mate is mapped. +############################################################ +echo " intersect.new.t20.b...\c" +touch exp +$BT intersect -a queryUnmappedMateMappedCoordsInvalid.bam -b j1.bed -bed > obs +check obs exp +rm obs exp + +########################################################### +# Test intersection of bam file containing read where one +# mate is mapped and one is not, with noHit (-v) option. +############################################################ +echo " intersect.new.t20.c...\c" +echo ". -1 -1 TTTACCTTT:4FSQ5P1:286:D2GA7ACXX:6:2316:20858:89646 0 . -1 -1 -1 0,0,0 0 . ." > exp +$BT intersect -a queryUnmappedMateMappedCoordsInvalid.bam -b j1.bed -bed -v > obs +check obs exp +rm obs exp + + + + + +########################################################### +# Test intersection with -sorted, see that order specified +# in genome file is enforced. +############################################################ +echo " intersect.new.t21...\c" +echo \ +"Error: Sorted input specified, but the file chromOrderA.bed has the following record with a different sort order than the genomeFile human.hg19.genome +chr10 10 20 a3 100 +" > exp +$BT intersect -a chromOrderA.bed -b chromOrderB.bed -sorted -g human.hg19.genome 2>obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection with -sorted, see that hits are missed +# With no genome file +############################################################ +echo " intersect.new.t22...\c" +echo \ +"chr1 15 20 a1 100 + +chr2 15 20 a2 100 + +chrX 15 20 a5 100 +" > exp +$BT intersect -a chromOrderA.bed -b chromOrderB.bed -sorted > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection with -sorted, see that hits are correct +# when sort order of files matches genome file sort order +############################################################ +echo " intersect.new.t23...\c" +echo \ +"chr1 15 20 a1 100 + +chr2 15 20 a2 100 + +chr10 15 20 a3 100 + +chr11 15 20 a4 100 + +chrX 15 20 a5 100 + +chrM 15 20 a6 100 +" > exp +$BT intersect -a chromOrderA.bed -b chromOrderB.bed -sorted -g human.hg19.vSort.genome > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bed from file vs b as bed from file +############################################################ +echo " intersect.new.t24...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a a_withLargeHeader.bed -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bed from redirect vs b as bed from file +############################################################ +echo " intersect.new.t25...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a - -b b.bed < a_withLargeHeader.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bed from pipe vs b as bed from file +############################################################ +echo " intersect.new.t26...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +cat a_withLargeHeader.bed | $BT intersect -a - -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bed from fifo vs b as bed from file +############################################################ +echo " intersect.new.t27...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a <(cat a_withLargeHeader.bed) -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as gzipped from file vs b as bed from file +############################################################ +echo " intersect.new.t28...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a a_withLargeHeader_gzipped.bed.gz -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as gzipped from redirect vs b as bed from file +############################################################ +echo " intersect.new.t29...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a - -b b.bed < a_withLargeHeader_gzipped.bed.gz > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as gzipped from pipe vs b as bed from file +############################################################ +echo " intersect.new.t30...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +cat a_withLargeHeader_gzipped.bed.gz | $BT intersect -a - -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as gzipped from fifo vs b as bed from file +############################################################ +echo " intersect.new.t31...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a <(cat a_withLargeHeader_gzipped.bed.gz) -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bgzipped from file vs b as bed from file +############################################################ +echo " intersect.new.t32...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a a_withLargeHeader_bgzipped.bed.gz -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bgzipped from redirect vs b as bed from file +############################################################ +echo " intersect.new.t33...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a - -b b.bed < a_withLargeHeader_bgzipped.bed.gz > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bgzipped from pipe vs b as bed from file +############################################################ +echo " intersect.new.t34...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +cat a_withLargeHeader_bgzipped.bed.gz | $BT intersect -a - -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bgzipped from fifo vs b as bed from file +############################################################ +echo " intersect.new.t35...\c" +echo \ +"chr1 100 101 a2 2 - +chr1 100 110 a2 2 -" > exp +$BT intersect -a <(cat a_withLargeHeader_bgzipped.bed.gz) -b b.bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test intersection of a with large header as bed from file +# vs b as bed from file, and print header +############################################################ +echo " intersect.new.t36...\c" +$BT intersect -a a_withLargeHeader.bed -b b.bed -header > obs +check obs aWithHeaderVsB.txt +rm obs + + +########################################################### +# Test intersection of a with large header as bed from +# redirect vs b as bed from file, and print header +############################################################ +echo " intersect.new.t37...\c" +$BT intersect -a - -b b.bed < a_withLargeHeader.bed -header > obs +check obs aWithHeaderVsB.txt +rm obs + + +########################################################### +# Test intersection of a with large header as bed from pipe +# vs b as bed from file, and print header +############################################################ +echo " intersect.new.t38...\c" +cat a_withLargeHeader.bed | $BT intersect -a - -b b.bed -header > obs +check obs aWithHeaderVsB.txt +rm obs + + +########################################################### +# Test intersection of a with large header as bed from fifo +# vs b as bed from file, and print header +############################################################ +echo " intersect.new.t39...\c" +$BT intersect -a <(cat a_withLargeHeader.bed) -b b.bed -header > obs +check obs aWithHeaderVsB.txt +rm obs + + +########################################################### +# Test intersection of a with large header as gzipped from +# file vs b as bed from file, and print header +############################################################ +echo " intersect.new.t40...\c" +$BT intersect -a a_withLargeHeader_gzipped.bed.gz -b b.bed -header > obs +check obs aWithHeaderVsB.txt +rm obs + + +########################################################### +# Test intersection of a with large header as gzipped from +# redirect vs b as bed from file, and print header +############################################################ +echo " intersect.new.t41...\c" +$BT intersect -a - -b b.bed < a_withLargeHeader_gzipped.bed.gz -header > obs +check obs aWithHeaderVsB.txt +rm obs + + +########################################################### +# Test intersection of a with large header as gzipped from +# pipe vs b as bed from file, and print header +############################################################ +echo " intersect.new.t42...\c" +cat a_withLargeHeader_gzipped.bed.gz | $BT intersect -a - -b b.bed -header > obs +check obs aWithHeaderVsB.txt +rm obs + + +########################################################### +# Test intersection of a with large header as gzipped from +# fifo vs b as bed from file, and print header +############################################################ +echo " intersect.new.t43...\c" +$BT intersect -a <(cat a_withLargeHeader_gzipped.bed.gz) -b b.bed -header > obs +check obs aWithHeaderVsB.txt +rm obs + + +########################################################### +# Test intersection of a with large header as bgzipped from +# file vs b as bed from file, and print header +############################################################ +echo " intersect.new.t44...\c" +$BT intersect -a a_withLargeHeader_bgzipped.bed.gz -b b.bed -header > obs +check obs aWithHeaderVsB.txt +rm obs + + +########################################################### +# Test intersection of a with large header as bgzipped from +# redirect vs b as bed from file, and print header +############################################################ +echo " intersect.new.t45...\c" +$BT intersect -a - -b b.bed < a_withLargeHeader_bgzipped.bed.gz -header > obs +check obs aWithHeaderVsB.txt +rm obs + + +########################################################### +# Test intersection of a with large header as bgzipped from +# pipe vs b as bed from file, and print header +############################################################ +echo " intersect.new.t46...\c" +cat a_withLargeHeader_bgzipped.bed.gz | $BT intersect -a - -b b.bed -header > obs +check obs aWithHeaderVsB.txt +rm obs + + +########################################################### +# Test intersection of a with large header as bgzipped from +# fifo vs b as bed from file, and print header +############################################################ +echo " intersect.new.t47...\c" +$BT intersect -a <(cat a_withLargeHeader_bgzipped.bed.gz) -b b.bed -header > obs +check obs aWithHeaderVsB.txt +rm obs + + +########################################################### +# Test enforcement of sort order when using -sorted option. +# Show that detect chrom "jumping", i.e. chr1, chr2, then +# back to chr1 +############################################################ +echo " intersect.new.t48...\c" +echo "Error: Sorted input specified" > exp +$BT intersect -a chromsOutOfOrder.bed -b b.bed -sorted 2>&1 > /dev/null | grep "Error: Sorted input specified, " | cut -f1 -d ',' > obs +check obs exp +rm obs exp + +########################################################### +# Test enforcement of sort order when using -sorted option. +# Show that detect chrom "jumping", i.e. chr1, chr2, then +# back to chr1 +############################################################ +echo " intersect.new.t49...\c" +echo "Error: Sorted input specified" > exp +$BT intersect -a recordsOutOfOrder.bed -b b.bed -sorted 2>&1 > /dev/null | grep "Error: Sorted input specified, " | cut -f1 -d ',' > obs +check obs exp +rm obs exp + +########################################################### +# Test that we throw an error for non-existant files +############################################################ +echo " intersect.new.t50...\c" +echo "Error: Unable to open file nonExistantFile.bed. Exiting." > exp +$BT intersect -a nonExistantFile.bed -b b.bed -sorted 2>&1 > /dev/null | cat - >obs +check obs exp +rm obs exp + +########################################################### +# Test that we allow empty query files +############################################################ +echo " intersect.new.t51...\c" +touch exp +touch dummy.txt +$BT intersect -a dummy.txt -b b.bed 2>&1 > /dev/null | cat - > obs +check obs exp +rm obs exp dummy.txt + + +########################################################### +# Test that we throw an error for unrecognized arguments +############################################################ +echo " intersect.new.t52...\c" +echo "***** ERROR: Unrecognized parameter: -wrongArg *****" > exp +$BT intersect -a a.bed -b b.bed -wrongArg 2>&1 > /dev/null | head -2 | tail -1 > obs +check obs exp +rm obs exp + + +########################################################### +# Test that we can process a Bam file with no text in +# it's header. +############################################################ +echo " intersect.new.t53...\c" +$BT intersect -a gdc.bam -b gdc.bam -bed > obs +check obs gdc_exp +rm obs + +########################################################### +# Test that if the query is BAM, and bed output is not +# explicit, and they asked for an output option that is not +# valid with BAM output, an error is thrown. +############################################################ +echo " intersect.new.t54...\c" +echo "***** ERROR: writeAllOverlap option is not valid with BAM query input, unless bed output is specified with -bed option. *****" > exp +$BT intersect -a a.bam -b b.bed -wao 2>&1 > /dev/null | head -2 | tail -1 > obs +check obs exp +rm obs exp + + +########################################################### +# Test that gff files work correctly +############################################################ +echo " intersect.new.t55...\c" +echo \ +"chr2L . UTR 50 70 0 + . ID=mRNA:xs2:UTR:41-70;Parent=mRNA:xs2; +chr2L . CDS 71 100 0 + . ID=mRNA:xs2:CDS:71-130;Parent=mRNA:xs2; +chr2L . exon 50 100 0 + . ID=mRNA:xs2:exon:41-130;Parent=mRNA:xs2; +chr2L . mRNA 50 100 0 + . ID=mRNA:xs2;Parent=g2; +chr2L . gene 50 100 0 + . ID=g2;" > exp +$BT intersect -a gdc.gff -b gdc_one.bed > obs +check obs exp +rm obs exp + +########################################################### +# Test that we allow empty database files, unsorted +############################################################ +echo " intersect.new.t56...\c" +touch exp +touch dummy.txt +$BT intersect -a a.bed -b dummy.txt 2>&1 > /dev/null | cat - > obs +check obs exp +rm obs exp dummy.txt + +########################################################### +# Test that we allow empty database files, sorted +############################################################ +echo " intersect.new.t57...\c" +touch exp +touch dummy.txt +$BT intersect -a a.bed -b dummy.txt -sorted 2>&1 > /dev/null | cat - > obs +check obs exp +rm obs exp dummy.txt + + +########################################################### +# Test that we allow empty database files, unsorted, with +# -v (noHit) option +############################################################ +echo " intersect.new.t58...\c" +touch dummy.txt +$BT intersect -a a.bed -b dummy.txt -v > obs +check obs a.bed +rm obs dummy.txt + + +########################################################### +# Test that an empty query with header run with -header +# option will print header +############################################################ +echo " intersect.new.t59...\c" +echo "#Random Header" >dummy.txt +$BT intersect -a dummy.txt -b a.bed -header > obs +check obs dummy.txt +rm obs dummy.txt + + +########################################################### +# Test that an empty query with header, gzipped, that +# runs with -header option will print header +############################################################ +echo " intersect.new.t60...\c" +echo "#Random Header" >dummy.txt +gzip dummy.txt +echo "#Random Header" >exp +$BT intersect -a dummy.txt.gz -b a.bed -header > obs +check obs exp +rm obs dummy.txt.gz exp + +########################################################### +# Test that an empty query with header, bgzipped, that +# runs with -header option will print header +############################################################ +echo " intersect.new.t61...\c" +echo "#Random Header" >dummy.txt +bgzip dummy.txt +echo "#Random Header" >exp +$BT intersect -a dummy.txt.gz -b a.bed -header > obs +check obs exp +rm obs dummy.txt.gz exp + + +########################################################### +# Test that an empty VCF query with header that +# runs with -header option will print header +############################################################ +echo " intersect.new.t62...\c" +$BT intersect -a headerOnly.vcf -b a.bed -header > obs +check obs headerOnly.vcf +rm obs + + +########################################################### +# Test that files with DOS newline characters, '\r', +# and/or extra tabs at end of line are handled +############################################################ +echo " intersect.new.t63...\c" +echo \ +"chr1 11323785 11617177 +chr1 12645605 13926923 +chr1 14750216 15119039" >exp +~/mergeBugSpace/bt2-merge-debug/bin/bedtools intersect -a dosLineChar_a.bed -b dosLineCharWithExtraTab_b.bed -v > obs +check exp obs +rm exp obs + +########################################################### +# Test that files with no newlines at all are handled +############################################################ +echo " intersect.new.t64...\c" +echo "chr17 7577068 7577157" > exp +$BT intersect -a oneRecordNoNewline.bed -b oneRecordNoNewline.bed > obs +check obs exp +rm obs exp + diff --git a/test/intersect/performanceTest.sh b/test/intersect/performanceTest.sh index 6e9109343ee1044af7bdc66a90ae2353ae0c1390..0d63763f6a18fd64611408df356324ec5d02cc21 100755 --- a/test/intersect/performanceTest.sh +++ b/test/intersect/performanceTest.sh @@ -23,7 +23,7 @@ if true; then echo "generating data..." mkdir perfData cd perfData - ../$BT random -l 1000 -n 20000000 -g ../human.hg19.genome | sort -k1,1 -k2,2n > a10M.bed + ../$BT random -l 1000 -n 10000000 -g ../human.hg19.genome | sort -k1,1 -k2,2n > a10M.bed ../$BT random -l 1000 -n 10000000 -g ../human.hg19.genome | sort -k1,1 -k2,2n > b10M.bed cp a10M.bed a10M_gzipped.bed gzip a10M_gzipped.bed diff --git a/test/intersect/test-intersect.sh b/test/intersect/test-intersect.sh index e6c2c670e1e78fd2ab66d6f6f25bbbca8129f6a0..f29ac8406fafb56731c40c84dfff7a02ae1d3cd6 100644 --- a/test/intersect/test-intersect.sh +++ b/test/intersect/test-intersect.sh @@ -458,4 +458,9 @@ chr1 100 200 a2 2 -" > exp $BT intersect -a a.bed -b a.bam > obs -rm one_block.bam two_blocks.bam three_blocks.bam \ No newline at end of file +rm one_block.bam two_blocks.bam three_blocks.bam + + +cd multi_intersect +bash test-multi_intersect.sh +cd .. \ No newline at end of file