diff --git a/Makefile b/Makefile index 1890fa2653ecfe15b4f2b33c8b2b7b3a951d5ab1..eb2f97ae396678f8489b416ccc0327409939bec5 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/linksBed \ $(SRC_DIR)/maskFastaFromBed \ $(SRC_DIR)/mapFile \ - $(SRC_DIR)/mergeBed \ + $(SRC_DIR)/mergeFile \ $(SRC_DIR)/multiBamCov \ $(SRC_DIR)/multiIntersectBed \ $(SRC_DIR)/nekSandbox1 \ diff --git a/src/intersectFile/intersectFile.cpp b/src/intersectFile/intersectFile.cpp index 928d1d9a4f7e8f985cc9edd2ca44b1dbdcf8a517..e61f4d23165c587cee7c01453964ac56ff9fbfd9 100644 --- a/src/intersectFile/intersectFile.cpp +++ b/src/intersectFile/intersectFile.cpp @@ -81,7 +81,7 @@ bool FileIntersect::processUnsortedFiles() while (!queryFRM->eof()) { - Record *queryRecord = queryFRM->allocateAndGetNextRecord(); + Record *queryRecord = queryFRM->getNextRecord(); if (queryRecord == NULL) { continue; } diff --git a/src/mergeBed/Makefile b/src/mergeBed/Makefile deleted file mode 100644 index 0eaaa645efeab15081c7b84cbb024a4a33826897..0000000000000000000000000000000000000000 --- a/src/mergeBed/Makefile +++ /dev/null @@ -1,35 +0,0 @@ -UTILITIES_DIR = ../utils/ -OBJ_DIR = ../../obj/ -BIN_DIR = ../../bin/ - -# ------------------- -# define our includes -# ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ - -I$(UTILITIES_DIR)/lineFileUtilities/ \ - -I$(UTILITIES_DIR)/gzstream/ \ - -I$(UTILITIES_DIR)/fileType/ \ - -I$(UTILITIES_DIR)/VectorOps/ \ - -I$(UTILITIES_DIR)/version/ - -# ---------------------------------- -# define our source and object files -# ---------------------------------- -SOURCES= mergeMain.cpp mergeBed.cpp mergeBed.h -OBJECTS= mergeMain.o mergeBed.o -BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) - - -all: $(BUILT_OBJECTS) - -.PHONY: all - -$(BUILT_OBJECTS): $(SOURCES) - @echo " * compiling" $(*F).cpp - @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) - -clean: - @echo "Cleaning up." - @rm -f $(OBJ_DIR)/mergeMain.o $(OBJ_DIR)/mergeBed.o - -.PHONY: clean \ No newline at end of file diff --git a/src/mergeBed/mergeBed.cpp b/src/mergeBed/mergeBed.cpp deleted file mode 100644 index dae9887ba3ee46e8930b995e9be6bf3d2f8e4004..0000000000000000000000000000000000000000 --- a/src/mergeBed/mergeBed.cpp +++ /dev/null @@ -1,359 +0,0 @@ -/***************************************************************************** - mergeBed.cpp - - (c) 2009 - Aaron Quinlan - Hall Laboratory - Department of Biochemistry and Molecular Genetics - University of Virginia - aaronquinlan@gmail.com - - Licenced under the GNU General Public License 2.0 license. -******************************************************************************/ -#include "lineFileUtilities.h" -#include "mergeBed.h" - - - -void BedMerge::ReportMergedNames(const vector<string> &names) { - if (names.size() > 0) { - printf("\t"); - vector<string>::const_iterator nameItr = names.begin(); - vector<string>::const_iterator nameEnd = names.end(); - for (; nameItr != nameEnd; ++nameItr) { - if (nameItr < (nameEnd - 1)) - cout << *nameItr << _delimiter; - else - cout << *nameItr; - } - } - else { - cerr << endl - << "*****" << endl - << "*****ERROR: " - << "No names found to report for the -names option. Exiting." - << endl - << "*****" << endl; - exit(1); - } -} - - -void BedMerge::ReportMergedScores(const vector<string> &scores) { - - // setup a VectorOps instances for the list of scores. - // VectorOps methods used for each possible operation. - VectorOps vo(scores); - std::stringstream buffer; - if (scores.size() > 0) { - if (_scoreOp == "sum") - buffer << setprecision (PRECISION) << vo.GetSum(); - else if (_scoreOp == "min") - buffer << setprecision (PRECISION) << vo.GetMin(); - else if (_scoreOp == "max") - buffer << setprecision (PRECISION) << vo.GetMax(); - else if (_scoreOp == "mean") - buffer << setprecision (PRECISION) << vo.GetMean(); - else if (_scoreOp == "median") - buffer << setprecision (PRECISION) << vo.GetMedian(); - else if (_scoreOp == "mode") - buffer << setprecision (PRECISION) << vo.GetMode(); - else if (_scoreOp == "antimode") - buffer << setprecision (PRECISION) << vo.GetAntiMode(); - else if (_scoreOp == "collapse") - buffer << setprecision (PRECISION) << vo.GetCollapse(_delimiter); - cout << "\t" << buffer.str(); - } - else { - cerr << endl - << "*****" << endl - << "*****ERROR: No scores found to report for the -scores option. Exiting." << endl - << "*****" << endl; - exit(1); - } -} - -// =============== -// = Constructor = -// =============== -BedMerge::BedMerge(string &bedFile, - bool numEntries, - int maxDistance, - bool forceStrand, - bool reportNames, - bool reportScores, - const string &scoreOp, - const string &delimiter) : - _bedFile(bedFile), - _numEntries(numEntries), - _forceStrand(forceStrand), - _reportNames(reportNames), - _reportScores(reportScores), - _scoreOp(scoreOp), - _maxDistance(maxDistance), - _delimiter(delimiter) -{ - _bed = new BedFile(bedFile); - - if (_forceStrand == false) - MergeBed(); - else - MergeBedStranded(); -} - - -// ================= -// = Destructor = -// ================= -BedMerge::~BedMerge(void) { -} - - -// =============================================== -// Convenience method for reporting merged blocks -// ================================================ -void BedMerge::Report(string chrom, int start, - int end, const vector<string> &names, - const vector<string> &scores, int mergeCount) -{ - // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type - //if (_bed->isZeroBased == false) {start++;} - - printf("%s\t%d\t%d", chrom.c_str(), start, end); - // just the merged intervals - if (_numEntries == false && _reportNames == false && - _reportScores == false) { - printf("\n"); - } - // merged intervals and counts - else if (_numEntries == true && _reportNames == false && - _reportScores == false) { - printf("\t%d\n", mergeCount); - } - // merged intervals, counts, and scores - else if (_numEntries == true && _reportNames == false && - _reportScores == true) { - printf("\t%d", mergeCount); - ReportMergedScores(scores); - printf("\n"); - } - // merged intervals, counts, and names - else if (_numEntries == true && _reportNames == true && - _reportScores == false) { - ReportMergedNames(names); - printf("\t%d\n", mergeCount); - } - // merged intervals, counts, names, and scores - else if (_numEntries == true && _reportNames == true && - _reportScores == true) { - ReportMergedNames(names); - ReportMergedScores(scores); - printf("\t%d\n", mergeCount); - } - // merged intervals and names - else if (_numEntries == false && _reportNames == true && - _reportScores == false) { - ReportMergedNames(names); - printf("\n"); - } - // merged intervals and scores - else if (_numEntries == false && _reportNames == false && - _reportScores == true) { - ReportMergedScores(scores); - printf("\n"); - } - // merged intervals, names, and scores - else if (_numEntries == false && _reportNames == true && - _reportScores == true) { - ReportMergedNames(names); - ReportMergedScores(scores); - printf("\n"); - } -} - - -// ========================================================= -// Convenience method for reporting merged blocks by strand -// ========================================================= -void BedMerge::ReportStranded(string chrom, int start, - int end, const vector<string> &names, - const vector<string> &scores, int mergeCount, - string strand) -{ - // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type - //if (_bed->isZeroBased == false) {start++;} - - printf("%s\t%d\t%d", chrom.c_str(), start, end); - // just the merged intervals - if (_numEntries == false && _reportNames == false && - _reportScores == false) { - printf("\t\t\t%s\n", strand.c_str()); - } - // merged intervals and counts - else if (_numEntries == true && _reportNames == false && - _reportScores == false) { - printf("\t\t%d\t%s\n", mergeCount, strand.c_str()); - } - // merged intervals, counts, and scores - else if (_numEntries == true && _reportNames == false && - _reportScores == true) { - printf("\t%d", mergeCount); - ReportMergedScores(scores); - printf("\t%s\n", strand.c_str()); - } - // merged intervals, counts, and names - else if (_numEntries == true && _reportNames == true && - _reportScores == false) { - ReportMergedNames(names); - printf("\t%d\t%s", mergeCount, strand.c_str()); - printf("\n"); - } - // merged intervals, counts, names, and scores - else if (_numEntries == true && _reportNames == true && - _reportScores == true) { - ReportMergedNames(names); - ReportMergedScores(scores); - printf("\t%s\t%d", strand.c_str(), mergeCount); - printf("\n"); - } - // merged intervals and names - else if (_numEntries == false && _reportNames == true && - _reportScores == false) { - ReportMergedNames(names); - printf("\t.\t%s\n", strand.c_str()); - } - // merged intervals and scores - else if (_numEntries == false && _reportNames == false && - _reportScores == true) { - printf("\t"); - ReportMergedScores(scores); - printf("\t%s\n", strand.c_str()); - } - // merged intervals, names, and scores - else if (_numEntries == false && _reportNames == true && - _reportScores == true) { - ReportMergedNames(names); - ReportMergedScores(scores); - printf("\t%s\n", strand.c_str()); - } -} - - -// ===================================================== -// = Merge overlapping BED entries into a single entry = -// ===================================================== -void BedMerge::MergeBed() { - int mergeCount = 1; - vector<string> names; - vector<string> scores; - int start = -1; - int end = -1; - BED prev, curr; - - _bed->Open(); - while (_bed->GetNextBed(curr, true)) { // true = force sorted intervals - if (_bed->_status != BED_VALID) - continue; - // new block, no overlap - if ( (((int) curr.start - end) > _maxDistance) || - (curr.chrom != prev.chrom)) - { - if (start >= 0) { - Report(prev.chrom, start, end, names, scores, mergeCount); - // reset - mergeCount = 1; - names.clear(); - scores.clear(); - } - start = curr.start; - end = curr.end; - if (!curr.name.empty()) - names.push_back(curr.name); - if (!curr.score.empty()) - scores.push_back(curr.score); - } - // same block, overlaps - else { - if ((int) curr.end > end) - end = curr.end; - if (!curr.name.empty()) - names.push_back(curr.name); - if (!curr.score.empty()) - scores.push_back(curr.score); - mergeCount++; - } - prev = curr; - } - if (start >= 0) { - Report(prev.chrom, start, end, names, scores, mergeCount); - } -} - - -// =============================================================================== -// = Merge overlapping BED entries into a single entry, accounting for strandedness = -// ================================================================================ -void BedMerge::MergeBedStranded() { - - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bed->loadBedFileIntoMapNoBin(); - - // loop through each chromosome and merge their BED entries - masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); - masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); - for (; m != mEnd; ++m) { - - // bedList is already sorted by start position. - string chrom = m->first; - - // make a list of the two strands to merge separately. - vector<string> strands(2); - strands[0] = "+"; - strands[1] = "-"; - // do two passes, one for each strand. - for (unsigned int s = 0; s < strands.size(); s++) { - int mergeCount = 1; - int numOnStrand = 0; - vector<string> names; - vector<string> scores; - - // merge overlapping features for this chromosome. - int start = -1; - int end = -1; - vector<BED>::const_iterator bedItr = m->second.begin(); - vector<BED>::const_iterator bedEnd = m->second.end(); - for (; bedItr != bedEnd; ++bedItr) { - // if forcing strandedness, move on if the hit - // is not on the current strand. - if (bedItr->strand != strands[s]) { continue; } - else { numOnStrand++; } - if ( (((int) bedItr->start - end) > _maxDistance) || - (end < 0)) - { - if (start >= 0) { - ReportStranded(chrom, start, end, names, - scores, mergeCount, strands[s]); - // reset - mergeCount = 1; - names.clear(); - scores.clear(); - } - start = bedItr->start; - end = bedItr->end; - if (!bedItr->name.empty()) names.push_back(bedItr->name); - if (!bedItr->score.empty()) scores.push_back(bedItr->score); - } - else { - if ((int) bedItr-> end > end) end = bedItr->end; - mergeCount++; - if (!bedItr->name.empty()) names.push_back(bedItr->name); - if (!bedItr->score.empty()) scores.push_back(bedItr->score); - } - } - if (start >= 0) { - ReportStranded(chrom, start, end, names, - scores, mergeCount, strands[s]); - } - } - } -} diff --git a/src/mergeBed/mergeBed.h b/src/mergeBed/mergeBed.h deleted file mode 100644 index d9b0c1438cb33aae8db1a40ad0ca92635e0eee20..0000000000000000000000000000000000000000 --- a/src/mergeBed/mergeBed.h +++ /dev/null @@ -1,72 +0,0 @@ -/***************************************************************************** - mergeBed.h - - (c) 2009 - Aaron Quinlan - Hall Laboratory - Department of Biochemistry and Molecular Genetics - University of Virginia - aaronquinlan@gmail.com - - Licenced under the GNU General Public License 2.0 license. -******************************************************************************/ -#include "bedFile.h" -#include <vector> -#include <algorithm> -#include <numeric> -#include <iostream> -#include <iomanip> -#include <fstream> -#include <limits.h> -#include <stdlib.h> -#include "VectorOps.h" - -using namespace std; - -const int PRECISION = 21; - -//************************************************ -// Class methods and elements -//************************************************ -class BedMerge { - -public: - - // constructor - BedMerge(string &bedFile, bool numEntries, - int maxDistance, bool forceStrand, - bool reportNames, bool reportScores, - const string &scoreOp, const string &delimiter); - - // destructor - ~BedMerge(void); - - void MergeBed(); - void MergeBedStranded(); - -private: - - string _bedFile; - bool _numEntries; - bool _forceStrand; - bool _reportNames; - bool _reportScores; - string _scoreOp; - int _maxDistance; - string _delimiter; - // instance of a bed file class. - BedFile *_bed; - - void Report(string chrom, int start, int end, - const vector<string> &names, - const vector<string> &scores, - int mergeCount); - - void ReportStranded(string chrom, int start, int end, - const vector<string> &names, - const vector<string> &scores, - int mergeCount, - string strand); - void ReportMergedNames(const vector<string> &names); - void ReportMergedScores(const vector<string> &scores); - -}; diff --git a/src/mergeFile/Makefile b/src/mergeFile/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..50f4004250b34cb4c1ab59481edc907a2f1d27ed --- /dev/null +++ b/src/mergeFile/Makefile @@ -0,0 +1,46 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ + -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/GenomeFile/ \ + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools/src \ + -I$(UTILITIES_DIR)/BlockedIntervals \ + -I$(UTILITIES_DIR)/BamTools-Ancillary \ + -I$(UTILITIES_DIR)/FileRecordTools/ \ + -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ + -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ + -I$(UTILITIES_DIR)/RecordOutputMgr/ \ + -I$(UTILITIES_DIR)/NewChromsweep \ + -I$(UTILITIES_DIR)/BinTree \ + -I$(UTILITIES_DIR)/version/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= mergeMain.cpp mergeFile.cpp mergeFile.h +OBJECTS= mergeMain.o mergeFile.o +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/mergeMain.o $(OBJ_DIR)/mergeFile.o + +.PHONY: clean \ No newline at end of file diff --git a/src/mergeFile/mergeFile.cpp b/src/mergeFile/mergeFile.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f772629c440db0fae1ab9e7535491fc6d2edf4eb --- /dev/null +++ b/src/mergeFile/mergeFile.cpp @@ -0,0 +1,39 @@ +/***************************************************************************** + mergeFile.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "mergeFile.h" + + +MergeFile::MergeFile(ContextMerge *context) +: _context(context), + _recordOutputMgr(NULL) +{ + _recordOutputMgr = new RecordOutputMgr(); + _recordOutputMgr->init(_context); +} + +MergeFile::~MergeFile() +{ + delete _recordOutputMgr; + _recordOutputMgr = NULL; +} + +bool MergeFile::merge() +{ + RecordKeyList hitSet; + FileRecordMgr *frm = _context->getFile(0); + while (!frm->eof()) { + Record *key = frm->getNextRecord(&hitSet); + if (key == NULL) continue; + _recordOutputMgr->printRecord(hitSet.getKey(), _context->getColumnOpsVal(hitSet)); + } + return true; +} diff --git a/src/mergeFile/mergeFile.h b/src/mergeFile/mergeFile.h new file mode 100644 index 0000000000000000000000000000000000000000..0d876c67ef260a0935efc0bd00b4d194a13b1b22 --- /dev/null +++ b/src/mergeFile/mergeFile.h @@ -0,0 +1,37 @@ +/***************************************************************************** + mergeFile.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ + +#ifndef MERGE_FILE_H_ +#define MERGE_FILE_H_ + +//************************************************ +// Class methods and elements +//************************************************ + +#include "ContextMerge.h" +#include "RecordOutputMgr.h" + +class MergeFile { + +public: + MergeFile(ContextMerge *context); + ~MergeFile(); + + bool merge(); + +private: + ContextMerge *_context; + RecordOutputMgr *_recordOutputMgr; + +}; + +#endif diff --git a/src/mergeBed/mergeMain.cpp b/src/mergeFile/mergeMain.cpp similarity index 52% rename from src/mergeBed/mergeMain.cpp rename to src/mergeFile/mergeMain.cpp index 28b869af4cb54bbe0ec9eeaf7251d23ab985564d..3f443e27ba21b39b9ed28a4f91f3121017abf1ff 100644 --- a/src/mergeBed/mergeMain.cpp +++ b/src/mergeFile/mergeMain.cpp @@ -9,7 +9,7 @@ Licenced under the GNU General Public License 2.0 license. ******************************************************************************/ -#include "mergeBed.h" +#include "mergeFile.h" #include "version.h" using namespace std; @@ -26,113 +26,21 @@ void merge_help(void); int merge_main(int argc, char* argv[]) { - // our configuration variables - bool showHelp = false; - - // input files - string bedFile = "stdin"; - int maxDistance = 0; - string scoreOp = ""; - - // input arguments - bool haveBed = true; - bool numEntries = false; - bool haveMaxDistance = false; - bool forceStrand = false; - bool reportNames = false; - bool reportScores = false; - string delimiter = ","; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; + ContextMerge *context = new ContextMerge(); + if (!context->parseCmdArgs(argc, argv, 1) || context->getShowHelp() || !context->isValidState()) { + if (!context->getErrorMsg().empty()) { + cerr << context->getErrorMsg() << endl; } - } - - if(showHelp) merge_help(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - bedFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-n", 2, parameterLength)) { - numEntries = true; - } - else if(PARAMETER_CHECK("-d", 2, parameterLength)) { - if ((i+1) < argc) { - haveMaxDistance = true; - maxDistance = atoi(argv[i + 1]); - i++; - } - } - else if (PARAMETER_CHECK("-s", 2, parameterLength)) { - forceStrand = true; - } - else if (PARAMETER_CHECK("-nms", 4, parameterLength)) { - reportNames = true; - } - else if (PARAMETER_CHECK("-scores", 7, parameterLength)) { - reportScores = true; - if ((i+1) < argc) { - scoreOp = argv[i + 1]; - i++; - } - } - else if (PARAMETER_CHECK("-delim", 6, parameterLength)) { - if ((i+1) < argc) { - delimiter = argv[i + 1]; - i++; - } - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBed) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl; - showHelp = true; - } - if ((reportScores == true) && (scoreOp != "sum") - && (scoreOp != "max") && (scoreOp != "min") - && (scoreOp != "mean") && (scoreOp != "mode") - && (scoreOp != "median") && (scoreOp != "antimode") - && (scoreOp != "collapse")) - { - cerr << endl - << "*****" - << endl - << "*****ERROR: Invalid scoreOp selection \"" - << scoreOp - << endl - << "\" *****" - << endl; - showHelp = true; - } - - if (!showHelp) { - BedMerge *bm = new BedMerge(bedFile, numEntries, - maxDistance, forceStrand, - reportNames, reportScores, - scoreOp, delimiter); - delete bm; - } - else { merge_help(); + delete context; + return 0; } - return 0; + MergeFile *mergeFile = new MergeFile(context); + + bool retVal = mergeFile->merge(); + delete mergeFile; + delete context; + return retVal ? 0 : 1; } void merge_help(void) { diff --git a/src/nekSandbox1/nekSandboxMain.cpp b/src/nekSandbox1/nekSandboxMain.cpp index ac3fb3196b49d602e98a2e79ae369ef907573603..6e895b2ec4aad16dd81efba9e7be970c0a636f45 100644 --- a/src/nekSandbox1/nekSandboxMain.cpp +++ b/src/nekSandbox1/nekSandboxMain.cpp @@ -6,7 +6,7 @@ using namespace std; #include <cstdio> #include "RecordKeyList.h" #include "NewChromsweep.h" -#include "DualQueue.h" +//#include "DualQueue.h" #include "ParseTools.h" #include <sstream> #include <iomanip> @@ -145,7 +145,7 @@ int nek_sandbox1_main(int argc,char** argv) // bool headerFound = false; // QuickString outbuf; // while (!frm.eof()) { -// Record *record = frm.allocateAndGetNextRecord(); +// Record *record = frm.getNextRecord(); // if (!headerFound && frm.hasHeader()) { // cout << frm.getHeader() << endl; // headerFound = true; @@ -287,7 +287,7 @@ void testDualQueue(Context *context) { printf("Original record order is:\n"); while (!frm.eof()) { - Record *record = frm.allocateAndGetNextRecord(); + Record *record = frm.getNextRecord(); if (record == NULL) { continue; } diff --git a/src/sampleFile/SampleFile.cpp b/src/sampleFile/SampleFile.cpp index 8291824cef1cb5a67d7827c9bf0e867b7df4f48f..096327e2012d111f2fcfab398a08e7b75855073b 100644 --- a/src/sampleFile/SampleFile.cpp +++ b/src/sampleFile/SampleFile.cpp @@ -51,7 +51,7 @@ bool SampleFile::takeSample() while (!_inputFile->eof()) { - Record *record = _inputFile->allocateAndGetNextRecord(); + Record *record = _inputFile->getNextRecord(); if (record == NULL) { continue; } diff --git a/src/utils/BinTree/BinTree.cpp b/src/utils/BinTree/BinTree.cpp index 54696f6928e7667683d993c658f7771d8e88ff2f..f5cddd83fa5feef95ac47780770ddb6751266040 100644 --- a/src/utils/BinTree/BinTree.cpp +++ b/src/utils/BinTree/BinTree.cpp @@ -74,7 +74,7 @@ void BinTree::loadDB() Record *record = NULL; while (!_databaseFile->eof()) { - record = _databaseFile->allocateAndGetNextRecord(); + record = _databaseFile->getNextRecord(); //In addition to NULL records, we also don't want to add unmapped reads. if (record == NULL || record->isUnmapped()) { continue; diff --git a/src/utils/Contexts/ContextBase.cpp b/src/utils/Contexts/ContextBase.cpp index 16d2402c080b16d498fe3f767e9423ac7e61beda..c76d29b45978d8fdc3e3d506cb33eedade6a72cc 100644 --- a/src/utils/Contexts/ContextBase.cpp +++ b/src/utils/Contexts/ContextBase.cpp @@ -13,7 +13,6 @@ ContextBase::ContextBase() : _program(UNSPECIFIED_PROGRAM), _allFilesOpened(false), - _useMergedIntervals(false), _genomeFile(NULL), _outputFileType(FileRecordTypeChecker::UNKNOWN_FILE_TYPE), _outputTypeDetermined(false), @@ -45,7 +44,6 @@ ContextBase::ContextBase() _maxNumDatabaseFields(0), _useFullBamTags(false), _reportCount(false), - _maxDistance(0), _reportNames(false), _reportScores(false), _numOutputRecords(0), @@ -53,11 +51,16 @@ ContextBase::ContextBase() _seed(0), _forwardOnly(false), _reverseOnly(false), - _hasColumnOpsMethods(false) + _hasColumnOpsMethods(false), + _desiredStrand(FileRecordMergeMgr::ANY_STRAND), + _maxDistance(0), + _useMergedIntervals(false) + { _programNames["intersect"] = INTERSECT; _programNames["sample"] = SAMPLE; _programNames["map"] = MAP; + _programNames["merge"] = MERGE; if (hasColumnOpsMethods()) { _keyListOps = new KeyListOps(); @@ -233,11 +236,12 @@ bool ContextBase::openFiles() { _files.resize(_fileNames.size()); for (int i = 0; i < (int)_fileNames.size(); i++) { - FileRecordMgr *frm = new FileRecordMgr(_fileNames[i], _sortedInput); + FileRecordMgr *frm = getNewFRM(_fileNames[i]); if (hasGenomeFile()) { frm->setGenomeFile(_genomeFile); } frm->setFullBamFlags(_useFullBamTags); + frm->setIsSorted(_sortedInput); if (!frm->open()) { return false; } @@ -391,8 +395,9 @@ bool ContextBase::handle_c() markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); + return true; } - return true; + return false; } @@ -412,7 +417,7 @@ bool ContextBase::handle_o() } -// for col ops, -null is a NULL vakue assigned +// for col ops, -null is a NULL value assigned // when no overlaps are detected. bool ContextBase::handle_null() { @@ -424,8 +429,9 @@ bool ContextBase::handle_null() markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); + return true; } - return true; + return false; } //for col ops, delimStr will appear between each item in @@ -459,3 +465,13 @@ const QuickString &ContextBase::getColumnOpsVal(RecordKeyList &keyList) const { return _keyListOps->getOpVals(keyList); } +FileRecordMgr *ContextBase::getNewFRM(const QuickString &filename) { + if (!_useMergedIntervals) { + return new FileRecordMgr(filename); + } else { + FileRecordMergeMgr *frm = new FileRecordMergeMgr(filename); + frm->setStrandType(_desiredStrand); + frm->setMaxDistance(_maxDistance); + return frm; + } +} diff --git a/src/utils/Contexts/ContextBase.h b/src/utils/Contexts/ContextBase.h index b4bf12279f71db6a934a4e42a8970771a4a528e8..2c194a68497571188dd5962fa179526f7fd44b22 100644 --- a/src/utils/Contexts/ContextBase.h +++ b/src/utils/Contexts/ContextBase.h @@ -20,7 +20,7 @@ #include "version.h" #include "BedtoolsTypes.h" #include "FileRecordTypeChecker.h" -#include "FileRecordMgr.h" +#include "FileRecordMergeMgr.h" #include "NewGenomeFile.h" #include "api/BamReader.h" #include "api/BamAux.h" @@ -59,6 +59,7 @@ public: bool getUseMergedIntervals() const { return _useMergedIntervals; } void setUseMergedIntervals(bool val) { _useMergedIntervals = val; } + FileRecordMergeMgr::WANTED_STRAND_TYPE getDesiredStrand() const { return _desiredStrand; } void openGenomeFile(const QuickString &genomeFilename); void openGenomeFile(const BamTools::RefVector &refVector); @@ -106,23 +107,23 @@ public: virtual bool getUseFullBamTags() const { return _useFullBamTags; } virtual void setUseFullBamTags(bool val) { _useFullBamTags = val; } - // - // MERGE METHODS - // - virtual bool getReportCount() const { return _reportCount; } - virtual void setReportCount(bool val) { _reportCount = val; } - - virtual int getMaxDistance() const { return _maxDistance; } - virtual void setMaxDistance(int distance) { _maxDistance = distance; } - - virtual bool getReportNames() const { return _reportNames; } - virtual void setReportNames(bool val) { _reportNames = val; } - - virtual bool getReportScores() const { return _reportScores; } - virtual void setReportScores(bool val) { _reportScores = val; } - - virtual const QuickString &getScoreOp() const { return _scoreOp; } - virtual void setScoreOp(const QuickString &op) { _scoreOp = op; } +// // +// // MERGE METHODS +// // +// virtual bool getReportCount() const { return _reportCount; } +// virtual void setReportCount(bool val) { _reportCount = val; } +// +// virtual int getMaxDistance() const { return _maxDistance; } +// virtual void setMaxDistance(int distance) { _maxDistance = distance; } +// +// virtual bool getReportNames() const { return _reportNames; } +// virtual void setReportNames(bool val) { _reportNames = val; } +// +// virtual bool getReportScores() const { return _reportScores; } +// virtual void setReportScores(bool val) { _reportScores = val; } +// +// virtual const QuickString &getScoreOp() const { return _scoreOp; } +// virtual void setScoreOp(const QuickString &op) { _scoreOp = op; } // METHODS FOR PROGRAMS WITH USER_SPECIFIED NUMBER @@ -160,7 +161,6 @@ protected: bool _allFilesOpened; map<QuickString, PROGRAM_TYPE> _programNames; - bool _useMergedIntervals; NewGenomeFile *_genomeFile; ContextFileType _outputFileType; @@ -200,7 +200,6 @@ protected: int _maxNumDatabaseFields; bool _useFullBamTags; bool _reportCount; - int _maxDistance; bool _reportNames; bool _reportScores; QuickString _scoreOp; @@ -212,14 +211,22 @@ protected: bool _forwardOnly; bool _reverseOnly; + //Members for column operations bool _hasColumnOpsMethods; KeyListOps *_keyListOps; QuickString _nullStr; //placeholder return value when col ops aren't valid. + //Members for merged records + FileRecordMergeMgr::WANTED_STRAND_TYPE _desiredStrand; + int _maxDistance; + bool _useMergedIntervals; + + void markUsed(int i) { _argsProcessed[i] = true; } bool isUsed(int i) const { return _argsProcessed[i]; } bool cmdArgsValid(); bool openFiles(); + virtual FileRecordMgr *getNewFRM(const QuickString &filename); //set cmd line params and counter, i, as members so code //is more readable (as opposed to passing all 3 everywhere). diff --git a/src/utils/Contexts/ContextMap.cpp b/src/utils/Contexts/ContextMap.cpp index e3f82417cda79b1ab81e89d72847613b6c151c53..e6abc519501a1e5320c5aad4103bd04fc083096a 100644 --- a/src/utils/Contexts/ContextMap.cpp +++ b/src/utils/Contexts/ContextMap.cpp @@ -47,13 +47,3 @@ bool ContextMap::parseCmdArgs(int argc, char **argv, int skipFirstArgs) { } return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs); } -// -// -//bool ContextMap::isValidState() -//{ -// if (!ContextIntersect::isValidState()) { -// return false; -// } -//} -// -// diff --git a/src/utils/Contexts/ContextMap.h b/src/utils/Contexts/ContextMap.h index 9b7280e5425991a3f47588c7c3c78cf9b9fd8745..b5bf5959cbfea429ee30c4dcd41ed2f2e4e005e4 100644 --- a/src/utils/Contexts/ContextMap.h +++ b/src/utils/Contexts/ContextMap.h @@ -15,12 +15,8 @@ class ContextMap : public ContextIntersect { public: ContextMap(); virtual ~ContextMap(); -// virtual bool isValidState(); -// virtual bool parseCmdArgs(int argc, char **argv, int skipFirstArgs); -// virtual bool hasIntersectMethods() const { return true; } -// private: diff --git a/src/utils/Contexts/ContextMerge.cpp b/src/utils/Contexts/ContextMerge.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5d6c58aac3cb051a8eb485c89be8bb91a033f96b --- /dev/null +++ b/src/utils/Contexts/ContextMerge.cpp @@ -0,0 +1,164 @@ +/* + * ContextMerge.cpp + * + * Created on: Mar 26, 2014 + * Author: nek3d + */ + + +#include "ContextMerge.h" + +ContextMerge::ContextMerge() +{ + setUseMergedIntervals(true); + setColumnOpsMethods(true); + + //merge has no default columnOps the way map does, so we'll need to clear those. + _keyListOps->setColumns(""); + _keyListOps->setOperations(""); + +} + +ContextMerge::~ContextMerge() +{ + +} + + +bool ContextMerge::parseCmdArgs(int argc, char **argv, int skipFirstArgs) +{ + _argc = argc; + _argv = argv; + _skipFirstArgs = skipFirstArgs; + if (_argc < 2) { + setShowHelp(true); + return false; + } + + setProgram(_programNames[argv[0]]); + + _argsProcessed.resize(_argc - _skipFirstArgs, false); + + for (_i=_skipFirstArgs; _i < argc; _i++) { + if (isUsed(_i - _skipFirstArgs)) { + continue; + } + else if (strcmp(_argv[_i], "-n") == 0) { + if (!handle_n()) return false; + } + else if (strcmp(_argv[_i], "-nms") == 0) { + if (!handle_nms()) return false; + } + else if (strcmp(_argv[_i], "-scores") == 0) { + if (!handle_scores()) return false; + } + else if (strcmp(_argv[_i], "-delim") == 0) { + if (!handle_delim()) return false; + } + else if (strcmp(_argv[_i], "-d") == 0) { + if (!handle_d()) return false; + } + else if (strcmp(_argv[_i], "-s") == 0) { + if (!handle_s()) return false; + } + else if (strcmp(_argv[_i], "-S") == 0) { + if (!handle_S()) return false; + } + } + return ContextBase::parseCmdArgs(argc, argv, _skipFirstArgs); +} + +bool ContextMerge::isValidState() +{ + // Special: The merge program does not have default + //column operations, so if none were entered, disable column ops. + if (_keyListOps->getColumns().empty() && _keyListOps->getOperations().empty()) { + setColumnOpsMethods(false); + delete _keyListOps; + _keyListOps = NULL; + } + if (!ContextBase::isValidState()) { + return false; + } + if (_files.size() != 1) { + _errorMsg = "\n***** ERROR: input file not specified. *****"; + // Allow one and only input file for now + return false; + } + return true; +} + + +bool ContextMerge::handle_d() { + if ((_i+1) < _argc) { + if (isNumeric(_argv[_i+1])) { + int dist = str2chrPos(_argv[_i+1]); + if (dist >=0 ) { + _maxDistance = dist; + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + return true; + } + } + } + _errorMsg = "\n***** ERROR: -d option must be followed by an integer value *****"; + return false; +} + +bool ContextMerge::handle_n() +{ + //This is the same as telling map "-c any -o count" + _keyListOps->addColumns("1"); //doesn't really matter which column, but the default column + //for keyListOps is score, which not every record necessarily has. + _keyListOps->addOperations("count"); + markUsed(_i - _skipFirstArgs); + return true; +} + +bool ContextMerge::handle_nms() +{ + //This is the same as telling map "-c 4 -o collapse" + _keyListOps->addColumns("4"); + _keyListOps->addOperations("collapse"); + markUsed(_i - _skipFirstArgs); + return true; +} + + +bool ContextMerge::handle_scores() +{ + if ((_i+1) < _argc) { + _keyListOps->addColumns("5"); + _keyListOps->addOperations(_argv[_i+1]); + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + return true; + } + _errorMsg = "\n***** ERROR: -scores option given, but no operation specified. *****"; + + return false; +} + +bool ContextMerge::handle_s() { + _desiredStrand = FileRecordMergeMgr::SAME_STRAND_EITHER; + markUsed(_i - _skipFirstArgs); + return true; +} + +bool ContextMerge::handle_S() { + if ((_i+1) < _argc) { + if (_argv[_i+1][0] == '+') { + _desiredStrand = FileRecordMergeMgr::SAME_STRAND_FORWARD; + } else if (_argv[_i+1][0] == '-') { + _desiredStrand = FileRecordMergeMgr::SAME_STRAND_REVERSE; + } + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + return true; + } + _errorMsg = "\n***** ERROR: -S option must be followed by + or -. *****"; + return false; +} diff --git a/src/utils/Contexts/ContextMerge.h b/src/utils/Contexts/ContextMerge.h new file mode 100644 index 0000000000000000000000000000000000000000..f2690833b79156fe05dfeb0c95df05d60adb9cc1 --- /dev/null +++ b/src/utils/Contexts/ContextMerge.h @@ -0,0 +1,31 @@ +/* + * ContextMerge.h + * + * Created on: Mar 26, 2014 + * Author: nek3d + */ + +#ifndef CONTEXTMERGE_H_ +#define CONTEXTMERGE_H_ + +#include "ContextBase.h" +#include "FileRecordMergeMgr.h" + +class ContextMerge: public ContextBase { +public: + ContextMerge(); + ~ContextMerge(); + virtual bool parseCmdArgs(int argc, char **argv, int skipFirstArgs); + virtual bool isValidState(); + +protected: + bool handle_d(); + bool handle_n(); + bool handle_nms(); + bool handle_scores(); + bool handle_s(); + bool handle_S(); +}; + + +#endif /* CONTEXTMERGE_H_ */ diff --git a/src/utils/Contexts/Makefile b/src/utils/Contexts/Makefile index 4b2ed4291d7bec5341c11eb7e82602dd7882de9e..1856b82b9184201244d6df72e833a5173d31e3af 100644 --- a/src/utils/Contexts/Makefile +++ b/src/utils/Contexts/Makefile @@ -19,8 +19,9 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= ContextBase.cpp ContextBase.h ContextIntersect.cpp ContextIntersect.h ContextMap.cpp ContextMap.h ContextSample.cpp ContextSample.h -OBJECTS= ContextBase.o ContextIntersect.o ContextMap.o ContextSample.o +SOURCES= ContextBase.cpp ContextBase.h ContextIntersect.cpp ContextIntersect.h ContextMap.cpp \ + ContextMap.h ContextSample.cpp ContextSample.h ContextMerge.h ContextMerge.cpp +OBJECTS= ContextBase.o ContextIntersect.o ContextMap.o ContextSample.o ContextMerge.o _EXT_OBJECTS=ParseTools.o QuickString.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) @@ -38,6 +39,7 @@ clean: @rm -f $(OBJ_DIR)/ContextBase.o \ $(OBJ_DIR)/ContextIntersect.o \ $(OBJ_DIR)/ContextMap.o \ - $(OBJ_DIR)/ContextSample.o + $(OBJ_DIR)/ContextSample.o \ + $(OBJ_DIR)/ContextMerge.o \ .PHONY: clean \ No newline at end of file diff --git a/src/utils/FileRecordTools/FileRecordMergeMgr.cpp b/src/utils/FileRecordTools/FileRecordMergeMgr.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c82206b3c34e91e7b301857a195a94bbee208e4d --- /dev/null +++ b/src/utils/FileRecordTools/FileRecordMergeMgr.cpp @@ -0,0 +1,204 @@ +/* + * FileRecordMergeMgr.cpp + * + * Created on: Mar 19, 2014 + * Author: nek3d + */ + + +#include "FileRecordMergeMgr.h" + +FileRecordMergeMgr::FileRecordMergeMgr(const QuickString & filename) +: FileRecordMgr(filename), + _desiredStrand(ANY_STRAND), + _maxDistance(0) +{ +} + +//Record *FileRecordMergeMgr::allocateAndGetNextMergedRecord(WANT_STRAND_TYPE desiredStrand, int maxDistance) { +// RecordKeyList recList; +// if (!allocateAndGetNextMergedRecord(recList, desiredStrand, maxDistance)) { +// return NULL; +// } +// deleteAllMergedItemsButKey(recList); +// return const_cast<Record *>(recList.getKey()); //want key to be non-const +//} + +Record *FileRecordMergeMgr::getNextRecord(RecordKeyList *recList) +{ + if (!recList->allClear()) { + deleteMergedRecord(*recList); + } + + _mustBeForward = _desiredStrand == SAME_STRAND_FORWARD; + _mustBeReverse = _desiredStrand == SAME_STRAND_REVERSE; + + Record *startRecord = tryToTakeFromStorage(); + + // if we couldn't use a previously stored record for starters, + //then begin with a new one that matches strand criteria. + while (startRecord == NULL) { + startRecord = FileRecordMgr::getNextRecord(); + if (startRecord == NULL) { //hit EOF!! + return NULL; + } + + if ((_mustBeForward && (startRecord->getStrandVal() != Record::FORWARD)) || (_mustBeReverse && (startRecord->getStrandVal() != Record::REVERSE))) { + //record is reverse, only want forward, OR record is forward, wanted reverse + deleteRecord(startRecord); + startRecord = NULL; + } + if (startRecord->getStrandVal() == Record::UNKNOWN && _desiredStrand != ANY_STRAND) { + //there is an unknown strand, but the user specified strandedness. + deleteRecord(startRecord); + startRecord = NULL; + } + } + + // OK!! We have a start record! Re-evaluate strand requirements for next recored. + + _mustBeForward = _desiredStrand == SAME_STRAND_FORWARD || (_desiredStrand == SAME_STRAND_EITHER && (startRecord->getStrandVal() == Record::FORWARD)); + _mustBeReverse = _desiredStrand == SAME_STRAND_REVERSE || (_desiredStrand == SAME_STRAND_EITHER && (startRecord->getStrandVal() == Record::REVERSE)); + bool mustKeepOpposite = (_desiredStrand == SAME_STRAND_EITHER); + + const QuickString &currChrom = startRecord->getChrName(); + _foundChroms.insert(currChrom); + + bool madeComposite = false; + recList->push_back(startRecord); + recList->setKey(startRecord); //key of recList will just be the startRecord unless we're able to merge more. + + Record::strandType currStrand = startRecord->getStrandVal(); + bool mustMatchStrand = _desiredStrand != ANY_STRAND; + + int currEnd = startRecord->getEndPos(); + //now look for more records to merge with this one. + //stop when they're out of range, not on the same chromosome, or we hit EOF. + //ignore if they don't comply with strand. + Record *nextRecord = NULL; + while (nextRecord == NULL) { + bool takenFromStorage = false; + nextRecord = mustMatchStrand ? tryToTakeFromStorage(currStrand) : tryToTakeFromStorage(); + if (nextRecord == NULL) { + nextRecord = FileRecordMgr::getNextRecord(); + } else { + takenFromStorage = true; + } + if (nextRecord == NULL) { // EOF hit + break; + } + //delete any record from file with an unknown strand if we are doing stranded merge, but first check + //that it's chrom was the same and it's not out of range. If either is true, stop scanning. + bool mustDelete = (mustMatchStrand && nextRecord->getStrandVal() == Record::UNKNOWN); + + //check that we are still on the same chromosome. + const QuickString &newChrom = nextRecord->getChrName(); + if (newChrom != currChrom) { //hit a different chromosome. + if (_foundChroms.find(newChrom) == _foundChroms.end() || takenFromStorage) { + //haven't seen this chromosome before, sort order is already enforced in the base class method. + if (!mustDelete) { + addToStorage(nextRecord); + } else { + deleteRecord(nextRecord); + } + nextRecord = NULL; + break; + } + } + + //check whether it's in range + int nextStart = nextRecord->getStartPos(); + if (nextStart > currEnd + _maxDistance) { + //no, it's out of range. + if (!mustDelete) { + addToStorage(nextRecord); + } else { + deleteRecord(nextRecord); + } + nextRecord = NULL; + break; + } + + // NOW, going back, we can delete any unknown strand records. But don't stop scanning. + if (mustDelete) { + deleteRecord(nextRecord); + nextRecord = NULL; + continue; + } + //if taken from file, and wrong strand, store or delete. + if (!takenFromStorage && ((_mustBeForward && (nextRecord->getStrandVal() != Record::FORWARD)) || (_mustBeReverse && (nextRecord->getStrandVal() != Record::REVERSE)))) { + if (mustKeepOpposite) { + addToStorage(nextRecord); + } else { + deleteRecord(nextRecord); + } + nextRecord = NULL; + continue; //get the next record + } + //ok, they're on the same chrom and in range, and the strand is good. Do a merge. + recList->push_back(nextRecord); + madeComposite = true; + int nextEnd = nextRecord->getEndPos(); + if (nextEnd > currEnd) { + currEnd = nextEnd; + } + nextRecord = NULL; + } + if (madeComposite) { + Record *newKey = _recordMgr->allocateRecord(); + (*newKey) = (*startRecord); + newKey->setEndPos(currEnd); + recList->setKey(newKey); + } + _totalMergedRecordLength += (unsigned long)(recList->getKey()->getEndPos() - recList->getKey()->getStartPos()); + return const_cast<Record *>(recList->getKey()); +} + +void FileRecordMergeMgr::addToStorage(Record *record) { + //if the strand requirements are strict, and the record doesn't match, + //store in the "round file". + + if ((_desiredStrand == SAME_STRAND_FORWARD && record->getStrandVal() != Record::FORWARD) || + (_desiredStrand == SAME_STRAND_REVERSE && record->getStrandVal() != Record::REVERSE) || + (_desiredStrand != ANY_STRAND && record->getStrandVal() == Record::UNKNOWN)) { + deleteRecord(record); + return; + } + _storedRecords.push(record); +} + +Record *FileRecordMergeMgr::tryToTakeFromStorage() { + Record *record = _storedRecords.top(); + if (record != NULL) { + _storedRecords.pop(); + } + return record; +} + +Record *FileRecordMergeMgr::tryToTakeFromStorage(Record::strandType strand) { + Record *record = _storedRecords.top(strand); + if (record != NULL) { + _storedRecords.pop(strand); + } + return record; +} + +void FileRecordMergeMgr::deleteMergedRecord(RecordKeyList &recList) +{ + deleteAllMergedItemsButKey(recList); + deleteRecord(recList.getKey()); + recList.setKey(NULL); +} + +void FileRecordMergeMgr::deleteAllMergedItemsButKey(RecordKeyList &recList) { + //if the key is also in the list, this method won't delete it. + for (RecordKeyList::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { + if (iter->value() == recList.getKey()) { + continue; + } + deleteRecord(iter->value()); + } + recList.clearList(); +} + + diff --git a/src/utils/FileRecordTools/FileRecordMergeMgr.h b/src/utils/FileRecordTools/FileRecordMergeMgr.h new file mode 100644 index 0000000000000000000000000000000000000000..5bbfb529b45ba2830220356b74ff712d350644cd --- /dev/null +++ b/src/utils/FileRecordTools/FileRecordMergeMgr.h @@ -0,0 +1,55 @@ +/* + * FileRecordMergeMgr.h + * + * Created on: Mar 19, 2014 + * Author: nek3d + */ + +#ifndef FILERECORDMERGEMGR_H_ +#define FILERECORDMERGEMGR_H_ + +#include "FileRecordMgr.h" +#include "StrandQueue.h" + +class FileRecordMergeMgr : public FileRecordMgr { + +public: + FileRecordMergeMgr(const QuickString & filename); + + ////////////////////////////////////////////////////////////////////////////////// + // + // MERGED RECORDS + // + // This will give a single "meta" record containing "flattened" or merged records. + // + // Pass an empty RecordKeyList. When done, will have a pair: 1st is the final merged record, + // second is list of constituent Records merged. + // + /////////////////////////////////////////////////////////////////////////////////// + + Record *getNextRecord(RecordKeyList *keyList = NULL); + void deleteMergedRecord(RecordKeyList &recList); // MUST use this method for cleanup! + + typedef enum { SAME_STRAND_FORWARD, //must all be forward strand + SAME_STRAND_REVERSE, //must all be reverse strand + SAME_STRAND_EITHER, //must be same strand, but can be either forward or reverse + ANY_STRAND } //do no care about strand (Default value) + WANTED_STRAND_TYPE; + + void setStrandType(WANTED_STRAND_TYPE strand) { _desiredStrand = strand; } + void setMaxDistance(int maxDistance) { _maxDistance = maxDistance; } + +private: + + WANTED_STRAND_TYPE _desiredStrand; + int _maxDistance; + StrandQueue _storedRecords; + + void deleteAllMergedItemsButKey(RecordKeyList &recList); + void addToStorage(Record *record); + Record *tryToTakeFromStorage(); + Record *tryToTakeFromStorage(Record::strandType strand); +}; + + +#endif /* FILERECORDMERGEMGR_H_ */ diff --git a/src/utils/FileRecordTools/FileRecordMgr.cpp b/src/utils/FileRecordTools/FileRecordMgr.cpp index 38fc056b483236611748943ebcce4a7228f0a95f..0edf73a687a0a8feb0b77ae150f552b94ce610f9 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.cpp +++ b/src/utils/FileRecordTools/FileRecordMgr.cpp @@ -4,7 +4,7 @@ #include "Record.h" #include "NewGenomeFile.h" -FileRecordMgr::FileRecordMgr(const QuickString &filename, bool isSorted) +FileRecordMgr::FileRecordMgr(const QuickString &filename) : _filename(filename), _bufStreamMgr(NULL), @@ -12,7 +12,7 @@ FileRecordMgr::FileRecordMgr(const QuickString &filename, bool isSorted) _fileType(FileRecordTypeChecker::UNKNOWN_FILE_TYPE), _recordType(FileRecordTypeChecker::UNKNOWN_RECORD_TYPE), _recordMgr(NULL), - _isSortedInput(isSorted), + _isSortedInput(false), _freeListBlockSize(512), _useFullBamTags(false), _prevStart(INT_MAX), @@ -88,7 +88,7 @@ bool FileRecordMgr::eof(){ return _fileReader->eof(); } -Record *FileRecordMgr::allocateAndGetNextRecord() +Record *FileRecordMgr::getNextRecord(RecordKeyList *keyList) { if (!_fileReader->isOpen()) { return NULL; @@ -120,6 +120,9 @@ Record *FileRecordMgr::allocateAndGetNextRecord() } assignChromId(record); _totalRecordLength += (unsigned long)(record->getEndPos() - record->getStartPos()); + if (keyList != NULL) { + keyList->setKey(record); + } return record; } @@ -198,6 +201,10 @@ void FileRecordMgr::deleteRecord(const Record *record) { _recordMgr->deleteRecord(record); } +void FileRecordMgr::deleteRecord(RecordKeyList *keyList) { + _recordMgr->deleteRecord(keyList->getKey()); +} + void FileRecordMgr::allocateFileReader() { switch (_fileType) { @@ -224,175 +231,3 @@ const BamTools::RefVector & FileRecordMgr::getBamReferences() { } return static_cast<BamFileReader *>(_fileReader)->getReferences(); } - -#ifdef false - -Record *FileRecordMgr::allocateAndGetNextMergedRecord(WANT_STRAND_TYPE desiredStrand, int maxDistance) { - RecordKeyList recList; - if (!allocateAndGetNextMergedRecord(recList, desiredStrand, maxDistance)) { - return NULL; - } - deleteAllMergedItemsButKey(recList); - return const_cast<Record *>(recList.getKey()); //want key to be non-const -} - -bool FileRecordMgr::allocateAndGetNextMergedRecord(RecordKeyList & recList, WANT_STRAND_TYPE desiredStrand, int maxDistance) -{ - if (!recList.allClear()) { - deleteMergedRecord(recList); - } - - _mustBeForward = desiredStrand == SAME_STRAND_FORWARD; - _mustBeReverse = desiredStrand == SAME_STRAND_REVERSE; - - Record *startRecord = tryToTakeFromStorage(); - - // if we couldn't use a previously stored record for starters, - //then begin with a new one that matches strand criteria. - while (startRecord == NULL) { - startRecord = allocateAndGetNextRecord(); - if (startRecord == NULL) { //hit EOF!! - return false; - } - - if (_mustBeForward && !startRecord->getStrand()) { - //record is reverse, wanted forward. - addToStorage(startRecord); - startRecord = NULL; - } else if (_mustBeReverse && startRecord->getStrand()) { - //record is forward, wanted reverse - addToStorage(startRecord); - startRecord = NULL; - } - } - - // OK!! We have a start record! - - _mustBeForward = desiredStrand == SAME_STRAND_FORWARD || (desiredStrand == SAME_STRAND_EITHER && startRecord->getStrand()); - _mustBeReverse = desiredStrand == SAME_STRAND_REVERSE || (desiredStrand == SAME_STRAND_EITHER && !startRecord->getStrand()); - - const QuickString &currChrom = startRecord->getChrName(); - _foundChroms.insert(currChrom); - - bool madeComposite = false; - recList.push_back(startRecord); - recList.setKey(startRecord); //key of recList will just be the startRecord unless we're able to merge more. - - bool currStrand = startRecord->getStrand(); - bool mustMatchStrand = desiredStrand != ANY_STRAND; - - int currEnd = startRecord->getEndPos(); - //now look for more records to merge with this one. - //stop when they're out of range, not on the same chromosome, or we hit EOF. - //ignore if they don't comply with strand. - Record *nextRecord = NULL; - while (nextRecord == NULL) { - bool takenFromStorage = false; - nextRecord = mustMatchStrand ? tryToTakeFromStorage(currStrand) : tryToTakeFromStorage(); - if (nextRecord == NULL) { - nextRecord = allocateAndGetNextRecord(); - } else { - takenFromStorage = true; - } - if (nextRecord == NULL) { // EOF hit - break; - } - const QuickString &newChrom = nextRecord->getChrName(); - if (newChrom != currChrom) { //hit a different chromosome. - if (_foundChroms.find(newChrom) == _foundChroms.end() || takenFromStorage) { - //haven't seen this chromosome before. - addToStorage(nextRecord); - break; - } else { - //different strand, but we've already seen this chrom. File is not sorted. - fprintf(stderr, "ERROR: Input file %s is not sorted by chromosome, startPos.\n", _context->getInputFileName(_contextFileIdx).c_str()); - deleteRecord(nextRecord); - deleteMergedRecord(recList); - exit(1); - } - } - int nextStart = nextRecord->getStartPos(); - //is the record out of range? - if (nextStart > currEnd + maxDistance) { - //yes, it's out of range. - addToStorage(nextRecord); - break; - } - - //ok, they're on the same chrom and in range. Are we happy with the strand? - if (mustMatchStrand && nextRecord->getStrand() != currStrand) { - //no, we're not. - addToStorage(nextRecord); - nextRecord = NULL; - continue; - } - //everything's good! do a merge. - recList.push_back(nextRecord); - madeComposite = true; - int nextEnd = nextRecord->getEndPos(); - if (nextEnd > currEnd) { - currEnd = nextEnd; - } - nextRecord = NULL; - } - if (madeComposite) { - Record *newKey = _recordMgr->allocateRecord(); - (*newKey) = (*startRecord); - newKey->setEndPos(currEnd); - recList.setKey(newKey); - } - _totalMergedRecordLength += (unsigned long)(recList.getKey()->getEndPos() - recList.getKey()->getStartPos()); - return true; -} - -void FileRecordMgr::addToStorage(Record *record) { - _storedRecords.push(record); -} - -Record *FileRecordMgr::tryToTakeFromStorage() { - Record *record = _storedRecords.empty() ? NULL : const_cast<Record *>(_storedRecords.top()); - if (record != NULL) { - _storedRecords.pop(); - } - return record; -} - -Record *FileRecordMgr::tryToTakeFromStorage(bool strand) { - Record *record = NULL; - if(strand) { - if (_storedRecords.emptyForward()) { - return NULL; - } else { - record = const_cast<Record *>(_storedRecords.topForward()); - _storedRecords.popForward(); - return record; - } - } else { - if (_storedRecords.emptyReverse()) { - return NULL; - } else { - record = const_cast<Record *>(_storedRecords.topReverse()); - _storedRecords.popReverse(); - return record; - } - } -} - -void FileRecordMgr::deleteMergedRecord(RecordKeyList &recList) -{ - deleteAllMergedItemsButKey(recList); - deleteRecord(recList.getKey()); - recList.setKey(NULL); -} - -void FileRecordMgr::deleteAllMergedItemsButKey(RecordKeyList &recList) { - //if the key is also in the list, this method won't delete it. - for (RecordKeyList::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { - if (iter->value() == recList.getKey()) { - continue; - } - deleteRecord(iter->value()); - } - recList.clearList(); -} -#endif diff --git a/src/utils/FileRecordTools/FileRecordMgr.h b/src/utils/FileRecordTools/FileRecordMgr.h index b4e76e92f1e7e1618d67d28dcedba1581862024c..0994a7c7ea78b9cce3a78afb8332ca03aedea22c 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.h +++ b/src/utils/FileRecordTools/FileRecordMgr.h @@ -32,12 +32,20 @@ class NewGenomeFile; class FileRecordMgr { public: - FileRecordMgr(const QuickString & filename, bool isSorted = false); - ~FileRecordMgr(); + FileRecordMgr(const QuickString & filename); + virtual ~FileRecordMgr(); bool open(); void close(); bool eof(); + //This is an all-in-one method to give the user a new record that is initialized with + //the next entry in the data file. + //NOTE!! User MUST pass back the returned pointer to deleteRecord method for cleanup! + //Also Note! User must check for NULL returned, meaning we failed to get the next record. + virtual Record *getNextRecord(RecordKeyList *keyList = NULL); + void deleteRecord(const Record *); + virtual void deleteRecord(RecordKeyList *keyList); + const QuickString &getFileName() const { return _filename;} bool hasHeader() const { return _fileReader->hasHeader(); } const QuickString &getHeader() const { return _fileReader->getHeader(); } @@ -69,55 +77,6 @@ public: const BamTools::RefVector &getBamReferences(); int getNumFields() const { return _fileReader->getNumFields(); } - //This is an all-in-one method to give the user a new record that is initialized with - //the next entry in the data file. - //NOTE!! User MUST pass back the returned pointer to deleteRecord method for cleanup! - //Also Note! User must check for NULL returned, meaning we failed to get the next record. - Record *allocateAndGetNextRecord(); - void deleteRecord(const Record *); - -#ifdef false - ////////////////////////////////////////////////////////////////////////////////// - // - // MERGED RECORDS - // - //this will give a single "meta" record containing "flattened" or merged records. - // - // 1st ARG: Pass an empty RecordKeyList. When done, will have a pair: 1st is the final merged record, - // second is list of constituent Records merged. - // ** NOTE ** If the RecordKeyList is not empty, this method will empty it for you and delete all contents! - // - // 2nd ARG: Choose from WANT_STRAND_TYPE, defined below below - // - // 3rd ARG: allows for nearby records, i.e. maxDistance 100 will merge records <= 100 bases apart. Default 0 means only - // merge records that actually intersect. - // - // Return value: true if any records found. False if eof hit before records matching requested parameters found. - - typedef enum { SAME_STRAND_FORWARD, //must all be forward strand - SAME_STRAND_REVERSE, //must all be reverse strand - SAME_STRAND_EITHER, //must be same strand, but can be either forward or reverse - ANY_STRAND } //do no care about strand (Default value) - WANT_STRAND_TYPE; - - // - // WARNING!! Specifying a strand will keep all records on the other strand in memory!! - // This is done so that requests for records on that other strand can still be met. - // For now, use this method at any time to purge the kept records from memory, such as - // when changing chromosomes, for example. - void purgeKeepList(); - bool allocateAndGetNextMergedRecord(RecordKeyList & recList, WANT_STRAND_TYPE desiredStrand = ANY_STRAND, int maxDistance = 0); - void deleteMergedRecord(RecordKeyList &recList); // MUST use this method for cleanup! - - //this method will allocate a new record of merged records, but the returned record should only be passed to the deleteRecord method - //for cleanup, not to the delete mmerged record. - Record *allocateAndGetNextMergedRecord(WANT_STRAND_TYPE desiredStrand = ANY_STRAND, int maxDistance = 0); - - // - // END MERGED RECORDS - // - ////////////////////////////////////////////////////////////////////////////////// -#endif //File statistics unsigned long getTotalRecordLength() const { return _totalRecordLength; } //sum of length of all returned records @@ -140,7 +99,9 @@ public: _hasGenomeFile = true; } -private: + void setIsSorted(bool val) { _isSortedInput = val; } + +protected: QuickString _filename; BufferedStreamMgr *_bufStreamMgr; @@ -158,8 +119,6 @@ private: int _prevStart; int _prevChromId; - //members for handling merged records -// DualQueue<Record *, DualQueueAscending > _storedRecords; bool _mustBeForward; bool _mustBeReverse; @@ -177,16 +136,6 @@ private: void testInputSortOrder(Record *record); void assignChromId(Record *); void sortError(const Record *record, bool genomeFileError); - - -#ifdef false - void deleteAllMergedItemsButKey(RecordKeyList &recList); - void addToStorage(Record *record); - Record *tryToTakeFromStorage(); - Record *tryToTakeFromStorage(bool strand); -#endif - - }; diff --git a/src/utils/FileRecordTools/Makefile b/src/utils/FileRecordTools/Makefile index 9d021179772c29991565b157bb9aba7d0063e40f..fce099b8dc482a1db78459adb2c51d946ad47b2a 100644 --- a/src/utils/FileRecordTools/Makefile +++ b/src/utils/FileRecordTools/Makefile @@ -21,8 +21,8 @@ SUBDIRS = ./FileReaders \ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= FileRecordMgr.cpp FileRecordMgr.h -OBJECTS= FileRecordMgr.o RecordOutputMgr.o +SOURCES= FileRecordMgr.cpp FileRecordMgr.h FileRecordMergeMgr.cpp FileRecordMergeMgr.h +OBJECTS= FileRecordMgr.o FileRecordMergeMgr.o _EXT_OBJECTS=SingleLineDelimTextFileReader.o BamFileReader.o Bed3Interval.o Bed6Interval.o BedPlusInterval.o Bed12Interval.o BamRecord.o \ SingleLineDelimTransferBuffer.o FileRecordTypeChecker.o QuickString.o ParseTools.o RecordKeyList.o BufferedStreamMgr.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) @@ -31,6 +31,8 @@ BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) $(BUILT_OBJECTS): $(SOURCES) $(SUBDIRS) @echo " * compiling FileRecordMgr.cpp" @$(CXX) -c -o $(OBJ_DIR)/FileRecordMgr.o FileRecordMgr.cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + @echo " * compiling FileRecordMergeMgr.cpp" + @$(CXX) -c -o $(OBJ_DIR)/FileRecordMergeMgr.o FileRecordMergeMgr.cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) @@ -42,10 +44,8 @@ $(SUBDIRS): $(OBJ_DIR) clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/FileRecordMgr.o - @rm -f $(OBJ_DIR)/RecordMgr.o @rm -f $(OBJ_DIR)/FileRecordTypeChecker.o - @rm -f $(OBJ_DIR)/SingleLineDelimTextFileReader.o - @rm -f $(OBJ_DIR)/SingleLineDelimTransferBuffer.o + @rm -f $(OBJ_DIR)/FileRecordMergeMgr.o .PHONY: clean \ No newline at end of file diff --git a/src/utils/FileRecordTools/Records/Makefile b/src/utils/FileRecordTools/Records/Makefile index 6bfd6106ab52fb820c3759b74fa175639d948511..556cc82df90188cbaf17e2e325a178abef9918a0 100644 --- a/src/utils/FileRecordTools/Records/Makefile +++ b/src/utils/FileRecordTools/Records/Makefile @@ -21,9 +21,9 @@ SOURCES= RecordMgr.cpp RecordMgr.h Record.h Record.cpp Bed3Interval.h Bed3Interv Bed4Interval.h Bed4Interval.cpp BedGraphInterval.h BedGraphInterval.cpp Bed5Interval.h Bed5Interval.cpp \ Bed6Interval.h Bed6Interval.cpp \ BedPlusInterval.h BedPlusInterval.cpp Bed12Interval.h Bed12Interval.cpp BamRecord.h BamRecord.cpp VcfRecord.h VcfRecord.cpp \ - GffRecord.h GffRecord.cpp RecordKeyList.h RecordKeyList.cpp BlockMgr.h BlockMgr.cpp + GffRecord.h GffRecord.cpp RecordKeyList.h RecordKeyList.cpp BlockMgr.h BlockMgr.cpp StrandQueue.h StrandQueue.cpp OBJECTS= RecordMgr.o Record.o Bed3Interval.o Bed4Interval.o BedGraphInterval.o Bed5Interval.o Bed6Interval.o BedPlusInterval.o Bed12Interval.o BamRecord.o \ - VcfRecord.o GffRecord.o RecordKeyList.o BlockMgr.o + VcfRecord.o GffRecord.o RecordKeyList.o BlockMgr.o StrandQueue.o _EXT_OBJECTS=ParseTools.o QuickString.o ChromIdLookup.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) @@ -40,6 +40,6 @@ clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/RecordMgr.o $(OBJ_DIR)/Record.o $(OBJ_DIR)/Bed3Interval.o $(OBJ_DIR)/Bed4Interval.o \ $(OBJ_DIR)/BedGraphInterval.o $(OBJ_DIR)/Bed5Interval.o $(OBJ_DIR)/Bed6Interval.o \ - $(OBJ_DIR)/BedPlusInterval.o $(OBJ_DIR)/Bed12Interval.o $(OBJ_DIR)/BamRecord.o $(OBJ_DIR)/VcfRecord.o $(OBJ_DIR)/GffRecord.o $(OBJ_DIR)/BlockMgr.o + $(OBJ_DIR)/BedPlusInterval.o $(OBJ_DIR)/Bed12Interval.o $(OBJ_DIR)/BamRecord.o $(OBJ_DIR)/VcfRecord.o $(OBJ_DIR)/GffRecord.o $(OBJ_DIR)/BlockMgr.o $(OBJ_DIR)/StrandQueue.o .PHONY: clean \ No newline at end of file diff --git a/src/utils/FileRecordTools/Records/Record.h b/src/utils/FileRecordTools/Records/Record.h index d8071c1ed223d7172bd7faa33e6ff6f514975616..8e4376c1a9f6d2eb112c69d9c87a6b3b742d0a59 100644 --- a/src/utils/FileRecordTools/Records/Record.h +++ b/src/utils/FileRecordTools/Records/Record.h @@ -152,6 +152,9 @@ protected: bool _isMateUnmapped; }; - +class RecordPtrSortFunctor { +public: + bool operator()(const Record *rec1, const Record *rec2) const { return *rec1 > *rec2; } +}; #endif /* RECORD_H_ */ diff --git a/src/utils/FileRecordTools/Records/StrandQueue.cpp b/src/utils/FileRecordTools/Records/StrandQueue.cpp new file mode 100644 index 0000000000000000000000000000000000000000..aa53313b19465c01dd207fdd083d65bc67378643 --- /dev/null +++ b/src/utils/FileRecordTools/Records/StrandQueue.cpp @@ -0,0 +1,131 @@ +/* + * StrandQueue.cpp + * + * Created on: Mar 31, 2014 + * Author: nek3d + */ + +#include "StrandQueue.h" + +StrandQueue::StrandQueue() { + + for (int i=0; i < NUM_QUEUES; i++) { + queueType *queue = new queueType(); + _queues.push_back(queue); + } + _strandIdxs.resize(3); + _strandIdxs[0] = Record::FORWARD; + _strandIdxs[1] = Record::REVERSE; + _strandIdxs[2] = Record::UNKNOWN; +} + +StrandQueue::~StrandQueue() { + for (int i=0; i < NUM_QUEUES; i++) { + delete _queues[i]; + } +} + +Record *StrandQueue::top() const +{ + int minIdx = getMinIdx(); + if (minIdx == -1) return NULL; + return const_cast<Record *>(_queues[minIdx]->top()); +} + +void StrandQueue::pop() { + int minIdx = getMinIdx(); + if (minIdx == -1) return; + _queues[minIdx]->pop(); +} + +Record * StrandQueue::top(Record::strandType strand) const { + const Record *record = NULL; + switch (strand) { + case Record::FORWARD: + if (_queues[0]->empty()) return NULL; + record = _queues[0]->top(); + break; + case Record::REVERSE: + if (_queues[1]->empty()) return NULL; + + record = _queues[1]->top(); + break; + case Record::UNKNOWN: + if (_queues[0]->empty()) return NULL; + record = _queues[2]->top(); + break; + default: + break; + } + return const_cast<Record *>(record); +} + +void StrandQueue::pop(Record::strandType strand) const { + switch (strand) { + case Record::FORWARD: + if (_queues[0]->empty()) return; + _queues[0]->pop(); + break; + case Record::REVERSE: + if (_queues[1]->empty()) return; + _queues[1]->pop(); + break; + case Record::UNKNOWN: + if (_queues[2]->empty()) return; + _queues[2]->pop(); + break; + default: + break; + } +} + +void StrandQueue::push(Record *record) { + switch (record->getStrandVal()) { + case Record::FORWARD: + _queues[0]->push(record); + break; + case Record::REVERSE: + _queues[1]->push(record); + break; + case Record::UNKNOWN: + _queues[2]->push(record); + break; + default: + break; + } +} + +size_t StrandQueue::size() const { + size_t sumSize = 0; + for (int i = 0; i < NUM_QUEUES; i++) { + sumSize += _queues[i]->size(); + } + return sumSize; +} + +bool StrandQueue::empty() const { + for (int i = 0; i < NUM_QUEUES; i++) { + if (!_queues[i]->empty()) { + return false; + } + } + return true; +} + + +int StrandQueue::getMinIdx() const { + if (empty()) return -1; + const Record *minRec = NULL; + int minIdx = -1; + for (int i = 0; i < NUM_QUEUES; i++) { + if (_queues[i]->empty()) continue; + const Record *currTop = _queues[i]->top(); + if (currTop == NULL) continue; + if (minRec == NULL || *currTop < *minRec) { + minRec = currTop; + minIdx = i; + } + } + return minIdx; +} + diff --git a/src/utils/FileRecordTools/Records/StrandQueue.h b/src/utils/FileRecordTools/Records/StrandQueue.h new file mode 100644 index 0000000000000000000000000000000000000000..e6f0bb49d7eab3160850ba03f13958b867054f37 --- /dev/null +++ b/src/utils/FileRecordTools/Records/StrandQueue.h @@ -0,0 +1,47 @@ +/* + * StrandQueue.h + * + * Created on: Jan 29, 2013 + * Author: nek3d + */ +#ifndef STRANDQUEUE_H_ +#define STRANDQUEUE_H_ + +using namespace std; + +#include <vector> +#include <queue> +#include <cstdio> +#include <cstdlib> +#include "Record.h" + +class StrandQueue { +public: + StrandQueue(); + ~StrandQueue(); + + Record * top() const; + void pop(); + Record * top(Record::strandType strand) const; + void pop(Record::strandType strand) const; + void push(Record *record); + size_t size() const; + bool empty() const; + +private: +// static RecordPtrSortFunctor _recSortFunctor; + typedef priority_queue<Record *, vector<const Record *>, RecordPtrSortFunctor > queueType; + vector<queueType *> _queues; + static const int NUM_QUEUES = 3; + + //we want to be able to iterate over the enumerated strand types in Record.h, + //which are FORWARD, REVERSE, and UNKNOWN. However, iterating over an enum is hard to + //do, so we'll use a suggestion found in a forum, and put the enum values into a vector. + vector<Record::strandType> _strandIdxs; + + int getMinIdx() const; //will return the idx of queue with the current min val. + +}; + + +#endif // STRANDQUEUE_H_ diff --git a/src/utils/FileRecordTools/Records/recordsTar.tar.gz b/src/utils/FileRecordTools/Records/recordsTar.tar.gz deleted file mode 100644 index 321a88e4935d73f7034cce0611f9191d27bfeaa2..0000000000000000000000000000000000000000 Binary files a/src/utils/FileRecordTools/Records/recordsTar.tar.gz and /dev/null differ diff --git a/src/utils/KeyListOps/KeyListOps.h b/src/utils/KeyListOps/KeyListOps.h index 3c26d2c20b5ac49d4c608fc6ae7dd98653dfb932..5046ec1883ae92e00183dc232da8e67ca8cc164b 100644 --- a/src/utils/KeyListOps/KeyListOps.h +++ b/src/utils/KeyListOps/KeyListOps.h @@ -18,10 +18,24 @@ public: KeyListOps(); void setColumns(const QuickString &columns) { _columns = columns; } + void addColumns(const QuickString &newCols) { + if (!_columns.empty()) _columns += ","; + _columns += newCols; + } void setOperations(const QuickString & operation) { _operations = operation; } + void addOperations(const QuickString &newOps) { + if (!_operations.empty()) _operations += ","; + _operations += newOps; + } + void setNullValue(const QuickString & nullValue) { _methods.setNullValue(nullValue); } void setDelimStr(const QuickString & delimStr) { _methods.setDelimStr(delimStr); } + const QuickString &getColumns() { return _columns; } + const QuickString &getOperations() { return _operations; } + const QuickString &getNullValue() { return _methods.getNullValue(); } + const QuickString &getDelimStr() { return _methods.getDelimStr(); } + void setKeyList(RecordKeyList *keyList) { _methods.setKeyList(keyList); } typedef enum { SUM, MEAN, STDDEV, SAMPLE_STDDEV, MEDIAN, MODE, ANTIMODE, MIN, MAX, ABSMIN, ABSMAX, COUNT, DISTINCT, COUNT_DISTINCT, diff --git a/src/utils/NewChromsweep/NewChromsweep.cpp b/src/utils/NewChromsweep/NewChromsweep.cpp index 5e6045be625799cca515ac056e9a17b33f1908a6..c04fefb19b1cdfcb99f3cf03486074d4dd21c1aa 100644 --- a/src/utils/NewChromsweep/NewChromsweep.cpp +++ b/src/utils/NewChromsweep/NewChromsweep.cpp @@ -185,7 +185,7 @@ bool NewChromSweep::next(RecordKeyList &next) { void NewChromSweep::nextRecord(bool query) { if (query) { // if (!_context->getUseMergedIntervals()) { - _currQueryRec = _queryFRM->allocateAndGetNextRecord(); + _currQueryRec = _queryFRM->getNextRecord(); // } else { // _currQueryRec = _queryFRM->allocateAndGetNextMergedRecord(_context->getSameStrand() ? FileRecordMgr::SAME_STRAND_EITHER : FileRecordMgr::ANY_STRAND); // } @@ -194,7 +194,7 @@ void NewChromSweep::nextRecord(bool query) { } } else { //database // if (!_context->getUseMergedIntervals()) { - _currDatabaseRec = _databaseFRM->allocateAndGetNextRecord(); + _currDatabaseRec = _databaseFRM->getNextRecord(); // } else { // _currDatabaseRec = _databaseFRM->allocateAndGetNextMergedRecord(_context->getSameStrand() ? FileRecordMgr::SAME_STRAND_EITHER : FileRecordMgr::ANY_STRAND); // } diff --git a/src/utils/RecordOutputMgr/RecordOutputMgr.cpp b/src/utils/RecordOutputMgr/RecordOutputMgr.cpp index 55cb70ec9aa74af58b33e5faf32d18b19c0b0bae..9ab8b52b060163e1cc47753a4e2f446f1c36f531 100644 --- a/src/utils/RecordOutputMgr/RecordOutputMgr.cpp +++ b/src/utils/RecordOutputMgr/RecordOutputMgr.cpp @@ -76,9 +76,15 @@ bool RecordOutputMgr::printKeyAndTerminate(RecordKeyList &keyList) { if (bamCode == BAM_AS_BAM) { return true; } else if (bamCode == NOT_BAM) { - keyList.getKey()->print(_outBuf); + if (_context->getProgram() == ContextBase::MERGE) { + //when printing merged records, we want to force the printing into + //bed3 format, which is surprisingly difficult to do. Had to use the following: + const Bed3Interval *bed3 = static_cast<const Bed3Interval *>(keyList.getKey()); + bed3->Bed3Interval::print(_outBuf); + } else { + keyList.getKey()->print(_outBuf); + } return false; - } //otherwise, it was BAM_AS_BED, and the key was printed. return false; @@ -114,6 +120,7 @@ void RecordOutputMgr::printRecord(const Record *record) void RecordOutputMgr::printRecord(const Record *record, const QuickString & value) { + _afterVal = value; printRecord(record); _outBuf.append(value); newline(); @@ -206,6 +213,17 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi } _currBamBlockList = NULL; return; + } else if (_context->getProgram() == ContextBase::MERGE) { + if (!printKeyAndTerminate(keyList)) { + if (_context->getDesiredStrand() != FileRecordMergeMgr::ANY_STRAND) { + //add the sign of the record + tab(); + _outBuf.append(keyList.getKey()->getStrand()); + } + if (!_afterVal.empty()) tab(); + } + _currBamBlockList = NULL; + return; } } diff --git a/src/utils/RecordOutputMgr/RecordOutputMgr.h b/src/utils/RecordOutputMgr/RecordOutputMgr.h index b6296fdadc2d166d4c4d00b0b1f12fe306d46e70..07891cc2cfdcc3bf3d5de19349939953ae31c523 100644 --- a/src/utils/RecordOutputMgr/RecordOutputMgr.h +++ b/src/utils/RecordOutputMgr/RecordOutputMgr.h @@ -44,6 +44,7 @@ private: // BlockMgr *_bamBlockMgr; const BlockMgr *_splitInfo; + QuickString _afterVal; //to store values to be printed after record, such as column operations. //some helper functions to neaten the code. void tab() { _outBuf.append('\t'); } void newline() { _outBuf.append('\n'); } diff --git a/src/utils/general/DualQueue.h b/src/utils/general/DualQueue.h deleted file mode 100644 index 1cee41581709e742702aac75fb5c0d69430205c9..0000000000000000000000000000000000000000 --- a/src/utils/general/DualQueue.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * DualQueue.h - * - * Created on: Jan 29, 2013 - * Author: nek3d - */ -#ifdef false -#ifndef DUALQUEUE_H_ -#define DUALQUEUE_H_ - -using namespace std; - -#include <vector> -#include <queue> -#include <cstdio> -#include <cstdlib> - -template <class T> class DualQueueAscending { -public: - bool operator() ( const T &item1, const T &item2) const { - - printf("\n\nIn comparison method:\n item1=\n"); -// item1->print(); - printf("\nitem2=\n"); -// item2->print(); - printf("\n"); - - if( *(item1) < *(item2) ) { - printf("Item1 less than item2. Returning false.\n"); - return false; - } - printf("Item1 not less than item2. Returning true.\n"); - return true; - } -}; - -template <class T> class DualQueueDescending { -public: - bool operator() ( const T &item1, const T &item2) const { - if( *(item2) < *(item1) ) { - return false; - } - return true; - } -}; - - -template <class T, template<class T> class CompareFunc> class DualQueue { -public: - DualQueue() {} - ~DualQueue() {} - - const T & top() const { - if (empty()) { - fprintf(stderr, "ERROR. Tried to top from empty dualQueue.\n"); - exit(1); - } - if (emptyForward()) { - return topReverse(); - } - if (emptyReverse()) { - return topForward(); - } - - return (topFowardHigherPriorityThanTopReverse() ? topForward() : topReverse()); - } - void pop() { - if (empty()) { - fprintf(stderr, "ERROR. Tried to pop from empty dualQueue.\n"); - exit(1); - } - if (emptyForward()) { - popReverse(); - return; - } - if (emptyReverse()) { - popForward(); - return; - } - topFowardHigherPriorityThanTopReverse() ? popForward() : popReverse(); - } - void push(const T &item) { item->getStrand() ? pushForward(item) : pushReverse(item); } - size_t size() const { return sizeForward() + sizeReverse(); } - bool empty() const { return _forwardQueue.empty() && _reverseQueue.empty(); } - - const T & topForward() const { return _forwardQueue.top(); } - void popForward() { _forwardQueue.pop(); } - void pushForward(const T &item) { _forwardQueue.push(item); } - size_t sizeForward() const { return _forwardQueue.size(); } - bool emptyForward() const { return _forwardQueue.empty(); } - - - const T & topReverse() const { return _reverseQueue.top(); } - void popReverse() { _reverseQueue.pop(); } - void pushReverse(const T &item) { _reverseQueue.push(item); } - size_t sizeReverse() const { return _reverseQueue.size(); } - bool emptyReverse() const { return _reverseQueue.empty(); } - - -private: - typedef priority_queue<T, vector<T>, CompareFunc<T> > queueType; - queueType _forwardQueue; - queueType _reverseQueue; - - bool topFowardHigherPriorityThanTopReverse() const { - printf("\n\nIn priority method:\n TopForward=\n"); -// topForward()->print(); - printf("\nTopReverse=\n"); -// topReverse()->print(); - printf("\n"); - if (CompareFunc<T>()(topForward(), topReverse())) { - printf("Forward higher priority than reverse.\n"); - return true; - } else { - printf("Reverse higher priority than forward.\n"); - return false; - } - } - -}; - - -#endif /* DUALQUEUE_H_ */ -#endif diff --git a/src/utils/general/ParseTools.cpp b/src/utils/general/ParseTools.cpp index ad5764b416e8a76566c3b7210a0f97fb66b55325..bef426a4ed7d7b3625f75f62861e874747bc2672 100644 --- a/src/utils/general/ParseTools.cpp +++ b/src/utils/general/ParseTools.cpp @@ -19,6 +19,9 @@ int str2chrPos(const QuickString &str) { } int str2chrPos(const char *str, size_t ulen) { + if (ulen == 0) { + ulen = strlen(str); + } int len=(int)ulen; if (len < 1 || len > 10) { return INT_MIN; //can't do more than 9 digits and a minus sign diff --git a/src/utils/general/ParseTools.h b/src/utils/general/ParseTools.h index 2132d0e4c6be8994bbc2a579fbf951f3b4bc323d..405d631a88ae067dfd121ebfd0ad890c857c5690 100644 --- a/src/utils/general/ParseTools.h +++ b/src/utils/general/ParseTools.h @@ -22,7 +22,7 @@ bool isNumeric(const QuickString &str); //Empty strings, too long strings, or strings containing anything other than //digits (with the excpetion of a minus sign in the first position) //will result in error. Errors return INT_MIN. -int str2chrPos(const char *str, size_t len); +int str2chrPos(const char *str, size_t len = 0); int str2chrPos(const QuickString &str); diff --git a/test/merge/test-merge.sh b/test/merge/test-merge.sh index ee7f629064989b67989943c7b31cf3c40435779a..165e1e863e485477eb65ddbceb965596777b0e33 100644 --- a/test/merge/test-merge.sh +++ b/test/merge/test-merge.sh @@ -30,18 +30,22 @@ $BT merge -i a.bed > obs check obs exp rm obs exp - +########################################################### +# +# NOTE: Testing for sorted input is now deprecated, as the +# FileRecordMgr is already testing for that. +# ########################################################### # Test #2 # Enforce coordinate sorted input. ########################################################### -echo " merge.t2...\c" -command -v tac 2>/dev/null || alias tac="sed '1!G;h;\$!d'" -tac a.bed | $BT merge -i - 2> obs -echo "ERROR: input file: (-) is not sorted by chrom then start. - The start coordinate at line 3 is less than the start at line 2" > exp -check obs exp -rm obs exp +#echo " merge.t2...\c" +#command -v tac 2>/dev/null || alias tac="sed '1!G;h;\$!d'" +#tac a.bed | $BT merge -i - 2> obs +#echo "ERROR: input file: (-) is not sorted by chrom then start. +# The start coordinate at line 3 is less than the start at line 2" > exp +#check obs exp +#rm obs exp ########################################################### @@ -64,11 +68,9 @@ rm obs exp ########################################################### echo " merge.t4...\c" echo \ -"chr1 10 20 -***** -*****ERROR: No names found to report for the -names option. Exiting. -*****" > exp -$BT merge -i a.bed -nms > obs 2>&1 +"***** +***** ERROR: Requested column 4, but database file a.bed only has fields 1 - 3." > exp +$BT merge -i a.bed -nms 2>&1 > /dev/null | head -3 | tail -2 > obs check obs exp rm obs exp @@ -130,7 +132,7 @@ chr1 30 100 a2,a3,a4 9 3 chr2 10 20 a1 5 1 chr2 30 40 a2 6 1 chr2 42 100 a3,a4 15 2" > exp -$BT merge -i a.full.bed -nms -n -scores sum> obs +$BT merge -i a.full.bed -nms -scores sum -n> obs check obs exp rm obs exp @@ -139,15 +141,15 @@ rm obs exp ########################################################### echo " merge.t9...\c" echo \ -"chr1 10 20 a1 1 + 1 -chr1 30 40 a2 2 + 1 -chr1 45 100 a4 4 + 1 -chr1 40 50 a3 3 - 1 -chr2 10 20 a1 5 + 1 -chr2 30 40 a2 6 + 1 -chr2 42 50 a3 7 + 1 -chr2 45 100 a4 8 - 1" > exp -$BT merge -i a.full.bed -s -nms -n -scores sum> obs +"chr1 10 20 + a1 1 1 +chr1 30 40 + a2 2 1 +chr1 40 50 - a3 3 1 +chr1 45 100 + a4 4 1 +chr2 10 20 + a1 5 1 +chr2 30 40 + a2 6 1 +chr2 42 50 + a3 7 1 +chr2 45 100 - a4 8 1" > exp +$BT merge -i a.full.bed -s -nms -scores sum -n> obs check obs exp rm obs exp