diff --git a/Makefile b/Makefile index 1890fa2653ecfe15b4f2b33c8b2b7b3a951d5ab1..eb2f97ae396678f8489b416ccc0327409939bec5 100644 --- a/Makefile +++ b/Makefile @@ -47,7 +47,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/linksBed \ $(SRC_DIR)/maskFastaFromBed \ $(SRC_DIR)/mapFile \ - $(SRC_DIR)/mergeBed \ + $(SRC_DIR)/mergeFile \ $(SRC_DIR)/multiBamCov \ $(SRC_DIR)/multiIntersectBed \ $(SRC_DIR)/nekSandbox1 \ diff --git a/src/intersectFile/intersectFile.cpp b/src/intersectFile/intersectFile.cpp index 928d1d9a4f7e8f985cc9edd2ca44b1dbdcf8a517..32643ac9fc851a20c5ef92b625ae4e31067976e4 100644 --- a/src/intersectFile/intersectFile.cpp +++ b/src/intersectFile/intersectFile.cpp @@ -81,7 +81,7 @@ bool FileIntersect::processUnsortedFiles() while (!queryFRM->eof()) { - Record *queryRecord = queryFRM->allocateAndGetNextRecord(); + Record *queryRecord = queryFRM->getNextRecord(); if (queryRecord == NULL) { continue; } @@ -97,7 +97,6 @@ bool FileIntersect::processUnsortedFiles() } queryFRM->deleteRecord(queryRecord); } - queryFRM->close(); //clean up. delete binTree; diff --git a/src/mapFile/mapMain.cpp b/src/mapFile/mapMain.cpp index f08e56b3905727b3333e4743e53b9bb9e9aaa239..49d1b270431d1be7800b669159b0a3d32dc2421f 100644 --- a/src/mapFile/mapMain.cpp +++ b/src/mapFile/mapMain.cpp @@ -48,18 +48,8 @@ void map_help(void) { cerr << "Options: " << endl; - cerr << "\t-c\t" << "Specify the column from the B file to map onto intervals in A." << endl; - cerr << "\t\t - Default = 5." << endl << endl; - - cerr << "\t-o\t" << "Specify the operation that should be applied to -c." << endl; - cerr << "\t\t Valid operations:" << endl; - cerr << "\t\t sum, min, max, absmin, absmax," << endl; - cerr << "\t\t mean, median," << endl; - cerr << "\t\t collapse (i.e., print a comma separated list (duplicates allowed)), " << endl; - cerr << "\t\t distinct (i.e., print a comma separated list (NO duplicates allowed)), " << endl; - cerr << "\t\t count" << endl; - cerr << "\t\t count_distinct (i.e., a count of the unique values in the column), " << endl; - cerr << "\t\t- Default: sum" << endl << endl; + KeyListOpsHelp(); + cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl; cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl; diff --git a/src/mergeBed/Makefile b/src/mergeBed/Makefile deleted file mode 100644 index 0eaaa645efeab15081c7b84cbb024a4a33826897..0000000000000000000000000000000000000000 --- a/src/mergeBed/Makefile +++ /dev/null @@ -1,35 +0,0 @@ -UTILITIES_DIR = ../utils/ -OBJ_DIR = ../../obj/ -BIN_DIR = ../../bin/ - -# ------------------- -# define our includes -# ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ - -I$(UTILITIES_DIR)/lineFileUtilities/ \ - -I$(UTILITIES_DIR)/gzstream/ \ - -I$(UTILITIES_DIR)/fileType/ \ - -I$(UTILITIES_DIR)/VectorOps/ \ - -I$(UTILITIES_DIR)/version/ - -# ---------------------------------- -# define our source and object files -# ---------------------------------- -SOURCES= mergeMain.cpp mergeBed.cpp mergeBed.h -OBJECTS= mergeMain.o mergeBed.o -BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) - - -all: $(BUILT_OBJECTS) - -.PHONY: all - -$(BUILT_OBJECTS): $(SOURCES) - @echo " * compiling" $(*F).cpp - @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) - -clean: - @echo "Cleaning up." - @rm -f $(OBJ_DIR)/mergeMain.o $(OBJ_DIR)/mergeBed.o - -.PHONY: clean \ No newline at end of file diff --git a/src/mergeBed/mergeBed.cpp b/src/mergeBed/mergeBed.cpp deleted file mode 100644 index dae9887ba3ee46e8930b995e9be6bf3d2f8e4004..0000000000000000000000000000000000000000 --- a/src/mergeBed/mergeBed.cpp +++ /dev/null @@ -1,359 +0,0 @@ -/***************************************************************************** - mergeBed.cpp - - (c) 2009 - Aaron Quinlan - Hall Laboratory - Department of Biochemistry and Molecular Genetics - University of Virginia - aaronquinlan@gmail.com - - Licenced under the GNU General Public License 2.0 license. -******************************************************************************/ -#include "lineFileUtilities.h" -#include "mergeBed.h" - - - -void BedMerge::ReportMergedNames(const vector<string> &names) { - if (names.size() > 0) { - printf("\t"); - vector<string>::const_iterator nameItr = names.begin(); - vector<string>::const_iterator nameEnd = names.end(); - for (; nameItr != nameEnd; ++nameItr) { - if (nameItr < (nameEnd - 1)) - cout << *nameItr << _delimiter; - else - cout << *nameItr; - } - } - else { - cerr << endl - << "*****" << endl - << "*****ERROR: " - << "No names found to report for the -names option. Exiting." - << endl - << "*****" << endl; - exit(1); - } -} - - -void BedMerge::ReportMergedScores(const vector<string> &scores) { - - // setup a VectorOps instances for the list of scores. - // VectorOps methods used for each possible operation. - VectorOps vo(scores); - std::stringstream buffer; - if (scores.size() > 0) { - if (_scoreOp == "sum") - buffer << setprecision (PRECISION) << vo.GetSum(); - else if (_scoreOp == "min") - buffer << setprecision (PRECISION) << vo.GetMin(); - else if (_scoreOp == "max") - buffer << setprecision (PRECISION) << vo.GetMax(); - else if (_scoreOp == "mean") - buffer << setprecision (PRECISION) << vo.GetMean(); - else if (_scoreOp == "median") - buffer << setprecision (PRECISION) << vo.GetMedian(); - else if (_scoreOp == "mode") - buffer << setprecision (PRECISION) << vo.GetMode(); - else if (_scoreOp == "antimode") - buffer << setprecision (PRECISION) << vo.GetAntiMode(); - else if (_scoreOp == "collapse") - buffer << setprecision (PRECISION) << vo.GetCollapse(_delimiter); - cout << "\t" << buffer.str(); - } - else { - cerr << endl - << "*****" << endl - << "*****ERROR: No scores found to report for the -scores option. Exiting." << endl - << "*****" << endl; - exit(1); - } -} - -// =============== -// = Constructor = -// =============== -BedMerge::BedMerge(string &bedFile, - bool numEntries, - int maxDistance, - bool forceStrand, - bool reportNames, - bool reportScores, - const string &scoreOp, - const string &delimiter) : - _bedFile(bedFile), - _numEntries(numEntries), - _forceStrand(forceStrand), - _reportNames(reportNames), - _reportScores(reportScores), - _scoreOp(scoreOp), - _maxDistance(maxDistance), - _delimiter(delimiter) -{ - _bed = new BedFile(bedFile); - - if (_forceStrand == false) - MergeBed(); - else - MergeBedStranded(); -} - - -// ================= -// = Destructor = -// ================= -BedMerge::~BedMerge(void) { -} - - -// =============================================== -// Convenience method for reporting merged blocks -// ================================================ -void BedMerge::Report(string chrom, int start, - int end, const vector<string> &names, - const vector<string> &scores, int mergeCount) -{ - // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type - //if (_bed->isZeroBased == false) {start++;} - - printf("%s\t%d\t%d", chrom.c_str(), start, end); - // just the merged intervals - if (_numEntries == false && _reportNames == false && - _reportScores == false) { - printf("\n"); - } - // merged intervals and counts - else if (_numEntries == true && _reportNames == false && - _reportScores == false) { - printf("\t%d\n", mergeCount); - } - // merged intervals, counts, and scores - else if (_numEntries == true && _reportNames == false && - _reportScores == true) { - printf("\t%d", mergeCount); - ReportMergedScores(scores); - printf("\n"); - } - // merged intervals, counts, and names - else if (_numEntries == true && _reportNames == true && - _reportScores == false) { - ReportMergedNames(names); - printf("\t%d\n", mergeCount); - } - // merged intervals, counts, names, and scores - else if (_numEntries == true && _reportNames == true && - _reportScores == true) { - ReportMergedNames(names); - ReportMergedScores(scores); - printf("\t%d\n", mergeCount); - } - // merged intervals and names - else if (_numEntries == false && _reportNames == true && - _reportScores == false) { - ReportMergedNames(names); - printf("\n"); - } - // merged intervals and scores - else if (_numEntries == false && _reportNames == false && - _reportScores == true) { - ReportMergedScores(scores); - printf("\n"); - } - // merged intervals, names, and scores - else if (_numEntries == false && _reportNames == true && - _reportScores == true) { - ReportMergedNames(names); - ReportMergedScores(scores); - printf("\n"); - } -} - - -// ========================================================= -// Convenience method for reporting merged blocks by strand -// ========================================================= -void BedMerge::ReportStranded(string chrom, int start, - int end, const vector<string> &names, - const vector<string> &scores, int mergeCount, - string strand) -{ - // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type - //if (_bed->isZeroBased == false) {start++;} - - printf("%s\t%d\t%d", chrom.c_str(), start, end); - // just the merged intervals - if (_numEntries == false && _reportNames == false && - _reportScores == false) { - printf("\t\t\t%s\n", strand.c_str()); - } - // merged intervals and counts - else if (_numEntries == true && _reportNames == false && - _reportScores == false) { - printf("\t\t%d\t%s\n", mergeCount, strand.c_str()); - } - // merged intervals, counts, and scores - else if (_numEntries == true && _reportNames == false && - _reportScores == true) { - printf("\t%d", mergeCount); - ReportMergedScores(scores); - printf("\t%s\n", strand.c_str()); - } - // merged intervals, counts, and names - else if (_numEntries == true && _reportNames == true && - _reportScores == false) { - ReportMergedNames(names); - printf("\t%d\t%s", mergeCount, strand.c_str()); - printf("\n"); - } - // merged intervals, counts, names, and scores - else if (_numEntries == true && _reportNames == true && - _reportScores == true) { - ReportMergedNames(names); - ReportMergedScores(scores); - printf("\t%s\t%d", strand.c_str(), mergeCount); - printf("\n"); - } - // merged intervals and names - else if (_numEntries == false && _reportNames == true && - _reportScores == false) { - ReportMergedNames(names); - printf("\t.\t%s\n", strand.c_str()); - } - // merged intervals and scores - else if (_numEntries == false && _reportNames == false && - _reportScores == true) { - printf("\t"); - ReportMergedScores(scores); - printf("\t%s\n", strand.c_str()); - } - // merged intervals, names, and scores - else if (_numEntries == false && _reportNames == true && - _reportScores == true) { - ReportMergedNames(names); - ReportMergedScores(scores); - printf("\t%s\n", strand.c_str()); - } -} - - -// ===================================================== -// = Merge overlapping BED entries into a single entry = -// ===================================================== -void BedMerge::MergeBed() { - int mergeCount = 1; - vector<string> names; - vector<string> scores; - int start = -1; - int end = -1; - BED prev, curr; - - _bed->Open(); - while (_bed->GetNextBed(curr, true)) { // true = force sorted intervals - if (_bed->_status != BED_VALID) - continue; - // new block, no overlap - if ( (((int) curr.start - end) > _maxDistance) || - (curr.chrom != prev.chrom)) - { - if (start >= 0) { - Report(prev.chrom, start, end, names, scores, mergeCount); - // reset - mergeCount = 1; - names.clear(); - scores.clear(); - } - start = curr.start; - end = curr.end; - if (!curr.name.empty()) - names.push_back(curr.name); - if (!curr.score.empty()) - scores.push_back(curr.score); - } - // same block, overlaps - else { - if ((int) curr.end > end) - end = curr.end; - if (!curr.name.empty()) - names.push_back(curr.name); - if (!curr.score.empty()) - scores.push_back(curr.score); - mergeCount++; - } - prev = curr; - } - if (start >= 0) { - Report(prev.chrom, start, end, names, scores, mergeCount); - } -} - - -// =============================================================================== -// = Merge overlapping BED entries into a single entry, accounting for strandedness = -// ================================================================================ -void BedMerge::MergeBedStranded() { - - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bed->loadBedFileIntoMapNoBin(); - - // loop through each chromosome and merge their BED entries - masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); - masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); - for (; m != mEnd; ++m) { - - // bedList is already sorted by start position. - string chrom = m->first; - - // make a list of the two strands to merge separately. - vector<string> strands(2); - strands[0] = "+"; - strands[1] = "-"; - // do two passes, one for each strand. - for (unsigned int s = 0; s < strands.size(); s++) { - int mergeCount = 1; - int numOnStrand = 0; - vector<string> names; - vector<string> scores; - - // merge overlapping features for this chromosome. - int start = -1; - int end = -1; - vector<BED>::const_iterator bedItr = m->second.begin(); - vector<BED>::const_iterator bedEnd = m->second.end(); - for (; bedItr != bedEnd; ++bedItr) { - // if forcing strandedness, move on if the hit - // is not on the current strand. - if (bedItr->strand != strands[s]) { continue; } - else { numOnStrand++; } - if ( (((int) bedItr->start - end) > _maxDistance) || - (end < 0)) - { - if (start >= 0) { - ReportStranded(chrom, start, end, names, - scores, mergeCount, strands[s]); - // reset - mergeCount = 1; - names.clear(); - scores.clear(); - } - start = bedItr->start; - end = bedItr->end; - if (!bedItr->name.empty()) names.push_back(bedItr->name); - if (!bedItr->score.empty()) scores.push_back(bedItr->score); - } - else { - if ((int) bedItr-> end > end) end = bedItr->end; - mergeCount++; - if (!bedItr->name.empty()) names.push_back(bedItr->name); - if (!bedItr->score.empty()) scores.push_back(bedItr->score); - } - } - if (start >= 0) { - ReportStranded(chrom, start, end, names, - scores, mergeCount, strands[s]); - } - } - } -} diff --git a/src/mergeBed/mergeBed.h b/src/mergeBed/mergeBed.h deleted file mode 100644 index d9b0c1438cb33aae8db1a40ad0ca92635e0eee20..0000000000000000000000000000000000000000 --- a/src/mergeBed/mergeBed.h +++ /dev/null @@ -1,72 +0,0 @@ -/***************************************************************************** - mergeBed.h - - (c) 2009 - Aaron Quinlan - Hall Laboratory - Department of Biochemistry and Molecular Genetics - University of Virginia - aaronquinlan@gmail.com - - Licenced under the GNU General Public License 2.0 license. -******************************************************************************/ -#include "bedFile.h" -#include <vector> -#include <algorithm> -#include <numeric> -#include <iostream> -#include <iomanip> -#include <fstream> -#include <limits.h> -#include <stdlib.h> -#include "VectorOps.h" - -using namespace std; - -const int PRECISION = 21; - -//************************************************ -// Class methods and elements -//************************************************ -class BedMerge { - -public: - - // constructor - BedMerge(string &bedFile, bool numEntries, - int maxDistance, bool forceStrand, - bool reportNames, bool reportScores, - const string &scoreOp, const string &delimiter); - - // destructor - ~BedMerge(void); - - void MergeBed(); - void MergeBedStranded(); - -private: - - string _bedFile; - bool _numEntries; - bool _forceStrand; - bool _reportNames; - bool _reportScores; - string _scoreOp; - int _maxDistance; - string _delimiter; - // instance of a bed file class. - BedFile *_bed; - - void Report(string chrom, int start, int end, - const vector<string> &names, - const vector<string> &scores, - int mergeCount); - - void ReportStranded(string chrom, int start, int end, - const vector<string> &names, - const vector<string> &scores, - int mergeCount, - string strand); - void ReportMergedNames(const vector<string> &names); - void ReportMergedScores(const vector<string> &scores); - -}; diff --git a/src/mergeBed/mergeMain.cpp b/src/mergeBed/mergeMain.cpp deleted file mode 100644 index 28b869af4cb54bbe0ec9eeaf7251d23ab985564d..0000000000000000000000000000000000000000 --- a/src/mergeBed/mergeMain.cpp +++ /dev/null @@ -1,184 +0,0 @@ -/***************************************************************************** - mergeMain.cpp - - (c) 2009 - Aaron Quinlan - Hall Laboratory - Department of Biochemistry and Molecular Genetics - University of Virginia - aaronquinlan@gmail.com - - Licenced under the GNU General Public License 2.0 license. -******************************************************************************/ -#include "mergeBed.h" -#include "version.h" - -using namespace std; - -// define our program name -#define PROGRAM_NAME "bedtools merge" - - -// define our parameter checking macro -#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) - -// function declarations -void merge_help(void); - -int merge_main(int argc, char* argv[]) { - - // our configuration variables - bool showHelp = false; - - // input files - string bedFile = "stdin"; - int maxDistance = 0; - string scoreOp = ""; - - // input arguments - bool haveBed = true; - bool numEntries = false; - bool haveMaxDistance = false; - bool forceStrand = false; - bool reportNames = false; - bool reportScores = false; - string delimiter = ","; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) merge_help(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - bedFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-n", 2, parameterLength)) { - numEntries = true; - } - else if(PARAMETER_CHECK("-d", 2, parameterLength)) { - if ((i+1) < argc) { - haveMaxDistance = true; - maxDistance = atoi(argv[i + 1]); - i++; - } - } - else if (PARAMETER_CHECK("-s", 2, parameterLength)) { - forceStrand = true; - } - else if (PARAMETER_CHECK("-nms", 4, parameterLength)) { - reportNames = true; - } - else if (PARAMETER_CHECK("-scores", 7, parameterLength)) { - reportScores = true; - if ((i+1) < argc) { - scoreOp = argv[i + 1]; - i++; - } - } - else if (PARAMETER_CHECK("-delim", 6, parameterLength)) { - if ((i+1) < argc) { - delimiter = argv[i + 1]; - i++; - } - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have both input files - if (!haveBed) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl; - showHelp = true; - } - if ((reportScores == true) && (scoreOp != "sum") - && (scoreOp != "max") && (scoreOp != "min") - && (scoreOp != "mean") && (scoreOp != "mode") - && (scoreOp != "median") && (scoreOp != "antimode") - && (scoreOp != "collapse")) - { - cerr << endl - << "*****" - << endl - << "*****ERROR: Invalid scoreOp selection \"" - << scoreOp - << endl - << "\" *****" - << endl; - showHelp = true; - } - - if (!showHelp) { - BedMerge *bm = new BedMerge(bedFile, numEntries, - maxDistance, forceStrand, - reportNames, reportScores, - scoreOp, delimiter); - delete bm; - } - else { - merge_help(); - } - return 0; -} - -void merge_help(void) { - - cerr << "\nTool: bedtools merge (aka mergeBed)" << endl; - cerr << "Version: " << VERSION << "\n"; - cerr << "Summary: Merges overlapping BED/GFF/VCF entries into a single interval." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf>" << endl << endl; - - cerr << "Options: " << endl; - cerr << "\t-s\t" << "Force strandedness. That is, only merge features" << endl; - cerr << "\t\tthat are the same strand." << endl; - cerr << "\t\t- By default, merging is done without respect to strand." << endl << endl; - - cerr << "\t-n\t" << "Report the number of BED entries that were merged." << endl; - cerr << "\t\t- Note: \"1\" is reported if no merging occurred." << endl << endl; - - - cerr << "\t-d\t" << "Maximum distance between features allowed for features" << endl; - cerr << "\t\tto be merged." << endl; - cerr << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl; - cerr << "\t\t- (INTEGER)" << endl << endl; - - cerr << "\t-nms\t" << "Report the names of the merged features separated by commas." << endl; - cerr << "\t\tChange delim. with -delim." << endl << endl; - - cerr << "\t-scores\t" << "Report the scores of the merged features. Specify one of " << endl; - cerr << "\t\tthe following options for reporting scores:" << endl; - cerr << "\t\t sum, min, max," << endl; - cerr << "\t\t mean, median, mode, antimode," << endl; - cerr << "\t\t collapse (i.e., print a semicolon-separated list)," << endl; - cerr << "\t\t- (INTEGER)" << endl << endl; - - cerr << "\t-delim\t" << "Specify a custom delimiter for the -nms and -scores concat options" << endl; - cerr << "\t\t- Example: -delim \"|\"" << endl; - cerr << "\t\t- Default: \",\"." << endl << endl; - - - cerr << "Notes: " << endl; - cerr << "\t(1) All output, regardless of input type (e.g., GFF or VCF)" << endl; - cerr << "\t will in BED format with zero-based starts" << endl << endl; - - cerr << "\t(2) The input file (-i) file must be sorted by chrom, then start." << endl << endl; - - // end the program here - exit(1); - -} diff --git a/src/mergeFile/Makefile b/src/mergeFile/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..50f4004250b34cb4c1ab59481edc907a2f1d27ed --- /dev/null +++ b/src/mergeFile/Makefile @@ -0,0 +1,46 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \ + -I$(UTILITIES_DIR)/general/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/GenomeFile/ \ + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools/src \ + -I$(UTILITIES_DIR)/BlockedIntervals \ + -I$(UTILITIES_DIR)/BamTools-Ancillary \ + -I$(UTILITIES_DIR)/FileRecordTools/ \ + -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ + -I$(UTILITIES_DIR)/FileRecordTools/Records/ \ + -I$(UTILITIES_DIR)/KeyListOps/ \ + -I$(UTILITIES_DIR)/RecordOutputMgr/ \ + -I$(UTILITIES_DIR)/NewChromsweep \ + -I$(UTILITIES_DIR)/BinTree \ + -I$(UTILITIES_DIR)/version/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= mergeMain.cpp mergeFile.cpp mergeFile.h +OBJECTS= mergeMain.o mergeFile.o +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/mergeMain.o $(OBJ_DIR)/mergeFile.o + +.PHONY: clean \ No newline at end of file diff --git a/src/mergeFile/mergeFile.cpp b/src/mergeFile/mergeFile.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f772629c440db0fae1ab9e7535491fc6d2edf4eb --- /dev/null +++ b/src/mergeFile/mergeFile.cpp @@ -0,0 +1,39 @@ +/***************************************************************************** + mergeFile.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "mergeFile.h" + + +MergeFile::MergeFile(ContextMerge *context) +: _context(context), + _recordOutputMgr(NULL) +{ + _recordOutputMgr = new RecordOutputMgr(); + _recordOutputMgr->init(_context); +} + +MergeFile::~MergeFile() +{ + delete _recordOutputMgr; + _recordOutputMgr = NULL; +} + +bool MergeFile::merge() +{ + RecordKeyList hitSet; + FileRecordMgr *frm = _context->getFile(0); + while (!frm->eof()) { + Record *key = frm->getNextRecord(&hitSet); + if (key == NULL) continue; + _recordOutputMgr->printRecord(hitSet.getKey(), _context->getColumnOpsVal(hitSet)); + } + return true; +} diff --git a/src/mergeFile/mergeFile.h b/src/mergeFile/mergeFile.h new file mode 100644 index 0000000000000000000000000000000000000000..0d876c67ef260a0935efc0bd00b4d194a13b1b22 --- /dev/null +++ b/src/mergeFile/mergeFile.h @@ -0,0 +1,37 @@ +/***************************************************************************** + mergeFile.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ + +#ifndef MERGE_FILE_H_ +#define MERGE_FILE_H_ + +//************************************************ +// Class methods and elements +//************************************************ + +#include "ContextMerge.h" +#include "RecordOutputMgr.h" + +class MergeFile { + +public: + MergeFile(ContextMerge *context); + ~MergeFile(); + + bool merge(); + +private: + ContextMerge *_context; + RecordOutputMgr *_recordOutputMgr; + +}; + +#endif diff --git a/src/mergeFile/mergeMain.cpp b/src/mergeFile/mergeMain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0318725a09ec8af9c71446eb31f3e12a93f3c5db --- /dev/null +++ b/src/mergeFile/mergeMain.cpp @@ -0,0 +1,76 @@ +/***************************************************************************** + mergeMain.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "mergeFile.h" +#include "version.h" + +using namespace std; + +// define our program name +#define PROGRAM_NAME "bedtools merge" + + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +// function declarations +void merge_help(void); + +int merge_main(int argc, char* argv[]) { + + ContextMerge *context = new ContextMerge(); + if (!context->parseCmdArgs(argc, argv, 1) || context->getShowHelp() || !context->isValidState()) { + if (!context->getErrorMsg().empty()) { + cerr << context->getErrorMsg() << endl; + } + merge_help(); + delete context; + return 0; + } + MergeFile *mergeFile = new MergeFile(context); + + bool retVal = mergeFile->merge(); + delete mergeFile; + delete context; + return retVal ? 0 : 1; +} + +void merge_help(void) { + + cerr << "\nTool: bedtools merge (aka mergeBed)" << endl; + cerr << "Version: " << VERSION << "\n"; + cerr << "Summary: Merges overlapping BED/GFF/VCF entries into a single interval." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf>" << endl << endl; + + cerr << "Options: " << endl; + cerr << "\t-s\t" << "Force strandedness. That is, only merge features" << endl; + cerr << "\t\tthat are the same strand." << endl; + cerr << "\t\t- By default, merging is done without respect to strand." << endl << endl; + + + cerr << "\t-d\t" << "Maximum distance between features allowed for features" << endl; + cerr << "\t\tto be merged." << endl; + cerr << "\t\t- Def. 0. That is, overlapping & book-ended features are merged." << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + KeyListOpsHelp(); + + cerr << "Notes: " << endl; + cerr << "\t(1) All output, regardless of input type (e.g., GFF or VCF)" << endl; + cerr << "\t will in BED format with zero-based starts" << endl << endl; + + cerr << "\t(2) The input file (-i) file must be sorted by chrom, then start." << endl << endl; + + // end the program here + exit(1); + +} diff --git a/src/nekSandbox1/nekSandboxMain.cpp b/src/nekSandbox1/nekSandboxMain.cpp index ac3fb3196b49d602e98a2e79ae369ef907573603..6e895b2ec4aad16dd81efba9e7be970c0a636f45 100644 --- a/src/nekSandbox1/nekSandboxMain.cpp +++ b/src/nekSandbox1/nekSandboxMain.cpp @@ -6,7 +6,7 @@ using namespace std; #include <cstdio> #include "RecordKeyList.h" #include "NewChromsweep.h" -#include "DualQueue.h" +//#include "DualQueue.h" #include "ParseTools.h" #include <sstream> #include <iomanip> @@ -145,7 +145,7 @@ int nek_sandbox1_main(int argc,char** argv) // bool headerFound = false; // QuickString outbuf; // while (!frm.eof()) { -// Record *record = frm.allocateAndGetNextRecord(); +// Record *record = frm.getNextRecord(); // if (!headerFound && frm.hasHeader()) { // cout << frm.getHeader() << endl; // headerFound = true; @@ -287,7 +287,7 @@ void testDualQueue(Context *context) { printf("Original record order is:\n"); while (!frm.eof()) { - Record *record = frm.allocateAndGetNextRecord(); + Record *record = frm.getNextRecord(); if (record == NULL) { continue; } diff --git a/src/sampleFile/SampleFile.cpp b/src/sampleFile/SampleFile.cpp index 8291824cef1cb5a67d7827c9bf0e867b7df4f48f..096327e2012d111f2fcfab398a08e7b75855073b 100644 --- a/src/sampleFile/SampleFile.cpp +++ b/src/sampleFile/SampleFile.cpp @@ -51,7 +51,7 @@ bool SampleFile::takeSample() while (!_inputFile->eof()) { - Record *record = _inputFile->allocateAndGetNextRecord(); + Record *record = _inputFile->getNextRecord(); if (record == NULL) { continue; } diff --git a/src/utils/BinTree/BinTree.cpp b/src/utils/BinTree/BinTree.cpp index 54696f6928e7667683d993c658f7771d8e88ff2f..253dea2798320bfd7886931213612b2c21931baf 100644 --- a/src/utils/BinTree/BinTree.cpp +++ b/src/utils/BinTree/BinTree.cpp @@ -74,7 +74,7 @@ void BinTree::loadDB() Record *record = NULL; while (!_databaseFile->eof()) { - record = _databaseFile->allocateAndGetNextRecord(); + record = _databaseFile->getNextRecord(); //In addition to NULL records, we also don't want to add unmapped reads. if (record == NULL || record->isUnmapped()) { continue; @@ -86,13 +86,6 @@ void BinTree::loadDB() exit(1); } } - _databaseFile->close(); - - //TBD: give ERROR and return false if tree is empty. - if (_mainMap.empty()) { - fprintf(stderr, "ERROR: Tree is empty, no records added.\n"); - exit(1); - } } void BinTree::getHits(Record *record, RecordKeyList &hitSet) diff --git a/src/utils/Contexts/ContextBase.cpp b/src/utils/Contexts/ContextBase.cpp index 16d2402c080b16d498fe3f767e9423ac7e61beda..f05c5b186ab71b1b3c4821650da715c0bb518d15 100644 --- a/src/utils/Contexts/ContextBase.cpp +++ b/src/utils/Contexts/ContextBase.cpp @@ -13,7 +13,6 @@ ContextBase::ContextBase() : _program(UNSPECIFIED_PROGRAM), _allFilesOpened(false), - _useMergedIntervals(false), _genomeFile(NULL), _outputFileType(FileRecordTypeChecker::UNKNOWN_FILE_TYPE), _outputTypeDetermined(false), @@ -45,7 +44,6 @@ ContextBase::ContextBase() _maxNumDatabaseFields(0), _useFullBamTags(false), _reportCount(false), - _maxDistance(0), _reportNames(false), _reportScores(false), _numOutputRecords(0), @@ -53,11 +51,17 @@ ContextBase::ContextBase() _seed(0), _forwardOnly(false), _reverseOnly(false), - _hasColumnOpsMethods(false) + _hasColumnOpsMethods(false), + _keyListOps(NULL), + _desiredStrand(FileRecordMergeMgr::ANY_STRAND), + _maxDistance(0), + _useMergedIntervals(false) + { _programNames["intersect"] = INTERSECT; _programNames["sample"] = SAMPLE; _programNames["map"] = MAP; + _programNames["merge"] = MERGE; if (hasColumnOpsMethods()) { _keyListOps = new KeyListOps(); @@ -230,14 +234,22 @@ bool ContextBase::openFiles() { if (_allFilesOpened) { return true; } + + if (_fileNames.size() == 0) { + //No input was specified. Error and exit. + _errorMsg += "\n***** ERROR: No input file given. Exiting. *****"; + return false; + } + _files.resize(_fileNames.size()); for (int i = 0; i < (int)_fileNames.size(); i++) { - FileRecordMgr *frm = new FileRecordMgr(_fileNames[i], _sortedInput); + FileRecordMgr *frm = getNewFRM(_fileNames[i]); if (hasGenomeFile()) { frm->setGenomeFile(_genomeFile); } frm->setFullBamFlags(_useFullBamTags); + frm->setIsSorted(_sortedInput); if (!frm->open()) { return false; } @@ -391,8 +403,9 @@ bool ContextBase::handle_c() markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); + return true; } - return true; + return false; } @@ -412,7 +425,7 @@ bool ContextBase::handle_o() } -// for col ops, -null is a NULL vakue assigned +// for col ops, -null is a NULL value assigned // when no overlaps are detected. bool ContextBase::handle_null() { @@ -424,8 +437,9 @@ bool ContextBase::handle_null() markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); + return true; } - return true; + return false; } //for col ops, delimStr will appear between each item in @@ -446,10 +460,11 @@ bool ContextBase::handle_delim() void ContextBase::setColumnOpsMethods(bool val) { - _hasColumnOpsMethods = val; - if (val) { + if (val && !_hasColumnOpsMethods) { + //was off, but we're turning it on. _keyListOps = new KeyListOps(); } + _hasColumnOpsMethods = val; } const QuickString &ContextBase::getColumnOpsVal(RecordKeyList &keyList) const { @@ -459,3 +474,13 @@ const QuickString &ContextBase::getColumnOpsVal(RecordKeyList &keyList) const { return _keyListOps->getOpVals(keyList); } +FileRecordMgr *ContextBase::getNewFRM(const QuickString &filename) { + if (!_useMergedIntervals) { + return new FileRecordMgr(filename); + } else { + FileRecordMergeMgr *frm = new FileRecordMergeMgr(filename); + frm->setStrandType(_desiredStrand); + frm->setMaxDistance(_maxDistance); + return frm; + } +} diff --git a/src/utils/Contexts/ContextBase.h b/src/utils/Contexts/ContextBase.h index b4bf12279f71db6a934a4e42a8970771a4a528e8..2c194a68497571188dd5962fa179526f7fd44b22 100644 --- a/src/utils/Contexts/ContextBase.h +++ b/src/utils/Contexts/ContextBase.h @@ -20,7 +20,7 @@ #include "version.h" #include "BedtoolsTypes.h" #include "FileRecordTypeChecker.h" -#include "FileRecordMgr.h" +#include "FileRecordMergeMgr.h" #include "NewGenomeFile.h" #include "api/BamReader.h" #include "api/BamAux.h" @@ -59,6 +59,7 @@ public: bool getUseMergedIntervals() const { return _useMergedIntervals; } void setUseMergedIntervals(bool val) { _useMergedIntervals = val; } + FileRecordMergeMgr::WANTED_STRAND_TYPE getDesiredStrand() const { return _desiredStrand; } void openGenomeFile(const QuickString &genomeFilename); void openGenomeFile(const BamTools::RefVector &refVector); @@ -106,23 +107,23 @@ public: virtual bool getUseFullBamTags() const { return _useFullBamTags; } virtual void setUseFullBamTags(bool val) { _useFullBamTags = val; } - // - // MERGE METHODS - // - virtual bool getReportCount() const { return _reportCount; } - virtual void setReportCount(bool val) { _reportCount = val; } - - virtual int getMaxDistance() const { return _maxDistance; } - virtual void setMaxDistance(int distance) { _maxDistance = distance; } - - virtual bool getReportNames() const { return _reportNames; } - virtual void setReportNames(bool val) { _reportNames = val; } - - virtual bool getReportScores() const { return _reportScores; } - virtual void setReportScores(bool val) { _reportScores = val; } - - virtual const QuickString &getScoreOp() const { return _scoreOp; } - virtual void setScoreOp(const QuickString &op) { _scoreOp = op; } +// // +// // MERGE METHODS +// // +// virtual bool getReportCount() const { return _reportCount; } +// virtual void setReportCount(bool val) { _reportCount = val; } +// +// virtual int getMaxDistance() const { return _maxDistance; } +// virtual void setMaxDistance(int distance) { _maxDistance = distance; } +// +// virtual bool getReportNames() const { return _reportNames; } +// virtual void setReportNames(bool val) { _reportNames = val; } +// +// virtual bool getReportScores() const { return _reportScores; } +// virtual void setReportScores(bool val) { _reportScores = val; } +// +// virtual const QuickString &getScoreOp() const { return _scoreOp; } +// virtual void setScoreOp(const QuickString &op) { _scoreOp = op; } // METHODS FOR PROGRAMS WITH USER_SPECIFIED NUMBER @@ -160,7 +161,6 @@ protected: bool _allFilesOpened; map<QuickString, PROGRAM_TYPE> _programNames; - bool _useMergedIntervals; NewGenomeFile *_genomeFile; ContextFileType _outputFileType; @@ -200,7 +200,6 @@ protected: int _maxNumDatabaseFields; bool _useFullBamTags; bool _reportCount; - int _maxDistance; bool _reportNames; bool _reportScores; QuickString _scoreOp; @@ -212,14 +211,22 @@ protected: bool _forwardOnly; bool _reverseOnly; + //Members for column operations bool _hasColumnOpsMethods; KeyListOps *_keyListOps; QuickString _nullStr; //placeholder return value when col ops aren't valid. + //Members for merged records + FileRecordMergeMgr::WANTED_STRAND_TYPE _desiredStrand; + int _maxDistance; + bool _useMergedIntervals; + + void markUsed(int i) { _argsProcessed[i] = true; } bool isUsed(int i) const { return _argsProcessed[i]; } bool cmdArgsValid(); bool openFiles(); + virtual FileRecordMgr *getNewFRM(const QuickString &filename); //set cmd line params and counter, i, as members so code //is more readable (as opposed to passing all 3 everywhere). diff --git a/src/utils/Contexts/ContextMap.cpp b/src/utils/Contexts/ContextMap.cpp index e3f82417cda79b1ab81e89d72847613b6c151c53..e6abc519501a1e5320c5aad4103bd04fc083096a 100644 --- a/src/utils/Contexts/ContextMap.cpp +++ b/src/utils/Contexts/ContextMap.cpp @@ -47,13 +47,3 @@ bool ContextMap::parseCmdArgs(int argc, char **argv, int skipFirstArgs) { } return ContextIntersect::parseCmdArgs(argc, argv, _skipFirstArgs); } -// -// -//bool ContextMap::isValidState() -//{ -// if (!ContextIntersect::isValidState()) { -// return false; -// } -//} -// -// diff --git a/src/utils/Contexts/ContextMap.h b/src/utils/Contexts/ContextMap.h index 9b7280e5425991a3f47588c7c3c78cf9b9fd8745..b5bf5959cbfea429ee30c4dcd41ed2f2e4e005e4 100644 --- a/src/utils/Contexts/ContextMap.h +++ b/src/utils/Contexts/ContextMap.h @@ -15,12 +15,8 @@ class ContextMap : public ContextIntersect { public: ContextMap(); virtual ~ContextMap(); -// virtual bool isValidState(); -// virtual bool parseCmdArgs(int argc, char **argv, int skipFirstArgs); -// virtual bool hasIntersectMethods() const { return true; } -// private: diff --git a/src/utils/Contexts/ContextMerge.cpp b/src/utils/Contexts/ContextMerge.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d30b09915c7aa7d9b3c684e5c5bdbc7ba2463362 --- /dev/null +++ b/src/utils/Contexts/ContextMerge.cpp @@ -0,0 +1,181 @@ +/* + * ContextMerge.cpp + * + * Created on: Mar 26, 2014 + * Author: nek3d + */ + + +#include "ContextMerge.h" + +ContextMerge::ContextMerge() +{ + setUseMergedIntervals(true); + setColumnOpsMethods(true); + + //merge has no default columnOps the way map does, so we'll need to clear those. + _keyListOps->setColumns(""); + _keyListOps->setOperations(""); + +} + +ContextMerge::~ContextMerge() +{ + +} + + +bool ContextMerge::parseCmdArgs(int argc, char **argv, int skipFirstArgs) +{ + _argc = argc; + _argv = argv; + _skipFirstArgs = skipFirstArgs; + if (_argc < 2) { + setShowHelp(true); + return false; + } + + setProgram(_programNames[argv[0]]); + + _argsProcessed.resize(_argc - _skipFirstArgs, false); + + for (_i=_skipFirstArgs; _i < argc; _i++) { + if (isUsed(_i - _skipFirstArgs)) { + continue; + } + else if (strcmp(_argv[_i], "-n") == 0) { + if (!handle_n()) return false; + } + else if (strcmp(_argv[_i], "-nms") == 0) { + if (!handle_nms()) return false; + } + else if (strcmp(_argv[_i], "-scores") == 0) { + if (!handle_scores()) return false; + } + else if (strcmp(_argv[_i], "-delim") == 0) { + if (!handle_delim()) return false; + } + else if (strcmp(_argv[_i], "-d") == 0) { + if (!handle_d()) return false; + } + else if (strcmp(_argv[_i], "-s") == 0) { + if (!handle_s()) return false; + } + else if (strcmp(_argv[_i], "-S") == 0) { + if (!handle_S()) return false; + } + } + return ContextBase::parseCmdArgs(argc, argv, _skipFirstArgs); +} + +bool ContextMerge::isValidState() +{ + // Special: The merge program does not have default + //column operations, so if none were entered, disable column ops. + if (_keyListOps->getColumns().empty() && _keyListOps->getOperations().empty()) { + setColumnOpsMethods(false); + delete _keyListOps; + _keyListOps = NULL; + } + if (!ContextBase::isValidState()) { + return false; + } + if (_files.size() != 1) { + _errorMsg = "\n***** ERROR: input file not specified. *****"; + // Allow only one input file for now + return false; + } + + // + // Tests for stranded merge + // + if (_desiredStrand != FileRecordMergeMgr::ANY_STRAND) { // requested stranded merge + // make sure file has strand. + if (!getFile(0)->recordsHaveStrand()) { + _errorMsg = "\n***** ERROR: stranded merge requested, but input file records do not have strand. *****"; + return false; + } + //make sure file is not VCF. + if (getFile(0)->getFileType() == FileRecordTypeChecker::VCF_FILE_TYPE) { + _errorMsg = "\n***** ERROR: stranded merge not supported for VCF files. *****"; + return false; + } + } + + //column operations not allowed with BAM input + if (hasColumnOpsMethods() && + getFile(0)->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { + _errorMsg = "\n***** ERROR: stranded merge not supported for VCF files. *****"; + return false; + } + return true; +} + + +bool ContextMerge::handle_d() { + if ((_i+1) < _argc) { + if (isNumeric(_argv[_i+1])) { + int dist = str2chrPos(_argv[_i+1]); + if (dist >=0 ) { + _maxDistance = dist; + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + return true; + } + } + } + _errorMsg = "\n***** ERROR: -d option must be followed by an integer value *****"; + return false; +} + +bool ContextMerge::handle_n() +{ + markUsed(_i - _skipFirstArgs); + _errorMsg = "\n***** ERROR: -n option is deprecated. Please see the documentation for the -c and -o column operation options. *****"; + return false; +} + +bool ContextMerge::handle_nms() +{ + markUsed(_i - _skipFirstArgs); + _errorMsg = "\n***** ERROR: -nms option is deprecated. Please see the documentation for the -c and -o column operation options. *****"; + return false; + +} + + +bool ContextMerge::handle_scores() +{ + // No longer supporting this deprecated option. + markUsed(_i - _skipFirstArgs); + _errorMsg = "\n***** ERROR: -scores option is deprecated. Please see the documentation for the -c and -o column operation options. *****"; + return false; +} + +bool ContextMerge::handle_s() { + _desiredStrand = FileRecordMergeMgr::SAME_STRAND_EITHER; + markUsed(_i - _skipFirstArgs); + return true; +} + +bool ContextMerge::handle_S() { + if ((_i+1) < _argc) { + bool validChar = false; + if (_argv[_i+1][0] == '+') { + _desiredStrand = FileRecordMergeMgr::SAME_STRAND_FORWARD; + validChar = true; + } else if (_argv[_i+1][0] == '-') { + validChar = true; + _desiredStrand = FileRecordMergeMgr::SAME_STRAND_REVERSE; + } + if (validChar) { + markUsed(_i - _skipFirstArgs); + _i++; + markUsed(_i - _skipFirstArgs); + return true; + } + } + _errorMsg = "\n***** ERROR: -S option must be followed by + or -. *****"; + return false; +} diff --git a/src/utils/Contexts/ContextMerge.h b/src/utils/Contexts/ContextMerge.h new file mode 100644 index 0000000000000000000000000000000000000000..f2690833b79156fe05dfeb0c95df05d60adb9cc1 --- /dev/null +++ b/src/utils/Contexts/ContextMerge.h @@ -0,0 +1,31 @@ +/* + * ContextMerge.h + * + * Created on: Mar 26, 2014 + * Author: nek3d + */ + +#ifndef CONTEXTMERGE_H_ +#define CONTEXTMERGE_H_ + +#include "ContextBase.h" +#include "FileRecordMergeMgr.h" + +class ContextMerge: public ContextBase { +public: + ContextMerge(); + ~ContextMerge(); + virtual bool parseCmdArgs(int argc, char **argv, int skipFirstArgs); + virtual bool isValidState(); + +protected: + bool handle_d(); + bool handle_n(); + bool handle_nms(); + bool handle_scores(); + bool handle_s(); + bool handle_S(); +}; + + +#endif /* CONTEXTMERGE_H_ */ diff --git a/src/utils/Contexts/Makefile b/src/utils/Contexts/Makefile index 4b2ed4291d7bec5341c11eb7e82602dd7882de9e..1856b82b9184201244d6df72e833a5173d31e3af 100644 --- a/src/utils/Contexts/Makefile +++ b/src/utils/Contexts/Makefile @@ -19,8 +19,9 @@ INCLUDES = -I$(UTILITIES_DIR)/general/ \ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= ContextBase.cpp ContextBase.h ContextIntersect.cpp ContextIntersect.h ContextMap.cpp ContextMap.h ContextSample.cpp ContextSample.h -OBJECTS= ContextBase.o ContextIntersect.o ContextMap.o ContextSample.o +SOURCES= ContextBase.cpp ContextBase.h ContextIntersect.cpp ContextIntersect.h ContextMap.cpp \ + ContextMap.h ContextSample.cpp ContextSample.h ContextMerge.h ContextMerge.cpp +OBJECTS= ContextBase.o ContextIntersect.o ContextMap.o ContextSample.o ContextMerge.o _EXT_OBJECTS=ParseTools.o QuickString.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) @@ -38,6 +39,7 @@ clean: @rm -f $(OBJ_DIR)/ContextBase.o \ $(OBJ_DIR)/ContextIntersect.o \ $(OBJ_DIR)/ContextMap.o \ - $(OBJ_DIR)/ContextSample.o + $(OBJ_DIR)/ContextSample.o \ + $(OBJ_DIR)/ContextMerge.o \ .PHONY: clean \ No newline at end of file diff --git a/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp index 27f07894c347c2b871201f933287ed410c555bd0..373b8d15a040471a40cb8f42b26ff527c296151f 100644 --- a/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp +++ b/src/utils/FileRecordTools/FileReaders/BufferedStreamMgr.cpp @@ -73,7 +73,18 @@ bool BufferedStreamMgr::getTypeData() if (!_typeChecker.scanBuffer(_currScanBuffer.c_str(), _currScanBuffer.size()) && !_typeChecker.needsMoreData()) { return false; } else if (_typeChecker.needsMoreData()) { - _inputStreamMgr->populateScanBuffer(); + if (!_inputStreamMgr->populateScanBuffer()) { + // We have found a file with just a header. If the file and record type are known, + //break here. Otherwise, assign those types to empty file and record type, then break. + if ((_typeChecker.getFileType() != FileRecordTypeChecker::UNKNOWN_FILE_TYPE) && + (_typeChecker.getRecordType() != FileRecordTypeChecker::UNKNOWN_RECORD_TYPE)) { + break; + } else { + _typeChecker.setFileType(FileRecordTypeChecker::EMPTY_FILE_TYPE); + _typeChecker.setRecordType(FileRecordTypeChecker::EMPTY_RECORD_TYPE); + break; + } + } _currScanBuffer.append(_inputStreamMgr->getSavedData()); } } while (_typeChecker.needsMoreData()); diff --git a/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp index 2035b9c6529e8ac97f53766b22c2fd98c8eba56b..5736a5189114df908fb944b9e7f3789c6e00c838 100644 --- a/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp +++ b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.cpp @@ -125,7 +125,7 @@ int InputStreamMgr::read(char *data, size_t dataSize) return origRead + _finalInputStream->gcount(); } -void InputStreamMgr::populateScanBuffer() +bool InputStreamMgr::populateScanBuffer() { _scanBuffer.clear(); _saveDataStr.clear(); @@ -133,8 +133,7 @@ void InputStreamMgr::populateScanBuffer() int currChar = 0; while (1) { if (_isGzipped && _bamRuledOut) { - readZipChunk(); - return; + return readZipChunk(); } currChar = _pushBackStreamBuf->sbumpc(); //Stop when EOF hit. @@ -145,7 +144,8 @@ void InputStreamMgr::populateScanBuffer() _scanBuffer.push_back(currChar); if (_isGzipped) { if (!_bamRuledOut && detectBamOrBgzip(numChars, currChar)) { - return; + //we now know the file is in bgzipped format + return true; } if (numChars == 0) { continue; //this will only happen when we've just discovered that this @@ -163,6 +163,9 @@ void InputStreamMgr::populateScanBuffer() //append it to the savedDataStr. _scanBuffer.toStr(_saveDataStr, true); + if (_numBytesInBuffer == 0) return false; + return true; + } bool InputStreamMgr::detectBamOrBgzip(int &numChars, int currChar) @@ -239,7 +242,7 @@ bool InputStreamMgr::detectBamOrBgzip(int &numChars, int currChar) return false; } -void InputStreamMgr::readZipChunk() +bool InputStreamMgr::readZipChunk() { if (_tmpZipBuf == NULL) { _tmpZipBuf = new char[SCAN_BUFFER_SIZE +1]; @@ -251,7 +254,8 @@ void InputStreamMgr::readZipChunk() if ((int)numCharsRead < SCAN_BUFFER_SIZE) { _streamFinished = true; } - return; + if (numCharsRead == 0) return false; + return true; } bool InputStreamMgr::resetStream() diff --git a/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h index 1c463d37ebf5769c5b4ddf90c3181579fcee1fc3..058a247769c8e6934a19c94e4cf21ab40df5e949 100644 --- a/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h +++ b/src/utils/FileRecordTools/FileReaders/InputStreamMgr.h @@ -30,7 +30,7 @@ public: // istream *getFinalStream() { return _finalInputStream; } const BTlist<int> &getScanBuffer() const { return _scanBuffer; } int getBufferLength() const { return _numBytesInBuffer; } - void populateScanBuffer(); + bool populateScanBuffer(); const QuickString &getSavedData() const { return _saveDataStr; } bool isGzipped() const { return _isGzipped; } PushBackStreamBuf *getPushBackStreamBuf() const {return _pushBackStreamBuf; } @@ -65,7 +65,7 @@ private: BamTools::Internal::BgzfStream *_bgStream; static const char *FIFO_STRING_LITERAL; - void readZipChunk(); + bool readZipChunk(); bool detectBamOrBgzip(int &numChars, int currChar); // void decompressBuffer(); diff --git a/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp index 38074edf3513817e7413cf4426269fe618d84b7d..4beaccf2ace7a6d16c8a476ba73c09ad9f0ee9ae 100644 --- a/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp +++ b/src/utils/FileRecordTools/FileReaders/SingleLineDelimTextFileReader.cpp @@ -104,7 +104,11 @@ void SingleLineDelimTextFileReader::appendField(int fieldNum, QuickString &str) bool SingleLineDelimTextFileReader::detectAndHandleHeader() { - if (!isHeaderLine(_sLine)) { + //not sure why the linker is giving me a hard time about + //passing a non-const QuickString to isHeaderLine, but + //this const ref is a workaround. + const QuickString &sLine2 = _sLine; + if (!isHeaderLine(sLine2)) { return false; } if (!_fullHeaderFound) { diff --git a/src/utils/FileRecordTools/FileRecordMergeMgr.cpp b/src/utils/FileRecordTools/FileRecordMergeMgr.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f4c39f12169fea8e302ad73738794c22b77ef524 --- /dev/null +++ b/src/utils/FileRecordTools/FileRecordMergeMgr.cpp @@ -0,0 +1,210 @@ +/* + * FileRecordMergeMgr.cpp + * + * Created on: Mar 19, 2014 + * Author: nek3d + */ + + +#include "FileRecordMergeMgr.h" + +FileRecordMergeMgr::FileRecordMergeMgr(const QuickString & filename) +: FileRecordMgr(filename), + _desiredStrand(ANY_STRAND), + _maxDistance(0) +{ +} + +//Record *FileRecordMergeMgr::allocateAndGetNextMergedRecord(WANT_STRAND_TYPE desiredStrand, int maxDistance) { +// RecordKeyList recList; +// if (!allocateAndGetNextMergedRecord(recList, desiredStrand, maxDistance)) { +// return NULL; +// } +// deleteAllMergedItemsButKey(recList); +// return const_cast<Record *>(recList.getKey()); //want key to be non-const +//} + +Record *FileRecordMergeMgr::getNextRecord(RecordKeyList *recList) +{ + if (!recList->allClear()) { + deleteMergedRecord(*recList); + } + + _mustBeForward = _desiredStrand == SAME_STRAND_FORWARD; + _mustBeReverse = _desiredStrand == SAME_STRAND_REVERSE; + + Record *startRecord = tryToTakeFromStorage(); + + // if we couldn't use a previously stored record for starters, + //then begin with a new one that matches strand criteria. + while (startRecord == NULL) { + startRecord = FileRecordMgr::getNextRecord(); + if (startRecord == NULL) { //hit EOF!! + return NULL; + } + + if ((_mustBeForward && (startRecord->getStrandVal() != Record::FORWARD)) || (_mustBeReverse && (startRecord->getStrandVal() != Record::REVERSE))) { + //record is reverse, only want forward, OR record is forward, wanted reverse + deleteRecord(startRecord); + startRecord = NULL; + continue; + } + if (startRecord->getStrandVal() == Record::UNKNOWN && _desiredStrand != ANY_STRAND) { + //there is an unknown strand, but the user specified strandedness. + deleteRecord(startRecord); + startRecord = NULL; + } + } + + // OK!! We have a start record! Re-evaluate strand requirements for next recored. + + _mustBeForward = _desiredStrand == SAME_STRAND_FORWARD || (_desiredStrand == SAME_STRAND_EITHER && (startRecord->getStrandVal() == Record::FORWARD)); + _mustBeReverse = _desiredStrand == SAME_STRAND_REVERSE || (_desiredStrand == SAME_STRAND_EITHER && (startRecord->getStrandVal() == Record::REVERSE)); + bool mustKeepOpposite = (_desiredStrand == SAME_STRAND_EITHER); + + const QuickString &currChrom = startRecord->getChrName(); + _foundChroms.insert(currChrom); + + bool madeComposite = false; + recList->push_back(startRecord); + recList->setKey(startRecord); //key of recList will just be the startRecord unless we're able to merge more. + + Record::strandType currStrand = startRecord->getStrandVal(); + bool mustMatchStrand = _desiredStrand != ANY_STRAND; + + int currEnd = startRecord->getEndPos(); + //now look for more records to merge with this one. + //stop when they're out of range, not on the same chromosome, or we hit EOF. + //ignore if they don't comply with strand. + Record *nextRecord = NULL; + while (nextRecord == NULL) { + bool takenFromStorage = false; + nextRecord = mustMatchStrand ? tryToTakeFromStorage(currStrand) : tryToTakeFromStorage(); + if (nextRecord == NULL) { + nextRecord = FileRecordMgr::getNextRecord(); + } else { + takenFromStorage = true; + } + if (nextRecord == NULL) { // EOF hit + break; + } + //delete any record from file with an unknown strand if we are doing stranded merge, but first check + //that it's chrom was the same and it's not out of range. If either is true, stop scanning. + bool mustDelete = (mustMatchStrand && nextRecord->getStrandVal() == Record::UNKNOWN); + + //check that we are still on the same chromosome. + const QuickString &newChrom = nextRecord->getChrName(); + if (newChrom != currChrom) { //hit a different chromosome. + if (_foundChroms.find(newChrom) == _foundChroms.end() || takenFromStorage) { + //haven't seen this chromosome before, sort order is already enforced in the base class method. + if (!mustDelete) { + addToStorage(nextRecord); + } else { + deleteRecord(nextRecord); + } + nextRecord = NULL; + break; + } + } + + //check whether it's in range + int nextStart = nextRecord->getStartPos(); + if (nextStart > currEnd + _maxDistance) { + //no, it's out of range. + if (!mustDelete) { + addToStorage(nextRecord); + } else { + deleteRecord(nextRecord); + } + nextRecord = NULL; + break; + } + + // NOW, going back, we can delete any unknown strand records. But don't stop scanning. + if (mustDelete) { + deleteRecord(nextRecord); + nextRecord = NULL; + continue; + } + //if taken from file, and wrong strand, store or delete. + if (!takenFromStorage && ((_mustBeForward && (nextRecord->getStrandVal() != Record::FORWARD)) || (_mustBeReverse && (nextRecord->getStrandVal() != Record::REVERSE)))) { + if (mustKeepOpposite) { + addToStorage(nextRecord); + } else { + deleteRecord(nextRecord); + } + nextRecord = NULL; + continue; //get the next record + } + //ok, they're on the same chrom and in range, and the strand is good. Do a merge. + recList->push_back(nextRecord); + madeComposite = true; + int nextEnd = nextRecord->getEndPos(); + if (nextEnd > currEnd) { + currEnd = nextEnd; + } + nextRecord = NULL; + } + if (madeComposite) { + Record *newKey = _recordMgr->allocateRecord(); + (*newKey) = (*startRecord); + newKey->setEndPos(currEnd); + recList->setKey(newKey); + } + _totalMergedRecordLength += (unsigned long)(recList->getKey()->getEndPos() - recList->getKey()->getStartPos()); + return const_cast<Record *>(recList->getKey()); +} + +void FileRecordMergeMgr::addToStorage(Record *record) { + //if the strand requirements are strict, and the record doesn't match, + //store in the "round file". + + if ((_desiredStrand == SAME_STRAND_FORWARD && record->getStrandVal() != Record::FORWARD) || + (_desiredStrand == SAME_STRAND_REVERSE && record->getStrandVal() != Record::REVERSE) || + (_desiredStrand != ANY_STRAND && record->getStrandVal() == Record::UNKNOWN)) { + deleteRecord(record); + return; + } + _storedRecords.push(record); +} + +Record *FileRecordMergeMgr::tryToTakeFromStorage() { + Record *record = _storedRecords.top(); + if (record != NULL) { + _storedRecords.pop(); + } + return record; +} + +Record *FileRecordMergeMgr::tryToTakeFromStorage(Record::strandType strand) { + Record *record = _storedRecords.top(strand); + if (record != NULL) { + _storedRecords.pop(strand); + } + return record; +} + +void FileRecordMergeMgr::deleteMergedRecord(RecordKeyList &recList) +{ + deleteAllMergedItemsButKey(recList); + deleteRecord(recList.getKey()); + recList.setKey(NULL); +} + +bool FileRecordMergeMgr::eof(){ + return (_fileReader->eof() && _storedRecords.empty()); +} + + +void FileRecordMergeMgr::deleteAllMergedItemsButKey(RecordKeyList &recList) { + //if the key is also in the list, this method won't delete it. + for (RecordKeyList::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { + if (iter->value() == recList.getKey()) { + continue; + } + deleteRecord(iter->value()); + } + recList.clearList(); +} + + diff --git a/src/utils/FileRecordTools/FileRecordMergeMgr.h b/src/utils/FileRecordTools/FileRecordMergeMgr.h new file mode 100644 index 0000000000000000000000000000000000000000..aecdeefd856debcbc29dbf352165b1a78c936f15 --- /dev/null +++ b/src/utils/FileRecordTools/FileRecordMergeMgr.h @@ -0,0 +1,58 @@ +/* + * FileRecordMergeMgr.h + * + * Created on: Mar 19, 2014 + * Author: nek3d + */ + +#ifndef FILERECORDMERGEMGR_H_ +#define FILERECORDMERGEMGR_H_ + +#include "FileRecordMgr.h" +#include "StrandQueue.h" + +class FileRecordMergeMgr : public FileRecordMgr { + +public: + FileRecordMergeMgr(const QuickString & filename); + + ////////////////////////////////////////////////////////////////////////////////// + // + // MERGED RECORDS + // + // This will give a single "meta" record containing "flattened" or merged records. + // + // Pass an empty RecordKeyList. When done, will have a pair: 1st is the final merged record, + // second is list of constituent Records merged. + // + /////////////////////////////////////////////////////////////////////////////////// + + Record *getNextRecord(RecordKeyList *keyList = NULL); + void deleteMergedRecord(RecordKeyList &recList); // MUST use this method for cleanup! + + bool eof(); + + + typedef enum { SAME_STRAND_FORWARD, //must all be forward strand + SAME_STRAND_REVERSE, //must all be reverse strand + SAME_STRAND_EITHER, //must be same strand, but can be either forward or reverse + ANY_STRAND } //do no care about strand (Default value) + WANTED_STRAND_TYPE; + + void setStrandType(WANTED_STRAND_TYPE strand) { _desiredStrand = strand; } + void setMaxDistance(int maxDistance) { _maxDistance = maxDistance; } + +private: + + WANTED_STRAND_TYPE _desiredStrand; + int _maxDistance; + StrandQueue _storedRecords; + + void deleteAllMergedItemsButKey(RecordKeyList &recList); + void addToStorage(Record *record); + Record *tryToTakeFromStorage(); + Record *tryToTakeFromStorage(Record::strandType strand); +}; + + +#endif /* FILERECORDMERGEMGR_H_ */ diff --git a/src/utils/FileRecordTools/FileRecordMgr.cpp b/src/utils/FileRecordTools/FileRecordMgr.cpp index 38fc056b483236611748943ebcce4a7228f0a95f..56417fbb4e016fa18cafb5e7c4a142731d86d4a9 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.cpp +++ b/src/utils/FileRecordTools/FileRecordMgr.cpp @@ -4,7 +4,7 @@ #include "Record.h" #include "NewGenomeFile.h" -FileRecordMgr::FileRecordMgr(const QuickString &filename, bool isSorted) +FileRecordMgr::FileRecordMgr(const QuickString &filename) : _filename(filename), _bufStreamMgr(NULL), @@ -12,7 +12,7 @@ FileRecordMgr::FileRecordMgr(const QuickString &filename, bool isSorted) _fileType(FileRecordTypeChecker::UNKNOWN_FILE_TYPE), _recordType(FileRecordTypeChecker::UNKNOWN_RECORD_TYPE), _recordMgr(NULL), - _isSortedInput(isSorted), + _isSortedInput(false), _freeListBlockSize(512), _useFullBamTags(false), _prevStart(INT_MAX), @@ -88,7 +88,7 @@ bool FileRecordMgr::eof(){ return _fileReader->eof(); } -Record *FileRecordMgr::allocateAndGetNextRecord() +Record *FileRecordMgr::getNextRecord(RecordKeyList *keyList) { if (!_fileReader->isOpen()) { return NULL; @@ -120,6 +120,9 @@ Record *FileRecordMgr::allocateAndGetNextRecord() } assignChromId(record); _totalRecordLength += (unsigned long)(record->getEndPos() - record->getStartPos()); + if (keyList != NULL) { + keyList->setKey(record); + } return record; } @@ -198,9 +201,14 @@ void FileRecordMgr::deleteRecord(const Record *record) { _recordMgr->deleteRecord(record); } +void FileRecordMgr::deleteRecord(RecordKeyList *keyList) { + _recordMgr->deleteRecord(keyList->getKey()); +} + void FileRecordMgr::allocateFileReader() { switch (_fileType) { + case FileRecordTypeChecker::EMPTY_FILE_TYPE: case FileRecordTypeChecker::SINGLE_LINE_DELIM_TEXT_FILE_TYPE: case FileRecordTypeChecker::VCF_FILE_TYPE: _fileReader = new SingleLineDelimTextFileReader(_bufStreamMgr->getTypeChecker().getNumFields(), _bufStreamMgr->getTypeChecker().getDelimChar()); @@ -224,175 +232,3 @@ const BamTools::RefVector & FileRecordMgr::getBamReferences() { } return static_cast<BamFileReader *>(_fileReader)->getReferences(); } - -#ifdef false - -Record *FileRecordMgr::allocateAndGetNextMergedRecord(WANT_STRAND_TYPE desiredStrand, int maxDistance) { - RecordKeyList recList; - if (!allocateAndGetNextMergedRecord(recList, desiredStrand, maxDistance)) { - return NULL; - } - deleteAllMergedItemsButKey(recList); - return const_cast<Record *>(recList.getKey()); //want key to be non-const -} - -bool FileRecordMgr::allocateAndGetNextMergedRecord(RecordKeyList & recList, WANT_STRAND_TYPE desiredStrand, int maxDistance) -{ - if (!recList.allClear()) { - deleteMergedRecord(recList); - } - - _mustBeForward = desiredStrand == SAME_STRAND_FORWARD; - _mustBeReverse = desiredStrand == SAME_STRAND_REVERSE; - - Record *startRecord = tryToTakeFromStorage(); - - // if we couldn't use a previously stored record for starters, - //then begin with a new one that matches strand criteria. - while (startRecord == NULL) { - startRecord = allocateAndGetNextRecord(); - if (startRecord == NULL) { //hit EOF!! - return false; - } - - if (_mustBeForward && !startRecord->getStrand()) { - //record is reverse, wanted forward. - addToStorage(startRecord); - startRecord = NULL; - } else if (_mustBeReverse && startRecord->getStrand()) { - //record is forward, wanted reverse - addToStorage(startRecord); - startRecord = NULL; - } - } - - // OK!! We have a start record! - - _mustBeForward = desiredStrand == SAME_STRAND_FORWARD || (desiredStrand == SAME_STRAND_EITHER && startRecord->getStrand()); - _mustBeReverse = desiredStrand == SAME_STRAND_REVERSE || (desiredStrand == SAME_STRAND_EITHER && !startRecord->getStrand()); - - const QuickString &currChrom = startRecord->getChrName(); - _foundChroms.insert(currChrom); - - bool madeComposite = false; - recList.push_back(startRecord); - recList.setKey(startRecord); //key of recList will just be the startRecord unless we're able to merge more. - - bool currStrand = startRecord->getStrand(); - bool mustMatchStrand = desiredStrand != ANY_STRAND; - - int currEnd = startRecord->getEndPos(); - //now look for more records to merge with this one. - //stop when they're out of range, not on the same chromosome, or we hit EOF. - //ignore if they don't comply with strand. - Record *nextRecord = NULL; - while (nextRecord == NULL) { - bool takenFromStorage = false; - nextRecord = mustMatchStrand ? tryToTakeFromStorage(currStrand) : tryToTakeFromStorage(); - if (nextRecord == NULL) { - nextRecord = allocateAndGetNextRecord(); - } else { - takenFromStorage = true; - } - if (nextRecord == NULL) { // EOF hit - break; - } - const QuickString &newChrom = nextRecord->getChrName(); - if (newChrom != currChrom) { //hit a different chromosome. - if (_foundChroms.find(newChrom) == _foundChroms.end() || takenFromStorage) { - //haven't seen this chromosome before. - addToStorage(nextRecord); - break; - } else { - //different strand, but we've already seen this chrom. File is not sorted. - fprintf(stderr, "ERROR: Input file %s is not sorted by chromosome, startPos.\n", _context->getInputFileName(_contextFileIdx).c_str()); - deleteRecord(nextRecord); - deleteMergedRecord(recList); - exit(1); - } - } - int nextStart = nextRecord->getStartPos(); - //is the record out of range? - if (nextStart > currEnd + maxDistance) { - //yes, it's out of range. - addToStorage(nextRecord); - break; - } - - //ok, they're on the same chrom and in range. Are we happy with the strand? - if (mustMatchStrand && nextRecord->getStrand() != currStrand) { - //no, we're not. - addToStorage(nextRecord); - nextRecord = NULL; - continue; - } - //everything's good! do a merge. - recList.push_back(nextRecord); - madeComposite = true; - int nextEnd = nextRecord->getEndPos(); - if (nextEnd > currEnd) { - currEnd = nextEnd; - } - nextRecord = NULL; - } - if (madeComposite) { - Record *newKey = _recordMgr->allocateRecord(); - (*newKey) = (*startRecord); - newKey->setEndPos(currEnd); - recList.setKey(newKey); - } - _totalMergedRecordLength += (unsigned long)(recList.getKey()->getEndPos() - recList.getKey()->getStartPos()); - return true; -} - -void FileRecordMgr::addToStorage(Record *record) { - _storedRecords.push(record); -} - -Record *FileRecordMgr::tryToTakeFromStorage() { - Record *record = _storedRecords.empty() ? NULL : const_cast<Record *>(_storedRecords.top()); - if (record != NULL) { - _storedRecords.pop(); - } - return record; -} - -Record *FileRecordMgr::tryToTakeFromStorage(bool strand) { - Record *record = NULL; - if(strand) { - if (_storedRecords.emptyForward()) { - return NULL; - } else { - record = const_cast<Record *>(_storedRecords.topForward()); - _storedRecords.popForward(); - return record; - } - } else { - if (_storedRecords.emptyReverse()) { - return NULL; - } else { - record = const_cast<Record *>(_storedRecords.topReverse()); - _storedRecords.popReverse(); - return record; - } - } -} - -void FileRecordMgr::deleteMergedRecord(RecordKeyList &recList) -{ - deleteAllMergedItemsButKey(recList); - deleteRecord(recList.getKey()); - recList.setKey(NULL); -} - -void FileRecordMgr::deleteAllMergedItemsButKey(RecordKeyList &recList) { - //if the key is also in the list, this method won't delete it. - for (RecordKeyList::const_iterator_type iter = recList.begin(); iter != recList.end(); iter = recList.next()) { - if (iter->value() == recList.getKey()) { - continue; - } - deleteRecord(iter->value()); - } - recList.clearList(); -} -#endif diff --git a/src/utils/FileRecordTools/FileRecordMgr.h b/src/utils/FileRecordTools/FileRecordMgr.h index b4e76e92f1e7e1618d67d28dcedba1581862024c..24aca9a04af524974037e5660c3dcc309a302045 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.h +++ b/src/utils/FileRecordTools/FileRecordMgr.h @@ -32,11 +32,19 @@ class NewGenomeFile; class FileRecordMgr { public: - FileRecordMgr(const QuickString & filename, bool isSorted = false); - ~FileRecordMgr(); + FileRecordMgr(const QuickString & filename); + virtual ~FileRecordMgr(); bool open(); void close(); - bool eof(); + virtual bool eof(); + + //This is an all-in-one method to give the user a new record that is initialized with + //the next entry in the data file. + //NOTE!! User MUST pass back the returned pointer to deleteRecord method for cleanup! + //Also Note! User must check for NULL returned, meaning we failed to get the next record. + virtual Record *getNextRecord(RecordKeyList *keyList = NULL); + void deleteRecord(const Record *); + virtual void deleteRecord(RecordKeyList *keyList); const QuickString &getFileName() const { return _filename;} bool hasHeader() const { return _fileReader->hasHeader(); } @@ -69,55 +77,6 @@ public: const BamTools::RefVector &getBamReferences(); int getNumFields() const { return _fileReader->getNumFields(); } - //This is an all-in-one method to give the user a new record that is initialized with - //the next entry in the data file. - //NOTE!! User MUST pass back the returned pointer to deleteRecord method for cleanup! - //Also Note! User must check for NULL returned, meaning we failed to get the next record. - Record *allocateAndGetNextRecord(); - void deleteRecord(const Record *); - -#ifdef false - ////////////////////////////////////////////////////////////////////////////////// - // - // MERGED RECORDS - // - //this will give a single "meta" record containing "flattened" or merged records. - // - // 1st ARG: Pass an empty RecordKeyList. When done, will have a pair: 1st is the final merged record, - // second is list of constituent Records merged. - // ** NOTE ** If the RecordKeyList is not empty, this method will empty it for you and delete all contents! - // - // 2nd ARG: Choose from WANT_STRAND_TYPE, defined below below - // - // 3rd ARG: allows for nearby records, i.e. maxDistance 100 will merge records <= 100 bases apart. Default 0 means only - // merge records that actually intersect. - // - // Return value: true if any records found. False if eof hit before records matching requested parameters found. - - typedef enum { SAME_STRAND_FORWARD, //must all be forward strand - SAME_STRAND_REVERSE, //must all be reverse strand - SAME_STRAND_EITHER, //must be same strand, but can be either forward or reverse - ANY_STRAND } //do no care about strand (Default value) - WANT_STRAND_TYPE; - - // - // WARNING!! Specifying a strand will keep all records on the other strand in memory!! - // This is done so that requests for records on that other strand can still be met. - // For now, use this method at any time to purge the kept records from memory, such as - // when changing chromosomes, for example. - void purgeKeepList(); - bool allocateAndGetNextMergedRecord(RecordKeyList & recList, WANT_STRAND_TYPE desiredStrand = ANY_STRAND, int maxDistance = 0); - void deleteMergedRecord(RecordKeyList &recList); // MUST use this method for cleanup! - - //this method will allocate a new record of merged records, but the returned record should only be passed to the deleteRecord method - //for cleanup, not to the delete mmerged record. - Record *allocateAndGetNextMergedRecord(WANT_STRAND_TYPE desiredStrand = ANY_STRAND, int maxDistance = 0); - - // - // END MERGED RECORDS - // - ////////////////////////////////////////////////////////////////////////////////// -#endif //File statistics unsigned long getTotalRecordLength() const { return _totalRecordLength; } //sum of length of all returned records @@ -140,7 +99,9 @@ public: _hasGenomeFile = true; } -private: + void setIsSorted(bool val) { _isSortedInput = val; } + +protected: QuickString _filename; BufferedStreamMgr *_bufStreamMgr; @@ -158,8 +119,6 @@ private: int _prevStart; int _prevChromId; - //members for handling merged records -// DualQueue<Record *, DualQueueAscending > _storedRecords; bool _mustBeForward; bool _mustBeReverse; @@ -177,16 +136,6 @@ private: void testInputSortOrder(Record *record); void assignChromId(Record *); void sortError(const Record *record, bool genomeFileError); - - -#ifdef false - void deleteAllMergedItemsButKey(RecordKeyList &recList); - void addToStorage(Record *record); - Record *tryToTakeFromStorage(); - Record *tryToTakeFromStorage(bool strand); -#endif - - }; diff --git a/src/utils/FileRecordTools/Makefile b/src/utils/FileRecordTools/Makefile index 9d021179772c29991565b157bb9aba7d0063e40f..fce099b8dc482a1db78459adb2c51d946ad47b2a 100644 --- a/src/utils/FileRecordTools/Makefile +++ b/src/utils/FileRecordTools/Makefile @@ -21,8 +21,8 @@ SUBDIRS = ./FileReaders \ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= FileRecordMgr.cpp FileRecordMgr.h -OBJECTS= FileRecordMgr.o RecordOutputMgr.o +SOURCES= FileRecordMgr.cpp FileRecordMgr.h FileRecordMergeMgr.cpp FileRecordMergeMgr.h +OBJECTS= FileRecordMgr.o FileRecordMergeMgr.o _EXT_OBJECTS=SingleLineDelimTextFileReader.o BamFileReader.o Bed3Interval.o Bed6Interval.o BedPlusInterval.o Bed12Interval.o BamRecord.o \ SingleLineDelimTransferBuffer.o FileRecordTypeChecker.o QuickString.o ParseTools.o RecordKeyList.o BufferedStreamMgr.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) @@ -31,6 +31,8 @@ BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) $(BUILT_OBJECTS): $(SOURCES) $(SUBDIRS) @echo " * compiling FileRecordMgr.cpp" @$(CXX) -c -o $(OBJ_DIR)/FileRecordMgr.o FileRecordMgr.cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + @echo " * compiling FileRecordMergeMgr.cpp" + @$(CXX) -c -o $(OBJ_DIR)/FileRecordMergeMgr.o FileRecordMergeMgr.cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) @@ -42,10 +44,8 @@ $(SUBDIRS): $(OBJ_DIR) clean: @echo "Cleaning up." @rm -f $(OBJ_DIR)/FileRecordMgr.o - @rm -f $(OBJ_DIR)/RecordMgr.o @rm -f $(OBJ_DIR)/FileRecordTypeChecker.o - @rm -f $(OBJ_DIR)/SingleLineDelimTextFileReader.o - @rm -f $(OBJ_DIR)/SingleLineDelimTransferBuffer.o + @rm -f $(OBJ_DIR)/FileRecordMergeMgr.o .PHONY: clean \ No newline at end of file diff --git a/src/utils/FileRecordTools/Records/BlockMgr.cpp b/src/utils/FileRecordTools/Records/BlockMgr.cpp index 21b060df73b71af677bde159f1a980c73b48bbf8..d63687130be1ba5e2ea62fe5f0d9187be8571104 100644 --- a/src/utils/FileRecordTools/Records/BlockMgr.cpp +++ b/src/utils/FileRecordTools/Records/BlockMgr.cpp @@ -60,11 +60,8 @@ void BlockMgr::getBlocksFromBed12(RecordKeyList &keyList, bool &mustDelete) return; } - vector<QuickString> sizes; - vector<QuickString> starts; - - int sizeCount = Tokenize(keyRecord->getBlockSizes(), sizes, ',', blockCount); - int startCount = Tokenize(keyRecord->getBlockStarts(), starts, ',', blockCount); + int sizeCount = _blockSizeTokens.tokenize(keyRecord->getBlockSizes(), ','); + int startCount = _blockStartTokens.tokenize(keyRecord->getBlockStarts(), ','); if (blockCount != sizeCount || sizeCount != startCount) { fprintf(stderr, "Error: found wrong block counts while splitting entry.\n"); @@ -72,8 +69,8 @@ void BlockMgr::getBlocksFromBed12(RecordKeyList &keyList, bool &mustDelete) } for (int i=0; i < blockCount; i++) { - int startPos = keyRecord->getStartPos() + str2chrPos(starts[i].c_str()); - int endPos = startPos + str2chrPos(sizes[i].c_str()); + int startPos = keyRecord->getStartPos() + str2chrPos(_blockStartTokens.getElem(i).c_str()); + int endPos = startPos + str2chrPos(_blockSizeTokens.getElem(i).c_str()); const Record *record = allocateAndAssignRecord(keyRecord, startPos, endPos); keyList.push_back(record); diff --git a/src/utils/FileRecordTools/Records/BlockMgr.h b/src/utils/FileRecordTools/Records/BlockMgr.h index c83b1e0125284bb5743c95ff5d3cc91708a06d04..bf6f116f5a0f060a912aec16d4fb6910102f9ea8 100644 --- a/src/utils/FileRecordTools/Records/BlockMgr.h +++ b/src/utils/FileRecordTools/Records/BlockMgr.h @@ -16,6 +16,7 @@ using namespace std; #include "FileRecordTypeChecker.h" #include "RecordKeyList.h" + class RecordMgr; class BlockMgr { @@ -50,6 +51,8 @@ private: float _overlapFraction; bool _hasReciprocal; + Tokenizer _blockSizeTokens; + Tokenizer _blockStartTokens; // For now, all records will be split into Bed6 records. const static FileRecordTypeChecker::RECORD_TYPE _blockRecordsType = FileRecordTypeChecker::BED6_RECORD_TYPE; diff --git a/src/utils/FileRecordTools/Records/EmptyRecord.cpp b/src/utils/FileRecordTools/Records/EmptyRecord.cpp new file mode 100644 index 0000000000000000000000000000000000000000..81d16864b3ac6ce57ac2a901c32a8c21a76d730c --- /dev/null +++ b/src/utils/FileRecordTools/Records/EmptyRecord.cpp @@ -0,0 +1,34 @@ +/* + * EmptyRecord.cpp + * + * Created on: Apr 14, 2014 + * Author: nek3d + */ + + +#include "EmptyRecord.h" +#include "SingleLineDelimTextFileReader.h" + +//Technically, there is nothing to do here. Just adding the file +//to keep make happy. + +EmptyRecord::EmptyRecord() { + +} + +EmptyRecord::~EmptyRecord() { + +} + +bool EmptyRecord::initFromFile(FileReader *fileReader) +{ + SingleLineDelimTextFileReader *sldFileReader = static_cast<SingleLineDelimTextFileReader*>(fileReader); + return initFromFile(sldFileReader); +} + + +bool EmptyRecord::initFromFile(SingleLineDelimTextFileReader *fileReader) +{ + return true; +} + diff --git a/src/utils/FileRecordTools/Records/EmptyRecord.h b/src/utils/FileRecordTools/Records/EmptyRecord.h new file mode 100644 index 0000000000000000000000000000000000000000..29f6f4654a3fe5508d7605e5c92a7fc59c7baacc --- /dev/null +++ b/src/utils/FileRecordTools/Records/EmptyRecord.h @@ -0,0 +1,30 @@ +/* + * EmptyRecord.h + * + * Created on: Apr 14, 2014 + * Author: nek3d + */ + +#ifndef EMPTYRECORD_H_ +#define EMPTYRECORD_H_ + +#include "Record.h" + +class SingleLineDelimTextFileReader; + +class EmptyRecord : public Record { +public: + //This is really just a place holder that doesn't have to do much. + //But in order for the rest of the code to not have to contain special + //ugly if statements for empty records, it at least has to instantiable, + //so any purely virtual functions from the base class will have to be overriden. + EmptyRecord(); + ~EmptyRecord(); + + bool initFromFile(FileReader *); + virtual bool initFromFile(SingleLineDelimTextFileReader *); + int getNumFields() const { return 0; } +}; + + +#endif /* EMPTYRECORD_H_ */ diff --git a/src/utils/FileRecordTools/Records/Makefile b/src/utils/FileRecordTools/Records/Makefile index 6bfd6106ab52fb820c3759b74fa175639d948511..f471e666f110fa39f3cc4dde99b879ec580e07f1 100644 --- a/src/utils/FileRecordTools/Records/Makefile +++ b/src/utils/FileRecordTools/Records/Makefile @@ -17,13 +17,13 @@ INCLUDES = -I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= RecordMgr.cpp RecordMgr.h Record.h Record.cpp Bed3Interval.h Bed3Interval.cpp \ +SOURCES= RecordMgr.cpp RecordMgr.h Record.h Record.cpp EmptyRecord.h EmptyRecord.cpp Bed3Interval.h Bed3Interval.cpp \ Bed4Interval.h Bed4Interval.cpp BedGraphInterval.h BedGraphInterval.cpp Bed5Interval.h Bed5Interval.cpp \ Bed6Interval.h Bed6Interval.cpp \ BedPlusInterval.h BedPlusInterval.cpp Bed12Interval.h Bed12Interval.cpp BamRecord.h BamRecord.cpp VcfRecord.h VcfRecord.cpp \ - GffRecord.h GffRecord.cpp RecordKeyList.h RecordKeyList.cpp BlockMgr.h BlockMgr.cpp -OBJECTS= RecordMgr.o Record.o Bed3Interval.o Bed4Interval.o BedGraphInterval.o Bed5Interval.o Bed6Interval.o BedPlusInterval.o Bed12Interval.o BamRecord.o \ - VcfRecord.o GffRecord.o RecordKeyList.o BlockMgr.o + GffRecord.h GffRecord.cpp RecordKeyList.h RecordKeyList.cpp BlockMgr.h BlockMgr.cpp StrandQueue.h StrandQueue.cpp +OBJECTS= RecordMgr.o Record.o EmptyRecord.o Bed3Interval.o Bed4Interval.o BedGraphInterval.o Bed5Interval.o Bed6Interval.o BedPlusInterval.o Bed12Interval.o BamRecord.o \ + VcfRecord.o GffRecord.o RecordKeyList.o BlockMgr.o StrandQueue.o _EXT_OBJECTS=ParseTools.o QuickString.o ChromIdLookup.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) @@ -38,8 +38,8 @@ $(BUILT_OBJECTS): $(SOURCES) clean: @echo "Cleaning up." - @rm -f $(OBJ_DIR)/RecordMgr.o $(OBJ_DIR)/Record.o $(OBJ_DIR)/Bed3Interval.o $(OBJ_DIR)/Bed4Interval.o \ + @rm -f $(OBJ_DIR)/RecordMgr.o $(OBJ_DIR)/Record.o $(OBJ_DIR)/EmptyRecord.o $(OBJ_DIR)/Bed3Interval.o $(OBJ_DIR)/Bed4Interval.o \ $(OBJ_DIR)/BedGraphInterval.o $(OBJ_DIR)/Bed5Interval.o $(OBJ_DIR)/Bed6Interval.o \ - $(OBJ_DIR)/BedPlusInterval.o $(OBJ_DIR)/Bed12Interval.o $(OBJ_DIR)/BamRecord.o $(OBJ_DIR)/VcfRecord.o $(OBJ_DIR)/GffRecord.o $(OBJ_DIR)/BlockMgr.o + $(OBJ_DIR)/BedPlusInterval.o $(OBJ_DIR)/Bed12Interval.o $(OBJ_DIR)/BamRecord.o $(OBJ_DIR)/VcfRecord.o $(OBJ_DIR)/GffRecord.o $(OBJ_DIR)/BlockMgr.o $(OBJ_DIR)/StrandQueue.o .PHONY: clean \ No newline at end of file diff --git a/src/utils/FileRecordTools/Records/Record.h b/src/utils/FileRecordTools/Records/Record.h index d8071c1ed223d7172bd7faa33e6ff6f514975616..8e4376c1a9f6d2eb112c69d9c87a6b3b742d0a59 100644 --- a/src/utils/FileRecordTools/Records/Record.h +++ b/src/utils/FileRecordTools/Records/Record.h @@ -152,6 +152,9 @@ protected: bool _isMateUnmapped; }; - +class RecordPtrSortFunctor { +public: + bool operator()(const Record *rec1, const Record *rec2) const { return *rec1 > *rec2; } +}; #endif /* RECORD_H_ */ diff --git a/src/utils/FileRecordTools/Records/RecordMgr.cpp b/src/utils/FileRecordTools/Records/RecordMgr.cpp index 9799fedc325d208fa502cee814a1f29f6ca471e6..92d9dc54ad714a47fd65aa4edf929dda78635962 100644 --- a/src/utils/FileRecordTools/Records/RecordMgr.cpp +++ b/src/utils/FileRecordTools/Records/RecordMgr.cpp @@ -8,6 +8,13 @@ RecordMgr::RecordMgr(FileRecordTypeChecker::RECORD_TYPE recType, int blockSize) _freeListBlockSize(blockSize) { switch(_recordType) { + case FileRecordTypeChecker::EMPTY_RECORD_TYPE: + { + _freeList = new FreeList<EmptyRecord>(0); + break; + } + + case FileRecordTypeChecker::BED3_RECORD_TYPE: { _freeList = new FreeList<Bed3Interval>(_freeListBlockSize); @@ -72,6 +79,11 @@ RecordMgr::RecordMgr(FileRecordTypeChecker::RECORD_TYPE recType, int blockSize) RecordMgr::~RecordMgr() { switch(_recordType) { + case FileRecordTypeChecker::EMPTY_RECORD_TYPE: + { + delete (FreeList<EmptyRecord> *)_freeList; + break; + } case FileRecordTypeChecker::BED3_RECORD_TYPE: { delete (FreeList<Bed3Interval> *)_freeList; @@ -138,6 +150,12 @@ Record *RecordMgr::allocateRecord() { Record *record = NULL; switch(_recordType) { + case FileRecordTypeChecker::EMPTY_RECORD_TYPE: + { + EmptyRecord *er = ((FreeList<EmptyRecord> *)_freeList)->newObj(); + record = er; + break; + } case FileRecordTypeChecker::BED3_RECORD_TYPE: { Bed3Interval *b3i = ((FreeList<Bed3Interval> *)_freeList)->newObj(); @@ -215,6 +233,12 @@ Record *RecordMgr::allocateRecord() void RecordMgr::deleteRecord(const Record *record) { switch(_recordType) { + case FileRecordTypeChecker::EMPTY_RECORD_TYPE: + { + ((FreeList<EmptyRecord> *)_freeList)->deleteObj(static_cast<const EmptyRecord *>(record)); + break; + } + case FileRecordTypeChecker::BED3_RECORD_TYPE: { ((FreeList<Bed3Interval> *)_freeList)->deleteObj(static_cast<const Bed3Interval *>(record)); diff --git a/src/utils/FileRecordTools/Records/RecordMgr.h b/src/utils/FileRecordTools/Records/RecordMgr.h index 1d73f9055877fc42f76cc2cf0d10efd498101a51..37193b9c8137c72c64df2d48f3eac4d2e93d52ff 100644 --- a/src/utils/FileRecordTools/Records/RecordMgr.h +++ b/src/utils/FileRecordTools/Records/RecordMgr.h @@ -10,6 +10,7 @@ //include headers for all Records and derivative classes #include "Record.h" +#include "EmptyRecord.h" #include "Bed3Interval.h" #include "Bed4Interval.h" #include "Bed5Interval.h" diff --git a/src/utils/FileRecordTools/Records/StrandQueue.cpp b/src/utils/FileRecordTools/Records/StrandQueue.cpp new file mode 100644 index 0000000000000000000000000000000000000000..aa53313b19465c01dd207fdd083d65bc67378643 --- /dev/null +++ b/src/utils/FileRecordTools/Records/StrandQueue.cpp @@ -0,0 +1,131 @@ +/* + * StrandQueue.cpp + * + * Created on: Mar 31, 2014 + * Author: nek3d + */ + +#include "StrandQueue.h" + +StrandQueue::StrandQueue() { + + for (int i=0; i < NUM_QUEUES; i++) { + queueType *queue = new queueType(); + _queues.push_back(queue); + } + _strandIdxs.resize(3); + _strandIdxs[0] = Record::FORWARD; + _strandIdxs[1] = Record::REVERSE; + _strandIdxs[2] = Record::UNKNOWN; +} + +StrandQueue::~StrandQueue() { + for (int i=0; i < NUM_QUEUES; i++) { + delete _queues[i]; + } +} + +Record *StrandQueue::top() const +{ + int minIdx = getMinIdx(); + if (minIdx == -1) return NULL; + return const_cast<Record *>(_queues[minIdx]->top()); +} + +void StrandQueue::pop() { + int minIdx = getMinIdx(); + if (minIdx == -1) return; + _queues[minIdx]->pop(); +} + +Record * StrandQueue::top(Record::strandType strand) const { + const Record *record = NULL; + switch (strand) { + case Record::FORWARD: + if (_queues[0]->empty()) return NULL; + record = _queues[0]->top(); + break; + case Record::REVERSE: + if (_queues[1]->empty()) return NULL; + + record = _queues[1]->top(); + break; + case Record::UNKNOWN: + if (_queues[0]->empty()) return NULL; + record = _queues[2]->top(); + break; + default: + break; + } + return const_cast<Record *>(record); +} + +void StrandQueue::pop(Record::strandType strand) const { + switch (strand) { + case Record::FORWARD: + if (_queues[0]->empty()) return; + _queues[0]->pop(); + break; + case Record::REVERSE: + if (_queues[1]->empty()) return; + _queues[1]->pop(); + break; + case Record::UNKNOWN: + if (_queues[2]->empty()) return; + _queues[2]->pop(); + break; + default: + break; + } +} + +void StrandQueue::push(Record *record) { + switch (record->getStrandVal()) { + case Record::FORWARD: + _queues[0]->push(record); + break; + case Record::REVERSE: + _queues[1]->push(record); + break; + case Record::UNKNOWN: + _queues[2]->push(record); + break; + default: + break; + } +} + +size_t StrandQueue::size() const { + size_t sumSize = 0; + for (int i = 0; i < NUM_QUEUES; i++) { + sumSize += _queues[i]->size(); + } + return sumSize; +} + +bool StrandQueue::empty() const { + for (int i = 0; i < NUM_QUEUES; i++) { + if (!_queues[i]->empty()) { + return false; + } + } + return true; +} + + +int StrandQueue::getMinIdx() const { + if (empty()) return -1; + const Record *minRec = NULL; + int minIdx = -1; + for (int i = 0; i < NUM_QUEUES; i++) { + if (_queues[i]->empty()) continue; + const Record *currTop = _queues[i]->top(); + if (currTop == NULL) continue; + if (minRec == NULL || *currTop < *minRec) { + minRec = currTop; + minIdx = i; + } + } + return minIdx; +} + diff --git a/src/utils/FileRecordTools/Records/StrandQueue.h b/src/utils/FileRecordTools/Records/StrandQueue.h new file mode 100644 index 0000000000000000000000000000000000000000..e6f0bb49d7eab3160850ba03f13958b867054f37 --- /dev/null +++ b/src/utils/FileRecordTools/Records/StrandQueue.h @@ -0,0 +1,47 @@ +/* + * StrandQueue.h + * + * Created on: Jan 29, 2013 + * Author: nek3d + */ +#ifndef STRANDQUEUE_H_ +#define STRANDQUEUE_H_ + +using namespace std; + +#include <vector> +#include <queue> +#include <cstdio> +#include <cstdlib> +#include "Record.h" + +class StrandQueue { +public: + StrandQueue(); + ~StrandQueue(); + + Record * top() const; + void pop(); + Record * top(Record::strandType strand) const; + void pop(Record::strandType strand) const; + void push(Record *record); + size_t size() const; + bool empty() const; + +private: +// static RecordPtrSortFunctor _recSortFunctor; + typedef priority_queue<Record *, vector<const Record *>, RecordPtrSortFunctor > queueType; + vector<queueType *> _queues; + static const int NUM_QUEUES = 3; + + //we want to be able to iterate over the enumerated strand types in Record.h, + //which are FORWARD, REVERSE, and UNKNOWN. However, iterating over an enum is hard to + //do, so we'll use a suggestion found in a forum, and put the enum values into a vector. + vector<Record::strandType> _strandIdxs; + + int getMinIdx() const; //will return the idx of queue with the current min val. + +}; + + +#endif // STRANDQUEUE_H_ diff --git a/src/utils/FileRecordTools/Records/recordsTar.tar.gz b/src/utils/FileRecordTools/Records/recordsTar.tar.gz deleted file mode 100644 index 321a88e4935d73f7034cce0611f9191d27bfeaa2..0000000000000000000000000000000000000000 Binary files a/src/utils/FileRecordTools/Records/recordsTar.tar.gz and /dev/null differ diff --git a/src/utils/GenomeFile/NewGenomeFile.cpp b/src/utils/GenomeFile/NewGenomeFile.cpp index d100759707837bd0b1426437a42477b2778098ff..84e44cbed97dcdbd7c0fdb760dc02edca634bad8 100644 --- a/src/utils/GenomeFile/NewGenomeFile.cpp +++ b/src/utils/GenomeFile/NewGenomeFile.cpp @@ -11,6 +11,7 @@ ******************************************************************************/ #include "NewGenomeFile.h" #include "ParseTools.h" +#include "Tokenizer.h" NewGenomeFile::NewGenomeFile(const QuickString &genomeFilename) : _maxId(-1) @@ -44,21 +45,20 @@ void NewGenomeFile::loadGenomeFileIntoMap() { exit(1); } string sLine; - vector<QuickString> fields; + Tokenizer fieldTokens; CHRPOS chrSize = 0; QuickString chrName; while (!genFile.eof()) { sLine.clear(); - fields.clear(); chrSize = 0; chrName.clear(); getline(genFile, sLine); - Tokenize(sLine.c_str(), fields); - if (fields.size() != 2) { + int numFields = fieldTokens.tokenize(sLine.c_str()); + if (numFields != 2) { continue; } - chrName = fields[0]; - chrSize = str2chrPos(fields[1]); + chrName = fieldTokens.getElem(0); + chrSize = str2chrPos(fieldTokens.getElem(1)); _maxId++; _chromSizeIds[chrName] = pair<CHRPOS, int>(chrSize, _maxId); _startOffsets.push_back(_genomeLength); diff --git a/src/utils/KeyListOps/KeyListOps.cpp b/src/utils/KeyListOps/KeyListOps.cpp index 657635000e5313c200567d25ea36a29623bd4056..a3b25131aaff6845bcbda3496a359d232f572de1 100644 --- a/src/utils/KeyListOps/KeyListOps.cpp +++ b/src/utils/KeyListOps/KeyListOps.cpp @@ -98,25 +98,28 @@ bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) { //member of each pair is a column number, and the second member is the code for the //operation to perform on that column. - vector<QuickString> columnsVec; - vector<QuickString> opsVec; - int numCols = Tokenize(_columns, columnsVec, ','); - int numOps = Tokenize(_operations, opsVec, ','); + Tokenizer colTokens; + Tokenizer opsTokens; + + int numCols = colTokens.tokenize(_columns, ','); + int numOps = opsTokens.tokenize(_operations, ','); if (numOps < 1 || numCols < 1) { cerr << endl << "*****" << endl << "***** ERROR: There must be at least one column and at least one operation named." << endl; return false; } - if (numOps > 1 && numCols != numOps) { + if (numOps > 1 && numCols > 1 && numCols != numOps) { cerr << endl << "*****" << endl << "***** ERROR: There are " << numCols <<" columns given, but there are " << numOps << " operations." << endl; cerr << "\tPlease provide either a single operation that will be applied to all listed columns, " << endl; + cerr << "\ta single column to which all operations will be applied," << endl; cerr << "\tor an operation for each column." << endl; return false; } - for (int i=0; i < (int)columnsVec.size(); i++) { - int col = str2chrPos(columnsVec[i]); + int loop = max(numCols, numOps); + for (int i=0; i < loop; i++) { + int col = str2chrPos(colTokens.getElem(numCols > 1 ? i : 0)); //check that the column number is valid if (col < 1 || col > dbFile->getNumFields()) { @@ -124,7 +127,7 @@ bool KeyListOps::isValidColumnOps(FileRecordMgr *dbFile) { << dbFile->getFileName() << " only has fields 1 - " << dbFile->getNumFields() << "." << endl; return false; } - const QuickString &operation = opsVec.size() > 1 ? opsVec[i] : opsVec[0]; + const QuickString &operation = opsTokens.getElem(numOps > 1 ? i : 0); OP_TYPES opCode = getOpCode(operation); if (opCode == INVALID) { cerr << endl << "*****" << endl @@ -361,4 +364,33 @@ const QuickString & KeyListOps::getOpVals(RecordKeyList &hits) return _outVals; } +void KeyListOpsHelp() { + cerr << "\t-c\t" << "Specify columns from the B file to map onto intervals in A." << endl; + cerr << "\t\tDefault: 5." << endl; + cerr << "\t\tMultiple columns can be specified in a comma-delimited list." << endl << endl; + + cerr << "\t-o\t" << "Specify the operation that should be applied to -c." << endl; + cerr << "\t\tValid operations:" << endl; + cerr << "\t\t sum, min, max, absmin, absmax," << endl; + cerr << "\t\t mean, median," << endl; + cerr << "\t\t collapse (i.e., print a delimited list (duplicates allowed)), " << endl; + cerr << "\t\t distinct (i.e., print a delimited list (NO duplicates allowed)), " << endl; + cerr << "\t\t count" << endl; + cerr << "\t\t count_distinct (i.e., a count of the unique values in the column), " << endl; + cerr << "\t\tDefault: sum" << endl; + cerr << "\t\tMultiple operations can be specified in a comma-delimited list." << endl << endl; + + cerr << "\t\tIf there is only column, but multiple operations, all operations will be" << endl; + cerr << "\t\tapplied on that column. Likewise, if there is only one operation, but" << endl; + cerr << "multiple columns, that operation will be applied to all columns." << endl; + cerr << "\t\tOtherwise, the number of columns must match the the number of operations," << endl; + cerr << "and will be applied in respective order." << endl; + cerr << "\t\tE.g., \"-c 5,4,6 -o sum,mean,count\" will give the sum of column 5," << endl; + cerr << "the mean of column 4, and the count of column 6." << endl; + cerr << "\t\tThe order of output columns will match the ordering given in the command." << endl << endl<<endl; + + cerr << "\t-delim\t" << "Specify a custom delimiter for the collapse operations." << endl; + cerr << "\t\t- Example: -delim \"|\"" << endl; + cerr << "\t\t- Default: \",\"." << endl << endl; +} diff --git a/src/utils/KeyListOps/KeyListOps.h b/src/utils/KeyListOps/KeyListOps.h index 3c26d2c20b5ac49d4c608fc6ae7dd98653dfb932..5c5ea63c6780d0012b03bc510c7e11188898be55 100644 --- a/src/utils/KeyListOps/KeyListOps.h +++ b/src/utils/KeyListOps/KeyListOps.h @@ -12,16 +12,33 @@ class FileRecordMgr; +//print help message +void KeyListOpsHelp(); + class KeyListOps { public: KeyListOps(); void setColumns(const QuickString &columns) { _columns = columns; } + void addColumns(const QuickString &newCols) { + if (!_columns.empty()) _columns += ","; + _columns += newCols; + } void setOperations(const QuickString & operation) { _operations = operation; } + void addOperations(const QuickString &newOps) { + if (!_operations.empty()) _operations += ","; + _operations += newOps; + } + void setNullValue(const QuickString & nullValue) { _methods.setNullValue(nullValue); } void setDelimStr(const QuickString & delimStr) { _methods.setDelimStr(delimStr); } + const QuickString &getColumns() { return _columns; } + const QuickString &getOperations() { return _operations; } + const QuickString &getNullValue() { return _methods.getNullValue(); } + const QuickString &getDelimStr() { return _methods.getDelimStr(); } + void setKeyList(RecordKeyList *keyList) { _methods.setKeyList(keyList); } typedef enum { SUM, MEAN, STDDEV, SAMPLE_STDDEV, MEDIAN, MODE, ANTIMODE, MIN, MAX, ABSMIN, ABSMAX, COUNT, DISTINCT, COUNT_DISTINCT, diff --git a/src/utils/NewChromsweep/NewChromsweep.cpp b/src/utils/NewChromsweep/NewChromsweep.cpp index 5e6045be625799cca515ac056e9a17b33f1908a6..34647d678c448ccdfefbe32adcd856a81476b58b 100644 --- a/src/utils/NewChromsweep/NewChromsweep.cpp +++ b/src/utils/NewChromsweep/NewChromsweep.cpp @@ -41,9 +41,9 @@ bool NewChromSweep::init() { _databaseFRM = _context->getFile(_context->getDatabaseFileIdx()); nextRecord(false); - if (_currDatabaseRec == NULL) { - return false; - } +// if (_currDatabaseRec == NULL) { +// return false; +// } //determine whether to stop when the database end is hit, or keep going until the //end of the query file is hit as well. @@ -185,7 +185,7 @@ bool NewChromSweep::next(RecordKeyList &next) { void NewChromSweep::nextRecord(bool query) { if (query) { // if (!_context->getUseMergedIntervals()) { - _currQueryRec = _queryFRM->allocateAndGetNextRecord(); + _currQueryRec = _queryFRM->getNextRecord(); // } else { // _currQueryRec = _queryFRM->allocateAndGetNextMergedRecord(_context->getSameStrand() ? FileRecordMgr::SAME_STRAND_EITHER : FileRecordMgr::ANY_STRAND); // } @@ -194,7 +194,7 @@ void NewChromSweep::nextRecord(bool query) { } } else { //database // if (!_context->getUseMergedIntervals()) { - _currDatabaseRec = _databaseFRM->allocateAndGetNextRecord(); + _currDatabaseRec = _databaseFRM->getNextRecord(); // } else { // _currDatabaseRec = _databaseFRM->allocateAndGetNextMergedRecord(_context->getSameStrand() ? FileRecordMgr::SAME_STRAND_EITHER : FileRecordMgr::ANY_STRAND); // } diff --git a/src/utils/RecordOutputMgr/RecordOutputMgr.cpp b/src/utils/RecordOutputMgr/RecordOutputMgr.cpp index 55cb70ec9aa74af58b33e5faf32d18b19c0b0bae..9aaa511bc3d2e10c73c15706bbd42e29808f435a 100644 --- a/src/utils/RecordOutputMgr/RecordOutputMgr.cpp +++ b/src/utils/RecordOutputMgr/RecordOutputMgr.cpp @@ -37,6 +37,10 @@ RecordOutputMgr::RecordOutputMgr() RecordOutputMgr::~RecordOutputMgr() { + // In the rare case when a file had a header but was otherwise empty, + // we'll need to make a last check to see if the header still needs to be printed. + checkForHeader(); + if (_outBuf.size() > 0) { flush(); } @@ -76,9 +80,15 @@ bool RecordOutputMgr::printKeyAndTerminate(RecordKeyList &keyList) { if (bamCode == BAM_AS_BAM) { return true; } else if (bamCode == NOT_BAM) { - keyList.getKey()->print(_outBuf); + if (_context->getProgram() == ContextBase::MERGE) { + //when printing merged records, we want to force the printing into + //bed3 format, which is surprisingly difficult to do. Had to use the following: + const Bed3Interval *bed3 = static_cast<const Bed3Interval *>(keyList.getKey()); + bed3->Bed3Interval::print(_outBuf); + } else { + keyList.getKey()->print(_outBuf); + } return false; - } //otherwise, it was BAM_AS_BED, and the key was printed. return false; @@ -114,8 +124,12 @@ void RecordOutputMgr::printRecord(const Record *record) void RecordOutputMgr::printRecord(const Record *record, const QuickString & value) { + _afterVal = value; printRecord(record); - _outBuf.append(value); + if (!value.empty()) { + tab(); + _outBuf.append(value); + } newline(); if (needsFlush()) { @@ -145,9 +159,8 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi } //The first time we print a record is when we print any header, because the header //hasn't been read from the query file until after the first record has also been read. - if (_context->getPrintHeader()) { - checkForHeader(); - } + checkForHeader(); + const_cast<Record *>(keyList.getKey())->undoZeroLength(); _currBamBlockList = blockList; @@ -201,8 +214,21 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi _currBamBlockList = NULL; return; } else if (_context->getProgram() == ContextBase::MAP) { + printKeyAndTerminate(keyList); + _currBamBlockList = NULL; + return; + } else if (_context->getProgram() == ContextBase::MERGE) { + printKeyAndTerminate(keyList); + _currBamBlockList = NULL; + return; + } else if (_context->getProgram() == ContextBase::MERGE) { if (!printKeyAndTerminate(keyList)) { - tab(); + if (_context->getDesiredStrand() != FileRecordMergeMgr::ANY_STRAND) { + //add the sign of the record + tab(); + _outBuf.append(keyList.getKey()->getStrand()); + } + if (!_afterVal.empty()) tab(); } _currBamBlockList = NULL; return; @@ -210,6 +236,9 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi } void RecordOutputMgr::checkForHeader() { + // Do we need to print a header? + if (!_context->getPrintHeader()) return; + //if the program is based on intersection, we want the header from the query file. if (_context->hasIntersectMethods()) { int queryIdx = (static_cast<ContextIntersect *>(_context))->getQueryFileIdx(); diff --git a/src/utils/RecordOutputMgr/RecordOutputMgr.h b/src/utils/RecordOutputMgr/RecordOutputMgr.h index b6296fdadc2d166d4c4d00b0b1f12fe306d46e70..07891cc2cfdcc3bf3d5de19349939953ae31c523 100644 --- a/src/utils/RecordOutputMgr/RecordOutputMgr.h +++ b/src/utils/RecordOutputMgr/RecordOutputMgr.h @@ -44,6 +44,7 @@ private: // BlockMgr *_bamBlockMgr; const BlockMgr *_splitInfo; + QuickString _afterVal; //to store values to be printed after record, such as column operations. //some helper functions to neaten the code. void tab() { _outBuf.append('\t'); } void newline() { _outBuf.append('\n'); } diff --git a/src/utils/fileType/FileRecordTypeChecker.cpp b/src/utils/fileType/FileRecordTypeChecker.cpp index 370cfffa7eb5d418618f29af1e928813e76a2049..5b3eb97e7b36cd73ff54315e49c66d2ea6d2f138 100644 --- a/src/utils/fileType/FileRecordTypeChecker.cpp +++ b/src/utils/fileType/FileRecordTypeChecker.cpp @@ -12,8 +12,7 @@ FileRecordTypeChecker::FileRecordTypeChecker() _isText = false; _isBed = false; _isDelimited = false; - _delimChar = '\0'; - _lines.clear(); + _delimChar = '\t'; //tab by default _firstValidDataLineIdx = -1; _isVCF = false; _isBAM = false; @@ -21,8 +20,10 @@ FileRecordTypeChecker::FileRecordTypeChecker() _isGzipped = false; _insufficientData = false; _fourthFieldNumeric = false; + _givenEmptyBuffer = false; _hasName[UNKNOWN_RECORD_TYPE] = false; + _hasName[EMPTY_RECORD_TYPE] = false; _hasName[BED3_RECORD_TYPE] = false; _hasName[BED6_RECORD_TYPE] = true; _hasName[BED12_RECORD_TYPE] = true; @@ -32,6 +33,7 @@ FileRecordTypeChecker::FileRecordTypeChecker() _hasName[GFF_RECORD_TYPE] = true; _hasScore[UNKNOWN_RECORD_TYPE] = false; + _hasScore[EMPTY_RECORD_TYPE] = false; _hasScore[BED3_RECORD_TYPE] = false; _hasScore[BED6_RECORD_TYPE] = true; _hasScore[BED12_RECORD_TYPE] = true; @@ -41,6 +43,7 @@ FileRecordTypeChecker::FileRecordTypeChecker() _hasScore[GFF_RECORD_TYPE] = true; _hasStrand[UNKNOWN_RECORD_TYPE] = false; + _hasStrand[EMPTY_RECORD_TYPE] = false; _hasStrand[BED3_RECORD_TYPE] = false; _hasStrand[BED6_RECORD_TYPE] = true; _hasStrand[BED12_RECORD_TYPE] = true; @@ -50,6 +53,7 @@ FileRecordTypeChecker::FileRecordTypeChecker() _hasStrand[GFF_RECORD_TYPE] = true; _recordTypeNames[UNKNOWN_RECORD_TYPE] = "Unknown record type"; + _recordTypeNames[EMPTY_RECORD_TYPE] = "Empty record type"; _recordTypeNames[BED3_RECORD_TYPE] = "Bed3 record type"; _recordTypeNames[BED6_RECORD_TYPE] = "Bed6 record type"; _recordTypeNames[BED12_RECORD_TYPE] = "Bed12 record type"; @@ -58,11 +62,8 @@ FileRecordTypeChecker::FileRecordTypeChecker() _recordTypeNames[VCF_RECORD_TYPE] = "VCF record type"; _recordTypeNames[GFF_RECORD_TYPE] = "GFF record type"; -// UNKNOWN_FILE_TYPE, SINGLE_LINE_DELIM_TEXT_FILE_TYPE, -// MULTI_LINE_ENTRY_TEXT_FILE_TYPE, -// GFF_FILE_TYPE, GZIP_FILE_TYPE, BAM_FILE_TYPE - _fileTypeNames[UNKNOWN_FILE_TYPE] = "Unknown file type"; + _fileTypeNames[EMPTY_FILE_TYPE] = "Empty file type"; _fileTypeNames[SINGLE_LINE_DELIM_TEXT_FILE_TYPE] = "Delimited text file type"; _fileTypeNames[GZIP_FILE_TYPE] = "Gzip file type"; _fileTypeNames[BAM_FILE_TYPE] = "BAM file type"; @@ -77,8 +78,9 @@ bool FileRecordTypeChecker::scanBuffer(const char *buffer, size_t len) } _numBytesInBuffer = len; if (_numBytesInBuffer == 0) { - cerr << "Error: " << _filename << " is an empty file."<< endl; - exit(1); + _fileType = EMPTY_FILE_TYPE; + _recordType = EMPTY_RECORD_TYPE; + return true; } //special: the first thing we do is look for a gzipped file. @@ -158,9 +160,14 @@ bool FileRecordTypeChecker::handleTextFormat(const char *buffer, size_t len) _fileType = SINGLE_LINE_DELIM_TEXT_FILE_TYPE; //Tokenize the first line of valid data into fields. - const QuickString &line = _lines[_firstValidDataLineIdx]; - _currLineElems.clear(); - if (Tokenize(line, _currLineElems, _delimChar, _numFields) != _numFields) { + //Need to make a copy so next call to tokenizer doesn't overwrite the line. + + QuickString line(_tokenizer.getElem(_firstValidDataLineIdx)); + + _tokenizer.setKeepFinalIncompleteElem(Tokenizer::USE_NOW); + _tokenizer.setNumExpectedItems(_numFields); + + if (_tokenizer.tokenize(line, _delimChar) != _numFields) { cerr << "Error: Type checker found wrong number of fields while tokenizing data line." << endl; exit(1); } @@ -170,7 +177,7 @@ bool FileRecordTypeChecker::handleTextFormat(const char *buffer, size_t len) if (_numFields == 3) { _recordType = BED3_RECORD_TYPE; } else if (_numFields == 4) { - if (isNumeric(_currLineElems[3])) { + if (isNumeric(_tokenizer.getElem(3))) { _recordType = BEDGRAPH_RECORD_TYPE; _fourthFieldNumeric = true; } else { @@ -220,12 +227,12 @@ bool FileRecordTypeChecker::isBedFormat() { return false; } //the 2nd and 3rd fields must be numeric. - if (!isNumeric(_currLineElems[1]) || !isNumeric(_currLineElems[2])) { + if (!isNumeric(_tokenizer.getElem(1)) || !isNumeric(_tokenizer.getElem(2))) { return false; } - int start = str2chrPos(_currLineElems[1]); - int end = str2chrPos(_currLineElems[2]); + int start = str2chrPos(_tokenizer.getElem(1)); + int end = str2chrPos(_tokenizer.getElem(2)); if (end < start) { return false; } @@ -239,11 +246,11 @@ bool FileRecordTypeChecker::isGFFformat() return false; } //the 4th and 5th fields must be numeric. - if (!isNumeric(_currLineElems[3]) || !isNumeric(_currLineElems[4])) { + if (!isNumeric(_tokenizer.getElem(3)) || !isNumeric(_tokenizer.getElem(4))) { return false; } - int start = str2chrPos(_currLineElems[3]); - int end = str2chrPos(_currLineElems[4]); + int start = str2chrPos(_tokenizer.getElem(3)); + int end = str2chrPos(_tokenizer.getElem(4)); if (end < start) { return false; } @@ -253,8 +260,8 @@ bool FileRecordTypeChecker::isGFFformat() bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len) { //Break single string buffer into vector of QuickStrings. Delimiter is newline. - _lines.clear(); - int numLines = Tokenize(buffer, _lines, '\n', len); + _tokenizer.setKeepFinalIncompleteElem(Tokenizer::IGNORE); + int numLines = _tokenizer.tokenize(buffer, '\n'); //anticipated delimiter characters are tab, comma, and semi-colon. //If we need new ones, they must be added in this method. @@ -280,7 +287,7 @@ bool FileRecordTypeChecker::isTextDelimtedFormat(const char *buffer, size_t len) if (validLinesFound >=4) { break; //really only need to look at like 4 lines of data, max. } - QuickString &line = _lines[i]; + const QuickString &line = _tokenizer.getElem(i); int len =line.size(); //skip over any empty line if (len == 0) { diff --git a/src/utils/fileType/FileRecordTypeChecker.h b/src/utils/fileType/FileRecordTypeChecker.h index 6d598acea456c8a2e5852a97ca8c3dacd0d72b4e..15f31ca30c57ab11e83abbea02f05cd910de7bc1 100644 --- a/src/utils/fileType/FileRecordTypeChecker.h +++ b/src/utils/fileType/FileRecordTypeChecker.h @@ -18,6 +18,7 @@ using namespace std; #include <vector> #include <map> #include "PushBackStreamBuf.h" +#include "Tokenizer.h" class FileRecordTypeChecker { public: @@ -25,11 +26,11 @@ public: FileRecordTypeChecker(); ~FileRecordTypeChecker() {} - typedef enum { UNKNOWN_FILE_TYPE, SINGLE_LINE_DELIM_TEXT_FILE_TYPE, + typedef enum { UNKNOWN_FILE_TYPE, EMPTY_FILE_TYPE, SINGLE_LINE_DELIM_TEXT_FILE_TYPE, MULTI_LINE_ENTRY_TEXT_FILE_TYPE, GFF_FILE_TYPE, GZIP_FILE_TYPE, BAM_FILE_TYPE, VCF_FILE_TYPE} FILE_TYPE; - typedef enum { UNKNOWN_RECORD_TYPE, BED3_RECORD_TYPE, BED4_RECORD_TYPE, BEDGRAPH_RECORD_TYPE, BED5_RECORD_TYPE, + typedef enum { UNKNOWN_RECORD_TYPE, EMPTY_RECORD_TYPE, BED3_RECORD_TYPE, BED4_RECORD_TYPE, BEDGRAPH_RECORD_TYPE, BED5_RECORD_TYPE, BED6_RECORD_TYPE, BED12_RECORD_TYPE, BED_PLUS_RECORD_TYPE, BAM_RECORD_TYPE, VCF_RECORD_TYPE, GFF_RECORD_TYPE} RECORD_TYPE; void setFilename(const QuickString & filename) { _filename = filename; } @@ -41,7 +42,12 @@ public: bool recordTypeHasStrand(RECORD_TYPE type) const { return _hasStrand.find(type) != _hasStrand.end(); } FILE_TYPE getFileType() const { return _fileType; } + void setFileType(FILE_TYPE type) { _fileType = type; } + + RECORD_TYPE getRecordType() const { return _recordType; } + void setRecordType(RECORD_TYPE type) { _recordType = type; } + const string &getFileTypeName() const { return _fileTypeNames.find(_fileType)->second; } @@ -82,8 +88,8 @@ private: RECORD_TYPE _recordType; QuickString _filename; //useful for reporting errors with file. - vector<QuickString> _lines; - vector<QuickString> _currLineElems; + Tokenizer _tokenizer; + int _firstValidDataLineIdx; int _numBytesInBuffer; //this will hold the length of the buffer after the scan. @@ -99,6 +105,7 @@ private: bool _isGzipped; bool _insufficientData; //set to true if scan buffer had only header lines. bool _fourthFieldNumeric; //this is just to distinguish between Bed4 and BedGraph files. + bool _givenEmptyBuffer; map<RECORD_TYPE, string> _recordTypeNames; map<FILE_TYPE, string> _fileTypeNames; diff --git a/src/utils/general/DualQueue.h b/src/utils/general/DualQueue.h deleted file mode 100644 index 1cee41581709e742702aac75fb5c0d69430205c9..0000000000000000000000000000000000000000 --- a/src/utils/general/DualQueue.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * DualQueue.h - * - * Created on: Jan 29, 2013 - * Author: nek3d - */ -#ifdef false -#ifndef DUALQUEUE_H_ -#define DUALQUEUE_H_ - -using namespace std; - -#include <vector> -#include <queue> -#include <cstdio> -#include <cstdlib> - -template <class T> class DualQueueAscending { -public: - bool operator() ( const T &item1, const T &item2) const { - - printf("\n\nIn comparison method:\n item1=\n"); -// item1->print(); - printf("\nitem2=\n"); -// item2->print(); - printf("\n"); - - if( *(item1) < *(item2) ) { - printf("Item1 less than item2. Returning false.\n"); - return false; - } - printf("Item1 not less than item2. Returning true.\n"); - return true; - } -}; - -template <class T> class DualQueueDescending { -public: - bool operator() ( const T &item1, const T &item2) const { - if( *(item2) < *(item1) ) { - return false; - } - return true; - } -}; - - -template <class T, template<class T> class CompareFunc> class DualQueue { -public: - DualQueue() {} - ~DualQueue() {} - - const T & top() const { - if (empty()) { - fprintf(stderr, "ERROR. Tried to top from empty dualQueue.\n"); - exit(1); - } - if (emptyForward()) { - return topReverse(); - } - if (emptyReverse()) { - return topForward(); - } - - return (topFowardHigherPriorityThanTopReverse() ? topForward() : topReverse()); - } - void pop() { - if (empty()) { - fprintf(stderr, "ERROR. Tried to pop from empty dualQueue.\n"); - exit(1); - } - if (emptyForward()) { - popReverse(); - return; - } - if (emptyReverse()) { - popForward(); - return; - } - topFowardHigherPriorityThanTopReverse() ? popForward() : popReverse(); - } - void push(const T &item) { item->getStrand() ? pushForward(item) : pushReverse(item); } - size_t size() const { return sizeForward() + sizeReverse(); } - bool empty() const { return _forwardQueue.empty() && _reverseQueue.empty(); } - - const T & topForward() const { return _forwardQueue.top(); } - void popForward() { _forwardQueue.pop(); } - void pushForward(const T &item) { _forwardQueue.push(item); } - size_t sizeForward() const { return _forwardQueue.size(); } - bool emptyForward() const { return _forwardQueue.empty(); } - - - const T & topReverse() const { return _reverseQueue.top(); } - void popReverse() { _reverseQueue.pop(); } - void pushReverse(const T &item) { _reverseQueue.push(item); } - size_t sizeReverse() const { return _reverseQueue.size(); } - bool emptyReverse() const { return _reverseQueue.empty(); } - - -private: - typedef priority_queue<T, vector<T>, CompareFunc<T> > queueType; - queueType _forwardQueue; - queueType _reverseQueue; - - bool topFowardHigherPriorityThanTopReverse() const { - printf("\n\nIn priority method:\n TopForward=\n"); -// topForward()->print(); - printf("\nTopReverse=\n"); -// topReverse()->print(); - printf("\n"); - if (CompareFunc<T>()(topForward(), topReverse())) { - printf("Forward higher priority than reverse.\n"); - return true; - } else { - printf("Reverse higher priority than forward.\n"); - return false; - } - } - -}; - - -#endif /* DUALQUEUE_H_ */ -#endif diff --git a/src/utils/general/Makefile b/src/utils/general/Makefile index 0361fab41acde1e99e8268075d35b533dc3bfc8a..20d7aeead018248fc8d954d4eb982c66c4009701 100644 --- a/src/utils/general/Makefile +++ b/src/utils/general/Makefile @@ -9,8 +9,8 @@ INCLUDES = -I$(UTILITIES_DIR)/lineFileUtilities/ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= QuickString.h QuickString.cpp ParseTools.h ParseTools.cpp PushBackStreamBuf.cpp PushBackStreamBuf.h CompressionTools.h CompressionTools.cpp -OBJECTS= QuickString.o ParseTools.o PushBackStreamBuf.o CompressionTools.o +SOURCES= QuickString.h QuickString.cpp ParseTools.h ParseTools.cpp PushBackStreamBuf.cpp PushBackStreamBuf.h CompressionTools.h CompressionTools.cpp Tokenizer.h Tokenizer.h +OBJECTS= QuickString.o ParseTools.o PushBackStreamBuf.o CompressionTools.o Tokenizer.o BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) all: $(BUILT_OBJECTS) @@ -23,6 +23,6 @@ $(BUILT_OBJECTS): $(SOURCES) clean: @echo "Cleaning up." - @rm -f $(OBJ_DIR)/QuickString.o $(OBJ_DIR)/ParseTools.o $(OBJ_DIR)/PushBackStreamBuf.o + @rm -f $(OBJ_DIR)/QuickString.o $(OBJ_DIR)/ParseTools.o $(OBJ_DIR)/PushBackStreamBuf.o $(OBJ_DIR)/Tokenizer.o .PHONY: clean diff --git a/src/utils/general/ParseTools.cpp b/src/utils/general/ParseTools.cpp index ad5764b416e8a76566c3b7210a0f97fb66b55325..04782a717630a0c0457df8cfa3b80427b0170002 100644 --- a/src/utils/general/ParseTools.cpp +++ b/src/utils/general/ParseTools.cpp @@ -19,6 +19,9 @@ int str2chrPos(const QuickString &str) { } int str2chrPos(const char *str, size_t ulen) { + if (ulen == 0) { + ulen = strlen(str); + } int len=(int)ulen; if (len < 1 || len > 10) { return INT_MIN; //can't do more than 9 digits and a minus sign @@ -90,35 +93,7 @@ string vectorIntToStr(const vector<int> &vec) { return str; } -// TBD: Could be better optimized. I'm allocating 8KB for every call, then destroying it. -// That memory needs to stay in scope. -// Also, is this handling subsequent delimiters? -int Tokenize(const QuickString &str, vector<QuickString> &elems, char delimiter, int numExpectedItems) { - - elems.reserve(numExpectedItems); - int strLen = (int)str.size(); - - int startPos = 0; - int currPos = 0; - - char elemBuf[8192]; - - while (startPos < strLen) { - memset(elemBuf, 0, 8192); - while (str[currPos] != delimiter && currPos < strLen) { - currPos++; - } - if (currPos > startPos) { - memcpy(elemBuf, str.c_str() + startPos, min(currPos, strLen) - startPos); - elems.push_back(elemBuf); - } - startPos = currPos +1; - currPos = startPos; - } - return (int)elems.size(); -} - -bool isHeaderLine(QuickString &line) { +bool isHeaderLine(const QuickString &line) { if (line[0] == '>') { return true; } @@ -140,4 +115,3 @@ bool isHeaderLine(QuickString &line) { } return false; } - diff --git a/src/utils/general/ParseTools.h b/src/utils/general/ParseTools.h index 2132d0e4c6be8994bbc2a579fbf951f3b4bc323d..871f53bfea150ad615c2416c53701d4e94600fdc 100644 --- a/src/utils/general/ParseTools.h +++ b/src/utils/general/ParseTools.h @@ -22,7 +22,7 @@ bool isNumeric(const QuickString &str); //Empty strings, too long strings, or strings containing anything other than //digits (with the excpetion of a minus sign in the first position) //will result in error. Errors return INT_MIN. -int str2chrPos(const char *str, size_t len); +int str2chrPos(const char *str, size_t len = 0); int str2chrPos(const QuickString &str); @@ -86,7 +86,7 @@ void int2str(int number, T& buffer, bool appendToBuf = false) } -bool isHeaderLine(QuickString &line); +bool isHeaderLine(const QuickString &line); string vectorIntToStr(const vector<int> &vec); diff --git a/src/utils/general/Tokenizer.cpp b/src/utils/general/Tokenizer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b88e013409421074790395e2fc0dbf88b92d9c10 --- /dev/null +++ b/src/utils/general/Tokenizer.cpp @@ -0,0 +1,93 @@ +/* + * Tokenizer.cpp + * + * Created on: Apr 15, 2014 + * Author: nek3d + */ +#include "Tokenizer.h" +#include <cstring> + +Tokenizer::Tokenizer() +: _numExpectedElems(0), + _keepFinalIncElem(USE_NOW), + _numValidElems(0) { + + _elems.resize(INITIAL_NUM_ELEMS, NULL); + for (int i = 0; i < INITIAL_NUM_ELEMS; i++) { + _elems[i] = new QuickString(); + } +} + +Tokenizer::~Tokenizer() { + resize(0); //easy way to delete elems without repeating code. +} + +void Tokenizer::setNumExpectedItems(int newSize) { + _numExpectedElems = newSize; + resize(newSize); +} + +int Tokenizer::tokenize(const QuickString &str, char delimiter) { + + int strLen = (int)str.size(); + + int startPos = 0; + int currPos = 0; + + int currIdx = 0; + + while (startPos < strLen) { + while (str[currPos] != delimiter && currPos < strLen) { + currPos++; + } + if (currPos > startPos) { + if (currPos == strLen && _keepFinalIncElem != USE_NOW) { + //we found an incomplete final element. + // if we're ignoring incomplete elems, do nothing with it. + currIdx--; //make sure it's not included in the final count of valid elems. + + } else { + QuickString *newStr = fetchElem(currIdx); + newStr->assign(str.c_str() + startPos, min(currPos, strLen) - startPos); + } + } + startPos = currPos +1; + currPos = startPos; + currIdx++; + } + _numValidElems = currIdx; + return currIdx; +} + +void Tokenizer::setKeepFinalIncompleteElem(lastElemCode code) { + _keepFinalIncElem = code; +} + +QuickString *Tokenizer::fetchElem(int idx) +{ + if (idx >= (int)_elems.size()) { + resize(idx +1); + } + return _elems[idx]; +} + + +void Tokenizer::resize(int newSize) { + int oldSize = (int)_elems.size(); + + if (newSize > oldSize) { //need to add items. + _elems.resize(newSize); + for (int i=oldSize; i < newSize; i++) { + _elems[i] = new QuickString(); + } + } else if (oldSize > newSize) { + //need to remove items. + for (int i = oldSize - 1; i >= newSize; i--) { + delete _elems[i]; + _elems[i] = NULL; + } + _elems.resize(newSize); + } + //if oldSize is the same as newSize, do nothing. +} + diff --git a/src/utils/general/Tokenizer.h b/src/utils/general/Tokenizer.h new file mode 100644 index 0000000000000000000000000000000000000000..df49f11408b5d7d14d08cdaff7a6b9c2237f2d05 --- /dev/null +++ b/src/utils/general/Tokenizer.h @@ -0,0 +1,57 @@ +/* + * Tokenizer.h + * + * Created on: Apr 15, 2014 + * Author: nek3d + */ + +#ifndef TOKENIZER_H_ +#define TOKENIZER_H_ + +using namespace std; + +#include "QuickString.h" +#include <vector> + +class Tokenizer { +public: + Tokenizer(); + ~Tokenizer(); + + // If you know the expected number of items, set this. + // If not, don't worry about it. + void setNumExpectedItems(int val); + + int tokenize(const QuickString &str, char delimiter = '\t'); + + // If the final element ends before a delim char, that means + // the buffer passed in ends mid-element. The last, incomplete + // element found can either be: + // 1) Used now. We want it whether it's complete or not. + // 3) Ignored altogether. + typedef enum { USE_NOW, IGNORE } lastElemCode; + void setKeepFinalIncompleteElem(lastElemCode code); + + //final number of valid elems may be less than total number of elems, + //because elems are not necessarily deleted between subsequent calls + //to tokenizer. + int getNumValidElems() const { return _numValidElems; } + int getNumTotalElems() const { return (int)_elems.size(); } + const QuickString &getElem(int i) const { return (*(_elems[i])); } + + + +private: + static const int DEFAULT_PARSE_BUFFER_SIZE = 4096; // 8Kb + static const int INITIAL_NUM_ELEMS = 10; + vector<QuickString *> _elems; + int _numExpectedElems; + lastElemCode _keepFinalIncElem; + int _numValidElems; + + QuickString *fetchElem(int idx); + void resize(int newSize); +}; + + +#endif /* TOKENIZER_H_ */ diff --git a/test/intersect/headerOnly.vcf b/test/intersect/headerOnly.vcf new file mode 100644 index 0000000000000000000000000000000000000000..9863ae581046b88e4a9f1b8d957480b03dafe165 --- /dev/null +++ b/test/intersect/headerOnly.vcf @@ -0,0 +1,121 @@ +##fileformat=VCFv4.1 +##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> +##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)"> +##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"> +##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes"> +##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities"> +##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered"> +##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?"> +##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of Reads Containing Spanning Deletions"> +##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias"> +##INFO=<ID=HRun,Number=1,Type=Integer,Description="Largest Contiguous Homopolymer Run of Variant Allele In Either Direction"> +##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes"> +##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"> +##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality"> +##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads"> +##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"> +##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth"> +##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"> +##UnifiedGenotyper="analysis_type=UnifiedGenotyper input_file=[bam/all.conc.on.pos.dedup.realigned.bam] read_buffer_size=null phone_home=STANDARD read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL reference_sequence=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa rodBind=[] nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=10 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false logging_level=INFO log_to_file=null help=false genotype_likelihoods_model=BOTH p_nonref_model=EXACT heterozygosity=0.0010 pcr_error_rate=1.0E-4 genotyping_mode=DISCOVERY output_mode=EMIT_VARIANTS_ONLY standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=30.0 computeSLOD=false alleles=(RodBinding name= source=UNBOUND) min_base_quality_score=17 max_deletion_fraction=0.05 multiallelic=false max_alternate_alleles=5 min_indel_count_for_genotyping=5 indel_heterozygosity=1.25E-4 indelGapContinuationPenalty=10.0 indelGapOpenPenalty=45.0 indelHaplotypeSize=80 bandedIndel=false indelDebug=false ignoreSNPAlleles=false dbsnp=(RodBinding name= source=UNBOUND) out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub debug_file=null metrics_file=null annotation=[] excludeAnnotation=[] filter_mismatching_base_and_quals=false" +##contig=<ID=chr1,length=249250621,assembly=hg19> +##contig=<ID=chr10,length=135534747,assembly=hg19> +##contig=<ID=chr11,length=135006516,assembly=hg19> +##contig=<ID=chr11_gl000202_random,length=40103,assembly=hg19> +##contig=<ID=chr12,length=133851895,assembly=hg19> +##contig=<ID=chr13,length=115169878,assembly=hg19> +##contig=<ID=chr14,length=107349540,assembly=hg19> +##contig=<ID=chr15,length=102531392,assembly=hg19> +##contig=<ID=chr16,length=90354753,assembly=hg19> +##contig=<ID=chr17,length=81195210,assembly=hg19> +##contig=<ID=chr17_ctg5_hap1,length=1680828,assembly=hg19> +##contig=<ID=chr17_gl000203_random,length=37498,assembly=hg19> +##contig=<ID=chr17_gl000204_random,length=81310,assembly=hg19> +##contig=<ID=chr17_gl000205_random,length=174588,assembly=hg19> +##contig=<ID=chr17_gl000206_random,length=41001,assembly=hg19> +##contig=<ID=chr18,length=78077248,assembly=hg19> +##contig=<ID=chr18_gl000207_random,length=4262,assembly=hg19> +##contig=<ID=chr19,length=59128983,assembly=hg19> +##contig=<ID=chr19_gl000208_random,length=92689,assembly=hg19> +##contig=<ID=chr19_gl000209_random,length=159169,assembly=hg19> +##contig=<ID=chr1_gl000191_random,length=106433,assembly=hg19> +##contig=<ID=chr1_gl000192_random,length=547496,assembly=hg19> +##contig=<ID=chr2,length=243199373,assembly=hg19> +##contig=<ID=chr20,length=63025520,assembly=hg19> +##contig=<ID=chr21,length=48129895,assembly=hg19> +##contig=<ID=chr21_gl000210_random,length=27682,assembly=hg19> +##contig=<ID=chr22,length=51304566,assembly=hg19> +##contig=<ID=chr3,length=198022430,assembly=hg19> +##contig=<ID=chr4,length=191154276,assembly=hg19> +##contig=<ID=chr4_ctg9_hap1,length=590426,assembly=hg19> +##contig=<ID=chr4_gl000193_random,length=189789,assembly=hg19> +##contig=<ID=chr4_gl000194_random,length=191469,assembly=hg19> +##contig=<ID=chr5,length=180915260,assembly=hg19> +##contig=<ID=chr6,length=171115067,assembly=hg19> +##contig=<ID=chr6_apd_hap1,length=4622290,assembly=hg19> +##contig=<ID=chr6_cox_hap2,length=4795371,assembly=hg19> +##contig=<ID=chr6_dbb_hap3,length=4610396,assembly=hg19> +##contig=<ID=chr6_mann_hap4,length=4683263,assembly=hg19> +##contig=<ID=chr6_mcf_hap5,length=4833398,assembly=hg19> +##contig=<ID=chr6_qbl_hap6,length=4611984,assembly=hg19> +##contig=<ID=chr6_ssto_hap7,length=4928567,assembly=hg19> +##contig=<ID=chr7,length=159138663,assembly=hg19> +##contig=<ID=chr7_gl000195_random,length=182896,assembly=hg19> +##contig=<ID=chr8,length=146364022,assembly=hg19> +##contig=<ID=chr8_gl000196_random,length=38914,assembly=hg19> +##contig=<ID=chr8_gl000197_random,length=37175,assembly=hg19> +##contig=<ID=chr9,length=141213431,assembly=hg19> +##contig=<ID=chr9_gl000198_random,length=90085,assembly=hg19> +##contig=<ID=chr9_gl000199_random,length=169874,assembly=hg19> +##contig=<ID=chr9_gl000200_random,length=187035,assembly=hg19> +##contig=<ID=chr9_gl000201_random,length=36148,assembly=hg19> +##contig=<ID=chrM,length=16571,assembly=hg19> +##contig=<ID=chrUn_gl000211,length=166566,assembly=hg19> +##contig=<ID=chrUn_gl000212,length=186858,assembly=hg19> +##contig=<ID=chrUn_gl000213,length=164239,assembly=hg19> +##contig=<ID=chrUn_gl000214,length=137718,assembly=hg19> +##contig=<ID=chrUn_gl000215,length=172545,assembly=hg19> +##contig=<ID=chrUn_gl000216,length=172294,assembly=hg19> +##contig=<ID=chrUn_gl000217,length=172149,assembly=hg19> +##contig=<ID=chrUn_gl000218,length=161147,assembly=hg19> +##contig=<ID=chrUn_gl000219,length=179198,assembly=hg19> +##contig=<ID=chrUn_gl000220,length=161802,assembly=hg19> +##contig=<ID=chrUn_gl000221,length=155397,assembly=hg19> +##contig=<ID=chrUn_gl000222,length=186861,assembly=hg19> +##contig=<ID=chrUn_gl000223,length=180455,assembly=hg19> +##contig=<ID=chrUn_gl000224,length=179693,assembly=hg19> +##contig=<ID=chrUn_gl000225,length=211173,assembly=hg19> +##contig=<ID=chrUn_gl000226,length=15008,assembly=hg19> +##contig=<ID=chrUn_gl000227,length=128374,assembly=hg19> +##contig=<ID=chrUn_gl000228,length=129120,assembly=hg19> +##contig=<ID=chrUn_gl000229,length=19913,assembly=hg19> +##contig=<ID=chrUn_gl000230,length=43691,assembly=hg19> +##contig=<ID=chrUn_gl000231,length=27386,assembly=hg19> +##contig=<ID=chrUn_gl000232,length=40652,assembly=hg19> +##contig=<ID=chrUn_gl000233,length=45941,assembly=hg19> +##contig=<ID=chrUn_gl000234,length=40531,assembly=hg19> +##contig=<ID=chrUn_gl000235,length=34474,assembly=hg19> +##contig=<ID=chrUn_gl000236,length=41934,assembly=hg19> +##contig=<ID=chrUn_gl000237,length=45867,assembly=hg19> +##contig=<ID=chrUn_gl000238,length=39939,assembly=hg19> +##contig=<ID=chrUn_gl000239,length=33824,assembly=hg19> +##contig=<ID=chrUn_gl000240,length=41933,assembly=hg19> +##contig=<ID=chrUn_gl000241,length=42152,assembly=hg19> +##contig=<ID=chrUn_gl000242,length=43523,assembly=hg19> +##contig=<ID=chrUn_gl000243,length=43341,assembly=hg19> +##contig=<ID=chrUn_gl000244,length=39929,assembly=hg19> +##contig=<ID=chrUn_gl000245,length=36651,assembly=hg19> +##contig=<ID=chrUn_gl000246,length=38154,assembly=hg19> +##contig=<ID=chrUn_gl000247,length=36422,assembly=hg19> +##contig=<ID=chrUn_gl000248,length=39786,assembly=hg19> +##contig=<ID=chrUn_gl000249,length=38502,assembly=hg19> +##contig=<ID=chrX,length=155270560,assembly=hg19> +##contig=<ID=chrY,length=59373566,assembly=hg19> +##reference=file:///home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa +##SnpEffVersion="SnpEff 3.0g (build 2012-08-31), by Pablo Cingolani" +##SnpEffCmd="SnpEff -i vcf -o vcf GRCh37.66 /home/udp3f/cphg-home/projects/rs-exome/varCalling/2012-Feb-01/all.raw.nobaq.vcf " +##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' "> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1094PC0005 1094PC0009 1094PC0012 1094PC0013 diff --git a/test/intersect/new_test-intersect.sh b/test/intersect/new_test-intersect.sh index 4bdcdf05a1b208d1339dd21c416ef924d59d089e..3b7722239ba0217ad98e55f35466845b25430d01 100755 --- a/test/intersect/new_test-intersect.sh +++ b/test/intersect/new_test-intersect.sh @@ -609,12 +609,12 @@ check obs exp rm obs exp ########################################################### -# Test that we throw an error for empty files +# Test that we allow empty query files ############################################################ echo " intersect.new.t51...\c" -echo "is an empty file." > exp +touch exp touch dummy.txt -$BT intersect -a dummy.txt -b b.bed 2>&1 > /dev/null | cut -f3-6 -d ' ' > obs +$BT intersect -a dummy.txt -b b.bed 2>&1 > /dev/null | cat - > obs check obs exp rm obs exp dummy.txt @@ -664,5 +664,86 @@ $BT intersect -a gdc.gff -b gdc_one.bed > obs check obs exp rm obs exp +########################################################### +# Test that we allow empty database files, unsorted +############################################################ +echo " intersect.new.t56...\c" +touch exp +touch dummy.txt +$BT intersect -a a.bed -b dummy.txt 2>&1 > /dev/null | cat - > obs +check obs exp +rm obs exp dummy.txt + +########################################################### +# Test that we allow empty database files, sorted +############################################################ +echo " intersect.new.t57...\c" +touch exp +touch dummy.txt +$BT intersect -a a.bed -b dummy.txt -sorted 2>&1 > /dev/null | cat - > obs +check obs exp +rm obs exp dummy.txt + + +########################################################### +# Test that we allow empty database files, unsorted, with +# -v (noHit) option +############################################################ +echo " intersect.new.t58...\c" +touch dummy.txt +$BT intersect -a a.bed -b dummy.txt -v > obs +check obs a.bed +rm obs dummy.txt + + +########################################################### +# Test that an empty query with header run with -header +# option will print header +############################################################ +echo " intersect.new.t59...\c" +echo "#Random Header" >dummy.txt +$BT intersect -a dummy.txt -b a.bed -header > obs +check obs dummy.txt +rm obs dummy.txt + + +########################################################### +# Test that an empty query with header, gzipped, that +# runs with -header option will print header +############################################################ +echo " intersect.new.t60...\c" +echo "#Random Header" >dummy.txt +gzip dummy.txt +echo "#Random Header" >exp +$BT intersect -a dummy.txt.gz -b a.bed -header > obs +check obs exp +rm obs dummy.txt.gz exp + +########################################################### +# Test that an empty query with header, bgzipped, that +# runs with -header option will print header +############################################################ +echo " intersect.new.t61...\c" +echo "#Random Header" >dummy.txt +bgzip dummy.txt +echo "#Random Header" >exp +$BT intersect -a dummy.txt.gz -b a.bed -header > obs +check obs exp +rm obs dummy.txt.gz exp + + +########################################################### +# Test that an empty VCF query with header that +# runs with -header option will print header +############################################################ +echo " intersect.new.t62...\c" +$BT intersect -a headerOnly.vcf -b a.bed -header > obs +check obs headerOnly.vcf +rm obs + + + + + diff --git a/test/map/test-map.sh b/test/map/test-map.sh index 70d550a92fcc929fccc04e09fbeea7fe86ce93cb..0a188bec9d5acf6aa6b36e7b6d7fc648d6f24088 100644 --- a/test/map/test-map.sh +++ b/test/map/test-map.sh @@ -691,10 +691,13 @@ rm obs exp ############################################################ echo " map.t46...\c" echo \ -" -***** -***** ERROR: There are 1 columns given, but there are 2 operations." > exp -$BT map -a ivls.bed -b values.bed -o count,sum 2>&1 > /dev/null | head -3 > obs +"chr1 0 100 3 30 +chr1 100 200 1 1 +chr2 0 100 0 . +chr2 100 200 0 . +chr3 0 100 3 6 +chr3 100 200 1 4" > exp +$BT map -a ivls.bed -b values.bed -o count,sum > obs check obs exp rm obs exp diff --git a/test/merge/a.full.bam b/test/merge/a.full.bam new file mode 100644 index 0000000000000000000000000000000000000000..dbfceed08d147960ab319cbb42bf5b9847049bee Binary files /dev/null and b/test/merge/a.full.bam differ diff --git a/test/merge/a.gff b/test/merge/a.gff new file mode 100644 index 0000000000000000000000000000000000000000..4360a3c0e82941181f18865fd21c31ffd6605c46 --- /dev/null +++ b/test/merge/a.gff @@ -0,0 +1,6 @@ +browser position chr22:10000000-10025000 +browser hide all +track name=regulatory description="TeleGene(tm) Regulatory Regions" +chr22 TeleGene enhancer 10000000 10001000 500 + . touch1 +chr22 TeleGene enhancer 10010000 10010100 500 + . touch1 +chr22 TeleGene enhancer 10020000 10025000 500 + . touch1 diff --git a/test/merge/mixedStrands.bed b/test/merge/mixedStrands.bed new file mode 100644 index 0000000000000000000000000000000000000000..b636fcfd87cb667e3affb42f87d932f432e69e30 --- /dev/null +++ b/test/merge/mixedStrands.bed @@ -0,0 +1,12 @@ +chr1 10 50 a1f 2 + +chr1 20 60 b1r 4 - +chr1 25 70 c1q 8 . +chr1 30 75 d1q 16 . +chr1 40 80 e1f 32 + +chr1 45 90 f1r 64 - +chr2 10 50 a2q 2 . +chr2 20 40 b2f 4 + +chr2 25 50 c2r 8 - +chr2 30 60 d2f 16 + +chr2 35 65 e2q 32 . +chr2 39 80 f2r 64 - \ No newline at end of file diff --git a/test/merge/test-merge.sh b/test/merge/test-merge.sh index ee7f629064989b67989943c7b31cf3c40435779a..b7f5e12f1ae83da37e96ec92b7c36c0017e2269b 100644 --- a/test/merge/test-merge.sh +++ b/test/merge/test-merge.sh @@ -18,7 +18,6 @@ check() # chr1 45 100 ########################################################### -# Test #1 # Test a basic merge; one interval should be un-merged, # the other two should be merged. ########################################################### @@ -30,70 +29,50 @@ $BT merge -i a.bed > obs check obs exp rm obs exp - ########################################################### -# Test #2 -# Enforce coordinate sorted input. +# Test that -n option is shown as deperecated ########################################################### echo " merge.t2...\c" -command -v tac 2>/dev/null || alias tac="sed '1!G;h;\$!d'" -tac a.bed | $BT merge -i - 2> obs -echo "ERROR: input file: (-) is not sorted by chrom then start. - The start coordinate at line 3 is less than the start at line 2" > exp +echo "***** ERROR: -n option is deprecated. Please see the documentation for the -c and -o column operation options. *****" > exp +$BT merge -i a.bed -n 2>&1 > /dev/null | head -2 | tail -1 > obs check obs exp rm obs exp ########################################################### -# Test #3 # Test the counting of merged intervals. (-n) ########################################################### echo " merge.t3...\c" echo \ "chr1 10 20 1 chr1 30 100 3" > exp -$BT merge -i a.bed -n > obs +$BT merge -i a.bed -c 1 -o count > obs check obs exp rm obs exp ########################################################### -# Test #4 -# Test the listing of names from merged intervals. (-nms) -# a.bed should fail, as there is no name field +# Test that -nms option is deprecated ########################################################### echo " merge.t4...\c" -echo \ -"chr1 10 20 -***** -*****ERROR: No names found to report for the -names option. Exiting. -*****" > exp -$BT merge -i a.bed -nms > obs 2>&1 +echo "***** ERROR: -nms option is deprecated. Please see the documentation for the -c and -o column operation options. *****" > exp +$BT merge -i a.bed -nms 2>&1 > /dev/null | head -2 | tail -1 > obs check obs exp rm obs exp - ########################################################### -# Test #5 -# Test the listing of names from merged intervals. (-nms) -# a.named.bed should work, as there are name fields -# -# cat a.names.bed -# chr1 10 20 a1 -# chr1 30 40 a2 -# chr1 40 50 a3 -# chr1 45 100 a4 +# Test the listing of names from merged intervals. ########################################################### echo " merge.t5...\c" echo \ "chr1 10 20 a1 chr1 30 100 a2,a3,a4" > exp -$BT merge -i a.names.bed -nms > obs +$BT merge -i a.names.bed -c 4 -o collapse > obs check obs exp rm obs exp ########################################################### -# -nms and -scores sum +# collapsed list of the names, and sum of the scores ########################################################### echo " merge.t6...\c" echo \ @@ -102,12 +81,12 @@ chr1 30 100 a2,a3,a4 9 chr2 10 20 a1 5 chr2 30 40 a2 6 chr2 42 100 a3,a4 15" > exp -$BT merge -i a.full.bed -nms -scores sum> obs +$BT merge -i a.full.bed -c 4,5 -o collapse,sum > obs check obs exp rm obs exp ########################################################### -# -n and -scores sum +# count intervals and sum of scores ########################################################### echo " merge.t7...\c" echo \ @@ -116,12 +95,12 @@ chr1 30 100 3 9 chr2 10 20 1 5 chr2 30 40 1 6 chr2 42 100 2 15" > exp -$BT merge -i a.full.bed -n -scores sum> obs +$BT merge -i a.full.bed -c 5 -o count,sum> obs check obs exp rm obs exp ########################################################### -# -n, -nms, and -scores sum +# count, collapsed names, and sum of scores ########################################################### echo " merge.t8...\c" echo \ @@ -130,41 +109,134 @@ chr1 30 100 a2,a3,a4 9 3 chr2 10 20 a1 5 1 chr2 30 40 a2 6 1 chr2 42 100 a3,a4 15 2" > exp -$BT merge -i a.full.bed -nms -n -scores sum> obs +$BT merge -i a.full.bed -c 4,5,4 -o collapse,sum,count > obs check obs exp rm obs exp ########################################################### -# -s, -n, -nms, and -scores sum +# stranded merge, show sign, collapsed names, sum of +# scores, and count ########################################################### echo " merge.t9...\c" echo \ -"chr1 10 20 a1 1 + 1 -chr1 30 40 a2 2 + 1 -chr1 45 100 a4 4 + 1 -chr1 40 50 a3 3 - 1 -chr2 10 20 a1 5 + 1 -chr2 30 40 a2 6 + 1 -chr2 42 50 a3 7 + 1 -chr2 45 100 a4 8 - 1" > exp -$BT merge -i a.full.bed -s -nms -n -scores sum> obs +"chr1 10 20 + a1 1 1 +chr1 30 40 + a2 2 1 +chr1 40 50 - a3 3 1 +chr1 45 100 + a4 4 1 +chr2 10 20 + a1 5 1 +chr2 30 40 + a2 6 1 +chr2 42 50 + a3 7 1 +chr2 45 100 - a4 8 1" > exp +$BT merge -i a.full.bed -s -c 6,4,5,6 -o distinct,collapse,sum,count > obs check obs exp rm obs exp ########################################################### -# Test #10 # Test the use of a custom delimiter for -nms -# -# cat a.names.bed -# chr1 10 20 a1 -# chr1 30 40 a2 -# chr1 40 50 a3 -# chr1 45 100 a4 ########################################################### echo " merge.t10...\c" echo \ "chr1 10 20 a1 chr1 30 100 a2|a3|a4" > exp -$BT merge -i a.names.bed -nms -delim "|" > obs +$BT merge -i a.names.bed -delim "|" -c 4 -o collapse > obs check obs exp rm obs exp + +########################################################### +# Test that stranded merge not allowed with VCF +########################################################### +echo " merge.t11...\c" +echo "***** ERROR: stranded merge not supported for VCF files. *****" >exp +$BT merge -i testA.vcf -s 2>&1 > /dev/null | head -2 | tail -1 > obs +check exp obs +rm obs exp + +########################################################### +# Test that column ops not allowed with BAM +########################################################### +echo " merge.t12...\c" +echo "***** ERROR: BAM database file not currently supported for column operations." > exp +$BT merge -i a.full.bam -c 1 -o count 2>&1 > /dev/null | head -3 | tail -1 > obs +check exp obs +rm obs exp + + +########################################################### +# Test that VCF input gives BED3 output +########################################################### +echo " merge.t13...\c" +echo \ +"chr1 30859 30860 +chr1 69269 69270 +chr1 69510 69511 +chr1 874815 874816 +chr1 879675 879676 +chr1 935491 935492 +chr1 1334051 1334057 +chr1 31896607 31896608" > exp +$BT merge -i testA.vcf > obs +check exp obs +rm obs exp + +########################################################### +# Test that GFF input gives BED3 output +########################################################### +echo " merge.t14...\c" +echo \ +"chr22 9999999 10001000 +chr22 10009999 10010100 +chr22 10019999 10025000" > exp +$BT merge -i a.gff > obs +check exp obs +rm obs exp + +########################################################### +# Test that stranded merge with unknown records works +# correctly +########################################################### +echo " merge.t15...\c" +echo \ +"chr1 10 80 + +chr1 20 90 - +chr2 20 60 + +chr2 25 80 -" > exp +$BT merge -i mixedStrands.bed -s -c 6 -o distinct > obs +check exp obs +rm obs exp + +########################################################### +# Test that stranded merge with unknown records works +# correctly, forward strand only +########################################################### +echo " merge.t16...\c" +echo \ +"chr1 10 80 + +chr2 20 60 +" > exp +$BT merge -i mixedStrands.bed -S + -c 6 -o distinct > obs +check exp obs +rm obs exp + +########################################################### +# Test that stranded merge with unknown records works +# correctly, reverse strand only +########################################################### +echo " merge.t17...\c" +echo \ +"chr1 20 90 - +chr2 25 80 -" > exp +$BT merge -i mixedStrands.bed -S - -c 6 -o distinct > obs +check exp obs +rm obs exp + +########################################################### +# Test that merge with specified strand does not allowed +# other characters besides + or -. +########################################################### +echo " merge.t18...\c" +echo "***** ERROR: -S option must be followed by + or -. *****" > exp +$BT merge -i mixedStrands.bed -S . -c 6 -o distinct 2>&1 > /dev/null | head -2 | tail -1 >obs +check exp obs +rm obs exp + + + diff --git a/test/merge/testA.vcf b/test/merge/testA.vcf new file mode 100644 index 0000000000000000000000000000000000000000..ea53a203d4e137c281ede01a2ba75602b94be733 --- /dev/null +++ b/test/merge/testA.vcf @@ -0,0 +1,129 @@ +##fileformat=VCFv4.1 +##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed"> +##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)"> +##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Genotype Quality"> +##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> +##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification"> +##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed"> +##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed"> +##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes"> +##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities"> +##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered"> +##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?"> +##INFO=<ID=Dels,Number=1,Type=Float,Description="Fraction of Reads Containing Spanning Deletions"> +##INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias"> +##INFO=<ID=HRun,Number=1,Type=Integer,Description="Largest Contiguous Homopolymer Run of Variant Allele In Either Direction"> +##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes"> +##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation"> +##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality"> +##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads"> +##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities"> +##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth"> +##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias"> +##UnifiedGenotyper="analysis_type=UnifiedGenotyper input_file=[bam/all.conc.on.pos.dedup.realigned.bam] read_buffer_size=null phone_home=STANDARD read_filter=[] intervals=null excludeIntervals=null interval_set_rule=UNION interval_merging=ALL reference_sequence=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa rodBind=[] nonDeterministicRandomSeed=false downsampling_type=BY_SAMPLE downsample_to_fraction=null downsample_to_coverage=250 baq=OFF baqGapOpenPenalty=40.0 performanceLog=null useOriginalQualities=false defaultBaseQualities=-1 validation_strictness=SILENT unsafe=null num_threads=10 num_cpu_threads=null num_io_threads=null num_bam_file_handles=null read_group_black_list=null pedigree=[] pedigreeString=[] pedigreeValidationType=STRICT allow_intervals_with_unindexed_bam=false logging_level=INFO log_to_file=null help=false genotype_likelihoods_model=BOTH p_nonref_model=EXACT heterozygosity=0.0010 pcr_error_rate=1.0E-4 genotyping_mode=DISCOVERY output_mode=EMIT_VARIANTS_ONLY standard_min_confidence_threshold_for_calling=30.0 standard_min_confidence_threshold_for_emitting=30.0 computeSLOD=false alleles=(RodBinding name= source=UNBOUND) min_base_quality_score=17 max_deletion_fraction=0.05 multiallelic=false max_alternate_alleles=5 min_indel_count_for_genotyping=5 indel_heterozygosity=1.25E-4 indelGapContinuationPenalty=10.0 indelGapOpenPenalty=45.0 indelHaplotypeSize=80 bandedIndel=false indelDebug=false ignoreSNPAlleles=false dbsnp=(RodBinding name= source=UNBOUND) out=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub NO_HEADER=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub sites_only=org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub debug_file=null metrics_file=null annotation=[] excludeAnnotation=[] filter_mismatching_base_and_quals=false" +##contig=<ID=chr1,length=249250621,assembly=hg19> +##contig=<ID=chr10,length=135534747,assembly=hg19> +##contig=<ID=chr11,length=135006516,assembly=hg19> +##contig=<ID=chr11_gl000202_random,length=40103,assembly=hg19> +##contig=<ID=chr12,length=133851895,assembly=hg19> +##contig=<ID=chr13,length=115169878,assembly=hg19> +##contig=<ID=chr14,length=107349540,assembly=hg19> +##contig=<ID=chr15,length=102531392,assembly=hg19> +##contig=<ID=chr16,length=90354753,assembly=hg19> +##contig=<ID=chr17,length=81195210,assembly=hg19> +##contig=<ID=chr17_ctg5_hap1,length=1680828,assembly=hg19> +##contig=<ID=chr17_gl000203_random,length=37498,assembly=hg19> +##contig=<ID=chr17_gl000204_random,length=81310,assembly=hg19> +##contig=<ID=chr17_gl000205_random,length=174588,assembly=hg19> +##contig=<ID=chr17_gl000206_random,length=41001,assembly=hg19> +##contig=<ID=chr18,length=78077248,assembly=hg19> +##contig=<ID=chr18_gl000207_random,length=4262,assembly=hg19> +##contig=<ID=chr19,length=59128983,assembly=hg19> +##contig=<ID=chr19_gl000208_random,length=92689,assembly=hg19> +##contig=<ID=chr19_gl000209_random,length=159169,assembly=hg19> +##contig=<ID=chr1_gl000191_random,length=106433,assembly=hg19> +##contig=<ID=chr1_gl000192_random,length=547496,assembly=hg19> +##contig=<ID=chr2,length=243199373,assembly=hg19> +##contig=<ID=chr20,length=63025520,assembly=hg19> +##contig=<ID=chr21,length=48129895,assembly=hg19> +##contig=<ID=chr21_gl000210_random,length=27682,assembly=hg19> +##contig=<ID=chr22,length=51304566,assembly=hg19> +##contig=<ID=chr3,length=198022430,assembly=hg19> +##contig=<ID=chr4,length=191154276,assembly=hg19> +##contig=<ID=chr4_ctg9_hap1,length=590426,assembly=hg19> +##contig=<ID=chr4_gl000193_random,length=189789,assembly=hg19> +##contig=<ID=chr4_gl000194_random,length=191469,assembly=hg19> +##contig=<ID=chr5,length=180915260,assembly=hg19> +##contig=<ID=chr6,length=171115067,assembly=hg19> +##contig=<ID=chr6_apd_hap1,length=4622290,assembly=hg19> +##contig=<ID=chr6_cox_hap2,length=4795371,assembly=hg19> +##contig=<ID=chr6_dbb_hap3,length=4610396,assembly=hg19> +##contig=<ID=chr6_mann_hap4,length=4683263,assembly=hg19> +##contig=<ID=chr6_mcf_hap5,length=4833398,assembly=hg19> +##contig=<ID=chr6_qbl_hap6,length=4611984,assembly=hg19> +##contig=<ID=chr6_ssto_hap7,length=4928567,assembly=hg19> +##contig=<ID=chr7,length=159138663,assembly=hg19> +##contig=<ID=chr7_gl000195_random,length=182896,assembly=hg19> +##contig=<ID=chr8,length=146364022,assembly=hg19> +##contig=<ID=chr8_gl000196_random,length=38914,assembly=hg19> +##contig=<ID=chr8_gl000197_random,length=37175,assembly=hg19> +##contig=<ID=chr9,length=141213431,assembly=hg19> +##contig=<ID=chr9_gl000198_random,length=90085,assembly=hg19> +##contig=<ID=chr9_gl000199_random,length=169874,assembly=hg19> +##contig=<ID=chr9_gl000200_random,length=187035,assembly=hg19> +##contig=<ID=chr9_gl000201_random,length=36148,assembly=hg19> +##contig=<ID=chrM,length=16571,assembly=hg19> +##contig=<ID=chrUn_gl000211,length=166566,assembly=hg19> +##contig=<ID=chrUn_gl000212,length=186858,assembly=hg19> +##contig=<ID=chrUn_gl000213,length=164239,assembly=hg19> +##contig=<ID=chrUn_gl000214,length=137718,assembly=hg19> +##contig=<ID=chrUn_gl000215,length=172545,assembly=hg19> +##contig=<ID=chrUn_gl000216,length=172294,assembly=hg19> +##contig=<ID=chrUn_gl000217,length=172149,assembly=hg19> +##contig=<ID=chrUn_gl000218,length=161147,assembly=hg19> +##contig=<ID=chrUn_gl000219,length=179198,assembly=hg19> +##contig=<ID=chrUn_gl000220,length=161802,assembly=hg19> +##contig=<ID=chrUn_gl000221,length=155397,assembly=hg19> +##contig=<ID=chrUn_gl000222,length=186861,assembly=hg19> +##contig=<ID=chrUn_gl000223,length=180455,assembly=hg19> +##contig=<ID=chrUn_gl000224,length=179693,assembly=hg19> +##contig=<ID=chrUn_gl000225,length=211173,assembly=hg19> +##contig=<ID=chrUn_gl000226,length=15008,assembly=hg19> +##contig=<ID=chrUn_gl000227,length=128374,assembly=hg19> +##contig=<ID=chrUn_gl000228,length=129120,assembly=hg19> +##contig=<ID=chrUn_gl000229,length=19913,assembly=hg19> +##contig=<ID=chrUn_gl000230,length=43691,assembly=hg19> +##contig=<ID=chrUn_gl000231,length=27386,assembly=hg19> +##contig=<ID=chrUn_gl000232,length=40652,assembly=hg19> +##contig=<ID=chrUn_gl000233,length=45941,assembly=hg19> +##contig=<ID=chrUn_gl000234,length=40531,assembly=hg19> +##contig=<ID=chrUn_gl000235,length=34474,assembly=hg19> +##contig=<ID=chrUn_gl000236,length=41934,assembly=hg19> +##contig=<ID=chrUn_gl000237,length=45867,assembly=hg19> +##contig=<ID=chrUn_gl000238,length=39939,assembly=hg19> +##contig=<ID=chrUn_gl000239,length=33824,assembly=hg19> +##contig=<ID=chrUn_gl000240,length=41933,assembly=hg19> +##contig=<ID=chrUn_gl000241,length=42152,assembly=hg19> +##contig=<ID=chrUn_gl000242,length=43523,assembly=hg19> +##contig=<ID=chrUn_gl000243,length=43341,assembly=hg19> +##contig=<ID=chrUn_gl000244,length=39929,assembly=hg19> +##contig=<ID=chrUn_gl000245,length=36651,assembly=hg19> +##contig=<ID=chrUn_gl000246,length=38154,assembly=hg19> +##contig=<ID=chrUn_gl000247,length=36422,assembly=hg19> +##contig=<ID=chrUn_gl000248,length=39786,assembly=hg19> +##contig=<ID=chrUn_gl000249,length=38502,assembly=hg19> +##contig=<ID=chrX,length=155270560,assembly=hg19> +##contig=<ID=chrY,length=59373566,assembly=hg19> +##reference=file:///home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa +##SnpEffVersion="SnpEff 3.0g (build 2012-08-31), by Pablo Cingolani" +##SnpEffCmd="SnpEff -i vcf -o vcf GRCh37.66 /home/udp3f/cphg-home/projects/rs-exome/varCalling/2012-Feb-01/all.raw.nobaq.vcf " +##INFO=<ID=EFF,Number=.,Type=String,Description="Predicted effects for this variant.Format: 'Effect ( Effect_Impact | Functional_Class | Codon_Change | Amino_Acid_change| Amino_Acid_length | Gene_Name | Gene_BioType | Coding | Transcript | Exon [ | ERRORS | WARNINGS ] )' "> +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 1094PC0005 1094PC0009 1094PC0012 1094PC0013 +chr1 30860 . G C 33.46 . AC=2;AF=0.053;AN=38;BaseQRankSum=2.327;DP=49;Dels=0.00;FS=3.128;HRun=0;HaplotypeScore=0.6718;InbreedingCoeff=0.1005;MQ=36.55;MQ0=0;MQRankSum=0.217;QD=16.73;ReadPosRankSum=2.017;EFF=DOWNSTREAM(MODIFIER||||85|FAM138A|protein_coding|CODING|ENST00000417324|),DOWNSTREAM(MODIFIER|||||FAM138A|processed_transcript|CODING|ENST00000461467|),DOWNSTREAM(MODIFIER|||||MIR1302-10|miRNA|NON_CODING|ENST00000408384|),INTRON(MODIFIER|||||MIR1302-10|antisense|NON_CODING|ENST00000469289|),INTRON(MODIFIER|||||MIR1302-10|antisense|NON_CODING|ENST00000473358|),UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000423562|),UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000430492|),UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000438504|),UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000488147|),UPSTREAM(MODIFIER|||||WASH7P|unprocessed_pseudogene|NON_CODING|ENST00000538476|) GT:AD:DP:GQ:PL 0/0:7,0:7:15.04:0,15,177 0/0:2,0:2:3.01:0,3,39 0/0:6,0:6:12.02:0,12,143 0/0:4,0:4:9.03:0,9,119 +chr1 69270 . A G 2694.18 . AC=40;AF=1.000;AN=40;DP=83;Dels=0.00;FS=0.000;HRun=0;HaplotypeScore=0.0000;InbreedingCoeff=-0.0598;MQ=31.06;MQ0=0;QD=32.86;EFF=SYNONYMOUS_CODING(LOW|SILENT|tcA/tcG|S60|305|OR4F5|protein_coding|CODING|ENST00000335137|exon_1_69091_70008) GT:AD:DP:GQ:PL ./. ./. 1/1:0,3:3:9.03:106,9,0 1/1:0,6:6:18.05:203,18,0 +chr1 69511 . A G 77777.27 . AC=49;AF=0.875;AN=56;BaseQRankSum=0.150;DP=2816;DS;Dels=0.00;FS=21.286;HRun=0;HaplotypeScore=3.8956;InbreedingCoeff=0.0604;MQ=32.32;MQ0=0;MQRankSum=1.653;QD=27.68;ReadPosRankSum=2.261;EFF=NON_SYNONYMOUS_CODING(MODERATE|MISSENSE|Aca/Gca|T141A|305|OR4F5|protein_coding|CODING|ENST00000335137|exon_1_69091_70008) GT:AD:DP:GQ:PL ./. ./. 0/1:2,4:6:15.70:16,0,40 0/1:2,2:4:21.59:22,0,40 +chr1 874816 . C CT 1208.91 . AC=8;AF=0.105;AN=76;BaseQRankSum=-1.391;DP=785;FS=0.000;HRun=1;HaplotypeScore=68.5485;InbreedingCoeff=0.1619;MQ=57.70;MQ0=0;MQRankSum=-0.585;QD=13.14;ReadPosRankSum=-2.428;EFF=DOWNSTREAM(MODIFIER||||108|SAMD11|protein_coding|CODING|ENST00000437963|),DOWNSTREAM(MODIFIER||||178|SAMD11|protein_coding|CODING|ENST00000420190|),DOWNSTREAM(MODIFIER||||749|NOC2L|protein_coding|CODING|ENST00000327044|),DOWNSTREAM(MODIFIER|||||NOC2L|processed_transcript|CODING|ENST00000477976|),DOWNSTREAM(MODIFIER|||||NOC2L|processed_transcript|CODING|ENST00000483767|),FRAME_SHIFT(HIGH||-/T|-228?|681|SAMD11|protein_coding|CODING|ENST00000342066|exon_1_874655_874840),UPSTREAM(MODIFIER|||||SAMD11|processed_transcript|CODING|ENST00000474461|),UPSTREAM(MODIFIER|||||SAMD11|processed_transcript|CODING|ENST00000478729|),UPSTREAM(MODIFIER|||||SAMD11|retained_intron|CODING|ENST00000464948|),UPSTREAM(MODIFIER|||||SAMD11|retained_intron|CODING|ENST00000466827|) GT:AD:DP:GQ:PL 0/0:7,0:7:18.06:0,18,252 0/0:16,0:16:45.15:0,45,630 0/0:15,0:15:39.10:0,39,503 0/0:13,0:13:33.11:0,33,462 +chr1 879676 . G A 6715.87 . AC=63;AF=0.955;AN=66;BaseQRankSum=-0.665;DP=201;Dels=0.00;FS=0.000;HRun=3;HaplotypeScore=0.1046;InbreedingCoeff=-0.1218;MQ=56.71;MQ0=0;MQRankSum=-2.066;QD=33.41;ReadPosRankSum=-1.877;EFF=DOWNSTREAM(MODIFIER|||||NOC2L|processed_transcript|CODING|ENST00000496938|),DOWNSTREAM(MODIFIER|||||SAMD11|processed_transcript|CODING|ENST00000474461|),DOWNSTREAM(MODIFIER|||||SAMD11|processed_transcript|CODING|ENST00000478729|),DOWNSTREAM(MODIFIER|||||SAMD11|retained_intron|CODING|ENST00000464948|),DOWNSTREAM(MODIFIER|||||SAMD11|retained_intron|CODING|ENST00000466827|),EXON(MODIFIER|||||NOC2L|processed_transcript|CODING|ENST00000477976|),EXON(MODIFIER|||||NOC2L|processed_transcript|CODING|ENST00000483767|),UTR_3_PRIME(MODIFIER||||681|SAMD11|protein_coding|CODING|ENST00000342066|),UTR_3_PRIME(MODIFIER||||749|NOC2L|protein_coding|CODING|ENST00000327044|) GT:AD:DP:GQ:PL 1/1:0,6:7:18.05:218,18,0 1/1:0,7:7:21.05:262,21,0 1/1:0,8:8:24.07:308,24,0 1/1:0,5:5:15.05:187,15,0 +chr1 935492 . G T 41.57 . AC=4;AF=0.67;AN=6;BaseQRankSum=0.736;DP=3;Dels=0.00;FS=0.000;HRun=3;HaplotypeScore=0.0000;MQ=60.00;MQ0=0;MQRankSum=-0.736;QD=20.79;ReadPosRankSum=-0.736;EFF=START_GAINED(LOW||||247|HES4|protein_coding|CODING|ENST00000428771|exon_1_935072_935552),UPSTREAM(MODIFIER||||189|HES4|protein_coding|CODING|ENST00000484667|),UPSTREAM(MODIFIER||||221|HES4|protein_coding|CODING|ENST00000304952|),UPSTREAM(MODIFIER|||||HES4|processed_transcript|CODING|ENST00000481869|) GT:AD:DP:GQ:PL ./. ./. 1/1:0,1:1:3.01:39,3,0 ./. +chr1 1334052 . CTAGAG C 5078.01 . AC=3;AF=0.039;AN=76;BaseQRankSum=12.748;DP=4561;DS;FS=8.748;HRun=0;HaplotypeScore=609.0108;InbreedingCoeff=-0.0411;MQ=59.18;MQ0=0;MQRankSum=-15.062;QD=20.31;ReadPosRankSum=-1.131;EFF=DOWNSTREAM(MODIFIER||||149|MRPL20|protein_coding|CODING|ENST00000344843|),DOWNSTREAM(MODIFIER|||||MRPL20|processed_transcript|CODING|ENST00000487659|),DOWNSTREAM(MODIFIER|||||MRPL20|processed_transcript|CODING|ENST00000492508|),DOWNSTREAM(MODIFIER|||||MRPL20|processed_transcript|CODING|ENST00000493287|),DOWNSTREAM(MODIFIER|||||RP4-758J18.5.1|processed_transcript|NON_CODING|ENST00000514958|),EXON(MODIFIER|||||CCNL2|processed_transcript|CODING|ENST00000497013|),INTRON(MODIFIER||||226|CCNL2|protein_coding|CODING|ENST00000408918|),INTRON(MODIFIER||||520|CCNL2|protein_coding|CODING|ENST00000400809|),INTRON(MODIFIER|||||CCNL2|nonsense_mediated_decay|CODING|ENST00000425598|),INTRON(MODIFIER|||||CCNL2|nonsense_mediated_decay|CODING|ENST00000481223|),INTRON(MODIFIER|||||CCNL2|nonsense_mediated_decay|CODING|ENST00000488340|),INTRON(MODIFIER|||||CCNL2|nonsense_mediated_decay|CODING|ENST00000496007|),SPLICE_SITE_ACCEPTOR(HIGH|||||CCNL2|nonsense_mediated_decay|CODING|ENST00000488340|),UPSTREAM(MODIFIER|||||CCNL2|processed_transcript|CODING|ENST00000463895|),UPSTREAM(MODIFIER|||||CCNL2|processed_transcript|CODING|ENST00000471930|),UPSTREAM(MODIFIER|||||CCNL2|processed_transcript|CODING|ENST00000482621|),UPSTREAM(MODIFIER|||||CCNL2|retained_intron|CODING|ENST00000473872|),UPSTREAM(MODIFIER|||||RP4-758J18.2.1|processed_transcript|NON_CODING|ENST00000418833|),UPSTREAM(MODIFIER|||||RP4-758J18.2.1|processed_transcript|NON_CODING|ENST00000444362|),UPSTREAM(MODIFIER|||||RP4-758J18.2.1|processed_transcript|NON_CODING|ENST00000447725|),UPSTREAM(MODIFIER|||||RP4-758J18.2.1|processed_transcript|NON_CODING|ENST00000448629|),UPSTREAM(MODIFIER|||||RP4-758J18.3.1|processed_transcript|NON_CODING|ENST00000453521|) GT:AD:DP:GQ:PL 0/0:116,0:116:99:0,310,8807 0/0:106,0:106:99:0,275,7825 0/1:59,38:97:99:2157,0,3681 0/1:61,38:99:99:2174,0,3121 +chr1 31896608 . C T 1690.17 . AC=3;AF=0.039;AN=76;BaseQRankSum=-1.970;DP=1774;Dels=0.00;FS=1.532;HRun=0;HaplotypeScore=2.0440;InbreedingCoeff=-0.0492;MQ=58.72;MQ0=0;MQRankSum=0.940;QD=12.25;ReadPosRankSum=0.534;EFF=EXON(MODIFIER|||||SERINC2|processed_transcript|CODING|ENST00000487207|),EXON(MODIFIER|||||SERINC2|processed_transcript|CODING|ENST00000491976|),SYNONYMOUS_CODING(LOW|SILENT|acC/acT|T36|455|SERINC2|protein_coding|CODING|ENST00000373709|exon_1_31896540_31896701),SYNONYMOUS_CODING(LOW|SILENT|acC/acT|T40|459|SERINC2|protein_coding|CODING|ENST00000536384|exon_1_31896540_31896701),SYNONYMOUS_CODING(LOW|SILENT|acC/acT|T40|459|SERINC2|protein_coding|CODING|ENST00000536859|exon_1_31896540_31896701),SYNONYMOUS_CODING(LOW|SILENT|acC/acT|T45|464|SERINC2|protein_coding|CODING|ENST00000373710|exon_1_31896540_31896701) GT:AD:DP:GQ:PL 0/0:3,0:3:3.01:0,3,36 0/0:3,0:3:6.02:0,6,75 0/0:10,0:10:24.06:0,24,282 0/0:5,0:5:15.04:0,15,181 diff --git a/test/sample/test-sample.sh b/test/sample/test-sample.sh index 5a65085cb5ea6a7ebf4f3581fa422069272336ed..0994994627de09915e29463f4c7c059173b18e86 100644 --- a/test/sample/test-sample.sh +++ b/test/sample/test-sample.sh @@ -40,21 +40,19 @@ rm obs ############################################################ echo " sample.new.t02...\c" echo "***** ERROR: Unrecognized parameter: -wrongArg *****" > exp -$BT sample -wrongArg 2>&1 > /dev/null | head -2 | tail -1 > obs +$BT sample -i mainFile.bed -wrongArg 2>&1 > /dev/null | head -2 | tail -1 > obs check obs exp rm obs exp - ########################################################### -# Test that we throw an error when no input file is given +# Test that we throw an error when no input file was given. ############################################################ echo " sample.new.t03...\c" -echo "***** ERROR: input file not specified. *****" > exp +echo "***** ERROR: No input file given. Exiting. *****" > exp; $BT sample -n 10 2>&1 > /dev/null | head -2 | tail -1 > obs check obs exp rm obs exp - ########################################################### # Test that we throw an error for -i without input file ############################################################ @@ -116,6 +114,9 @@ $BT sample -i mainFile.bed -n 10 -header | head -1 > obs check obs exp rm obs exp + + + rm mainFile.bed