From 79c6a2ff1ce584aecdfd2b7e943a29f826701b7b Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Tue, 8 Nov 2011 21:42:58 -0500 Subject: [PATCH] Added methods to store and report file headers upon request. --- src/intersectBed/intersectBed.cpp | 12 ++- src/intersectBed/intersectBed.h | 3 +- src/intersectBed/intersectMain.cpp | 10 +- src/utils/bedFile/bedFile.cpp | 57 ++++++++--- src/utils/bedFile/bedFile.h | 156 +++++++++++++++-------------- 5 files changed, 139 insertions(+), 99 deletions(-) diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp index de7bd74b..6a48b880 100644 --- a/src/intersectBed/intersectBed.cpp +++ b/src/intersectBed/intersectBed.cpp @@ -68,7 +68,7 @@ BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, float overlapFraction, bool noHit, bool writeCount, bool sameStrand, bool diffStrand, bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam, - bool sortedInput) { + bool sortedInput, bool printHeader) { _bedAFile = bedAFile; _bedBFile = bedBFile; @@ -88,6 +88,7 @@ BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, _bamOutput = bamOutput; _isUncompressedBam = isUncompressedBam; _sortedInput = sortedInput; + _printHeader = printHeader; // should we print each overlap, or does the user want summary information? _printable = true; @@ -195,7 +196,6 @@ void BedIntersect::IntersectBed() { // compare each entry in A to it in search of overlaps. _bedB->loadBedFileIntoMap(); - int lineNum = 0; vector<BED> hits; hits.reserve(100); BED a, nullBed; @@ -203,7 +203,11 @@ void BedIntersect::IntersectBed() { // open the "A" file, process each BED entry and searh for overlaps. _bedA->Open(); - while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { + // report A's header first if asked. + if (_printHeader == true) { + _bedA->PrintHeader(); + } + while ((bedStatus = _bedA->GetNextBed(a)) != BED_INVALID) { if (bedStatus == BED_VALID) { // treat the BED as a single "block" if (_obeySplits == false) { @@ -213,7 +217,7 @@ void BedIntersect::IntersectBed() { // split the BED12 into blocks and look for overlaps in each discrete block else { bedVector bedBlocks; // vec to store the discrete BED "blocks" - splitBedIntoBlocks(a, lineNum, bedBlocks); + splitBedIntoBlocks(a,bedBlocks); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); diff --git a/src/intersectBed/intersectBed.h b/src/intersectBed/intersectBed.h index a996b155..b849446c 100644 --- a/src/intersectBed/intersectBed.h +++ b/src/intersectBed/intersectBed.h @@ -38,7 +38,7 @@ public: bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, float overlapFraction, bool noHit, bool writeCount, bool sameStrand, bool diffStrand, bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam, - bool sortedInput); + bool sortedInput, bool printHeader); // destructor ~BedIntersect(void); @@ -70,6 +70,7 @@ private: bool _isUncompressedBam; bool _sortedInput; bool _printable; + bool _printHeader; // instance of a bed file class. BedFile *_bedA, *_bedB; diff --git a/src/intersectBed/intersectMain.cpp b/src/intersectBed/intersectMain.cpp index 300154d1..02a2ace7 100644 --- a/src/intersectBed/intersectMain.cpp +++ b/src/intersectBed/intersectMain.cpp @@ -54,6 +54,8 @@ int main(int argc, char* argv[]) { bool outputIsBam = true; bool uncompressedBam = false; bool sortedInput = false; + bool printHeader = false; + // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -145,7 +147,10 @@ int main(int argc, char* argv[]) { } else if(PARAMETER_CHECK("-sorted", 7, parameterLength)) { sortedInput = true; - } + } + else if(PARAMETER_CHECK("-header", 7, parameterLength)) { + printHeader = true; + } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; @@ -212,7 +217,8 @@ int main(int argc, char* argv[]) { BedIntersect *bi = new BedIntersect(bedAFile, bedBFile, anyHit, writeA, writeB, writeOverlap, writeAllOverlap, overlapFraction, noHit, writeCount, sameStrand, diffStrand, - reciprocalFraction, obeySplits, inputIsBam, outputIsBam, uncompressedBam, sortedInput); + reciprocalFraction, obeySplits, inputIsBam, outputIsBam, uncompressedBam, + sortedInput, printHeader); delete bi; return 0; } diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index ec303579..167ea3c3 100644 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -15,16 +15,16 @@ /************************************************ Helper functions *************************************************/ -void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks) { +void splitBedIntoBlocks(const BED &bed, bedVector &bedBlocks) { if (bed.otherFields.size() < 6) { - cerr << "Input error: Cannot split into blocks. Found interval with fewer than 12 columns on line " << lineNum << "." << endl; + cerr << "Input error: Cannot split into blocks. Found interval with fewer than 12 columns." << endl; exit(1); } int blockCount = atoi(bed.otherFields[3].c_str()); if ( blockCount <= 0 ) { - cerr << "Input error: found interval having <= 0 blocks on line " << lineNum << "." << endl; + cerr << "Input error: found interval having <= 0 blocks." << endl; exit(1); } else if ( blockCount == 1 ) { @@ -42,7 +42,7 @@ void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks) { Tokenize(blockStarts, starts, ","); if ( sizes.size() != (size_t) blockCount || starts.size() != (size_t) blockCount ) { - cerr << "Input error: found interval with block-counts not matching starts/sizes on line " << lineNum << "." << endl; + cerr << "Input error: found interval with block-counts not matching starts/sizes on line." << endl; exit(1); } @@ -151,6 +151,8 @@ void BedFile::Open(void) { exit (1); } } + // save the file's header (if there is one) + GetHeader(); } // Rewind the pointer back to the beginning of the file @@ -164,7 +166,7 @@ void BedFile::Seek(unsigned long offset) { } // Jump to a specific byte in the file -bool BedFile::Empty() { +bool BedFile::Empty(void) { return _bedStream->eof(); } @@ -173,23 +175,46 @@ void BedFile::Close(void) { if (bedFile != "stdin" && bedFile != "-") delete _bedStream; } +void BedFile::GetLine(void) { + // parse the bedStream pointer + getline(*_bedStream, _bedLine); + // increment the line number + _lineNum++; + // split into a string vector. + Tokenize(_bedLine, _bedFields); +} -BedLineStatus BedFile::GetNextBed(BED &bed, int &lineNum, bool forceSorted) { +// Extract and store the header for the file. +void BedFile::GetHeader(void) { + GetLine(); + while ((_bedLine.find("#") != string::npos) || + (_bedLine.find("browser") != string::npos) || + (_bedLine.find("track") != string::npos) ) + { + _header.push_back(_bedLine); + GetLine(); + } +} + +// Dump the header +void BedFile::PrintHeader(void) { + for (size_t i = 0; i < _header.size(); ++i) { + cout << _header[i] << endl; + } +} + + +BedLineStatus BedFile::GetNextBed(BED &bed, bool forceSorted) { // make sure there are still lines to process. // if so, tokenize, validate and return the BED entry. _bedFields.clear(); // clear out the previous bed's data if (_bedStream->good()) { - // parse the bedStream pointer - getline(*_bedStream, _bedLine); - lineNum++; - - // split into a string vector. - Tokenize(_bedLine, _bedFields); - + // read the next line in the file and parse into discrete fields + GetLine(); // load the BED struct as long as it's a valid BED entry. - BedLineStatus status = parseLine(bed, _bedFields, lineNum); + BedLineStatus status = parseLine(bed, _bedFields); if (!forceSorted) { return status; } @@ -220,13 +245,13 @@ BedLineStatus BedFile::GetNextBed(BED &bed, int &lineNum, bool forceSorted) { } -bool BedFile::GetNextMergedBed(BED &merged_bed, int &lineNum) { +bool BedFile::GetNextMergedBed(BED &merged_bed) { if (_bedStream->good()) { BED bed; BedLineStatus bedStatus; // force sorting; hence third param = true - while ((bedStatus = GetNextBed(bed, lineNum, true)) != BED_INVALID) { + while ((bedStatus = GetNextBed(bed, true)) != BED_INVALID) { if (bedStatus == BED_VALID) { if (((int) bed.start - _merged_end > 0) || (_merged_end < 0) || diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index e8c56b4e..0d964444 100644 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -377,7 +377,7 @@ bool after(const BED &a, const BED &b) { // Ancillary functions -void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks); +void splitBedIntoBlocks(const BED &bed, bedVector &bedBlocks); // BED Sorting Methods @@ -404,27 +404,34 @@ public: // Destructor ~BedFile(void); + /********* File management ********/ // Open a BED file for reading (creates an istream pointer) void Open(void); + // Close an opened BED file. + void Close(void); + + // are the any intervals left in the file? + bool Empty(void); + // Rewind the pointer back to the beginning of the file void Rewind(void); // Jump to a specific byte in the file void Seek(unsigned long offset); - - bool Empty(); + // dump the header, which is collected as part of Open() + void PrintHeader(void); - // Close an opened BED file. - void Close(void); + // get the next line in the file. splits a line in _bedFields + void GetLine(void); // Get the next BED entry in an opened BED file. - BedLineStatus GetNextBed (BED &bed, int &lineNum, bool forceSorted = false); + BedLineStatus GetNextBed (BED &bed, bool forceSorted = false); // Returns the next MERGED (i.e., non-overlapping) interval in an opened BED file // NOTE: assumes input file is sorted by chrom then start - bool GetNextMergedBed(BED &merged_bed, int &lineNum); + bool GetNextMergedBed(BED &merged_bed); // load a BED file into a map keyed by chrom, then bin. value is vector of BEDs void loadBedFileIntoMap(); @@ -490,6 +497,8 @@ private: FileType _fileType; // what is the file type? (BED? GFF? VCF?) istream *_bedStream; string _bedLine; + int _lineNum; + vector<string> _header; vector<string> _bedFields; int _merged_start; int _merged_end; @@ -503,6 +512,9 @@ private: void setFileType (FileType type); void setBedType (int colNums); + /************ Private utilities ***********************/ + void GetHeader(void); + /****************************************************** Private definitions to circumvent linker issues with templated member functions. @@ -512,10 +524,8 @@ private: parseLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) */ template <typename T> - inline BedLineStatus parseLine (T &bed, const vector<string> &lineVector, int &lineNum) { + inline BedLineStatus parseLine (T &bed, const vector<string> &lineVector) { - //char *p2End, *p3End, *p4End, *p5End; - //long l2, l3, l4, l5; unsigned int numFields = lineVector.size(); // bail out if we have a blank line @@ -523,65 +533,59 @@ private: return BED_BLANK; } - if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) { - - if (numFields >= 3) { - // line parsing for all lines after the first non-header line - if (_typeIsKnown == true) { - switch(_fileType) { - case BED_FILETYPE: - if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - case VCF_FILETYPE: - if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - case GFF_FILETYPE: - if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - default: - printf("ERROR: file type encountered. Exiting\n"); - exit(1); - } - } - // line parsing for first non-header line: figure out file contents - else { - // it's BED format if columns 2 and 3 are integers - if (isInteger(lineVector[1]) && isInteger(lineVector[2])) { - setGff(false); - setZeroBased(true); - setFileType(BED_FILETYPE); - setBedType(numFields); // we now expect numFields columns in each line - if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - } - // it's VCF, assuming the second column is numeric and there are at least 8 fields. - else if (isInteger(lineVector[1]) && numFields >= 8) { - setGff(false); - setVcf(true); - setZeroBased(false); - setFileType(VCF_FILETYPE); - setBedType(numFields); // we now expect numFields columns in each line - if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - } - // it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total. - else if ((numFields >= 8) && isInteger(lineVector[3]) && isInteger(lineVector[4])) { - setGff(true); - setZeroBased(false); - setFileType(GFF_FILETYPE); - setBedType(numFields); // we now expect numFields columns in each line - if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - } - else { - cerr << "Unexpected file format. Please use tab-delimited BED, GFF, or VCF. " << - "Perhaps you have non-integer starts or ends at line " << lineNum << "?" << endl; + + if (numFields >= 3) { + // line parsing for all lines after the first non-header line + if (_typeIsKnown == true) { + switch(_fileType) { + case BED_FILETYPE: + if (parseBedLine(bed, lineVector, _lineNum, numFields) == true) return BED_VALID; + case VCF_FILETYPE: + if (parseVcfLine(bed, lineVector, _lineNum, numFields) == true) return BED_VALID; + case GFF_FILETYPE: + if (parseGffLine(bed, lineVector, _lineNum, numFields) == true) return BED_VALID; + default: + printf("ERROR: file type encountered. Exiting\n"); exit(1); - } } } + // line parsing for first non-header line: figure out file contents else { - cerr << "It looks as though you have less than 3 columns at line: " << lineNum << ". Are you sure your files are tab-delimited?" << endl; - exit(1); + // it's BED format if columns 2 and 3 are integers + if (isInteger(lineVector[1]) && isInteger(lineVector[2])) { + setGff(false); + setZeroBased(true); + setFileType(BED_FILETYPE); + setBedType(numFields); // we now expect numFields columns in each line + if (parseBedLine(bed, lineVector, _lineNum, numFields) == true) return BED_VALID; + } + // it's VCF, assuming the second column is numeric and there are at least 8 fields. + else if (isInteger(lineVector[1]) && numFields >= 8) { + setGff(false); + setVcf(true); + setZeroBased(false); + setFileType(VCF_FILETYPE); + setBedType(numFields); // we now expect numFields columns in each line + if (parseVcfLine(bed, lineVector, _lineNum, numFields) == true) return BED_VALID; + } + // it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total. + else if ((numFields >= 8) && isInteger(lineVector[3]) && isInteger(lineVector[4])) { + setGff(true); + setZeroBased(false); + setFileType(GFF_FILETYPE); + setBedType(numFields); // we now expect numFields columns in each line + if (parseGffLine(bed, lineVector, _lineNum, numFields) == true) return BED_VALID; + } + else { + cerr << "Unexpected file format. Please use tab-delimited BED, GFF, or VCF. " << + "Perhaps you have non-integer starts or ends at line " << _lineNum << "?" << endl; + exit(1); + } } } else { - lineNum--; - return BED_HEADER; + cerr << "It looks as though you have less than 3 columns at line: " << _lineNum << ". Are you sure your files are tab-delimited?" << endl; + exit(1); } // default return BED_INVALID; @@ -592,7 +596,7 @@ private: parseBedLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) */ template <typename T> - inline bool parseBedLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { + inline bool parseBedLine (T &bed, const vector<string> &lineVector, int _lineNum, unsigned int numFields) { // process as long as the number of fields in this // line matches what we expect for this file. @@ -601,13 +605,13 @@ private: int i; i = atoi(lineVector[1].c_str()); if (i<0) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Start Coordinate detected that is < 0. Exiting." << endl; + cerr << "Error: malformed BED entry at line " << _lineNum << ". Start Coordinate detected that is < 0. Exiting." << endl; exit(1); } bed.start = (CHRPOS)i; i = atoi(lineVector[2].c_str()); if (i<0) { - cerr << "Error: malformed BED entry at line " << lineNum << ". End Coordinate detected that is < 0. Exiting." << endl; + cerr << "Error: malformed BED entry at line " << _lineNum << ". End Coordinate detected that is < 0. Exiting." << endl; exit(1); } bed.end = (CHRPOS)i; @@ -640,7 +644,7 @@ private: } } else if (this->bedType != 3) { - cerr << "Error: unexpected number of fields at line: " << lineNum + cerr << "Error: unexpected number of fields at line: " << _lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; exit(1); } @@ -650,20 +654,20 @@ private: return true; } else { - cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + cerr << "Error: malformed BED entry at line " << _lineNum << ". Start was greater than end. Exiting." << endl; exit(1); } } else if (numFields == 1) { - cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + cerr << "Only one BED field detected: " << _lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; exit(1); } else if ((numFields != this->bedType) && (numFields != 0)) { - cerr << "Differing number of BED fields encountered at line: " << lineNum << ". Exiting..." << endl; + cerr << "Differing number of BED fields encountered at line: " << _lineNum << ". Exiting..." << endl; exit(1); } else if ((numFields < 3) && (numFields != 0)) { - cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required at line: "<< lineNum << ". Exiting..." << endl; + cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required at line: "<< _lineNum << ". Exiting..." << endl; exit(1); } return false; @@ -674,7 +678,7 @@ private: parseVcfLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) */ template <typename T> - inline bool parseVcfLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { + inline bool parseVcfLine (T &bed, const vector<string> &lineVector, int _lineNum, unsigned int numFields) { if (numFields == this->bedType) { bed.chrom = lineVector[0]; bed.start = atoi(lineVector[1].c_str()) - 1; // VCF is one-based @@ -696,24 +700,24 @@ private: return true; } else if (bed.start > bed.end) { - cerr << "Error: malformed VCF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + cerr << "Error: malformed VCF entry at line " << _lineNum << ". Start was greater than end. Exiting." << endl; exit(1); } else if ( (bed.start < 0) || (bed.end < 0) ) { - cerr << "Error: malformed VCF entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; + cerr << "Error: malformed VCF entry at line " << _lineNum << ". Coordinate detected that is < 0. Exiting." << endl; exit(1); } } else if (numFields == 1) { - cerr << "Only one VCF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + cerr << "Only one VCF field detected: " << _lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; exit(1); } else if ((numFields != this->bedType) && (numFields != 0)) { - cerr << "Differing number of VCF fields encountered at line: " << lineNum << ". Exiting..." << endl; + cerr << "Differing number of VCF fields encountered at line: " << _lineNum << ". Exiting..." << endl; exit(1); } else if ((numFields < 2) && (numFields != 0)) { - cerr << "TAB delimited VCF file with at least 2 fields (chrom, pos) is required at line: "<< lineNum << ". Exiting..." << endl; + cerr << "TAB delimited VCF file with at least 2 fields (chrom, pos) is required at line: "<< _lineNum << ". Exiting..." << endl; exit(1); } return false; -- GitLab