diff --git a/Makefile b/Makefile index bb5c26a58508c5849a3c793cdbb32e8218b11a49..4f3ec3496e52474890704e555b796660254b466b 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/intersectBed \ $(SRC_DIR)/linksBed \ $(SRC_DIR)/maskFastaFromBed \ + $(SRC_DIR)/mapBed \ $(SRC_DIR)/mergeBed \ $(SRC_DIR)/multiBamCov \ $(SRC_DIR)/multiIntersectBed \ diff --git a/src/bedToBam/bedToBam.cpp b/src/bedToBam/bedToBam.cpp index a58389f334d0b9a925b26973041d78a04c2b3bb0..c9f2ab734bfb0ea09c63cc740437ebea0faaa665 100644 --- a/src/bedToBam/bedToBam.cpp +++ b/src/bedToBam/bedToBam.cpp @@ -247,15 +247,15 @@ void ConvertBedToBam(const BED &bed, BamAlignment &bam, map<string, int, std::le else{ // does it smell like BED12? if so, process it. - if (bed.otherFields.size() == 6) { + if (bed.fields.size() == 6) { // extract the relevant BED fields to convert BED12 to BAM // namely: blockCount, blockStarts, blockEnds - unsigned int blockCount = atoi(bed.otherFields[3].c_str()); + unsigned int blockCount = atoi(bed.fields[9].c_str()); vector<int> blockSizes, blockStarts; - Tokenize(bed.otherFields[4], blockSizes, ","); - Tokenize(bed.otherFields[5], blockStarts, ","); + Tokenize(bed.fields[10], blockSizes, ","); + Tokenize(bed.fields[11], blockStarts, ","); // make sure this is a well-formed BED12 entry. if (blockSizes.size() != blockCount) { diff --git a/src/mapBed/Makefile b/src/mapBed/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0e2436727c8714c3d273682c1b1464d79088cb0a --- /dev/null +++ b/src/mapBed/Makefile @@ -0,0 +1,49 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/genomeFile/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools-Ancillary \ + -I$(UTILITIES_DIR)/chromsweep \ + -I$(UTILITIES_DIR)/version/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= mapMain.cpp mapBed.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=bedFile.o lineFileUtilities.o BamAncillary.o gzstream.o fileType.o chromsweep.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +PROGRAM= mapBed + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools-Ancillary/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/chromsweep/ + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean diff --git a/src/mapBed/mapBed.cpp b/src/mapBed/mapBed.cpp new file mode 100644 index 0000000000000000000000000000000000000000..382cb1ba9afc3397e20eb7aea6964c9b88316590 --- /dev/null +++ b/src/mapBed/mapBed.cpp @@ -0,0 +1,72 @@ +/***************************************************************************** + mapBed.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "lineFileUtilities.h" +#include "mapBed.h" + + +// Constructor +BedMap::BedMap(string bedAFile, string bedBFile, int column, string operation, + float overlapFraction, bool sameStrand, + bool diffStrand, bool reciprocal, + bool printHeader) { + + _bedAFile = bedAFile; + _bedBFile = bedBFile; + _column = column - 1; + _operation = operation; + _overlapFraction = overlapFraction; + _sameStrand = sameStrand; + _diffStrand = diffStrand; + _reciprocal = reciprocal; + _printHeader = printHeader; + + Map(); +} + +// Destructor +BedMap::~BedMap(void) { +} + +void BedMap::Map() { + + // create new BED file objects for A and B + _bedA = new BedFile(_bedAFile); + _bedB = new BedFile(_bedBFile); + + // use the chromsweep algorithm to detect overlaps on the fly. + ChromSweep sweep = ChromSweep(_bedB, _bedA, _sameStrand, _diffStrand, _printHeader); + + pair<BED, vector<BED> > hit_set; + hit_set.second.reserve(100000); + while (sweep.Next(hit_set)) { + ApplyHits(hit_set.first, hit_set.second); + } +} + + +// *************************************** +// ************** TO DO ************** +// move all of this logic into bedFile.cpp +// so that hits aren't looped through twice. +// *************************************** + +void BedMap::ApplyHits(const BED &a, const vector<BED> &hits) { + // loop through the hits and report those that meet the user's criteria + vector<BED>::const_iterator h = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; h != hitsEnd; ++h) { + cout << h->fields[_column] << " " << h->fields.size() << endl; + } +} + + + diff --git a/src/mapBed/mapBed.h b/src/mapBed/mapBed.h new file mode 100644 index 0000000000000000000000000000000000000000..1f037293536863841765782adb8e5df2db5ab655 --- /dev/null +++ b/src/mapBed/mapBed.h @@ -0,0 +1,73 @@ +/***************************************************************************** + mapBed.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#ifndef MAPBED_H +#define MAPBED_H + +#include "bedFile.h" +#include "chromsweep.h" +#include "api/BamReader.h" +#include "api/BamWriter.h" +#include "api/BamAux.h" +#include "BamAncillary.h" +using namespace BamTools; + + +#include <vector> +#include <iostream> +#include <fstream> +#include <stdlib.h> +using namespace std; + + + +class BedMap { + +public: + + // constructor + BedMap(string bedAFile, string bedBFile, int column, string operation, + float overlapFraction, bool sameStrand, + bool diffStrand, bool reciprocal, + bool printHeader); + + // destructor + ~BedMap(void); + +private: + + //------------------------------------------------ + // private attributes + //------------------------------------------------ + string _bedAFile; + string _bedBFile; + int _column; + string _operation; + bool _sameStrand; + bool _diffStrand; + bool _reciprocal; + float _overlapFraction; + + bool _printHeader; + + // instance of a bed file class. + BedFile *_bedA, *_bedB; + + //------------------------------------------------ + // private methods + //------------------------------------------------ + void Map(); + + void ApplyHits(const BED &a, const vector<BED> &hits); + +}; + +#endif /* MAPBED_H */ diff --git a/src/mapBed/mapMain.cpp b/src/mapBed/mapMain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d40d4be8a20e83db80c4aa91ebe5ab9906aa2332 --- /dev/null +++ b/src/mapBed/mapMain.cpp @@ -0,0 +1,189 @@ +/***************************************************************************** + mapMain.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "mapBed.h" +#include "version.h" + +using namespace std; + +// define our program name +#define PROGRAM_NAME "bedtools map" + + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +// function declarations +void map_help(void); + +int map_main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + int column = 5; + string operation = "sum"; + + // input arguments + float overlapFraction = 1E-9; + + bool haveBedA = false; + bool haveBedB = false; + bool haveColumn = false; + bool haveOperation = false; + bool haveFraction = false; + bool reciprocalFraction = false; + bool sameStrand = false; + bool diffStrand = false; + bool printHeader = false; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) map_help(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedB = true; + bedBFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-c", 2, parameterLength)) { + if ((i+1) < argc) { + haveColumn = true; + column = atoi(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-o", 2, parameterLength)) { + if ((i+1) < argc) { + haveOperation = true; + operation = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-f", 2, parameterLength)) { + if ((i+1) < argc) { + haveFraction = true; + overlapFraction = atof(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-r", 2, parameterLength)) { + reciprocalFraction = true; + } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + sameStrand = true; + } + else if (PARAMETER_CHECK("-S", 2, parameterLength)) { + diffStrand = true; + } + else if(PARAMETER_CHECK("-header", 7, parameterLength)) { + printHeader = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl; + showHelp = true; + } + + if (reciprocalFraction && !haveFraction) { + cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl; + showHelp = true; + } + + if (sameStrand && diffStrand) { + cerr << endl << "*****" << endl << "*****ERROR: Request either -s OR -S, not both." << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + + BedMap *bm = new BedMap(bedAFile, bedBFile, column, operation, + overlapFraction, sameStrand, + diffStrand, reciprocalFraction, + printHeader); + delete bm; + return 0; + } + else { + map_help(); + return 0; + } +} + +void map_help(void) { + + cerr << "\nTool: bedtools map (aka mapBed)" << endl; + cerr << "Version: " << VERSION << "\n"; + cerr << "Summary: Summarize interval overlaps between two feature files." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-c\t" << "Which column?." << endl << endl; + + cerr << "\t-o\t" << "Which operation?." << endl << endl; + + cerr << "\t-f\t" << "Minimum overlap required as a fraction of A." << endl; + cerr << "\t\t- Default is 1E-9 (i.e., 1bp)." << endl; + cerr << "\t\t- FLOAT (e.g. 0.50)" << endl << endl; + + cerr << "\t-r\t" << "Require that the fraction overlap be reciprocal for A and B." << endl; + cerr << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl; + cerr << "\t\t that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl; + + cerr << "\t-s\t" << "Require same strandedness. That is, only report hits in B" << endl; + cerr << "\t\tthat overlap A on the _same_ strand." << endl; + cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; + + cerr << "\t-S\t" << "Require different strandedness. That is, only report hits in B" << endl; + cerr << "\t\tthat overlap A on the _opposite_ strand." << endl; + cerr << "\t\t- By default, overlaps are reported without respect to strand." << endl << endl; + + cerr << "\t-header\t" << "Print the header from the A file prior to results." << endl << endl; + + // end the program here + exit(1); + +} diff --git a/src/pairToPair/pairToPair.cpp b/src/pairToPair/pairToPair.cpp index 58d52140638459ffe98cc40cb1bc7431b43b8c9a..ddece960a039b539838fda22b8b411528820eedf 100644 --- a/src/pairToPair/pairToPair.cpp +++ b/src/pairToPair/pairToPair.cpp @@ -161,8 +161,8 @@ bool PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<MATE> &qualityH b2.bed.chrom.c_str(), b2.bed.start, b2.bed.end, b1.bed.name.c_str(), b1.bed.score.c_str(), b1.bed.strand.c_str(), b2.bed.strand.c_str()); - for (size_t i = 0; i < b1.bed.otherFields.size(); ++i) - printf("\t%s", b1.bed.otherFields[i].c_str()); + for (size_t i = 0; i < b1.bed.other_idxs.size(); ++i) + printf("\t%s", b1.bed.fields[b1.bed.other_idxs[i]].c_str()); printf("\n"); } } @@ -195,8 +195,8 @@ void PairToPair::FindHitsOnEitherEnd(const BEDPE &a, const vector<MATE> &quality b2.bed.chrom.c_str(), b2.bed.start, b2.bed.end, b1.bed.name.c_str(), b1.bed.score.c_str(), b1.bed.strand.c_str(), b2.bed.strand.c_str()); - for (size_t i = 0; i < b1.bed.otherFields.size(); ++i) - printf("\t%s", b1.bed.otherFields[i].c_str()); + for (size_t i = 0; i < b1.bed.other_idxs.size(); ++i) + printf("\t%s", b1.bed.fields[b1.bed.other_idxs[i]].c_str()); printf("\n"); } else { @@ -207,8 +207,8 @@ void PairToPair::FindHitsOnEitherEnd(const BEDPE &a, const vector<MATE> &quality b1.mate->bed.chrom.c_str(), b1.mate->bed.start, b1.mate->bed.end, b1.bed.name.c_str(), b1.bed.score.c_str(), b1.bed.strand.c_str(), b1.mate->bed.strand.c_str()); - for (size_t i = 0; i < b1.bed.otherFields.size(); ++i) - printf("\t%s", b1.bed.otherFields[i].c_str()); + for (size_t i = 0; i < b1.bed.other_idxs.size(); ++i) + printf("\t%s", b1.bed.fields[b1.bed.other_idxs[i]].c_str()); printf("\n"); } } diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 0c5df4d2a4d500d9b2f0352ba79a83be2285b5b0..1e152a3955fa2ca9aa449db6db4b14bd10e1c85f 100644 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -17,12 +17,12 @@ Helper functions *************************************************/ void splitBedIntoBlocks(const BED &bed, bedVector &bedBlocks) { - if (bed.otherFields.size() < 6) { + if (bed.fields.size() < 6) { cerr << "Input error: Cannot split into blocks. Found interval with fewer than 12 columns." << endl; exit(1); } - int blockCount = atoi(bed.otherFields[3].c_str()); + int blockCount = atoi(bed.fields[9].c_str()); if ( blockCount <= 0 ) { cerr << "Input error: found interval having <= 0 blocks." << endl; exit(1); @@ -33,8 +33,8 @@ void splitBedIntoBlocks(const BED &bed, bedVector &bedBlocks) { } else { // get the comma-delimited strings for the BED12 block starts and block ends. - string blockSizes(bed.otherFields[4]); - string blockStarts(bed.otherFields[5]); + string blockSizes(bed.fields[10]); + string blockStarts(bed.fields[11]); vector<int> sizes; vector<int> starts; @@ -50,7 +50,8 @@ void splitBedIntoBlocks(const BED &bed, bedVector &bedBlocks) { for (UINT i = 0; i < (UINT) blockCount; ++i) { CHRPOS blockStart = bed.start + starts[i]; CHRPOS blockEnd = bed.start + starts[i] + sizes[i]; - BED currBedBlock(bed.chrom, blockStart, blockEnd, bed.name, bed.score, bed.strand, bed.otherFields); + BED currBedBlock(bed.chrom, blockStart, blockEnd, + bed.name, bed.score, bed.strand, bed.fields, bed.other_idxs); bedBlocks.push_back(currBedBlock); } } @@ -651,7 +652,8 @@ void BedFile::loadBedCovFileIntoMap() { bedCov.name = bedEntry.name; bedCov.score = bedEntry.score; bedCov.strand = bedEntry.strand; - bedCov.otherFields = bedEntry.otherFields; + bedCov.fields = bedEntry.fields; + bedCov.other_idxs = bedEntry.other_idxs; bedCov.zeroLength = bedEntry.zeroLength; bedCov.count = 0; bedCov.minOverlapStart = INT_MAX; @@ -677,7 +679,8 @@ void BedFile::loadBedCovListFileIntoMap() { bedCovList.name = bedEntry.name; bedCovList.score = bedEntry.score; bedCovList.strand = bedEntry.strand; - bedCovList.otherFields = bedEntry.otherFields; + bedCovList.fields = bedEntry.fields; + bedCovList.other_idxs = bedEntry.other_idxs; bedCovList.zeroLength = bedEntry.zeroLength; bedCovListMap[bedEntry.chrom][bin].push_back(bedCovList); diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 9777e619eeee3a051e798d22ea5e32590bfb8df5..e860f870239a7208b7e8c39d113521d405be526e 100644 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -85,11 +85,11 @@ struct BED { string name; string score; string strand; - - // Add'l fields for BED12 and/or custom BED annotations - vector<string> otherFields; - - // experimental fields for the FJOIN approach. + // all of the original fields in the record + vector<string> fields; + // indices of the "other" fields + vector<uint16_t> other_idxs; + // is this a zero length feature: i.e., start == end bool zeroLength; public: @@ -103,7 +103,8 @@ public: name(""), score(""), strand(""), - otherFields(), + fields(), + other_idxs(), zeroLength(false) {} @@ -115,7 +116,8 @@ public: name(""), score(""), strand(""), - otherFields(), + fields(), + other_idxs(), zeroLength(false) {} @@ -127,7 +129,8 @@ public: name(""), score(""), strand(strand), - otherFields(), + fields(), + other_idxs(), zeroLength(false) {} @@ -140,20 +143,22 @@ public: name(name), score(score), strand(strand), - otherFields(), + fields(), + other_idxs(), zeroLength(false) {} // BEDALL BED(string chrom, CHRPOS start, CHRPOS end, string name, - string score, string strand, vector<string> otherFields) + string score, string strand, vector<string> fields, vector<uint16_t> other_idxs) : chrom(chrom), start(start), end(end), name(name), score(score), strand(strand), - otherFields(otherFields), + fields(fields), + other_idxs(other_idxs), zeroLength(false) {} @@ -189,10 +194,11 @@ struct BEDCOV { string score; string strand; - // Add'l fields for BED12 and/or custom BED annotations - vector<string> otherFields; - - // flag a zero-length feature + // all of the original fields in the record + vector<string> fields; + // indices of the "other" fields + vector<uint16_t> other_idxs; + // is this a zero length feature: i.e., start == end bool zeroLength; // Additional fields specific to computing coverage @@ -211,7 +217,8 @@ struct BEDCOV { name(""), score(""), strand(""), - otherFields(), + fields(), + other_idxs(), zeroLength(false), depthMap(), count(0), @@ -234,10 +241,11 @@ struct BEDCOVLIST { string score; string strand; - // Add'l fields for BED12 and/or custom BED annotations - vector<string> otherFields; - - // flag a zero-length feature + // all of the original fields in the record + vector<string> fields; + // indices of the "other" fields + vector<uint16_t> other_idxs; + // is this a zero length feature: i.e., start == end bool zeroLength; // Additional fields specific to computing coverage @@ -256,7 +264,8 @@ struct BEDCOVLIST { name(""), score(""), strand(""), - otherFields(), + fields(), + other_idxs(), zeroLength(false), depthMapList(), counts(0), @@ -594,6 +603,7 @@ private: // process as long as the number of fields in this // line matches what we expect for this file. if (numFields == this->bedType) { + bed.fields = lineVector; bed.chrom = lineVector[0]; int i; i = atoi(lineVector[1].c_str()); @@ -633,7 +643,7 @@ private: bed.score = lineVector[4]; bed.strand = lineVector[5]; for (unsigned int i = 6; i < lineVector.size(); ++i) { - bed.otherFields.push_back(lineVector[i]); + bed.other_idxs.push_back(i); } } else if (this->bedType != 3) { @@ -673,6 +683,7 @@ private: template <typename T> inline bool parseVcfLine (T &bed, const vector<string> &lineVector, int _lineNum, unsigned int numFields) { if (numFields == this->bedType) { + bed.fields = lineVector; bed.chrom = lineVector[0]; bed.start = atoi(lineVector[1].c_str()) - 1; // VCF is one-based bed.end = bed.start + lineVector[3].size(); // VCF 4.0 stores the size of the affected REF allele. @@ -686,7 +697,7 @@ private: if (this->bedType > 2) { for (unsigned int i = 2; i < numFields; ++i) - bed.otherFields.push_back(lineVector[i]); + bed.other_idxs.push_back(i); } if ((bed.start <= bed.end) && (bed.start >= 0) && (bed.end >= 0)) { @@ -724,6 +735,7 @@ private: template <typename T> inline bool parseGffLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { if (numFields == this->bedType) { + bed.fields = lineVector; if (this->bedType >= 8 && _isGff) { bed.chrom = lineVector[0]; if (isInteger(lineVector[3])) @@ -733,11 +745,11 @@ private: bed.name = lineVector[2]; bed.score = lineVector[5]; bed.strand = lineVector[6].c_str(); - bed.otherFields.push_back(lineVector[1]); // add GFF "source". unused in BED - bed.otherFields.push_back(lineVector[7]); // add GFF "fname". unused in BED + bed.other_idxs.push_back(1); // add GFF "source". unused in BED + bed.other_idxs.push_back(7); // add GFF "fname". unused in BED // handle the optional 9th field. if (this->bedType == 9) - bed.otherFields.push_back(lineVector[8]); // add GFF "group". unused in BED + bed.other_idxs.push_back(8); // add GFF "group". unused in BED bed.start--; } else { @@ -813,11 +825,10 @@ public: else if (this->bedType > 6) { printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), bed.score.c_str(), bed.strand.c_str()); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); + vector<uint16_t>::const_iterator othIt = bed.other_idxs.begin(); + vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end(); for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); + printf("%s\t", bed.fields[*othIt].c_str()); } } } @@ -825,27 +836,28 @@ public: else if (_isGff == false && _isVcf == true) { printf ("%s\t%d\t", bed.chrom.c_str(), start+1); - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); + vector<uint16_t>::const_iterator othIt = bed.other_idxs.begin(); + vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end(); for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); + printf("%s\t", bed.fields[*othIt].c_str()); } } // GFF else if (_isGff == true) { // "GFF-8" if (this->bedType == 8) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(), bed.name.c_str(), start+1, end, bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str()); + bed.fields[bed.other_idxs[1]].c_str()); } // "GFF-9" else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(), bed.name.c_str(), start+1, end, bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + bed.fields[bed.other_idxs[1]].c_str(), + bed.fields[bed.other_idxs[2]].c_str()); } } } @@ -893,22 +905,22 @@ public: printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(), bed.score.c_str(), bed.strand.c_str()); - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); + vector<uint16_t>::const_iterator othIt = bed.other_idxs.begin(); + vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end(); for ( ; othIt != othEnd; ++othIt) { - printf("\t%s", othIt->c_str()); + printf("\t%s", bed.fields[*othIt].c_str()); } printf("\n"); } } // VCF else if (_isGff == false && _isVcf == true) { - printf ("%s\t%d\t", bed.chrom.c_str(), start+1); + printf ("%s\t%d", bed.chrom.c_str(), start+1); - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); + vector<uint16_t>::const_iterator othIt = bed.other_idxs.begin(); + vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end(); for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); + printf("\t%s", bed.fields[*othIt].c_str()); } printf("\n"); } @@ -916,17 +928,18 @@ public: else if (_isGff == true) { // "GFF-8" if (this->bedType == 8) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(), bed.name.c_str(), start+1, end, bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str()); + bed.fields[bed.other_idxs[1]].c_str()); } // "GFF-9" else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(), bed.name.c_str(), start+1, end, bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + bed.fields[bed.other_idxs[1]].c_str(), + bed.fields[bed.other_idxs[2]].c_str()); } } } @@ -970,39 +983,38 @@ public: else if (this->bedType > 6) { printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), bed.score.c_str(), bed.strand.c_str()); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); + vector<uint16_t>::const_iterator othIt = bed.other_idxs.begin(); + vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end(); for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); + printf("%s\t", bed.fields[*othIt].c_str()); } } } // VCF else if (_isGff == false && _isVcf == true) { printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); + vector<uint16_t>::const_iterator othIt = bed.other_idxs.begin(); + vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end(); for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); + printf("%s\t", bed.fields[*othIt].c_str()); } } // GFF else if (_isGff == true) { // "GFF-8" if (this->bedType == 8) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str()); + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand.c_str(), + bed.fields[bed.other_idxs[1]].c_str()); } // "GFF-9" else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand.c_str(), + bed.fields[bed.other_idxs[1]].c_str(), + bed.fields[bed.other_idxs[2]].c_str()); } } } @@ -1047,40 +1059,40 @@ public: printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(), bed.score.c_str(), bed.strand.c_str()); - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); + vector<uint16_t>::const_iterator othIt = bed.other_idxs.begin(); + vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end(); for ( ; othIt != othEnd; ++othIt) { - printf("\t%s", othIt->c_str()); + printf("\t%s", bed.fields[*othIt].c_str()); } printf("\n"); } } // VCF else if (_isGff == false && _isVcf == true) { - printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); + printf ("%s\t%d", bed.chrom.c_str(), bed.start+1); + vector<uint16_t>::const_iterator othIt = bed.other_idxs.begin(); + vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end(); for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); + printf("\t%s", bed.fields[*othIt].c_str()); } printf("\n"); } // GFF else if (_isGff == true) { - // "GFF-9" + // "GFF-8" if (this->bedType == 8) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str()); + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand.c_str(), + bed.fields[bed.other_idxs[1]].c_str()); } - // "GFF-8" + // "GFF-9" else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand.c_str(), + bed.fields[bed.other_idxs[1]].c_str(), + bed.fields[bed.other_idxs[2]].c_str()); } } } diff --git a/src/utils/bedFile/bedFile.h.orig b/src/utils/bedFile/bedFile.h.orig deleted file mode 100644 index 08a8faca482f63632add4c9bb45bdab5da6b1465..0000000000000000000000000000000000000000 --- a/src/utils/bedFile/bedFile.h.orig +++ /dev/null @@ -1,1144 +0,0 @@ -/***************************************************************************** - bedFile.h - - (c) 2009 - Aaron Quinlan - Hall Laboratory - Department of Biochemistry and Molecular Genetics - University of Virginia - aaronquinlan@gmail.com - - Licensed under the GNU General Public License 2.0 license. -******************************************************************************/ -#ifndef BEDFILE_H -#define BEDFILE_H - -// "local" includes -#include "gzstream.h" -#include "lineFileUtilities.h" -#include "fileType.h" - -// standard includes -#include <vector> -#include <map> -#include <set> -#include <string> -#include <iostream> -#include <fstream> -#include <sstream> -#include <cstring> -#include <algorithm> -#include <limits.h> -#include <stdint.h> -#include <cstdio> -//#include <tr1/unordered_map> // Experimental. -using namespace std; - - -//************************************************* -// Data type tydedef -//************************************************* -typedef uint32_t CHRPOS; -typedef uint16_t BINLEVEL; -typedef uint32_t BIN; -typedef uint16_t USHORT; -typedef uint32_t UINT; - -//************************************************* -// Genome binning constants -//************************************************* - -const BIN _numBins = 37450; -const BINLEVEL _binLevels = 7; - -// bins range in size from 16kb to 512Mb -// Bin 0 spans 512Mbp, # Level 1 -// Bins 1-8 span 64Mbp, # Level 2 -// Bins 9-72 span 8Mbp, # Level 3 -// Bins 73-584 span 1Mbp # Level 4 -// Bins 585-4680 span 128Kbp # Level 5 -// Bins 4681-37449 span 16Kbp # Level 6 -const BIN _binOffsetsExtended[] = {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; -//const BIN _binOffsetsExtended[] = {4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; - -const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */ -const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */ - - -//************************************************* -// Common data structures -//************************************************* - -struct DEPTH { - UINT starts; - UINT ends; -}; - - -/* - Structure for regular BED records -*/ -struct BED { - - // Regular BED fields - string chrom; - CHRPOS start; - CHRPOS end; - string name; - string score; - string strand; - - // Add'l fields for BED12 and/or custom BED annotations - vector<string> otherFields; - - // experimental fields for the FJOIN approach. - bool zeroLength; - bool added; - bool finished; - // list of hits from another file. - vector<BED> overlaps; - -public: - // constructors - - // Null - BED() - : chrom(""), - start(0), - end(0), - name(""), - score(""), - strand(""), - otherFields(), - zeroLength(false), - added(false), - finished(false), - overlaps() - {} - - // BED3 - BED(string chrom, CHRPOS start, CHRPOS end) - : chrom(chrom), - start(start), - end(end), - name(""), - score(""), - strand(""), - otherFields(), - zeroLength(false), - added(false), - finished(false), - overlaps() - {} - - // BED4 - BED(string chrom, CHRPOS start, CHRPOS end, string strand) - : chrom(chrom), - start(start), - end(end), - name(""), - score(""), - strand(strand), - otherFields(), - zeroLength(false), - added(false), - finished(false), - overlaps() - {} - - // BED6 - BED(string chrom, CHRPOS start, CHRPOS end, string name, - string score, string strand) - : chrom(chrom), - start(start), - end(end), - name(name), - score(score), - strand(strand), - otherFields(), - zeroLength(false), - added(false), - finished(false), - overlaps() - {} - - // BEDALL - BED(string chrom, CHRPOS start, CHRPOS end, string name, - string score, string strand, vector<string> otherFields) - : chrom(chrom), - start(start), - end(end), - name(name), - score(score), - strand(strand), - otherFields(otherFields), - zeroLength(false), - added(false), - finished(false), - overlaps() - {} - - int size() { - return end-start; - } - -}; // BED - - -/* - Structure for each end of a paired BED record - mate points to the other end. -*/ -struct MATE { - BED bed; - int lineNum; - MATE *mate; -}; - - -/* - Structure for regular BED COVERAGE records -*/ -struct BEDCOV { - - string chrom; - - // Regular BED fields - CHRPOS start; - CHRPOS end; - string name; - string score; - string strand; - - // Add'l fields for BED12 and/or custom BED annotations - vector<string> otherFields; - - // flag a zero-length feature - bool zeroLength; - - // Additional fields specific to computing coverage - map<unsigned int, DEPTH> depthMap; - unsigned int count; - CHRPOS minOverlapStart; - - - public: - // constructors - // Null - BEDCOV() - : chrom(""), - start(0), - end(0), - name(""), - score(""), - strand(""), - otherFields(), - zeroLength(false), - depthMap(), - count(0), - minOverlapStart(0) - {} -}; - - -/* - Structure for BED COVERAGE records having lists of - multiple coverages -*/ -struct BEDCOVLIST { - - // Regular BED fields - string chrom; - CHRPOS start; - CHRPOS end; - string name; - string score; - string strand; - - // Add'l fields for BED12 and/or custom BED annotations - vector<string> otherFields; - - // flag a zero-length feature - bool zeroLength; - - // Additional fields specific to computing coverage - vector< map<unsigned int, DEPTH> > depthMapList; - vector<unsigned int> counts; - vector<CHRPOS> minOverlapStarts; - - - public: - // constructors - // Null - BEDCOVLIST() - : chrom(""), - start(0), - end(0), - name(""), - score(""), - strand(""), - otherFields(), - zeroLength(false), - depthMapList(), - counts(0), - minOverlapStarts(0) - {} -}; - - -// enum to flag the state of a given line in a BED file. -enum BedLineStatus -{ - BED_INVALID = -1, - BED_HEADER = 0, - BED_BLANK = 1, - BED_VALID = 2 -}; - -// enum to indicate the type of file we are dealing with -enum FileType -{ - BED_FILETYPE, - GFF_FILETYPE, - VCF_FILETYPE -}; - -//************************************************* -// Data structure typedefs -//************************************************* -typedef vector<BED> bedVector; -typedef vector<BEDCOV> bedCovVector; -typedef vector<MATE> mateVector; -typedef vector<BEDCOVLIST> bedCovListVector; - -typedef map<BIN, bedVector, std::less<BIN> > binsToBeds; -typedef map<BIN, bedCovVector, std::less<BIN> > binsToBedCovs; -typedef map<BIN, mateVector, std::less<BIN> > binsToMates; -typedef map<BIN, bedCovListVector, std::less<BIN> > binsToBedCovLists; - -typedef map<string, binsToBeds, std::less<string> > masterBedMap; -typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap; -typedef map<string, binsToMates, std::less<string> > masterMateMap; -typedef map<string, binsToBedCovLists, std::less<string> > masterBedCovListMap; -typedef map<string, bedVector, std::less<string> > masterBedMapNoBin; - - -// EXPERIMENTAL - wait for TR1 -// typedef vector<BED> bedVector; -// typedef vector<BEDCOV> bedCovVector; -// -// typedef tr1::unordered_map<BIN, bedVector> binsToBeds; -// typedef tr1::unordered_map<BIN, bedCovVector> binsToBedCovs; -// -// typedef tr1::unordered_map<string, binsToBeds> masterBedMap; -// typedef tr1::unordered_map<string, binsToBedCovs> masterBedCovMap; -// typedef tr1::unordered_map<string, bedVector> masterBedMapNoBin; - - - -// return the genome "bin" for a feature with this start and end -inline -BIN getBin(CHRPOS start, CHRPOS end) { - --end; - start >>= _binFirstShift; - end >>= _binFirstShift; - - for (register short i = 0; i < _binLevels; ++i) { - if (start == end) return _binOffsetsExtended[i] + start; - start >>= _binNextShift; - end >>= _binNextShift; - } - cerr << "start " << start << ", end " << end << " out of range in findBin (max is 512M)" << endl; - return 0; -} - -/**************************************************** -// isInteger(s): Tests if string s is a valid integer -*****************************************************/ -inline bool isInteger(const std::string& s) { - int len = s.length(); - for (int i = 0; i < len; i++) { - if (!std::isdigit(s[i])) return false; - } - return true; -} - - -// return the amount of overlap between two features. Negative if none and the the -// number of negative bases is the distance between the two. -inline -int overlaps(CHRPOS aS, CHRPOS aE, CHRPOS bS, CHRPOS bE) { - return min(aE, bE) - max(aS, bS); -} - - -// Ancillary functions -void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks); - - -// BED Sorting Methods -bool sortByChrom(const BED &a, const BED &b); -bool sortByStart(const BED &a, const BED &b); -bool sortBySizeAsc(const BED &a, const BED &b); -bool sortBySizeDesc(const BED &a, const BED &b); -bool sortByScoreAsc(const BED &a, const BED &b); -bool sortByScoreDesc(const BED &a, const BED &b); -bool byChromThenStart(BED const &a, BED const &b); - - - -//************************************************ -// BedFile Class methods and elements -//************************************************ -class BedFile { - -public: - - // Constructor - BedFile(string &); - - // Destructor - ~BedFile(void); - - // Open a BED file for reading (creates an istream pointer) - void Open(void); - - // Close an opened BED file. - void Close(void); - - // Get the next BED entry in an opened BED file. - BedLineStatus GetNextBed (BED &bed, int &lineNum); - - // load a BED file into a map keyed by chrom, then bin. value is vector of BEDs - void loadBedFileIntoMap(); - - // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVs - void loadBedCovFileIntoMap(); - - // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVLISTs - void loadBedCovListFileIntoMap(); - - // load a BED file into a map keyed by chrom. value is vector of BEDs - void loadBedFileIntoMapNoBin(); - - // Given a chrom, start, end and strand for a single feature, - // search for all overlapping features in another BED file. - // Searches through each relevant genome bin on the same chromosome - // as the single feature. Note: Adapted from kent source "binKeeperFind" - void FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool forceStrand); - - // return true if at least one overlap was found. otherwise, return false. - bool FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, - bool forceStrand, float overlapFraction = 0.0); - - // return true if at least one __reciprocal__ overlap was found. otherwise, return false. - bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, - bool forceStrand, float overlapFraction = 0.0); - - // Given a chrom, start, end and strand for a single feature, - // increment a the number of hits for each feature in B file - // that the feature overlaps - void countHits(const BED &a, bool forceStrand = false, bool countsOnly = false); - - // same as above, but has special logic that processes a set of - // BED "blocks" from a single entry so as to avoid over-counting - // each "block" of a single BAM/BED12 as distinct coverage. That is, - // if one read has four block, we only want to count the coverage as - // coming from one read, not four. - void countSplitHits(const vector<BED> &bedBlock, bool forceStrand = false, bool countsOnly = false); - - // Given a chrom, start, end and strand for a single feature, - // increment a the number of hits for each feature in B file - // that the feature overlaps - void countListHits(const BED &a, int index, bool forceStrand); - - // the bedfile with which this instance is associated - string bedFile; - unsigned int bedType; // 3-6, 12 for BED - // 9 for GFF - bool isZeroBased; - - // Main data structires used by BEDTools - masterBedCovMap bedCovMap; - masterBedCovListMap bedCovListMap; - masterBedMap bedMap; - masterBedMapNoBin bedMapNoBin; - -private: - - // data - bool _isGff; - bool _isVcf; - bool _typeIsKnown; // do we know the type? (i.e., BED, GFF, VCF) - FileType _fileType; // what is the file type? (BED? GFF? VCF?) - istream *_bedStream; - string _bedLine; - vector<string> _bedFields; - - void setZeroBased(bool zeroBased); - void setGff (bool isGff); - void setVcf (bool isVcf); - void setFileType (FileType type); - void setBedType (int colNums); - - /****************************************************** - Private definitions to circumvent linker issues with - templated member functions. - *******************************************************/ - - /* - parseLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) - */ - template <typename T> - inline BedLineStatus parseLine (T &bed, const vector<string> &lineVector, int &lineNum) { - - //char *p2End, *p3End, *p4End, *p5End; - //long l2, l3, l4, l5; - unsigned int numFields = lineVector.size(); - - // bail out if we have a blank line - if (numFields == 0) { - return BED_BLANK; - } - - if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) { - - if (numFields >= 3) { - // line parsing for all lines after the first non-header line - if (_typeIsKnown == true) { - switch(_fileType) { - case BED_FILETYPE: - if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - case VCF_FILETYPE: - if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - case GFF_FILETYPE: - if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - default: - printf("ERROR: file type encountered. Exiting\n"); - exit(1); - } - } - // line parsing for first non-header line: figure out file contents - else { - // it's BED format if columns 2 and 3 are integers - if (isInteger(lineVector[1]) && isInteger(lineVector[2])) { - setGff(false); - setZeroBased(true); - setFileType(BED_FILETYPE); - setBedType(numFields); // we now expect numFields columns in each line - if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - } - // it's VCF, assuming the second column is numeric and there are at least 8 fields. - else if (isInteger(lineVector[1]) && numFields >= 8) { - setGff(false); - setVcf(true); - setZeroBased(false); - setFileType(VCF_FILETYPE); - setBedType(numFields); // we now expect numFields columns in each line - if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - } - // it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total. - else if ((numFields >= 8) && isInteger(lineVector[3]) && isInteger(lineVector[4])) { - setGff(true); - setZeroBased(false); - setFileType(GFF_FILETYPE); - setBedType(numFields); // we now expect numFields columns in each line - if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; - } - else { - cerr << "Unexpected file format. Please use tab-delimited BED, GFF, or VCF. " << - "Perhaps you have non-integer starts or ends at line " << lineNum << "?" << endl; - exit(1); - } - } - } - else { - cerr << "It looks as though you have less than 3 columns at line: " << lineNum << ". Are you sure your files are tab-delimited?" << endl; - exit(1); - } - } - else { - lineNum--; - return BED_HEADER; - } - // default - return BED_INVALID; - } - - - /* - parseBedLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) - */ - template <typename T> - inline bool parseBedLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { - - // process as long as the number of fields in this - // line matches what we expect for this file. - if (numFields == this->bedType) { - bed.chrom = lineVector[0]; - bed.start = atoi(lineVector[1].c_str()); - bed.end = atoi(lineVector[2].c_str()); - - // handle starts == end (e.g., insertions in reference genome) - if (bed.start == bed.end) { - bed.start--; - bed.end++; - bed.zeroLength = true; - } - - if (this->bedType == 4) { - bed.name = lineVector[3]; - } - else if (this->bedType == 5) { - bed.name = lineVector[3]; - bed.score = lineVector[4]; - } - else if (this->bedType == 6) { - bed.name = lineVector[3]; - bed.score = lineVector[4]; - bed.strand = lineVector[5]; - } - else if (this->bedType > 6) { - bed.name = lineVector[3]; - bed.score = lineVector[4]; - bed.strand = lineVector[5]; - for (unsigned int i = 6; i < lineVector.size(); ++i) { - bed.otherFields.push_back(lineVector[i]); - } - } - else if (this->bedType != 3) { - cerr << "Error: unexpected number of fields at line: " << lineNum - << ". Verify that your files are TAB-delimited. Exiting..." << endl; - exit(1); - } - - // sanity checks. - if ((bed.start <= bed.end) && (bed.start >= 0) && (bed.end >= 0)) { - return true; - } - else if (bed.start > bed.end) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; - exit(1); - } - else if ( (bed.start < 0) || (bed.end < 0) ) { - cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; - exit(1); - } - } - else if (numFields == 1) { - cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; - exit(1); - } - else if ((numFields != this->bedType) && (numFields != 0)) { - cerr << "Differing number of BED fields encountered at line: " << lineNum << ". Exiting..." << endl; - exit(1); - } - else if ((numFields < 3) && (numFields != 0)) { - cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required at line: "<< lineNum << ". Exiting..." << endl; - exit(1); - } - return false; - } - - - /* - parseVcfLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) - */ - template <typename T> - inline bool parseVcfLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { - if (numFields == this->bedType) { - bed.chrom = lineVector[0]; - bed.start = atoi(lineVector[1].c_str()) - 1; // VCF is one-based - bed.end = bed.start + lineVector[3].size(); // VCF 4.0 stores the size of the affected REF allele. - bed.strand = "+"; - // construct the name from the ref and alt alleles. - // if it's an annotated variant, add the rsId as well. - bed.name = lineVector[3] + "/" + lineVector[4]; - if (lineVector[2] != ".") { - bed.name += "_" + lineVector[2]; - } - - if (this->bedType > 2) { - for (unsigned int i = 2; i < numFields; ++i) - bed.otherFields.push_back(lineVector[i]); - } - - if ((bed.start <= bed.end) && (bed.start > 0) && (bed.end > 0)) { - return true; - } - else if (bed.start > bed.end) { - cerr << "Error: malformed VCF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; - exit(1); - } - else if ( (bed.start < 0) || (bed.end < 0) ) { - cerr << "Error: malformed VCF entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; - exit(1); - } - } - else if (numFields == 1) { - cerr << "Only one VCF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; - exit(1); - } - else if ((numFields != this->bedType) && (numFields != 0)) { - cerr << "Differing number of VCF fields encountered at line: " << lineNum << ". Exiting..." << endl; - exit(1); - } - else if ((numFields < 2) && (numFields != 0)) { - cerr << "TAB delimited VCF file with at least 2 fields (chrom, pos) is required at line: "<< lineNum << ". Exiting..." << endl; - exit(1); - } - return false; - } - - - - /* - parseGffLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) - */ - template <typename T> - inline bool parseGffLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { - if (numFields == this->bedType) { - if (this->bedType >= 8 && _isGff) { - bed.chrom = lineVector[0]; - bed.start = atoi(lineVector[3].c_str()); - bed.end = atoi(lineVector[4].c_str()); - bed.name = lineVector[2]; - bed.score = lineVector[5]; - bed.strand = lineVector[6].c_str(); - bed.otherFields.push_back(lineVector[1]); // add GFF "source". unused in BED - bed.otherFields.push_back(lineVector[7]); // add GFF "fname". unused in BED - // handle the optional 9th field. - if (this->bedType == 9) - bed.otherFields.push_back(lineVector[8]); // add GFF "group". unused in BED - - // handle starts == end (e.g., insertions in reference genome) - if (bed.start == bed.end) { - bed.end++; - bed.zeroLength = true; - } - // GFF uses 1-based starts, covert to zero-based - bed.start--; - } - else { - cerr << "Error: unexpected number of fields at line: " << lineNum << - ". Verify that your files are TAB-delimited and that your GFF file has 8 or 9 fields. Exiting..." << endl; - exit(1); - } - if (bed.start > bed.end) { - cerr << "Error: malformed GFF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; - exit(1); - } - else if ( (bed.start < 0) || (bed.end < 0) ) { - cerr << "Error: malformed GFF entry at line " << lineNum << ". Coordinate detected that is < 1. Exiting." << endl; - exit(1); - } - else return true; - } - else if (numFields == 1) { - cerr << "Only one GFF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; - exit(1); - } - else if ((numFields != this->bedType) && (numFields != 0)) { - cerr << "Differing number of GFF fields encountered at line: " << lineNum << ". Exiting..." << endl; - exit(1); - } - else if ((numFields < 8) && (numFields != 0)) { - cerr << "TAB delimited GFF file with 8 or 9 fields is required at line: "<< lineNum << ". Exiting..." << endl; - exit(1); - } - return false; - } - - -public: - - /* - reportBedTab - - Writes the _original_ BED entry with a TAB - at the end of the line. - Works for BED3 - BED6. - */ - template <typename T> - inline void reportBedTab(const T &bed) { - - // if it is azeroLength feature, we need to - // correct the start and end coords to what they were - // in the original file - CHRPOS start = bed.start; - CHRPOS end = bed.end; - if (bed.zeroLength) { - if (_isGff == false) - start++; - end--; - } - - // BED - if (_isGff == false && _isVcf == false) { - if (this->bedType == 3) { - printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end); - } - else if (this->bedType == 4) { - printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str()); - } - else if (this->bedType == 5) { - printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str()); - } - else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - } - else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - } - } - // VCF - else if (_isGff == false && _isVcf == true) { - printf ("%s\t%d\t", bed.chrom.c_str(), start+1); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - } - // GFF - else if (_isGff == true) { - // "GFF-8" - if (this->bedType == 8) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str()); - } - // "GFF-9" - else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); - } - } - } - - - - /* - reportBedNewLine - - Writes the _original_ BED entry with a NEWLINE - at the end of the line. - Works for BED3 - BED6. - */ - template <typename T> - inline void reportBedNewLine(const T &bed) { - - // if it is azeroLength feature, we need to - // correct the start and end coords to what they were - // in the original file - CHRPOS start = bed.start; - CHRPOS end = bed.end; - if (bed.zeroLength) { - if (_isGff == false) - start++; - end--; - } - - //BED - if (_isGff == false && _isVcf == false) { - if (this->bedType == 3) { - printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end); - } - else if (this->bedType == 4) { - printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str()); - } - else if (this->bedType == 5) { - printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str()); - } - else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - } - else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("\t%s", othIt->c_str()); - } - printf("\n"); - } - } - // VCF - else if (_isGff == false && _isVcf == true) { - printf ("%s\t%d\t", bed.chrom.c_str(), start+1); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - printf("\n"); - } - // GFF - else if (_isGff == true) { - // "GFF-8" - if (this->bedType == 8) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str()); - } - // "GFF-9" - else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); - } - } - } - - - - /* - reportBedRangeNewLine - - Writes a custom start->end for a BED entry - with a NEWLINE at the end of the line. - - Works for BED3 - BED6. - */ - template <typename T> - inline void reportBedRangeTab(const T &bed, CHRPOS start, CHRPOS end) { - - // if it is azeroLength feature, we need to - // correct the start and end coords to what they were - // in the original file - if (bed.zeroLength) { - start = bed.start + 1; - end = bed.end - 1; - } - - // BED - if (_isGff == false && _isVcf == false) { - if (this->bedType == 3) { - printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end); - } - else if (this->bedType == 4) { - printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str()); - } - else if (this->bedType == 5) { - printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str()); - } - else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - } - else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - } - } - // VCF - else if (_isGff == false && _isVcf == true) { - printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - } - // GFF - else if (_isGff == true) { - // "GFF-8" - if (this->bedType == 8) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str()); - } - // "GFF-9" - else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); - } - } - } - - - - /* - reportBedRangeTab - - Writes a custom start->end for a BED entry - with a TAB at the end of the line. - - Works for BED3 - BED6. - */ - template <typename T> - inline void reportBedRangeNewLine(const T &bed, CHRPOS start, CHRPOS end) { - - // if it is azeroLength feature, we need to - // correct the start and end coords to what they were - // in the original file - if (bed.zeroLength) { - start = bed.start + 1; - end = bed.end - 1; - } - - // BED - if (_isGff == false && _isVcf == false) { - if (this->bedType == 3) { - printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end); - } - else if (this->bedType == 4) { - printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str()); - } - else if (this->bedType == 5) { - printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str()); - } - else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - } - else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand.c_str()); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("\t%s", othIt->c_str()); - } - printf("\n"); - } - } - // VCF - else if (_isGff == false && _isVcf == true) { - printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); - - vector<string>::const_iterator othIt = bed.otherFields.begin(); - vector<string>::const_iterator othEnd = bed.otherFields.end(); - for ( ; othIt != othEnd; ++othIt) { - printf("%s\t", othIt->c_str()); - } - printf("\n"); - } - // GFF - else if (_isGff == true) { - // "GFF-9" - if (this->bedType == 8) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str()); - } - // "GFF-8" - else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), - bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand.c_str(), - bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); - } - } - } - - - /* - reportNullBedTab - */ - void reportNullBedTab() { - - if (_isGff == false && _isVcf == false) { - if (this->bedType == 3) { - printf (".\t-1\t-1\t"); - } - else if (this->bedType == 4) { - printf (".\t-1\t-1\t.\t"); - } - else if (this->bedType == 5) { - printf (".\t-1\t-1\t.\t-1\t"); - } - else if (this->bedType == 6) { - printf (".\t-1\t-1\t.\t-1\t.\t"); - } - else if (this->bedType > 6) { - printf (".\t-1\t-1\t.\t-1\t.\t"); - for (unsigned int i = 6; i < this->bedType; ++i) { - printf(".\t"); - } - } - } - else if (_isGff == true && _isVcf == false) { - if (this->bedType == 8) { - printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t"); - } - else if (this->bedType == 9) { - printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t"); - } - } - } - - - /* - reportNullBedTab - */ - void reportNullBedNewLine() { - - if (_isGff == false && _isVcf == false) { - if (this->bedType == 3) { - printf (".\t-1\t-1\n"); - } - else if (this->bedType == 4) { - printf (".\t-1\t-1\t.\n"); - } - else if (this->bedType == 5) { - printf (".\t-1\t-1\t.\t-1\n"); - } - else if (this->bedType == 6) { - printf (".\t-1\t-1\t.\t-1\t.\n"); - } - else if (this->bedType > 6) { - printf (".\t-1\t-1\t.\t-1\t."); - for (unsigned int i = 6; i < this->bedType; ++i) { - printf("\t."); - } - printf("\n"); - } - } - else if (_isGff == true && _isVcf == false) { - if (this->bedType == 8) { - printf (".\t.\t.\t-1\t-1\t-1\t.\t.\n"); - } - else if (this->bedType == 9) { - printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n"); - } - } - } - - -}; - -#endif /* BEDFILE_H */ diff --git a/src/utils/bedFilePE/bedFilePE.cpp b/src/utils/bedFilePE/bedFilePE.cpp index 023a5f5862ab87b41fa181080fbfc4afb193eb4e..31e61788b80c7413164944e566fc6f4cdadfaf20 100644 --- a/src/utils/bedFilePE/bedFilePE.cpp +++ b/src/utils/bedFilePE/bedFilePE.cpp @@ -106,11 +106,10 @@ void BedFilePE::reportBedPETab(const BEDPE &a) { printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", a.chrom1.c_str(), a.start1, a.end1, a.chrom2.c_str(), a.start2, a.end2, a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); - - vector<string>::const_iterator othIt = a.otherFields.begin(); - vector<string>::const_iterator othEnd = a.otherFields.end(); + vector<uint16_t>::const_iterator othIt = a.other_idxs.begin(); + vector<uint16_t>::const_iterator othEnd = a.other_idxs.end(); for ( ; othIt != othEnd; ++othIt) { - printf("\t%s", othIt->c_str()); + printf("\t%s", a.fields[*othIt].c_str()); } printf("\t"); } @@ -149,11 +148,10 @@ void BedFilePE::reportBedPENewLine(const BEDPE &a) { printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", a.chrom1.c_str(), a.start1, a.end1, a.chrom2.c_str(), a.start2, a.end2, a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); - - vector<string>::const_iterator othIt = a.otherFields.begin(); - vector<string>::const_iterator othEnd = a.otherFields.end(); + vector<uint16_t>::const_iterator othIt = a.other_idxs.begin(); + vector<uint16_t>::const_iterator othEnd = a.other_idxs.end(); for ( ; othIt != othEnd; ++othIt) { - printf("\t%s", othIt->c_str()); + printf("\t%s", a.fields[*othIt].c_str()); } printf("\n"); } @@ -193,7 +191,7 @@ bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, co if ((lineNum == 1) && (lineVector.size() >= 6)) { this->bedType = lineVector.size(); - + bed.fields = lineVector; if (this->bedType == 6) { bed.chrom1 = lineVector[0]; bed.start1 = atoi(lineVector[1].c_str()); @@ -263,7 +261,7 @@ bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, co bed.strand2 = lineVector[9]; for (unsigned int i = 10; i < lineVector.size(); ++i) { - bed.otherFields.push_back(lineVector[i]); + bed.other_idxs.push_back(i); } return true; } @@ -287,6 +285,8 @@ bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, co } else if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { + bed.fields = lineVector; + if (this->bedType == 6) { bed.chrom1 = lineVector[0]; bed.start1 = atoi(lineVector[1].c_str()); @@ -354,9 +354,8 @@ bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, co bed.strand1 = lineVector[8]; bed.strand2 = lineVector[9]; - for (unsigned int i = 10; i < lineVector.size(); ++i) { - bed.otherFields.push_back(lineVector[i]); + bed.other_idxs.push_back(i); } return true; } @@ -513,7 +512,8 @@ void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, bedEntry1->bed.name = bedpeEntry.name; bedEntry1->bed.score = bedpeEntry.score; // only store the score in end1 to save memory bedEntry1->bed.strand = bedpeEntry.strand1; - bedEntry1->bed.otherFields = bedpeEntry.otherFields; // only store the otherFields in end1 to save memory + bedEntry1->bed.fields = bedpeEntry.fields; // only store the fields in end1 to save memory + bedEntry1->bed.other_idxs = bedpeEntry.other_idxs; // only store the other_idxs in end1 to save memory bedEntry1->lineNum = lineNum; bedEntry1->mate = bedEntry2; // keep a pointer to end2 diff --git a/src/utils/bedFilePE/bedFilePE.h b/src/utils/bedFilePE/bedFilePE.h index 00dedbefcdc6ea6f51a0ddd2f20add356fd4f0c5..4c4087cf95db76d5e81dcdb9f3738adab438fdf8 100644 --- a/src/utils/bedFilePE/bedFilePE.h +++ b/src/utils/bedFilePE/bedFilePE.h @@ -35,7 +35,10 @@ struct BEDPE { string strand1; string strand2; - vector<string> otherFields; + // all of the original fields in the record + vector<string> fields; + // indices of the "other" fields + vector<uint16_t> other_idxs; };