diff --git a/Makefile b/Makefile index b6643ae6bd6124c5bbf975e3af7b1501d4a9328d..cb22e919fc74d38f6ba7bce285ef04b7626dad20 100644 --- a/Makefile +++ b/Makefile @@ -61,6 +61,7 @@ UTIL_SUBDIRS = $(SRC_DIR)/utils/lineFileUtilities \ $(SRC_DIR)/utils/tabFile \ $(SRC_DIR)/utils/BamTools \ $(SRC_DIR)/utils/BamTools-Ancillary \ + $(SRC_DIR)/utils/BlockedIntervals \ $(SRC_DIR)/utils/Fasta \ $(SRC_DIR)/utils/VectorOps \ $(SRC_DIR)/utils/genomeFile diff --git a/src/bamToBed/Makefile b/src/bamToBed/Makefile index c97dbb6d609469357290623a307e52d5573cdbca..38d1986da1c4ef6a9e77a56514cbfe3a3417809f 100644 --- a/src/bamToBed/Makefile +++ b/src/bamToBed/Makefile @@ -10,7 +10,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ -I$(UTILITIES_DIR)/lineFileUtilities/ \ -I$(UTILITIES_DIR)/fileType/ \ -I$(UTILITIES_DIR)/BamTools/include \ - -I$(UTILITIES_DIR)/BamTools-Ancillary \ + -I$(UTILITIES_DIR)/BlockedIntervals \ -I$(UTILITIES_DIR)/version/ @@ -19,7 +19,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ # ---------------------------------- SOURCES= bamToBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=BamAncillary.o +_EXT_OBJECTS=BamAncillary.o BlockedIntervals.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= bamToBed @@ -35,6 +35,7 @@ $(BUILT_OBJECTS): $(SOURCES) $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamAncillary/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BlockedIntervals/ clean: @echo "Cleaning up." diff --git a/src/bamToBed/bamToBed.cpp b/src/bamToBed/bamToBed.cpp index ba7421d79a64a08176f484f6dbf4a462a168337a..344be8622634eec92fad740a74b4664dc6880a22 100644 --- a/src/bamToBed/bamToBed.cpp +++ b/src/bamToBed/bamToBed.cpp @@ -11,7 +11,7 @@ ******************************************************************************/ #include "api/BamReader.h" #include "api/BamAux.h" -#include "BamAncillary.h" +#include "BlockedIntervals.h" #include "bedFile.h" #include "version.h" using namespace BamTools; @@ -289,50 +289,10 @@ void ConvertBamToBedpe(const string &bamFile, const bool &useEditDistance) { } -void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts, vector<int> &blockLengths, unsigned int &alignmentEnd) { - - int currPosition = 0; - int blockLength = 0; - - // Rip through the CIGAR ops and figure out if there is more - // than one block for this alignment - vector<CigarOp>::const_iterator cigItr = cigar.begin(); - vector<CigarOp>::const_iterator cigEnd = cigar.end(); - for (; cigItr != cigEnd; ++cigItr) { - switch (cigItr->Type) { - case ('M') : - blockLength += cigItr->Length; - currPosition += cigItr->Length; - case ('I') : break; - case ('S') : break; - case ('D') : break; - blockLength += cigItr->Length; - currPosition += cigItr->Length; - case ('P') : break; - case ('N') : - blockStarts.push_back(currPosition + cigItr->Length); - blockLengths.push_back(blockLength); - currPosition += cigItr->Length; - blockLength = 0; - case ('H') : break; // for 'H' - do nothing, move to next op - default : - printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here - exit(1); - } - } - // add the kast block and set the - // alignment end (i.e., relative to the start) - blockLengths.push_back(blockLength); - alignmentEnd = currPosition; -} - - string BuildCigarString(const vector<CigarOp> &cigar) { stringstream cigarString; - for (size_t i = 0; i < cigar.size(); ++i) { - //cerr << cigar[i].Type << " " << cigar[i].Length << endl; switch (cigar[i].Type) { case ('M') : case ('I') : @@ -449,16 +409,18 @@ void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDista // Report each chunk of the BAM alignment as a discrete BED entry // For example 10M100N10M would be reported as two seprate BED entries of length 10 else { + // parse the CIGAR string and figure out the alignment blocks vector<BED> bedBlocks; - // Load the alignment blocks in bam into the bedBlocks vector. - // Don't trigger a new block when a "D" (deletion) CIGAR op is found. - getBamBlocks(bam, refs, bedBlocks, false); - - vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) { - printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bedItr->start, - bedItr->end, name.c_str(), bam.MapQuality, strand.c_str()); + string chrom = refs.at(bam.RefID).RefName; + // extract the block starts and lengths from the CIGAR string + GetBamBlocks(bam, chrom, bedBlocks); + + unsigned int i; + for (i = 0; i < bedBlocks.size(); ++i) { + BED curr = bedBlocks[i]; + printf("%s\t%d\t%d\t\%s\t%d\t%s\n", + chrom.c_str(), curr.start, curr.end, + name.c_str(), bam.MapQuality, strand.c_str()); } } } @@ -476,14 +438,11 @@ void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDist if (bam.IsSecondMate()) name += "/2"; // parse the CIGAR string and figure out the alignment blocks - unsigned int alignmentEnd; - vector<int> blockLengths; - vector<int> blockStarts; - blockStarts.push_back(0); - + vector<BED> bedBlocks; + string chrom = refs.at(bam.RefID).RefName; + CHRPOS alignmentEnd = bam.GetEndPosition(); // extract the block starts and lengths from the CIGAR string - ParseCigarBed12(bam.CigarData, blockStarts, blockLengths, alignmentEnd); - alignmentEnd += bam.Position; + GetBamBlocks(bam, chrom, bedBlocks); // write BED6 portion if (useEditDistance == false && bamTag == "") { @@ -514,20 +473,20 @@ void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDist } // write the colors, etc. - printf("%d\t%d\t%s\t%d\t", bam.Position, alignmentEnd, color.c_str(), (int) blockStarts.size()); + printf("%d\t%d\t%s\t%d\t", bam.Position, alignmentEnd, color.c_str(), (int) bedBlocks.size()); // now write the lengths portion unsigned int b; - for (b = 0; b < blockLengths.size() - 1; ++b) { - printf("%d,", blockLengths[b]); + for (b = 0; b < bedBlocks.size() - 1; ++b) { + printf("%d,", bedBlocks[b].end - bedBlocks[b].start); } - printf("%d\t", blockLengths[b]); + printf("%d\t", bedBlocks[b].end - bedBlocks[b].start); // now write the starts portion - for (b = 0; b < blockStarts.size() - 1; ++b) { - printf("%d,", blockStarts[b]); + for (b = 0; b < bedBlocks.size() - 1; ++b) { + printf("%d,", bedBlocks[b].start - bam.Position); } - printf("%d\n", blockStarts[b]); + printf("%d\n", bedBlocks[b].start - bam.Position); } diff --git a/src/bed12ToBed6/Makefile b/src/bed12ToBed6/Makefile index c5917207fd75b9cba0a8c5104fd6bdb49a5709fd..a5ff81cfbe4e5ea294c6c309530ab0b86b5c3afa 100644 --- a/src/bed12ToBed6/Makefile +++ b/src/bed12ToBed6/Makefile @@ -9,6 +9,8 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ -I$(UTILITIES_DIR)/lineFileUtilities/ \ -I$(UTILITIES_DIR)/gzstream/ \ -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BlockedIntervals \ -I$(UTILITIES_DIR)/version/ # ---------------------------------- @@ -16,7 +18,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ # ---------------------------------- SOURCES= bed12ToBed6.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o lineFileUtilities.o gzstream.o fileType.o +_EXT_OBJECTS=bedFile.o BlockedIntervals.o lineFileUtilities.o gzstream.o fileType.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= bed12ToBed6 @@ -35,6 +37,7 @@ $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BlockedIntervals/ clean: @echo "Cleaning up." diff --git a/src/bed12ToBed6/bed12ToBed6.cpp b/src/bed12ToBed6/bed12ToBed6.cpp index 02dc483cac854b079fdc3a779aa53b8cdbd7b3ba..ea2f801282743a88f157962d038334dd77711577 100644 --- a/src/bed12ToBed6/bed12ToBed6.cpp +++ b/src/bed12ToBed6/bed12ToBed6.cpp @@ -10,6 +10,7 @@ Licenced under the GNU General Public License 2.0 license. ******************************************************************************/ #include "lineFileUtilities.h" +#include "BlockedIntervals.h" #include "bedFile.h" #include "version.h" #include <vector> @@ -140,7 +141,7 @@ void ProcessBed(istream &bedInput, BedFile *bed) { if (bed->_status == BED_VALID) { bedVector bedBlocks; // vec to store the discrete BED "blocks" from a - splitBedIntoBlocks(bedEntry, bedBlocks); + GetBedBlocks(bedEntry, bedBlocks); for (int i = 0; i < (int) bedBlocks.size(); ++i) { if (addBlockNums == false) { diff --git a/src/coverageBed/Makefile b/src/coverageBed/Makefile index ceef4b1e699840b69538c6804b62027f143cc035..dd43a5ee7da494f5d07ccdefe4352ec97456f04b 100644 --- a/src/coverageBed/Makefile +++ b/src/coverageBed/Makefile @@ -11,15 +11,15 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ -I$(UTILITIES_DIR)/lineFileUtilities/ \ -I$(UTILITIES_DIR)/fileType/ \ -I$(UTILITIES_DIR)/BamTools/include \ - -I$(UTILITIES_DIR)/BamTools-Ancillary \ + -I$(UTILITIES_DIR)/BlockedIntervals \ -I$(UTILITIES_DIR)/version/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= coverageMain.cpp coverageBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o lineFileUtilities.o gzstream.o fileType.o BamAncillary.o -EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +# _EXT_OBJECTS=bedFile.o lineFileUtilities.o gzstream.o fileType.o BlockedIntervals.o +# EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= coverageBed @@ -32,13 +32,13 @@ $(BUILT_OBJECTS): $(SOURCES) @echo " * compiling" $(*F).cpp @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) -$(EXT_OBJECTS): - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools/ - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools-Ancillary/ - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ +# $(EXT_OBJECTS): +# @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ +# @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ +# @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools/ +# @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BlockedIntervals/ +# @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ +# @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ clean: @echo "Cleaning up." diff --git a/src/coverageBed/coverageBed.cpp b/src/coverageBed/coverageBed.cpp index 220303898b66b349e4b8fbbb412988a9d2dccd9c..dfd3a199ddc84be80a4a22de6ce1428f2e95827e 100644 --- a/src/coverageBed/coverageBed.cpp +++ b/src/coverageBed/coverageBed.cpp @@ -62,7 +62,7 @@ void BedCoverage::CollectCoverageBed() { // split the BED into discrete blocksand process each independently. else { bedVector bedBlocks; - splitBedIntoBlocks(a, bedBlocks); + GetBedBlocks(a, bedBlocks); // use countSplitHits to avoid over-counting each split chunk // as distinct read coverage. _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly); @@ -117,7 +117,7 @@ void BedCoverage::CollectCoverageBam(string bamFile) { bedVector bedBlocks; // since we are counting coverage, we do want to split blocks when a // deletion (D) CIGAR op is encountered (hence the true for the last parm) - getBamBlocks(bam, refs, bedBlocks, true); + GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true); // use countSplitHits to avoid over-counting each split chunk // as distinct read coverage. _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly); diff --git a/src/coverageBed/coverageBed.h b/src/coverageBed/coverageBed.h index 42ad65063877f1f5b1b3ea967db075f4e0690b24..a3b665691df4e2cebd80c4b3568770ecafb93a85 100644 --- a/src/coverageBed/coverageBed.h +++ b/src/coverageBed/coverageBed.h @@ -16,7 +16,7 @@ #include "api/BamReader.h" #include "api/BamAux.h" -#include "BamAncillary.h" +#include "BlockedIntervals.h" using namespace BamTools; #include <vector> diff --git a/src/genomeCoverageBed/Makefile b/src/genomeCoverageBed/Makefile index 3324ec9187eae8412f62a6bd81a0ff0f9cc58081..546d7be7e0e859819cea57c4e3219635582a6dc5 100644 --- a/src/genomeCoverageBed/Makefile +++ b/src/genomeCoverageBed/Makefile @@ -11,7 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ -I$(UTILITIES_DIR)/lineFileUtilities/ \ -I$(UTILITIES_DIR)/fileType/ \ -I$(UTILITIES_DIR)/BamTools/include \ - -I$(UTILITIES_DIR)/BamTools-Ancillary \ + -I$(UTILITIES_DIR)/BlockedIntervals \ -I$(UTILITIES_DIR)/version/ # ---------------------------------- # define our source and object files @@ -37,7 +37,7 @@ $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/genomeFile @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools/ - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools-Ancillary/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BlockedIntervals/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ diff --git a/src/genomeCoverageBed/genomeCoverageBed.cpp b/src/genomeCoverageBed/genomeCoverageBed.cpp index af5dd25c7f1dc7d9e4f24d474a65e476e578632e..a06ac20425f07b5fe4d254cdebcdf1c96229ae17 100644 --- a/src/genomeCoverageBed/genomeCoverageBed.cpp +++ b/src/genomeCoverageBed/genomeCoverageBed.cpp @@ -172,7 +172,7 @@ void BedGenomeCoverage::CoverageBed() { if (_obeySplits == true) { bedVector bedBlocks; // vec to store the discrete BED "blocks" - splitBedIntoBlocks(a, bedBlocks); + GetBedBlocks(a, bedBlocks); AddBlockedCoverage(bedBlocks); } else if (_only_5p_end) { @@ -247,7 +247,7 @@ void BedGenomeCoverage::CoverageBam(string bamFile) { bedVector bedBlocks; // since we are counting coverage, we do want to split blocks when a // deletion (D) CIGAR op is encountered (hence the true for the last parm) - getBamBlocks(bam, refs, bedBlocks, true); + GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true); AddBlockedCoverage(bedBlocks); } else if (_only_5p_end) { diff --git a/src/genomeCoverageBed/genomeCoverageBed.h b/src/genomeCoverageBed/genomeCoverageBed.h index 5e6ef4b48a11e57a4f32166e2c003b9e85ea0b90..f2a877fd097a0a86bdc381e687ebd86151ecda16 100644 --- a/src/genomeCoverageBed/genomeCoverageBed.h +++ b/src/genomeCoverageBed/genomeCoverageBed.h @@ -12,7 +12,7 @@ Licenced under the GNU General Public License 2.0 license. #include "bedFile.h" #include "genomeFile.h" -#include "BamAncillary.h" +#include "BlockedIntervals.h" #include "api/BamReader.h" #include "api/BamAux.h" using namespace BamTools; diff --git a/src/intersectBed/Makefile b/src/intersectBed/Makefile index 357f823e9e0c2189a22c7bfb5c70290ff7ee387f..c96a433e4f9ae22816e01743c35149adfe21104d 100644 --- a/src/intersectBed/Makefile +++ b/src/intersectBed/Makefile @@ -11,7 +11,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ -I$(UTILITIES_DIR)/lineFileUtilities/ \ -I$(UTILITIES_DIR)/fileType/ \ -I$(UTILITIES_DIR)/BamTools/include \ - -I$(UTILITIES_DIR)/BamTools-Ancillary \ + -I$(UTILITIES_DIR)/BlockedIntervals \ -I$(UTILITIES_DIR)/chromsweep \ -I$(UTILITIES_DIR)/version/ @@ -20,7 +20,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ # ---------------------------------- SOURCES= intersectMain.cpp intersectBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=bedFile.o lineFileUtilities.o BamAncillary.o gzstream.o fileType.o chromsweep.o +_EXT_OBJECTS=bedFile.o lineFileUtilities.o BlockedIntervals.o gzstream.o fileType.o chromsweep.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= intersectBed @@ -37,7 +37,7 @@ $(EXT_OBJECTS): @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools/ - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools-Ancillary/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BlockedIntervals/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/chromsweep/ diff --git a/src/intersectBed/intersectBed.cpp b/src/intersectBed/intersectBed.cpp index edb3356c5977d09a99264ab2206db1e37e0654c3..7f777ad7f06239fe5c9f68fd2e007b80506a948d 100644 --- a/src/intersectBed/intersectBed.cpp +++ b/src/intersectBed/intersectBed.cpp @@ -17,47 +17,6 @@ Helper functions ************************************/ bool BedIntersect::processHits(const BED &a, const vector<BED> &hits) { - // // how many overlaps are there b/w the bed and the set of hits? - // CHRPOS s, e; - // int overlapBases; - // int numOverlaps = 0; - // bool hitsFound = false; - // int aLength = (a.end - a.start); // the length of a in b.p. - // - // // loop through the hits and report those that meet the user's criteria - // vector<BED>::const_iterator h = hits.begin(); - // vector<BED>::const_iterator hitsEnd = hits.end(); - // for (; h != hitsEnd; ++h) { - // s = max(a.start, h->start); - // e = min(a.end, h->end); - // overlapBases = (e - s); // the number of overlapping bases b/w a and b - // - // // is there enough overlap relative to the user's request? (default ~ 1bp) - // if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { - // // Report the hit if the user doesn't care about reciprocal overlap between A and B. - // if (_reciprocal == false) { - // hitsFound = true; - // numOverlaps++; - // if (_printable == true) - // ReportOverlapDetail(overlapBases, a, *h, s, e); - // } - // // we require there to be sufficient __reciprocal__ overlap - // else { - // int bLength = (h->end - h->start); - // float bOverlap = ( (float) overlapBases / (float) bLength ); - // if (bOverlap >= _overlapFraction) { - // hitsFound = true; - // numOverlaps++; - // if (_printable == true) - // ReportOverlapDetail(overlapBases, a, *h, s, e); - // } - // } - // } - // } - // // report the summary of the overlaps if requested. - // ReportOverlapSummary(a, numOverlaps); - // // were hits found for this BED feature? - // return hitsFound; bool hitsFound = false; if (_printable == true) { vector<BED>::const_iterator h = hits.begin(); @@ -220,7 +179,7 @@ void BedIntersect::IntersectBed() { // split the BED12 into blocks and look for overlaps in each discrete block else { bedVector bedBlocks; // vec to store the discrete BED "blocks" - splitBedIntoBlocks(a,bedBlocks); + GetBedBlocks(a,bedBlocks); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); @@ -322,7 +281,7 @@ void BedIntersect::IntersectBam(string bamFile) { bool overlapFoundForBlock; bedVector bedBlocks; // vec to store the discrete BED "blocks" from a // we don't want to split on "D" ops, hence the "false" - getBamBlocks(bam, refs, bedBlocks, false); + GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); @@ -354,7 +313,7 @@ void BedIntersect::IntersectBam(string bamFile) { // look for overlaps only within each block. else { bedVector bedBlocks; // vec to store the discrete BED "blocks" from a - getBamBlocks(bam, refs, bedBlocks, false); + GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); diff --git a/src/intersectBed/intersectBed.h b/src/intersectBed/intersectBed.h index 65b58ad5308cf9802366db3fe443ffa6ec3f8703..fa079a9ad678bcf14518ab482b2f77f77e894aeb 100644 --- a/src/intersectBed/intersectBed.h +++ b/src/intersectBed/intersectBed.h @@ -17,7 +17,7 @@ #include "api/BamReader.h" #include "api/BamWriter.h" #include "api/BamAux.h" -#include "BamAncillary.h" +#include "BlockedIntervals.h" using namespace BamTools; diff --git a/src/utils/BlockedIntervals/BlockedIntervals.cpp b/src/utils/BlockedIntervals/BlockedIntervals.cpp new file mode 100644 index 0000000000000000000000000000000000000000..abd4221ff0c059791bb7b1b52f2a485cd759e527 --- /dev/null +++ b/src/utils/BlockedIntervals/BlockedIntervals.cpp @@ -0,0 +1,105 @@ +/***************************************************************************** + BlockedIntervals.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licensed under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "BlockedIntervals.h" + + +void GetBamBlocks(const BamAlignment &bam, + const string &chrom, + vector<BED> &bedBlocks, + bool breakOnDeletionOps) +{ + vector<int> starts; + vector<int> lengths; + starts.push_back(0); + + string strand; + bam.IsReverseStrand() ? strand = "-" : strand = "+"; + CHRPOS currPosition = bam.Position; + int blockLength = 0; + + // Rip through the CIGAR ops and figure out if there is more + // than one block for this alignment + vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin(); + vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end(); + for (; cigItr != cigEnd; ++cigItr) { + switch (cigItr->Type) { + case ('M') : + blockLength += cigItr->Length; + case ('I') : break; + case ('S') : break; + case ('D') : + if (!breakOnDeletionOps) + blockLength += cigItr->Length; + else { + bedBlocks.push_back( BED(chrom, currPosition, currPosition + blockLength, + bam.Name, ToString(bam.MapQuality), strand) ); + currPosition += cigItr->Length + blockLength; + blockLength = 0; + } + case ('P') : break; + case ('N') : + bedBlocks.push_back( BED(chrom, currPosition, currPosition + blockLength, + bam.Name, ToString(bam.MapQuality), strand) ); + currPosition += cigItr->Length + blockLength; + blockLength = 0; + case ('H') : break; // for 'H' - do nothing, move to next op + default : + printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here + exit(1); + } + } + bedBlocks.push_back( BED(chrom, currPosition, currPosition + blockLength, + bam.Name, ToString(bam.MapQuality), strand) ); +} + + +void GetBedBlocks(const BED &bed, bedVector &bedBlocks) { + + if (bed.fields.size() < 6) { + cerr << "Input error: Cannot split into blocks. Found interval with fewer than 12 columns." << endl; + exit(1); + } + + int blockCount = atoi(bed.fields[9].c_str()); + if ( blockCount <= 0 ) { + cerr << "Input error: found interval having <= 0 blocks." << endl; + exit(1); + } + else if ( blockCount == 1 ) { + //take a short-cut for single blocks + bedBlocks.push_back(bed); + } + else { + // get the comma-delimited strings for the BED12 block starts and block ends. + string blockSizes(bed.fields[10]); + string blockStarts(bed.fields[11]); + + vector<int> sizes; + vector<int> starts; + Tokenize(blockSizes, sizes, ","); + Tokenize(blockStarts, starts, ","); + + if ( sizes.size() != (size_t) blockCount || starts.size() != (size_t) blockCount ) { + cerr << "Input error: found interval with block-counts not matching starts/sizes on line." << endl; + exit(1); + } + + // add each BED block to the bedBlocks vector + for (UINT i = 0; i < (UINT) blockCount; ++i) { + CHRPOS blockStart = bed.start + starts[i]; + CHRPOS blockEnd = bed.start + starts[i] + sizes[i]; + BED currBedBlock(bed.chrom, blockStart, blockEnd, + bed.name, bed.score, bed.strand, bed.fields, bed.other_idxs); + bedBlocks.push_back(currBedBlock); + } + } +} \ No newline at end of file diff --git a/src/utils/BlockedIntervals/BlockedIntervals.h b/src/utils/BlockedIntervals/BlockedIntervals.h new file mode 100644 index 0000000000000000000000000000000000000000..b4bae76e70d39ed7d57e45d4e8eb7c66e310751b --- /dev/null +++ b/src/utils/BlockedIntervals/BlockedIntervals.h @@ -0,0 +1,29 @@ +/***************************************************************************** + BlockedIntervals.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licensed under the GNU General Public License 2.0 license. +******************************************************************************/ +#include <vector> +#include "bedFile.h" +#include "api/BamAlignment.h" + +using namespace std; +using namespace BamTools; + +/* + Using the CIGAR string, break a BAM alignment + into discrete alignment blocks. +*/ +void GetBamBlocks(const BamAlignment &bam, + const string &chrom, + vector<BED> &bedBlocks, + bool breakOnDeletionOps = false); + +/* break a BED12 record into discrete BED6 blocks. */ +void GetBedBlocks(const BED &bed, bedVector &bedBlocks); \ No newline at end of file diff --git a/src/utils/BlockedIntervals/Makefile b/src/utils/BlockedIntervals/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..3d64f1af4410159ad05786e454d6fb5266b95531 --- /dev/null +++ b/src/utils/BlockedIntervals/Makefile @@ -0,0 +1,35 @@ +OBJ_DIR = ../../../obj/ +BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../ + +INCLUDES = -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= BlockedIntervals.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=bedFile.o fileType.o gzstream.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) -L$(BT_ROOT)/lib + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean \ No newline at end of file diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 27e4a31fedf20dab6c10babeadd56781fa5b0af9..f1000bf12f884d298c619d25af9c96b9b2066c86 100644 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -12,52 +12,6 @@ #include "bedFile.h" -/************************************************ -Helper functions -*************************************************/ -void splitBedIntoBlocks(const BED &bed, bedVector &bedBlocks) { - - if (bed.fields.size() < 6) { - cerr << "Input error: Cannot split into blocks. Found interval with fewer than 12 columns." << endl; - exit(1); - } - - int blockCount = atoi(bed.fields[9].c_str()); - if ( blockCount <= 0 ) { - cerr << "Input error: found interval having <= 0 blocks." << endl; - exit(1); - } - else if ( blockCount == 1 ) { - //take a short-cut for single blocks - bedBlocks.push_back(bed); - } - else { - // get the comma-delimited strings for the BED12 block starts and block ends. - string blockSizes(bed.fields[10]); - string blockStarts(bed.fields[11]); - - vector<int> sizes; - vector<int> starts; - Tokenize(blockSizes, sizes, ","); - Tokenize(blockStarts, starts, ","); - - if ( sizes.size() != (size_t) blockCount || starts.size() != (size_t) blockCount ) { - cerr << "Input error: found interval with block-counts not matching starts/sizes on line." << endl; - exit(1); - } - - // add each BED block to the bedBlocks vector - for (UINT i = 0; i < (UINT) blockCount; ++i) { - CHRPOS blockStart = bed.start + starts[i]; - CHRPOS blockEnd = bed.start + starts[i] + sizes[i]; - BED currBedBlock(bed.chrom, blockStart, blockEnd, - bed.name, bed.score, bed.strand, bed.fields, bed.other_idxs); - bedBlocks.push_back(currBedBlock); - } - } -} - - /*********************************************** Sorting comparison functions ************************************************/