Commit cee8949d authored by Aaron's avatar Aaron
Browse files

refactored intersect; better -bed support for split BAM alignments

parent 49071c57
......@@ -12,6 +12,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BlockedIntervals \
-I$(UTILITIES_DIR)/BamTools-Ancillary \
-I$(UTILITIES_DIR)/chromsweep \
-I$(UTILITIES_DIR)/version/
......@@ -20,7 +21,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
# ----------------------------------
SOURCES= intersectMain.cpp intersectBed.cpp
OBJECTS= $(SOURCES:.cpp=.o)
_EXT_OBJECTS=bedFile.o lineFileUtilities.o BlockedIntervals.o gzstream.o fileType.o chromsweep.o
_EXT_OBJECTS=bedFile.o lineFileUtilities.o BlockedIntervals.o BamAncillary.o gzstream.o fileType.o chromsweep.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= intersectBed
......@@ -38,6 +39,7 @@ $(EXT_OBJECTS):
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BlockedIntervals/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools-Ancillary/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/
@$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/chromsweep/
......
......@@ -16,7 +16,6 @@
Helper functions
************************************/
bool BedIntersect::processHits(const BED &a, const vector<BED> &hits) {
bool hitsFound = false;
if (_printable == true) {
vector<BED>::const_iterator h = hits.begin();
......@@ -96,17 +95,8 @@ bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) {
}
bool BedIntersect::FindBlockedOverlaps(const BED &a, vector<BED> &hits) {
// find the hits that overlap with the full span of the blocked BED
_bedB->allHits(a.chrom, a.start, a.end, a.strand,
hits, _sameStrand, _diffStrand,
_overlapFraction, _reciprocal);
// break a into discrete blocks, as we need to
// measure overlap with the individual blocks, not the full span.
bedVector a_blocks;
GetBedBlocks(a, a_blocks);
bool BedIntersect::FindBlockedOverlaps(const BED &a, const vector<BED> &a_blocks,
const vector<BED> &hits, bool a_is_bam) {
int a_footprint = GetTotalBlockLength(a_blocks);
// container to store the set of raw hits
// that actually overlap the A blocks
......@@ -159,7 +149,11 @@ bool BedIntersect::FindBlockedOverlaps(const BED &a, vector<BED> &hits) {
}
}
}
return processHits(a, valid_hits);
if (!a_is_bam) {
return processHits(a, valid_hits);
}
else
return !valid_hits.empty();
}
......@@ -242,8 +236,19 @@ void BedIntersect::IntersectBed() {
if (_obeySplits == false)
FindOverlaps(a, hits);
// split the BED12 into blocks and look for overlaps in each discrete block
else
FindBlockedOverlaps(a, hits);
else {
// find the hits that overlap with the full span of the blocked BED
_bedB->allHits(a.chrom, a.start, a.end, a.strand,
hits, _sameStrand, _diffStrand,
_overlapFraction, _reciprocal);
// break a into discrete blocks, as we need to
// measure overlap with the individual blocks, not the full span.
bedVector a_blocks;
GetBedBlocks(a, a_blocks);
// find the overlaps between the block in A and B
// last parm is false as a is not a BAM entry
FindBlockedOverlaps(a, a_blocks, hits, false);
}
hits.clear();
}
}
......@@ -261,8 +266,11 @@ void BedIntersect::IntersectBed() {
while (sweep.Next(hit_set)) {
if (_obeySplits == false)
processHits(hit_set.first, hit_set.second);
else
FindBlockedOverlaps(hit_set.first, hit_set.second);
else {
bedVector a_blocks;
GetBedBlocks(hit_set.first, a_blocks);
FindBlockedOverlaps(hit_set.first, a_blocks, hit_set.second, false);
}
}
}
}
......@@ -279,19 +287,15 @@ void BedIntersect::IntersectBam(string bamFile) {
// using BAM output.
if (_bamOutput == false) {
_bedA = new BedFile(_bedAFile);
_bedA->bedType = 6;
_bedA->bedType = 12;
}
// open the BAM file
BamReader reader;
BamWriter writer;
reader.Open(bamFile);
// get header & reference information
string bamHeader = reader.GetHeaderText();
RefVector refs = reader.GetReferenceData();
// open a BAM output to stdout if we are writing BAM
if (_bamOutput == true) {
// set compression mode
......@@ -301,93 +305,55 @@ void BedIntersect::IntersectBam(string bamFile) {
// open our BAM writer
writer.Open("stdout", bamHeader, refs);
}
vector<BED> hits;
// reserve some space
hits.reserve(100);
BamAlignment bam;
// get each set of alignments for each pair.
while (reader.GetNextAlignment(bam)) {
if (bam.IsMapped()) {
BED a;
a.chrom = refs.at(bam.RefID).RefName;
a.start = bam.Position;
a.end = bam.GetEndPosition(false, false);
// build the name field from the BAM alignment.
a.name = bam.Name;
if (bam.IsFirstMate()) a.name += "/1";
if (bam.IsSecondMate()) a.name += "/2";
a.score = ToString(bam.MapQuality);
a.strand = "+";
if (bam.IsReverseStrand()) a.strand = "-";
if (_bamOutput == true) {
bool overlapsFound = false;
// treat the BAM alignment as a single "block"
if (_obeySplits == false) {
overlapsFound = _bedB->anyHits(a.chrom, a.start, a.end,
a.strand, _sameStrand, _diffStrand,
_overlapFraction, _reciprocal);
}
// split the BAM alignment into discrete blocks and
// look for overlaps only within each block.
else {
bool overlapFoundForBlock;
bedVector bedBlocks; // vec to store the discrete BED "blocks" from a
// we don't want to split on "D" ops, hence the "false"
GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks);
vector<BED>::const_iterator bedItr = bedBlocks.begin();
vector<BED>::const_iterator bedEnd = bedBlocks.end();
for (; bedItr != bedEnd; ++bedItr) {
overlapFoundForBlock = _bedB->anyHits(bedItr->chrom, bedItr->start, bedItr->end,
bedItr->strand, _sameStrand, _diffStrand,
_overlapFraction, _reciprocal);
if (overlapFoundForBlock == true)
overlapsFound = true;
}
}
if (overlapsFound == true) {
if (_noHit == false)
writer.SaveAlignment(bam);
}
else {
if (_noHit == true) {
writer.SaveAlignment(bam);
}
}
}
else {
// treat the BAM alignment as a single BED "block"
if (_obeySplits == false) {
FindOverlaps(a, hits);
hits.clear();
}
// split the BAM alignment into discrete BED blocks and
// look for overlaps only within each block.
else {
bedVector bedBlocks; // vec to store the discrete BED "blocks" from a
GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks);
vector<BED>::const_iterator bedItr = bedBlocks.begin();
vector<BED>::const_iterator bedEnd = bedBlocks.end();
for (; bedItr != bedEnd; ++bedItr) {
FindOverlaps(*bedItr, hits);
hits.clear();
}
}
}
}
// BAM IsMapped() is false
else if (_noHit == true) {
if ((!bam.IsMapped()) && (_noHit == true))
writer.SaveAlignment(bam);
else if (!bam.IsMapped())
continue;
// break alignment into discrete blocks,
bedVector bed_blocks;
string chrom = refs.at(bam.RefID).RefName;
GetBamBlocks(bam, chrom, bed_blocks);
// create a basic BED entry from the BAM alignment
BED bed;
MakeBedFromBam(bam, chrom, bed_blocks, bed);
bool overlapsFound = false;
if ((_bamOutput == true) && (_obeySplits == false))
{
overlapsFound = _bedB->anyHits(bed.chrom, bed.start, bed.end,
bed.strand, _sameStrand, _diffStrand,
_overlapFraction, _reciprocal);
}
else if ( ((_bamOutput == true) && (_obeySplits == true)) ||
((_bamOutput == false) && (_obeySplits == true)) )
{
// find the hits that overlap with the full span of the blocked BED
_bedB->allHits(bed.chrom, bed.start, bed.end, bed.strand,
hits, _sameStrand, _diffStrand,
_overlapFraction, _reciprocal);
// find the overlaps between the block in A and B
overlapsFound = FindBlockedOverlaps(bed, bed_blocks, hits, _bamOutput);
}
else if ((_bamOutput == false) && (_obeySplits == false))
{
FindOverlaps(bed, hits);
}
// save the BAM alignment if overlap reqs. were met
if (_bamOutput == true) {
if ((overlapsFound == true) && (_noHit == false))
writer.SaveAlignment(bam);
else if ((overlapsFound == false) && (_noHit == true))
writer.SaveAlignment(bam);
}
hits.clear();
}
// close the relevant BAM files.
......
......@@ -18,6 +18,7 @@
#include "api/BamWriter.h"
#include "api/BamAux.h"
#include "BlockedIntervals.h"
#include "BamAncillary.h"
using namespace BamTools;
......@@ -87,11 +88,12 @@ private:
bool processHits(const BED &a, const vector<BED> &hits);
bool FindOverlaps(const BED &a, vector<BED> &hits);
bool FindBlockedOverlaps(const BED &a, vector<BED> &hits);
bool FindBlockedOverlaps(const BED &a, const vector<BED> &a_blocks,
const vector<BED> &hits, bool a_is_bam);
void ReportOverlapDetail(int overlapBases, const BED &a, const BED &b, CHRPOS s, CHRPOS e);
void ReportOverlapSummary(const BED &a, const int &numOverlapsFound);
};
......
......@@ -73,4 +73,43 @@ namespace BamTools {
blocks.push_back( BED(chrom, bam.Position, currPosition, name, score, strand) );
}
}
void MakeBedFromBam(const BamAlignment &bam, const string &chrom,
const bedVector &blocks, BED &bed)
{
bed.chrom = chrom;
bed.start = bam.Position;
bed.end = bam.GetEndPosition(false, false);
// build the name field from the BAM alignment.
bed.name = bam.Name;
if (bam.IsFirstMate()) bed.name += "/1";
else if (bam.IsSecondMate()) bed.name += "/2";
bed.score = ToString(bam.MapQuality);
bed.strand = "+";
if (bam.IsReverseStrand()) bed.strand = "-";
bed.fields.push_back(bed.chrom);
bed.fields.push_back(ToString(bed.start));
bed.fields.push_back(ToString(bed.end));
bed.fields.push_back(bed.name);
bed.fields.push_back(bed.score);
bed.fields.push_back(bed.strand);
bed.fields.push_back(ToString(bam.Position));
bed.fields.push_back(ToString(bed.end));
bed.fields.push_back("0,0,0");
bed.fields.push_back(ToString(blocks.size()));
ostringstream block_lengths, block_starts;
for (size_t i = 0; i < blocks.size(); ++i) {
block_lengths << blocks[i].end - blocks[i].start << ",";
block_starts << blocks[i].start - bam.Position << ",";
}
bed.fields.push_back(block_lengths.str());
bed.fields.push_back(block_starts.str());
// identify which indices relate to the "other"
// (i,e. non-BED6) fields
for (size_t i = 5; i <= 11; ++i)
bed.other_idxs.push_back(i);
}
}
......@@ -16,4 +16,7 @@
namespace BamTools {
void getBamBlocks(const BamAlignment &bam, const RefVector &refs,
vector<BED> &blocks, bool includeDeletions = true);
void MakeBedFromBam(const BamAlignment &bam, const string &chrom,
const bedVector &blocks, BED &bed);
}
......@@ -14,7 +14,7 @@
void GetBamBlocks(const BamAlignment &bam,
const string &chrom,
vector<BED> &bedBlocks,
bedVector &bedBlocks,
bool breakOnDeletionOps)
{
vector<int> starts;
......
......@@ -22,7 +22,7 @@ using namespace BamTools;
*/
void GetBamBlocks(const BamAlignment &bam,
const string &chrom,
vector<BED> &bedBlocks,
bedVector &bedBlocks,
bool breakOnDeletionOps = false);
/* break a BED12 record into discrete BED6 blocks. */
......
......@@ -798,7 +798,6 @@ public:
*/
template <typename T>
inline void reportBedTab(const T &bed) {
// if it is azeroLength feature, we need to
// correct the start and end coords to what they were
// in the original file
......@@ -888,7 +887,6 @@ public:
start++;
end--;
}
//BED
if (_isGff == false && _isVcf == false) {
if (this->bedType == 3) {
......@@ -960,7 +958,6 @@ public:
*/
template <typename T>
inline void reportBedRangeTab(const T &bed, CHRPOS start, CHRPOS end) {
// if it is azeroLength feature, we need to
// correct the start and end coords to what they were
// in the original file
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment