Skip to content
Snippets Groups Projects
Commit cf64c2f2 authored by Aaron's avatar Aaron
Browse files

Fixed issue #55: mergeBed now reports 1-based starts for GFF files. Also,

greatly simplified the mergeBed algorithm.  MUCH more elegant now.
parent 2507d4b0
No related branches found
Tags v0.1.2
No related merge requests found
......@@ -13,13 +13,19 @@
#include "mergeBed.h"
void ReportMergedNames(const map<string, bool> &names) {
unsigned int n = 0;
printf("\t");
map<string, bool>::const_iterator nameItr = names.begin();
map<string, bool>::const_iterator nameEnd = names.end();
for (; nameItr != nameEnd; ++nameItr) {
if (n < (names.size() - 1)) {cout << nameItr->first << ";";}
else {cout << nameItr->first;}
if (n < (names.size() - 1)) {
cout << nameItr->first << ";";
}
else {
cout << nameItr->first;
}
n++;
}
}
......@@ -51,6 +57,46 @@ BedMerge::~BedMerge(void) {
}
// ===============================================
// Convenience method for reporting merged blocks
// ================================================
void BedMerge::Report(string chrom, int start, int end, const map<string, bool> &names, int mergeCount) {
if (_bed->isZeroBased == false) {start++;}
printf("%s\t%d\t%d", chrom.c_str(), start, end);
if (_numEntries == false && _reportNames == false) {
printf("\n");
}
else if (_numEntries) {
printf("\t%d\n", mergeCount);
}
else if (_reportNames) {
ReportMergedNames(names);
printf("\n");
}
}
// =========================================================
// Convenience method for reporting merged blocks by strand
// =========================================================
void BedMerge::ReportStranded(string chrom, int start, int end, const map<string, bool> &names, int mergeCount, string strand) {
if (_bed->isZeroBased == false) {start++;}
printf("%s\t%d\t%d", chrom.c_str(), start, end);
if (_numEntries == false && _reportNames == false) {
printf("\t%s\n", strand.c_str());
}
else if (_numEntries) {
printf("\t%d\t%s\n", mergeCount, strand.c_str());
}
else if (_reportNames) {
ReportMergedNames(names);
printf("\t%s\n", strand.c_str());
}
}
// =====================================================
// = Merge overlapping BED entries into a single entry =
// =====================================================
......@@ -61,110 +107,41 @@ void BedMerge::MergeBed() {
_bed->loadBedFileIntoMapNoBin();
// loop through each chromosome and merge their BED entries
for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) {
masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
for (; m != mEnd; ++m) {
// bedList is already sorted by start position.
string chrom = m->first;
vector<BED> bedList = m->second;
CHRPOS minStart = INT_MAX;
CHRPOS maxEnd = 0;
bool OIP = false; // OIP = Overlap In Progress. Lame, I realize.
int prev = -1;
unsigned int curr = 0;
int mergeCount = 1;
map<string, bool> names;
// loop through the BED entries for this chromosome
// and look for overlaps
for (curr = 0; curr < bedList.size(); ++curr) {
// make sure prev points to an actual element
if (prev < 0) {
prev = curr;
continue;
}
// Is there an overlap between the current and previous entries?
if ( overlaps(bedList[prev].start, bedList[prev].end,
bedList[curr].start, bedList[curr].end) >= _maxDistance) {
OIP = true;
mergeCount++;
minStart = min(bedList[prev].start, min(minStart, bedList[curr].start));
maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end));
names[bedList[prev].name] = true;
names[bedList[curr].name] = true;
}
else if ( overlaps(minStart, maxEnd,
bedList[curr].start, bedList[curr].end) >= _maxDistance) {
mergeCount++;
minStart = min(minStart, bedList[curr].start);
maxEnd = max(maxEnd, bedList[curr].end);
names[bedList[curr].name] = true;
}
else {
// was there an overlap befor the current entry broke it?
if (OIP) {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t";
ReportMergedNames(names);
cout << endl;
}
else {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << endl;
}
}
else {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << endl;
}
else {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << endl;
}
// merge overlapping features for this chromosome.
int start = -1;
int end = -1;
vector<BED>::const_iterator bedItr = bedList.begin();
vector<BED>::const_iterator bedEnd = bedList.end();
for (; bedItr != bedEnd; ++bedItr) {
if ((int) bedItr->start > end) {
if (start >= 0) {
Report(chrom, start, end, names, mergeCount);
// reset
mergeCount = 1;
names.clear();
}
// reset things for the next overlapping "block"
OIP = false;
mergeCount = 1;
minStart = INT_MAX;
maxEnd = 0;
names.clear();
names[bedList[curr].name] = true;
}
prev = curr;
}
// clean up based on the last entry for the current chromosome
if (OIP) {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t";
ReportMergedNames(names);
cout << endl;
start = bedItr->start;
end = bedItr->end;
names[bedItr->name] = true;
}
else {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << endl;
end = bedItr->end;
mergeCount++;
names[bedItr->name] = true;
}
}
else {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << endl;
}
else {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << endl;
}
if (start >= 0) {
Report(chrom, start, end, names, mergeCount);
}
}
}
......@@ -183,7 +160,9 @@ void BedMerge::MergeBedStranded() {
masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
for (; m != mEnd; ++m) {
// bedList is already sorted by start position.
string chrom = m->first;
vector<BED> bedList = m->second;
// make a list of the two strands to merge separately.
......@@ -194,120 +173,41 @@ void BedMerge::MergeBedStranded() {
// do two passes, one for each strand.
for (unsigned int s = 0; s < strands.size(); s++) {
CHRPOS minStart = INT_MAX;
CHRPOS maxEnd = 0;
bool OIP = false; // OIP = Overlap In Progress. Lame, I realize.
int prev = -1;
unsigned int curr = 0;
int mergeCount = 1;
int numOnStrand = 0;
map<string, bool> names;
// loop through the BED entries for this chromosome
// and look for overlaps
for (curr = 0; curr < bedList.size(); ++curr) {
// merge overlapping features for this chromosome.
int start = -1;
int end = -1;
vector<BED>::const_iterator bedItr = bedList.begin();
vector<BED>::const_iterator bedEnd = bedList.end();
for (; bedItr != bedEnd; ++bedItr) {
// if forcing strandedness, move on if the hit
// is not on the current strand.
if (bedList[curr].strand != strands[s]) {
continue; // continue force the next iteration of the for loop.
}
else {
numOnStrand++;
}
// make sure prev points to an actual element on the
// current strand
if (prev < 0) {
if (bedList[curr].strand == strands[s]) {
prev = curr;
}
continue;
}
if ( overlaps(bedList[prev].start, bedList[prev].end,
bedList[curr].start, bedList[curr].end) >= _maxDistance) {
OIP = true;
mergeCount++;
minStart = min(bedList[prev].start, min(minStart, bedList[curr].start));
maxEnd = max(bedList[prev].end, max(maxEnd, bedList[curr].end));
names[bedList[prev].name] = true;
names[bedList[curr].name] = true;
}
else if ( overlaps(minStart, maxEnd,
bedList[curr].start, bedList[curr].end) >= _maxDistance) {
mergeCount++;
minStart = min(minStart, bedList[curr].start);
maxEnd = max(maxEnd, bedList[curr].end);
names[bedList[curr].name] = true;
}
else {
// was there an overlap before the current entry broke it?
if (OIP) {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t";
ReportMergedNames(names);
cout << "\t" << strands[s] << endl;
}
else {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl;
}
}
else {
if ((_numEntries) && (numOnStrand > 0)) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl;
}
else if (numOnStrand > 0) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl;
}
if (bedItr->strand != strands[s]) { continue; }
else { numOnStrand++; }
if ((int) bedItr->start > end) {
if (start >= 0) {
ReportStranded(chrom, start, end, names, mergeCount, strands[s]);
// reset
mergeCount = 1;
names.clear();
}
// reset things for the next overlapping "block"
OIP = false;
mergeCount = 1;
minStart = INT_MAX;
maxEnd = 0;
names.clear();
// add the name of the current element in prep for the next block
names[bedList[curr].name] = true;
}
prev = curr;
}
// clean up based on the last entry for the current chromosome
if (OIP) {
if (_numEntries) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << mergeCount << "\t" << strands[s] << endl;
}
else if (_reportNames) {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t";
ReportMergedNames(names);
cout << "\t" << strands[s] << endl;
start = bedItr->start;
end = bedItr->end;
names[bedItr->name] = true;
}
else {
cout << bedList[prev].chrom << "\t" << minStart << "\t" << maxEnd << "\t" << strands[s] << endl;
end = bedItr->end;
mergeCount++;
names[bedItr->name] = true;
}
}
else {
if ((_numEntries) && (numOnStrand > 0)) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << mergeCount << "\t" << strands[s] << endl;
}
else if ((_reportNames) && (numOnStrand > 0)) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << bedList[prev].name << "\t" << strands[s] << endl;
}
else if (numOnStrand > 0) {
cout << bedList[prev].chrom << "\t" << bedList[prev].start << "\t" << bedList[prev].end << "\t" << strands[s] << endl;
}
if (start >= 0) {
ReportStranded(chrom, start, end, names, mergeCount, strands[s]);
}
}
}
......
......@@ -47,4 +47,6 @@ private:
// instance of a bed file class.
BedFile *_bed;
void Report(string chrom, int start, int end, const map<string, bool> &names, int mergeCount);
void ReportStranded(string chrom, int start, int end, const map<string, bool> &names, int mergeCount, string strand);
};
......@@ -466,17 +466,12 @@ void BedFile::countListHits(const BED &a, int index, bool forceStrand) {
}
}
void BedFile::setZeroBased(bool zeroBased) { this->isZeroBased = zeroBased; }
void BedFile::setGff (bool gff) {
if (gff == true) this->_isGff = true;
else this->_isGff = false;
}
void BedFile::setGff (bool gff) { this->_isGff = gff; }
void BedFile::setVcf (bool vcf) {
if (vcf == true) this->_isVcf = true;
else this->_isVcf = false;
}
void BedFile::setVcf (bool vcf) { this->_isVcf = vcf; }
void BedFile::setFileType (FileType type) {
......
......@@ -396,6 +396,7 @@ public:
string bedFile;
unsigned int bedType; // 3-6, 12 for BED
// 9 for GFF
bool isZeroBased;
// Main data structires used by BEDTools
masterBedCovMap bedCovMap;
......@@ -412,6 +413,7 @@ private:
FileType _fileType; // what is the file type? (BED? GFF? VCF?)
istream *_bedStream;
void setZeroBased(bool zeroBased);
void setGff (bool isGff);
void setVcf (bool isVcf);
void setFileType (FileType type);
......@@ -459,6 +461,7 @@ private:
// it's BED format if columns 2 and 3 are integers
if (isInteger(lineVector[1]) && isInteger(lineVector[2])) {
setGff(false);
setZeroBased(true);
setFileType(BED_FILETYPE);
setBedType(numFields); // we now expect numFields columns in each line
if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
......@@ -467,6 +470,7 @@ private:
else if (isInteger(lineVector[1]) && numFields >= 8) {
setGff(false);
setVcf(true);
setZeroBased(false);
setFileType(VCF_FILETYPE);
setBedType(numFields); // we now expect numFields columns in each line
if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
......@@ -474,6 +478,7 @@ private:
// it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total.
else if ((numFields >= 8) && isInteger(lineVector[3]) && isInteger(lineVector[4])) {
setGff(true);
setZeroBased(false);
setFileType(GFF_FILETYPE);
setBedType(numFields); // we now expect numFields columns in each line
if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment