Skip to content
Snippets Groups Projects
Commit 025efbd1 authored by Aaron's avatar Aaron
Browse files

Added r. layer's cluster algorithm -cluster

parent fae3a320
No related branches found
No related tags found
No related merge requests found
...@@ -39,7 +39,7 @@ public: ...@@ -39,7 +39,7 @@ public:
coord_type(START), coord_type(START),
coord(0) coord(0)
{} {}
IntervalItem(int _index, COORDINATE_TYPE _type, CHRPOS _coord) : IntervalItem(int _index, COORDINATE_TYPE _type, CHRPOS _coord) :
source_index(_index), source_index(_index),
coord_type(_type), coord_type(_type),
...@@ -54,7 +54,20 @@ public: ...@@ -54,7 +54,20 @@ public:
bool operator< ( const IntervalItem& other ) const bool operator< ( const IntervalItem& other ) const
{ {
return this->coord > other.coord; if (this->coord > other.coord) {
return true;
}
else if (this->coord < other.coord) {
return false;
}
else {
if (this->coord_type == END) {
return false;
}
else {
return true;
}
}
} }
}; };
......
...@@ -58,84 +58,6 @@ MultiIntersectBed::~MultiIntersectBed() { ...@@ -58,84 +58,6 @@ MultiIntersectBed::~MultiIntersectBed() {
} }
void MultiIntersectBed::MultiIntersect() {
OpenFiles();
// Add the first interval from each file
for(size_t i = 0;i < input_files.size(); ++i)
LoadNextItem(i);
// Chromosome loop - once per chromosome
do {
// Find the first chromosome to use
current_chrom = DetermineNextChrom();
// Populate the queue with initial values from all files
// (if they belong to the correct chromosome)
for(size_t i = 0; i < input_files.size(); ++i)
AddInterval(i);
CHRPOS current_start = ConsumeNextCoordinate();
// User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage
if (print_empty_regions && current_start > 0)
PrintEmptyCoverage(0,current_start);
// Intervals loop - until all intervals (of current chromosome) from all files are used.
do {
CHRPOS current_end = queue.top().coord;
PrintCoverage(current_start, current_end);
current_start = ConsumeNextCoordinate();
} while (!queue.empty());
// User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome
// print a dummy empty coverage
if (print_empty_regions) {
CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom);
if (current_start < chrom_size)
PrintEmptyCoverage(current_start, chrom_size);
}
} while (!AllFilesDone());
}
CHRPOS MultiIntersectBed::ConsumeNextCoordinate() {
assert(!queue.empty());
CHRPOS new_position = queue.top().coord;
do {
IntervalItem item = queue.top();
UpdateInformation(item);
queue.pop();
} while (!queue.empty() && queue.top().coord == new_position);
return new_position;
}
void MultiIntersectBed::UpdateInformation(const IntervalItem &item) {
// Update the depth coverage for this file
// Which coordinate is it - start or end?
switch (item.coord_type)
{
case START:
current_depth[item.source_index] = 1;
current_non_zero_inputs++;
break;
case END:
//Read the next interval from this file
AddInterval(item.source_index);
current_depth[item.source_index] = 0;
current_non_zero_inputs--;
break;
default:
assert(0);
}
}
void MultiIntersectBed::AddInterval(int index) { void MultiIntersectBed::AddInterval(int index) {
assert(static_cast<unsigned int>(index) < input_files.size()); assert(static_cast<unsigned int>(index) < input_files.size());
...@@ -234,12 +156,6 @@ void MultiIntersectBed::LoadNextItem(int index) { ...@@ -234,12 +156,6 @@ void MultiIntersectBed::LoadNextItem(int index) {
BedFile *file = input_files[index]; BedFile *file = input_files[index];
BED merged_bed; BED merged_bed;
int lineNum = 0; int lineNum = 0;
//
// TO DO: Do the mergeing on the fly. How best to do this?
//
// IDEA: Implement a Merge class with GetNextMerge element.
//
while (file->GetNextMergedBed(merged_bed, lineNum)) while (file->GetNextMergedBed(merged_bed, lineNum))
{ {
current_item[index] = merged_bed; current_item[index] = merged_bed;
...@@ -291,3 +207,110 @@ void MultiIntersectBed::CloseFiles() { ...@@ -291,3 +207,110 @@ void MultiIntersectBed::CloseFiles() {
} }
input_files.clear(); input_files.clear();
} }
void MultiIntersectBed::MultiIntersect() {
OpenFiles();
// Add the first interval from each file
for(size_t i = 0;i < input_files.size(); ++i)
LoadNextItem(i);
// Chromosome loop - once per chromosome
IntervalItem curr_point;
do {
// Find the first chromosome to use
current_chrom = DetermineNextChrom();
// Populate the queue with initial values from all files
// (if they belong to the correct chromosome)
for(size_t i = 0; i < input_files.size(); ++i)
AddInterval(i);
CHRPOS prev_start = 0;
do {
// get the current point in the queue
curr_point = queue.top();
// if we have moved, report the interval
if (curr_point.coord > prev_start) {
PrintCoverage(prev_start, curr_point.coord);
}
// update depending on the type of interval we encountered.
if (curr_point.coord_type == START) {
current_depth[curr_point.source_index] = 1;
current_non_zero_inputs++;
}
else {
//Read the next interval from this file
AddInterval(curr_point.source_index);
current_depth[curr_point.source_index] = 0;
current_non_zero_inputs--;
}
// reset for the next point
prev_start = curr_point.coord;
queue.pop();
} while (!queue.empty());
// want empty regions, and the last coordinate is not the last coordinate of the chromosome
if (print_empty_regions) {
CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom);
if (curr_point.coord < chrom_size)
PrintEmptyCoverage(curr_point.coord, chrom_size);
}
} while (!AllFilesDone());
}
void MultiIntersectBed::Cluster() {
OpenFiles();
// Add the first interval from each file
for(size_t i = 0;i < input_files.size(); ++i)
LoadNextItem(i);
// Chromosome loop - once per chromosome
IntervalItem curr_point;
short last_direction = 0;
do {
// Find the first chromosome to use
current_chrom = DetermineNextChrom();
// Populate the queue with initial values from all files
// (if they belong to the correct chromosome)
for(size_t i = 0; i < input_files.size(); ++i)
AddInterval(i);
CHRPOS prev_start = 0;
do {
// get the current point in the queue
curr_point = queue.top();
if (curr_point.coord_type == START) {
current_depth[curr_point.source_index] = 1;
current_non_zero_inputs++;
// reset for the next point
prev_start = curr_point.coord;
last_direction = 1;
}
else {
if (last_direction == 1) {
PrintCoverage(prev_start, curr_point.coord);
}
//Read the next interval from this file
AddInterval(curr_point.source_index);
current_depth[curr_point.source_index] = 0;
current_non_zero_inputs--;
last_direction = -1;
}
queue.pop();
} while (!queue.empty());
// want empty regions, and the last coordinate is not the last coordinate of the chromosome
if (print_empty_regions) {
CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom);
if (curr_point.coord < chrom_size)
PrintEmptyCoverage(curr_point.coord, chrom_size);
}
} while (!AllFilesDone());
}
...@@ -56,6 +56,9 @@ public: ...@@ -56,6 +56,9 @@ public:
// Combines all interval files // Combines all interval files
void MultiIntersect(); void MultiIntersect();
//
void Cluster();
// Print the header line: chrom/start/end + name of each bedgraph file. // Print the header line: chrom/start/end + name of each bedgraph file.
void PrintHeader(); void PrintHeader();
...@@ -95,19 +98,6 @@ private: ...@@ -95,19 +98,6 @@ private:
*/ */
bool AllFilesDone(); bool AllFilesDone();
/*
Extract the next coordinate from the queue, and updates the current coverage information.
If multiple interval share the same coordinate values, all of them are handled.
If an END coordinate is consumed, the next interval (from the corresponding file) is read.
*/
CHRPOS ConsumeNextCoordinate();
/*
Updates the coverage information based on the given item.
Item can be a START coordinate or an END coordiante.
*/
void UpdateInformation(const IntervalItem &item);
/* /*
prints chrom/start/end and the current depth coverage values of all the files. prints chrom/start/end and the current depth coverage values of all the files.
*/ */
......
...@@ -49,6 +49,7 @@ int main(int argc, char* argv[]) ...@@ -49,6 +49,7 @@ int main(int argc, char* argv[])
bool haveFiller = true; bool haveFiller = true;
bool printHeader = false; bool printHeader = false;
bool printEmptyRegions = false; bool printEmptyRegions = false;
bool cluster = false;
bool showHelp = false; bool showHelp = false;
string genomeFile; string genomeFile;
string basePath; string basePath;
...@@ -127,6 +128,9 @@ int main(int argc, char* argv[]) ...@@ -127,6 +128,9 @@ int main(int argc, char* argv[])
else if(PARAMETER_CHECK("-empty", 6, parameterLength)) { else if(PARAMETER_CHECK("-empty", 6, parameterLength)) {
printEmptyRegions = true; printEmptyRegions = true;
} }
else if(PARAMETER_CHECK("-cluster", 8, parameterLength)) {
cluster = true;
}
else if(PARAMETER_CHECK("-examples", 9, parameterLength)) { else if(PARAMETER_CHECK("-examples", 9, parameterLength)) {
ShowHelp(); ShowHelp();
ShowExamples(); ShowExamples();
...@@ -155,7 +159,10 @@ int main(int argc, char* argv[]) ...@@ -155,7 +159,10 @@ int main(int argc, char* argv[])
MultiIntersectBed mbi(cout, inputFiles, inputTitles, printEmptyRegions, genomeFile, noCoverageValue); MultiIntersectBed mbi(cout, inputFiles, inputTitles, printEmptyRegions, genomeFile, noCoverageValue);
if (printHeader) if (printHeader)
mbi.PrintHeader(); mbi.PrintHeader();
mbi.MultiIntersect(); if (!cluster)
mbi.MultiIntersect();
else
mbi.Cluster();
} }
void ShowHelp(void) { void ShowHelp(void) {
...@@ -173,6 +180,8 @@ void ShowHelp(void) { ...@@ -173,6 +180,8 @@ void ShowHelp(void) {
cerr << "Options: " << endl; cerr << "Options: " << endl;
cerr << "\t-cluster\t\t" << "Invoke Ryan's algorithm." << endl << endl;
cerr << "\t-header\t\t" << "Print a header line." << endl; cerr << "\t-header\t\t" << "Print a header line." << endl;
cerr << "\t\t\t(chrom/start/end + names of each file)." << endl << endl; cerr << "\t\t\t(chrom/start/end + names of each file)." << endl << endl;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment