From 025efbd175e213e99bc85754a578474b89fa3225 Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Mon, 7 Nov 2011 19:54:58 -0500 Subject: [PATCH] Added r. layer's cluster algorithm -cluster --- src/multiIntersectBed/intervalItem.h | 17 +- src/multiIntersectBed/multiIntersectBed.cpp | 191 ++++++++++-------- src/multiIntersectBed/multiIntersectBed.h | 16 +- .../multiIntersectBedMain.cpp | 11 +- 4 files changed, 135 insertions(+), 100 deletions(-) diff --git a/src/multiIntersectBed/intervalItem.h b/src/multiIntersectBed/intervalItem.h index a63e0ce4..54b15d6d 100644 --- a/src/multiIntersectBed/intervalItem.h +++ b/src/multiIntersectBed/intervalItem.h @@ -39,7 +39,7 @@ public: coord_type(START), coord(0) {} - + IntervalItem(int _index, COORDINATE_TYPE _type, CHRPOS _coord) : source_index(_index), coord_type(_type), @@ -54,7 +54,20 @@ public: bool operator< ( const IntervalItem& other ) const { - return this->coord > other.coord; + if (this->coord > other.coord) { + return true; + } + else if (this->coord < other.coord) { + return false; + } + else { + if (this->coord_type == END) { + return false; + } + else { + return true; + } + } } }; diff --git a/src/multiIntersectBed/multiIntersectBed.cpp b/src/multiIntersectBed/multiIntersectBed.cpp index e4eb5e10..8058af56 100644 --- a/src/multiIntersectBed/multiIntersectBed.cpp +++ b/src/multiIntersectBed/multiIntersectBed.cpp @@ -58,84 +58,6 @@ MultiIntersectBed::~MultiIntersectBed() { } -void MultiIntersectBed::MultiIntersect() { - OpenFiles(); - - // Add the first interval from each file - for(size_t i = 0;i < input_files.size(); ++i) - LoadNextItem(i); - - // Chromosome loop - once per chromosome - do { - // Find the first chromosome to use - current_chrom = DetermineNextChrom(); - - // Populate the queue with initial values from all files - // (if they belong to the correct chromosome) - for(size_t i = 0; i < input_files.size(); ++i) - AddInterval(i); - - CHRPOS current_start = ConsumeNextCoordinate(); - - // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage - if (print_empty_regions && current_start > 0) - PrintEmptyCoverage(0,current_start); - - // Intervals loop - until all intervals (of current chromosome) from all files are used. - do { - CHRPOS current_end = queue.top().coord; - PrintCoverage(current_start, current_end); - current_start = ConsumeNextCoordinate(); - } while (!queue.empty()); - - // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome - // print a dummy empty coverage - if (print_empty_regions) { - CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom); - if (current_start < chrom_size) - PrintEmptyCoverage(current_start, chrom_size); - } - - } while (!AllFilesDone()); -} - - -CHRPOS MultiIntersectBed::ConsumeNextCoordinate() { - assert(!queue.empty()); - - CHRPOS new_position = queue.top().coord; - do { - IntervalItem item = queue.top(); - UpdateInformation(item); - queue.pop(); - } while (!queue.empty() && queue.top().coord == new_position); - - return new_position; -} - - -void MultiIntersectBed::UpdateInformation(const IntervalItem &item) { - // Update the depth coverage for this file - - // Which coordinate is it - start or end? - switch (item.coord_type) - { - case START: - current_depth[item.source_index] = 1; - current_non_zero_inputs++; - break; - case END: - //Read the next interval from this file - AddInterval(item.source_index); - current_depth[item.source_index] = 0; - current_non_zero_inputs--; - break; - default: - assert(0); - } -} - - void MultiIntersectBed::AddInterval(int index) { assert(static_cast<unsigned int>(index) < input_files.size()); @@ -234,12 +156,6 @@ void MultiIntersectBed::LoadNextItem(int index) { BedFile *file = input_files[index]; BED merged_bed; int lineNum = 0; - // - // TO DO: Do the mergeing on the fly. How best to do this? - // - // IDEA: Implement a Merge class with GetNextMerge element. - // - while (file->GetNextMergedBed(merged_bed, lineNum)) { current_item[index] = merged_bed; @@ -291,3 +207,110 @@ void MultiIntersectBed::CloseFiles() { } input_files.clear(); } + + +void MultiIntersectBed::MultiIntersect() { + OpenFiles(); + + // Add the first interval from each file + for(size_t i = 0;i < input_files.size(); ++i) + LoadNextItem(i); + + // Chromosome loop - once per chromosome + IntervalItem curr_point; + do { + // Find the first chromosome to use + current_chrom = DetermineNextChrom(); + + // Populate the queue with initial values from all files + // (if they belong to the correct chromosome) + for(size_t i = 0; i < input_files.size(); ++i) + AddInterval(i); + + CHRPOS prev_start = 0; + do { + // get the current point in the queue + curr_point = queue.top(); + // if we have moved, report the interval + if (curr_point.coord > prev_start) { + PrintCoverage(prev_start, curr_point.coord); + } + // update depending on the type of interval we encountered. + if (curr_point.coord_type == START) { + current_depth[curr_point.source_index] = 1; + current_non_zero_inputs++; + } + else { + //Read the next interval from this file + AddInterval(curr_point.source_index); + current_depth[curr_point.source_index] = 0; + current_non_zero_inputs--; + } + // reset for the next point + prev_start = curr_point.coord; + queue.pop(); + } while (!queue.empty()); + + // want empty regions, and the last coordinate is not the last coordinate of the chromosome + if (print_empty_regions) { + CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom); + if (curr_point.coord < chrom_size) + PrintEmptyCoverage(curr_point.coord, chrom_size); + } + } while (!AllFilesDone()); +} + + +void MultiIntersectBed::Cluster() { + OpenFiles(); + + // Add the first interval from each file + for(size_t i = 0;i < input_files.size(); ++i) + LoadNextItem(i); + + // Chromosome loop - once per chromosome + IntervalItem curr_point; + short last_direction = 0; + do { + // Find the first chromosome to use + current_chrom = DetermineNextChrom(); + + // Populate the queue with initial values from all files + // (if they belong to the correct chromosome) + for(size_t i = 0; i < input_files.size(); ++i) + AddInterval(i); + + CHRPOS prev_start = 0; + do { + // get the current point in the queue + curr_point = queue.top(); + if (curr_point.coord_type == START) { + current_depth[curr_point.source_index] = 1; + current_non_zero_inputs++; + // reset for the next point + prev_start = curr_point.coord; + last_direction = 1; + } + else { + if (last_direction == 1) { + PrintCoverage(prev_start, curr_point.coord); + } + //Read the next interval from this file + AddInterval(curr_point.source_index); + current_depth[curr_point.source_index] = 0; + current_non_zero_inputs--; + last_direction = -1; + } + queue.pop(); + } while (!queue.empty()); + + // want empty regions, and the last coordinate is not the last coordinate of the chromosome + if (print_empty_regions) { + CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom); + if (curr_point.coord < chrom_size) + PrintEmptyCoverage(curr_point.coord, chrom_size); + } + } while (!AllFilesDone()); +} + + diff --git a/src/multiIntersectBed/multiIntersectBed.h b/src/multiIntersectBed/multiIntersectBed.h index 1e22954e..b8497f84 100644 --- a/src/multiIntersectBed/multiIntersectBed.h +++ b/src/multiIntersectBed/multiIntersectBed.h @@ -56,6 +56,9 @@ public: // Combines all interval files void MultiIntersect(); + + // + void Cluster(); // Print the header line: chrom/start/end + name of each bedgraph file. void PrintHeader(); @@ -95,19 +98,6 @@ private: */ bool AllFilesDone(); - /* - Extract the next coordinate from the queue, and updates the current coverage information. - If multiple interval share the same coordinate values, all of them are handled. - If an END coordinate is consumed, the next interval (from the corresponding file) is read. - */ - CHRPOS ConsumeNextCoordinate(); - - /* - Updates the coverage information based on the given item. - Item can be a START coordinate or an END coordiante. - */ - void UpdateInformation(const IntervalItem &item); - /* prints chrom/start/end and the current depth coverage values of all the files. */ diff --git a/src/multiIntersectBed/multiIntersectBedMain.cpp b/src/multiIntersectBed/multiIntersectBedMain.cpp index 5248315e..5b4b627b 100644 --- a/src/multiIntersectBed/multiIntersectBedMain.cpp +++ b/src/multiIntersectBed/multiIntersectBedMain.cpp @@ -49,6 +49,7 @@ int main(int argc, char* argv[]) bool haveFiller = true; bool printHeader = false; bool printEmptyRegions = false; + bool cluster = false; bool showHelp = false; string genomeFile; string basePath; @@ -127,6 +128,9 @@ int main(int argc, char* argv[]) else if(PARAMETER_CHECK("-empty", 6, parameterLength)) { printEmptyRegions = true; } + else if(PARAMETER_CHECK("-cluster", 8, parameterLength)) { + cluster = true; + } else if(PARAMETER_CHECK("-examples", 9, parameterLength)) { ShowHelp(); ShowExamples(); @@ -155,7 +159,10 @@ int main(int argc, char* argv[]) MultiIntersectBed mbi(cout, inputFiles, inputTitles, printEmptyRegions, genomeFile, noCoverageValue); if (printHeader) mbi.PrintHeader(); - mbi.MultiIntersect(); + if (!cluster) + mbi.MultiIntersect(); + else + mbi.Cluster(); } void ShowHelp(void) { @@ -173,6 +180,8 @@ void ShowHelp(void) { cerr << "Options: " << endl; + cerr << "\t-cluster\t\t" << "Invoke Ryan's algorithm." << endl << endl; + cerr << "\t-header\t\t" << "Print a header line." << endl; cerr << "\t\t\t(chrom/start/end + names of each file)." << endl << endl; -- GitLab