From 2eeee4c39052ae960bc433152d6549e13798f7da Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Mon, 5 Nov 2012 06:48:10 -0500 Subject: [PATCH] [OMM] add reldist source... --- src/reldist/Makefile | 39 +++++++++ src/reldist/reldist.cpp | 154 ++++++++++++++++++++++++++++++++++++ src/reldist/reldist.h | 74 +++++++++++++++++ src/reldist/reldistMain.cpp | 136 +++++++++++++++++++++++++++++++ 4 files changed, 403 insertions(+) create mode 100644 src/reldist/Makefile create mode 100644 src/reldist/reldist.cpp create mode 100644 src/reldist/reldist.h create mode 100644 src/reldist/reldistMain.cpp diff --git a/src/reldist/Makefile b/src/reldist/Makefile new file mode 100644 index 00000000..6b7f9410 --- /dev/null +++ b/src/reldist/Makefile @@ -0,0 +1,39 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/genomeFile/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BlockedIntervals \ + -I$(UTILITIES_DIR)/BamTools-Ancillary \ + -I$(UTILITIES_DIR)/chromsweep \ + -I$(UTILITIES_DIR)/version/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= reldistMain.cpp reldist.cpp reldist.h +OBJECTS= reldistMain.o reldist.o +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +PROGRAM= reldist + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/reldistMain.o $(OBJ_DIR)/reldist.o + +.PHONY: clean diff --git a/src/reldist/reldist.cpp b/src/reldist/reldist.cpp new file mode 100644 index 00000000..699d857c --- /dev/null +++ b/src/reldist/reldist.cpp @@ -0,0 +1,154 @@ +/***************************************************************************** + reldist.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "lineFileUtilities.h" +#include "reldist.h" + +/* + Constructor +*/ +RelativeDistance::RelativeDistance(string bedAFile, + string bedBFile, + bool summary) +{ + _bedAFile = bedAFile; + _bedBFile = bedBFile; + _summary = summary; + _tot_queries = 0; + CalculateRelativeDistance(); +} + + +/* + Destructor +*/ +RelativeDistance::~RelativeDistance(void) { +} + + +void RelativeDistance::LoadMidpoints() { + + _bedB = new BedFile(_bedBFile); + + BED bed; + _bedB->Open(); + while (_bedB->GetNextBed(bed)) { + CHRPOS midpoint = (int) (bed.end + bed.start) / 2; + _db_midpoints[bed.chrom].push_back(midpoint); + } + + map<string, vector<CHRPOS> >::const_iterator midItr = _db_midpoints.begin(); + map<string, vector<CHRPOS> >::const_iterator midEnd = _db_midpoints.end(); + for (; midItr != midEnd; ++midItr) + { + sort(_db_midpoints[midItr->first].begin(), + _db_midpoints[midItr->first].end()); + } +} + + +void RelativeDistance::ReportDistanceSummary() +{ + cout << "reldist\t" + << "count\t" + << "total\t" + << "fraction\n"; + + map<float, size_t>::const_iterator freqItr = _reldists.begin(); + map<float, size_t>::const_iterator freqEnd = _reldists.end(); + for (; freqItr != freqEnd; ++freqItr) + { + printf("%.2f\t%lu\t%lu\t%.3f\n", + freqItr->first, + freqItr->second, + _tot_queries, + (float) freqItr->second / (float) _tot_queries); + } +} + + +void RelativeDistance::UpdateDistanceSummary(float rel_dist) +{ + _tot_queries++; + // round the relative distance to two decimal places. + float rounded_rel_dist = floorf(rel_dist * 100) / 100; + _reldists[rounded_rel_dist]++; +} + + +void RelativeDistance::CalculateRelativeDistance() +{ + LoadMidpoints(); + + vector<CHRPOS>::iterator low; + size_t low_idx, high_idx; + float rel_dist; + + _bedA = new BedFile(_bedAFile); + + BED bed; + _bedA->Open(); + while (_bedA->GetNextBed(bed)) { + + if (_bedA->_status != BED_VALID) + continue; + + vector<CHRPOS> *chrom_mids = &_db_midpoints[bed.chrom]; + // binary search the current query's midpoint among + // the database midpoints + int midpoint = (int) (bed.end + bed.start) / 2; + low = lower_bound(chrom_mids->begin(), + chrom_mids->end(), + midpoint); + + // grab the indicies for the database midpoints that are left and + // right of the query's midpoint. + low_idx = low - chrom_mids->begin() - 1; + high_idx = low_idx + 1; + + // make sure we don't run off the boundaries of the database's + // midpoint vector + if (low_idx != chrom_mids->size() - 1) + { + // grab the database midpoints that are left and right of + // the query's midpoint. + int left = (*chrom_mids)[low_idx]; + int right = (*chrom_mids)[high_idx]; + + // ? + if (left > midpoint) + continue; + + // calculate the relative distance between the query's midpoint + // and the two nearest database midpoints. + size_t left_dist = abs(midpoint-left); + size_t right_dist = abs(midpoint-right); + rel_dist = (float) min(left_dist, right_dist) + / + (float) (right-left); + + if (!_summary) + { + _bedA->reportBedTab(bed); + printf("%.3f\n", rel_dist); + } + else { + UpdateDistanceSummary(rel_dist); + } + } + } + + // report the "histogram" of distances. + if (_summary) + ReportDistanceSummary(); +} + + diff --git a/src/reldist/reldist.h b/src/reldist/reldist.h new file mode 100644 index 00000000..a1ea421a --- /dev/null +++ b/src/reldist/reldist.h @@ -0,0 +1,74 @@ +/***************************************************************************** + reldist.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#ifndef RELDIST_H +#define RELDIST_H + +#include "bedFile.h" +#include "chromsweep.h" +#include "api/BamReader.h" +#include "api/BamAux.h" +#include "BlockedIntervals.h" +#include "BamAncillary.h" +using namespace BamTools; + + +#include <vector> +#include <algorithm> +#include <iostream> +#include <fstream> +#include <stdlib.h> +#include <math.h> +using namespace std; + + + +class RelativeDistance { + +public: + + // constructor + RelativeDistance(string bedAFile, + string bedBFile, + bool _summary); + + // destructor + ~RelativeDistance(void); + +private: + + //------------------------------------------------ + // private attributes + //------------------------------------------------ + string _bedAFile; + string _bedBFile; + + map<string, vector<CHRPOS> > _db_midpoints; + map<float, size_t> _reldists; + size_t _tot_queries; + + // instance of a bed file class. + BedFile *_bedA, *_bedB; + bool _summary; + + //------------------------------------------------ + // private methods + //------------------------------------------------ + void LoadMidpoints(); + void CalculateRelativeDistance(); + void UpdateDistanceSummary(float rel_dist); + void ReportDistanceSummary(); + + + +}; + +#endif /* RELDIST_H */ diff --git a/src/reldist/reldistMain.cpp b/src/reldist/reldistMain.cpp new file mode 100644 index 00000000..82a178ad --- /dev/null +++ b/src/reldist/reldistMain.cpp @@ -0,0 +1,136 @@ +/***************************************************************************** + reldistMain.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "reldist.h" +#include "version.h" + +using namespace std; + +// define our program name +#define PROGRAM_NAME "bedtools reldist" + + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +// function declarations +void reldist_help(void); + +int reldist_main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input files + string bedAFile; + string bedBFile; + bool summary = true; + + // input arguments + bool haveBedA = false; + bool haveBedB = false; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) reldist_help(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-a", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedA = true; + bedAFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-b", 2, parameterLength)) { + if ((i+1) < argc) { + haveBedB = true; + bedBFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-detail", 7, parameterLength)) { + summary = false; + } + else { + cerr << endl + << "*****ERROR: Unrecognized parameter: " + << argv[i] + << " *****" + << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBedA || !haveBedB) { + cerr << endl + << "*****" + << endl + << "*****ERROR: Need -a and -b files. " + << endl + << "*****" + << endl; + showHelp = true; + } + + + if (!showHelp) { + + RelativeDistance *rd = new RelativeDistance(bedAFile, + bedBFile, + summary); + delete rd; + return 0; + } + else { + reldist_help(); + return 0; + } +} + +void reldist_help(void) { + + cerr << "\nTool: bedtools reldist" << endl; + cerr << "Version: " << VERSION << "\n"; + cerr << "Summary: Calculate the relative distance distribution " + << "b/w two feature files." + << endl << endl; + + cerr << "Usage: " + << PROGRAM_NAME + << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" + << endl << endl; + + cerr << "Options: " << endl; + + + cerr << "\t-detail\t" << "Instead of a summary, report the relative" + << "\t\t distance for each interval in A" + << endl << endl; + // end the program here + exit(1); + +} -- GitLab