From a3c9136d22a8edcc58cec818c8452494887b132e Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Sat, 11 Feb 2012 22:14:31 -0500 Subject: [PATCH] new tool: bedtools random --- Makefile | 1 + scripts/makeBashScripts.py | 1 + src/bedtools.cpp | 3 + src/randomBed/Makefile | 32 +++++++ src/randomBed/randomBed.cpp | 78 ++++++++++++++++ src/randomBed/randomBed.h | 55 ++++++++++++ src/randomBed/randomBedMain.cpp | 152 ++++++++++++++++++++++++++++++++ 7 files changed, 322 insertions(+) create mode 100644 src/randomBed/Makefile create mode 100644 src/randomBed/randomBed.cpp create mode 100644 src/randomBed/randomBed.h create mode 100644 src/randomBed/randomBedMain.cpp diff --git a/Makefile b/Makefile index a16294af..3e674de5 100644 --- a/Makefile +++ b/Makefile @@ -41,6 +41,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/nucBed \ $(SRC_DIR)/pairToBed \ $(SRC_DIR)/pairToPair \ + $(SRC_DIR)/randomBed \ $(SRC_DIR)/shuffleBed \ $(SRC_DIR)/slopBed \ $(SRC_DIR)/sortBed \ diff --git a/scripts/makeBashScripts.py b/scripts/makeBashScripts.py index 45647106..77d2c48c 100644 --- a/scripts/makeBashScripts.py +++ b/scripts/makeBashScripts.py @@ -34,6 +34,7 @@ def main(): 'overlap': 'getOverlap', 'pairtobed': 'pairToBed', 'pairtopair': 'pairToPair', + 'random': 'randomBed', 'shuffle': 'shuffleBed', 'slop': 'slopBed', 'sort': 'sortBed', diff --git a/src/bedtools.cpp b/src/bedtools.cpp index 8c1adbeb..765db2a0 100644 --- a/src/bedtools.cpp +++ b/src/bedtools.cpp @@ -59,6 +59,7 @@ int multiintersect_main(int argc, char* argv[]);// int nuc_main(int argc, char* argv[]);// int pairtobed_main(int argc, char* argv[]);// int pairtopair_main(int argc, char* argv[]);// +int random_main(int argc, char* argv[]); // int shuffle_main(int argc, char* argv[]); // int slop_main(int argc, char* argv[]); // int sort_main(int argc, char* argv[]); // @@ -92,6 +93,7 @@ int main(int argc, char *argv[]) else if (sub_cmd == "slop") return slop_main(argc-1, argv+1); else if (sub_cmd == "flank") return flank_main(argc-1, argv+1); else if (sub_cmd == "sort") return sort_main(argc-1, argv+1); + else if (sub_cmd == "random") return random_main(argc-1, argv+1); else if (sub_cmd == "shuffle") return shuffle_main(argc-1, argv+1); else if (sub_cmd == "annotate") return annotate_main(argc-1, argv+1); @@ -184,6 +186,7 @@ int bedtools_help(void) cout << " slop " << "Adjust the size of intervals.\n"; cout << " flank " << "Create new intervals from the flanks of existing intervals.\n"; cout << " sort " << "Order the intervals in a file.\n"; + cout << " random " << "Generate random intervals in a genome.\n"; cout << " shuffle " << "Randomly redistrubute intervals in a genome.\n"; cout << " annotate " << "Annotate coverage of features from multiple files.\n"; diff --git a/src/randomBed/Makefile b/src/randomBed/Makefile new file mode 100644 index 00000000..3a9ee898 --- /dev/null +++ b/src/randomBed/Makefile @@ -0,0 +1,32 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/genomeFile/ \ + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/version/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= randomBedMain.cpp randomBed.cpp randomBed.h +OBJECTS= randomBedMain.o randomBed.o +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) + + +all: $(BUILT_OBJECTS) + +.PHONY: all + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/randomBedMain.o $(OBJ_DIR)/randomBed.o + +.PHONY: clean diff --git a/src/randomBed/randomBed.cpp b/src/randomBed/randomBed.cpp new file mode 100644 index 00000000..72f50352 --- /dev/null +++ b/src/randomBed/randomBed.cpp @@ -0,0 +1,78 @@ +/***************************************************************************** + randomBed.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "randomBed.h" + + +BedRandom::BedRandom(string &genomeFile, uint32_t numToGenerate, int seed, + bool haveSeed, uint32_t length) { + _genomeFile = genomeFile; + _numToGenerate = numToGenerate; + _length = length; + _haveSeed = haveSeed; + _seed = seed; + + // use the supplied seed for the random + // number generation if given. else, + // roll our own. + if (_haveSeed) { + _seed = seed; + srand(seed); + } + else { + // thanks to Rob Long for the tip. + _seed = (unsigned)time(0)+(unsigned)getpid(); + srand(_seed); + } + Generate(); +} + + +BedRandom::~BedRandom(void) +{} + + +void BedRandom::Generate() +{ + _genome = new GenomeFile(_genomeFile); + uint32_t genomeSize = _genome->getGenomeSize(); + + string chrom; + uint32_t start; + uint32_t end; + char strand; + uint32_t chromSize; + uint32_t numGenerated = 0; + while (numGenerated < _numToGenerate) + { + do + { + // we need to combine two consective calls to rand() + // because RAND_MAX is 2^31 (2147483648), whereas + // mammalian genomes are obviously much larger. + uint32_t randStart = ((rand() << 31) | rand()) % genomeSize; + // use the above randomStart (e.g., for human 0..3.1billion) + // to identify the chrom and start on that chrom. + pair<string, int> location = _genome->projectOnGenome(randStart); + chrom = location.first; + start = location.second; + end = start + _length; + chromSize = _genome->getChromSize(location.first); + // keep looking if we have exceeded the end of the chrom. + } while (end > chromSize); + numGenerated++; + // flip a coin for strand + (rand() / double(RAND_MAX)) > 0.5 ? strand = '+' : strand = '-'; + printf("%s\t%d\t%d\t%d\t%d\t%c\n", chrom.c_str(), start, end, numGenerated, end-start, strand); + } +} + + diff --git a/src/randomBed/randomBed.h b/src/randomBed/randomBed.h new file mode 100644 index 00000000..f922a38e --- /dev/null +++ b/src/randomBed/randomBed.h @@ -0,0 +1,55 @@ +/***************************************************************************** + randomBed.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "genomeFile.h" + +#include <vector> +#include <iostream> +#include <fstream> +#include <map> +#include <cstdlib> +#include <ctime> +#include <sys/time.h> +#include <unistd.h> +#include <sys/types.h> +#include <algorithm> // for binary search +using namespace std; + +const int MAX_TRIES = 1000000; + +//************************************************ +// Class methods and elements +//************************************************ +class BedRandom { + +public: + + // constructor + BedRandom(string &genomeFile, uint32_t numToGenerate, int seed, + bool haveSeed, uint32_t length); + + // destructor + ~BedRandom(void); + +private: + + string _genomeFile; + int _seed; + bool _haveSeed; + + GenomeFile *_genome; + uint32_t _length; + uint32_t _numToGenerate; + + // methods + void Generate(); + +}; diff --git a/src/randomBed/randomBedMain.cpp b/src/randomBed/randomBedMain.cpp new file mode 100644 index 00000000..122b4dec --- /dev/null +++ b/src/randomBed/randomBedMain.cpp @@ -0,0 +1,152 @@ +/***************************************************************************** + randomBedMain.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "randomBed.h" +#include "version.h" + +using namespace std; + +// define our program name +#define PROGRAM_NAME "bedtools random" + + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +// function declarations +void random_help(void); + +int random_main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input files + string bedFile = "stdin"; + string genomeFile; + + bool haveGenome = false; + bool haveSeed = false; + int seed = -1; + int length = 100; + int numToGenerate = 1000000; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) random_help(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-g", 2, parameterLength)) { + if ((i+1) < argc) { + haveGenome = true; + genomeFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-seed", 5, parameterLength)) { + if ((i+1) < argc) { + haveSeed = true; + seed = atoi(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-l", 2, parameterLength)) { + if ((i+1) < argc) { + length = atoi(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-n", 2, parameterLength)) { + if ((i+1) < argc) { + numToGenerate = atoi(argv[i + 1]); + i++; + } + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveGenome) { + cerr << endl << "*****" << endl << "*****ERROR: Need a genome (-g) file. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + BedRandom *br = new BedRandom(genomeFile, numToGenerate, seed, haveSeed, length); + delete br; + return 0; + } + else { + random_help(); + } + return 0; +} + +void random_help(void) { + + cerr << "\nTool: bedtools random (aka randomBed)" << endl; + cerr << "Version: " << VERSION << "\n"; + cerr << "Summary: Generate random intervals among a genome." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -g <genome>" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-l\t" << "The length of the intervals to generate." << endl; + cerr << "\t\t- Default = 100." << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + cerr << "\t-n\t" << "The number of intervals to generate." << endl; + cerr << "\t\t- Default = 1,000,000." << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + cerr << "\t-n\t" << "Supply an integer seed for the shuffling." << endl; + cerr << "\t\t- By default, the seed is chosen automatically." << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + cerr << "\t-seed\t" << "Supply an integer seed for the shuffling." << endl; + cerr << "\t\t- By default, the seed is chosen automatically." << endl; + cerr << "\t\t- (INTEGER)" << endl << endl; + + cerr << "Notes: " << endl; + cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl; + cerr << "\t <chromName><TAB><chromSize>" << endl << endl; + cerr << "\tFor example, Human (hg19):" << endl; + cerr << "\tchr1\t249250621" << endl; + cerr << "\tchr2\t243199373" << endl; + cerr << "\t..." << endl; + cerr << "\tchr18_gl000207_random\t4262" << endl << endl; + + + cerr << "Tips: " << endl; + cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl; + cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl; + cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \\" << endl; + cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl; + + + // end the program here + exit(1); +} -- GitLab