From a3c9136d22a8edcc58cec818c8452494887b132e Mon Sep 17 00:00:00 2001
From: Aaron <aaronquinlan@gmail.com>
Date: Sat, 11 Feb 2012 22:14:31 -0500
Subject: [PATCH] new tool: bedtools random

---
 Makefile                        |   1 +
 scripts/makeBashScripts.py      |   1 +
 src/bedtools.cpp                |   3 +
 src/randomBed/Makefile          |  32 +++++++
 src/randomBed/randomBed.cpp     |  78 ++++++++++++++++
 src/randomBed/randomBed.h       |  55 ++++++++++++
 src/randomBed/randomBedMain.cpp | 152 ++++++++++++++++++++++++++++++++
 7 files changed, 322 insertions(+)
 create mode 100644 src/randomBed/Makefile
 create mode 100644 src/randomBed/randomBed.cpp
 create mode 100644 src/randomBed/randomBed.h
 create mode 100644 src/randomBed/randomBedMain.cpp

diff --git a/Makefile b/Makefile
index a16294af..3e674de5 100644
--- a/Makefile
+++ b/Makefile
@@ -41,6 +41,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
 		  $(SRC_DIR)/nucBed \
 		  $(SRC_DIR)/pairToBed \
 		  $(SRC_DIR)/pairToPair \
+		  $(SRC_DIR)/randomBed \
 		  $(SRC_DIR)/shuffleBed \
 		  $(SRC_DIR)/slopBed \
 		  $(SRC_DIR)/sortBed \
diff --git a/scripts/makeBashScripts.py b/scripts/makeBashScripts.py
index 45647106..77d2c48c 100644
--- a/scripts/makeBashScripts.py
+++ b/scripts/makeBashScripts.py
@@ -34,6 +34,7 @@ def main():
                  'overlap': 'getOverlap', 
                  'pairtobed': 'pairToBed', 
                  'pairtopair': 'pairToPair', 
+                 'random': 'randomBed', 
                  'shuffle': 'shuffleBed', 
                  'slop': 'slopBed', 
                  'sort': 'sortBed', 
diff --git a/src/bedtools.cpp b/src/bedtools.cpp
index 8c1adbeb..765db2a0 100644
--- a/src/bedtools.cpp
+++ b/src/bedtools.cpp
@@ -59,6 +59,7 @@ int multiintersect_main(int argc, char* argv[]);//
 int nuc_main(int argc, char* argv[]);//
 int pairtobed_main(int argc, char* argv[]);//
 int pairtopair_main(int argc, char* argv[]);//
+int random_main(int argc, char* argv[]); //
 int shuffle_main(int argc, char* argv[]); //
 int slop_main(int argc, char* argv[]); //
 int sort_main(int argc, char* argv[]); //
@@ -92,6 +93,7 @@ int main(int argc, char *argv[])
     else if (sub_cmd == "slop")        return slop_main(argc-1, argv+1);
     else if (sub_cmd == "flank")       return flank_main(argc-1, argv+1);
     else if (sub_cmd == "sort")        return sort_main(argc-1, argv+1);
+    else if (sub_cmd == "random")      return random_main(argc-1, argv+1);
     else if (sub_cmd == "shuffle")     return shuffle_main(argc-1, argv+1);
     else if (sub_cmd == "annotate")    return annotate_main(argc-1, argv+1);
 
@@ -184,6 +186,7 @@ int bedtools_help(void)
     cout  << "    slop          "  << "Adjust the size of intervals.\n";
     cout  << "    flank         "  << "Create new intervals from the flanks of existing intervals.\n";
     cout  << "    sort          "  << "Order the intervals in a file.\n";
+    cout  << "    random        "  << "Generate random intervals in a genome.\n";
     cout  << "    shuffle       "  << "Randomly redistrubute intervals in a genome.\n";
     cout  << "    annotate      "  << "Annotate coverage of features from multiple files.\n";
     
diff --git a/src/randomBed/Makefile b/src/randomBed/Makefile
new file mode 100644
index 00000000..3a9ee898
--- /dev/null
+++ b/src/randomBed/Makefile
@@ -0,0 +1,32 @@
+UTILITIES_DIR = ../utils/
+OBJ_DIR = ../../obj/
+BIN_DIR = ../../bin/
+
+# -------------------
+# define our includes
+# -------------------
+INCLUDES = -I$(UTILITIES_DIR)/genomeFile/ \
+           -I$(UTILITIES_DIR)/BamTools/include \
+           -I$(UTILITIES_DIR)/version/
+
+# ----------------------------------
+# define our source and object files
+# ----------------------------------
+SOURCES= randomBedMain.cpp randomBed.cpp randomBed.h
+OBJECTS= randomBedMain.o randomBed.o
+BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
+
+
+all: $(BUILT_OBJECTS)
+
+.PHONY: all
+
+$(BUILT_OBJECTS): $(SOURCES)
+	@echo "  * compiling" $(*F).cpp
+	@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
+
+clean:
+	@echo "Cleaning up."
+	@rm -f $(OBJ_DIR)/randomBedMain.o $(OBJ_DIR)/randomBed.o
+
+.PHONY: clean
diff --git a/src/randomBed/randomBed.cpp b/src/randomBed/randomBed.cpp
new file mode 100644
index 00000000..72f50352
--- /dev/null
+++ b/src/randomBed/randomBed.cpp
@@ -0,0 +1,78 @@
+/*****************************************************************************
+  randomBed.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "randomBed.h"
+
+
+BedRandom::BedRandom(string &genomeFile, uint32_t numToGenerate, int seed,
+                       bool haveSeed, uint32_t length) {
+    _genomeFile        = genomeFile;
+    _numToGenerate     = numToGenerate;
+    _length            = length;
+    _haveSeed          = haveSeed;
+    _seed              = seed;
+
+    // use the supplied seed for the random
+    // number generation if given.  else,
+    // roll our own.
+    if (_haveSeed) {
+        _seed = seed;
+        srand(seed);
+    }
+    else {
+        // thanks to Rob Long for the tip.
+        _seed = (unsigned)time(0)+(unsigned)getpid();
+        srand(_seed);
+    }
+    Generate();
+}
+
+
+BedRandom::~BedRandom(void) 
+{}
+
+
+void BedRandom::Generate() 
+{
+    _genome  = new GenomeFile(_genomeFile);
+    uint32_t genomeSize  = _genome->getGenomeSize();
+    
+    string chrom;
+    uint32_t start;
+    uint32_t end;
+    char strand;
+    uint32_t chromSize;
+    uint32_t numGenerated = 0;
+    while (numGenerated < _numToGenerate)
+    {
+        do 
+        {
+            // we need to combine two consective calls to rand()
+            // because RAND_MAX is 2^31 (2147483648), whereas
+            // mammalian genomes are obviously much larger.
+            uint32_t randStart = ((rand() << 31) | rand()) % genomeSize;
+            // use the above randomStart (e.g., for human 0..3.1billion) 
+            // to identify the chrom and start on that chrom.
+            pair<string, int> location = _genome->projectOnGenome(randStart);
+            chrom     = location.first;
+            start     = location.second;
+            end       = start + _length;
+            chromSize = _genome->getChromSize(location.first);
+        // keep looking if we have exceeded the end of the chrom.
+        } while (end > chromSize);
+        numGenerated++;
+        // flip a coin for strand
+        (rand() / double(RAND_MAX)) > 0.5 ? strand = '+' : strand = '-';
+        printf("%s\t%d\t%d\t%d\t%d\t%c\n", chrom.c_str(), start, end, numGenerated, end-start, strand);
+    }
+}
+
+
diff --git a/src/randomBed/randomBed.h b/src/randomBed/randomBed.h
new file mode 100644
index 00000000..f922a38e
--- /dev/null
+++ b/src/randomBed/randomBed.h
@@ -0,0 +1,55 @@
+/*****************************************************************************
+  randomBed.h
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "genomeFile.h"
+
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <cstdlib>
+#include <ctime>
+#include <sys/time.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <algorithm>  // for binary search
+using namespace std;
+
+const int MAX_TRIES = 1000000;
+
+//************************************************
+// Class methods and elements
+//************************************************
+class BedRandom {
+
+public:
+
+    // constructor
+    BedRandom(string &genomeFile, uint32_t numToGenerate, int seed,
+               bool haveSeed, uint32_t length);
+
+    // destructor
+    ~BedRandom(void);
+
+private:
+
+    string _genomeFile;
+    int _seed;
+    bool _haveSeed;
+
+    GenomeFile *_genome;
+    uint32_t _length;
+    uint32_t _numToGenerate;
+    
+    // methods
+    void Generate();
+
+};
diff --git a/src/randomBed/randomBedMain.cpp b/src/randomBed/randomBedMain.cpp
new file mode 100644
index 00000000..122b4dec
--- /dev/null
+++ b/src/randomBed/randomBedMain.cpp
@@ -0,0 +1,152 @@
+/*****************************************************************************
+  randomBedMain.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "randomBed.h"
+#include "version.h"
+
+using namespace std;
+
+// define our program name
+#define PROGRAM_NAME "bedtools random"
+
+
+// define our parameter checking macro
+#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
+
+// function declarations
+void random_help(void);
+
+int random_main(int argc, char* argv[]) {
+
+    // our configuration variables
+    bool showHelp = false;
+
+    // input files
+    string bedFile = "stdin";
+    string genomeFile;
+
+    bool haveGenome       = false;
+    bool haveSeed         = false;
+    int seed              = -1;
+    int length            = 100;
+    int numToGenerate     = 1000000;
+
+    for(int i = 1; i < argc; i++) {
+        int parameterLength = (int)strlen(argv[i]);
+
+        if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
+        (PARAMETER_CHECK("--help", 5, parameterLength))) {
+            showHelp = true;
+        }
+    }
+
+    if(showHelp) random_help();
+
+    // do some parsing (all of these parameters require 2 strings)
+    for(int i = 1; i < argc; i++) {
+
+        int parameterLength = (int)strlen(argv[i]);
+
+        if(PARAMETER_CHECK("-g", 2, parameterLength)) {
+            if ((i+1) < argc) {
+                haveGenome = true;
+                genomeFile = argv[i + 1];
+                i++;
+            }
+        }
+        else if(PARAMETER_CHECK("-seed", 5, parameterLength)) {
+            if ((i+1) < argc) {
+                haveSeed = true;
+                seed = atoi(argv[i + 1]);
+                i++;
+            }
+        }
+        else if(PARAMETER_CHECK("-l", 2, parameterLength)) {
+            if ((i+1) < argc) {
+                length = atoi(argv[i + 1]);
+                i++;
+            }
+        }
+        else if(PARAMETER_CHECK("-n", 2, parameterLength)) {
+            if ((i+1) < argc) {
+                numToGenerate = atoi(argv[i + 1]);
+                i++;
+            }
+        }
+        else {
+          cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
+            showHelp = true;
+        }
+    }
+
+    // make sure we have both input files
+    if (!haveGenome) {
+      cerr << endl << "*****" << endl << "*****ERROR: Need a genome (-g) file. " << endl << "*****" << endl;
+      showHelp = true;
+    }
+
+    if (!showHelp) {
+        BedRandom *br = new BedRandom(genomeFile, numToGenerate, seed, haveSeed, length);
+        delete br;
+        return 0;
+    }
+    else {
+        random_help();
+    }
+    return 0;
+}
+
+void random_help(void) {
+
+    cerr << "\nTool:    bedtools random (aka randomBed)" << endl;
+    cerr << "Version: " << VERSION << "\n";    
+    cerr << "Summary: Generate random intervals among a genome." << endl << endl;
+
+    cerr << "Usage:   " << PROGRAM_NAME << " [OPTIONS] -g <genome>" << endl << endl;
+
+    cerr << "Options: " << endl;
+
+    cerr << "\t-l\t"                << "The length of the intervals to generate." << endl;
+    cerr                            << "\t\t- Default = 100." << endl;
+    cerr                            << "\t\t- (INTEGER)" << endl << endl;
+
+    cerr << "\t-n\t"                << "The number of intervals to generate." << endl;
+    cerr                            << "\t\t- Default = 1,000,000." << endl;
+    cerr                            << "\t\t- (INTEGER)" << endl << endl;
+
+    cerr << "\t-n\t"                << "Supply an integer seed for the shuffling." << endl;
+    cerr                            << "\t\t- By default, the seed is chosen automatically." << endl;
+    cerr                            << "\t\t- (INTEGER)" << endl << endl;
+
+    cerr << "\t-seed\t"             << "Supply an integer seed for the shuffling." << endl;
+    cerr                            << "\t\t- By default, the seed is chosen automatically." << endl;
+    cerr                            << "\t\t- (INTEGER)" << endl << endl;
+
+    cerr << "Notes: " << endl;
+    cerr << "\t(1)  The genome file should tab delimited and structured as follows:" << endl;
+    cerr << "\t     <chromName><TAB><chromSize>" << endl << endl;
+    cerr << "\tFor example, Human (hg19):" << endl;
+    cerr << "\tchr1\t249250621" << endl;
+    cerr << "\tchr2\t243199373" << endl;
+    cerr << "\t..." << endl;
+    cerr << "\tchr18_gl000207_random\t4262" << endl << endl;
+
+
+    cerr << "Tips: " << endl;
+    cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl;
+    cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl;
+    cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \\" << endl;
+    cerr << "\t\"select chrom, size from hg19.chromInfo\"  > hg19.genome" << endl << endl;
+
+
+    // end the program here
+    exit(1);
+}
-- 
GitLab