Commit 5e3cb5f5 authored by Aaron's avatar Aaron
Browse files

make windows from ivls in bed files.

parent e8c5c14e
......@@ -6,6 +6,10 @@ BIN_DIR = ../../bin/
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/genomeFile/ \
-I$(UTILITIES_DIR)/bedFile/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/version/
......
/*****************************************************************************
windowMaker.cpp
windowMaker.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "windowMaker.h"
WindowMaker::WindowMaker(string &genomeFile, uint32_t size, uint32_t step)
: _genomeFile(genomeFile)
, _size(size)
, _step(step)
WindowMaker::WindowMaker(string &fileName, INPUT_FILE_TYPE input_file_type, uint32_t size, uint32_t step)
: _size(size)
, _step(step)
, _count(0)
, _window_method(FIXED_WINDOW_SIZE)
{
_genome = new GenomeFile(genomeFile);
MakeWindows();
if (input_file_type==GENOME_FILE)
MakeWindowsFromGenome(fileName);
else
MakeWindowsFromBED(fileName);
}
WindowMaker::WindowMaker(string &fileName, INPUT_FILE_TYPE input_file_type, uint32_t count)
: _size(0)
, _step(0)
, _count(count)
, _window_method(FIXED_WINDOW_COUNT)
{
if (input_file_type==GENOME_FILE)
MakeWindowsFromGenome(fileName);
else
MakeWindowsFromBED(fileName);
}
WindowMaker::~WindowMaker(void) {}
void WindowMaker::MakeWindows() {
void WindowMaker::MakeWindowsFromGenome(const string& genomeFileName) {
GenomeFile *_genome = new GenomeFile(genomeFileName);
// get a list of the chroms in the user's genome
vector<string> chromList = _genome->getChromList();
vector<string> chromList = _genome->getChromList();
// process each chrom in the genome
for (size_t c = 0; c < chromList.size(); ++c) {
string chrom = chromList[c];
uint32_t chrom_size = _genome->getChromSize(chrom);
for (uint32_t start = 0; start <= chrom_size; start += _step) {
if ((start + _size) <= chrom_size) {
cout << chrom << "\t" << start << "\t" << start + _size << endl;
}
else if (start < chrom_size) {
cout << chrom << "\t" << start << "\t" << chrom_size << endl;
}
}
BED bed(chrom,0,_genome->getChromSize(chrom));
MakeBEDWindow(bed);
}
}
void WindowMaker::MakeWindowsFromBED(string& bedFileName) {
BedFile bf(bedFileName);
bf.Open();
BED bed;
while (bf.GetNextBed(bed)) {
if (bf._status == BED_VALID)
MakeBEDWindow(bed);
}
bf.Close();
}
void WindowMaker::MakeBEDWindow(const BED& interval)
{
if (_window_method==FIXED_WINDOW_SIZE)
MakeFixedSizeWindow(interval);
else
MakeFixedCountWindow(interval);
}
void WindowMaker::MakeFixedSizeWindow(const BED& interval) {
for (uint32_t start = interval.start; start <= interval.end; start += _step) {
if ((start + _size) <= interval.end) {
cout << interval.chrom << "\t" << start << "\t" << start + _size << endl;
}
else if (start < interval.end) {
cout << interval.chrom << "\t" << start << "\t" << interval.end << endl;
}
}
}
void WindowMaker::MakeFixedCountWindow(const BED& interval) {
uint32_t interval_size = interval.end - interval.start ;
uint32_t window_size = (interval_size-1)/_count + 1; // integer version of ceil(interval_size/_count)
if (window_size==0 || interval_size==0)
return;
for (uint32_t start = interval.start; start <= interval.end; start += window_size) {
uint32_t end = min(start + window_size,interval.end);
cout << interval.chrom << "\t" << start << "\t" << end << endl;
}
}
\ No newline at end of file
/*****************************************************************************
windowMaker.h
windowMaker.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "genomeFile.h"
#include "bedFile.h"
using namespace std;
......@@ -20,18 +21,33 @@ using namespace std;
class WindowMaker {
public:
enum INPUT_FILE_TYPE {
GENOME_FILE,
BED_FILE
};
enum WINDOW_METHOD {
FIXED_WINDOW_SIZE,
FIXED_WINDOW_COUNT
};
// constructor
WindowMaker(string &genomeFile, uint32_t size, uint32_t step);
// constructor
WindowMaker(string &fileName, INPUT_FILE_TYPE input_file_type, uint32_t count);
WindowMaker(string &fileName, INPUT_FILE_TYPE input_file_type, uint32_t size, uint32_t step);
// destructor
~WindowMaker(void);
// destructor
~WindowMaker(void);
void MakeWindows();
void MakeWindowsFromGenome(const string& genomeFileName);
void MakeWindowsFromBED(string& bedFileName);
private:
string _genomeFile;
GenomeFile *_genome;
uint32_t _size;
uint32_t _size;
uint32_t _step;
};
uint32_t _count;
WINDOW_METHOD _window_method;
void MakeBEDWindow(const BED& interval);
void MakeFixedSizeWindow(const BED& interval);
void MakeFixedCountWindow(const BED& interval);
};
\ No newline at end of file
/*****************************************************************************
windowMakerMain.cpp
windowMakerMain.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "windowMaker.h"
#include "version.h"
......@@ -30,15 +30,19 @@ int windowmaker_main(int argc, char* argv[]) {
bool showHelp = false;
// input files
string genomeFile;
string inputFile;
WindowMaker::INPUT_FILE_TYPE inputFileType = WindowMaker::GENOME_FILE;
// parms
uint32_t size = 0;
uint32_t step = 0;
uint32_t count = 0;
bool haveGenome = false;
bool haveSize = false;
bool haveBed = false;
bool haveSize = false;
bool haveCount = false;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
......@@ -58,7 +62,16 @@ int windowmaker_main(int argc, char* argv[]) {
if(PARAMETER_CHECK("-g", 2, parameterLength)) {
if ((i+1) < argc) {
haveGenome = true;
genomeFile = argv[i + 1];
inputFile = argv[i + 1];
inputFileType = WindowMaker::GENOME_FILE;
i++;
}
}
else if(PARAMETER_CHECK("-b", 2, parameterLength)) {
if ((i+1) < argc) {
haveBed = true;
inputFile = argv[i + 1];
inputFileType = WindowMaker::BED_FILE;
i++;
}
}
......@@ -76,19 +89,42 @@ int windowmaker_main(int argc, char* argv[]) {
i++;
}
}
else if(PARAMETER_CHECK("-n", 2, parameterLength)) {
if ((i+1) < argc) {
haveCount = true;
count = atoi(argv[i + 1]);
i++;
}
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveGenome || !haveSize) {
cerr << endl << "*****" << endl << "*****ERROR: Need -g (genome file) and -w (window size). " << endl << "*****" << endl;
showHelp = true;
if (!haveGenome && !haveBed) {
cerr << endl << "*****" << endl << "*****ERROR: Need -g (genome file) or -b (BED file) for interval source. " << endl << "*****" << endl;
showHelp = true;
}
if (haveGenome && haveBed) {
cerr << endl << "*****" << endl << "*****ERROR: Can't combine -g (genome file) and -b (BED file). Please use one or the other." << endl << "*****" << endl;
showHelp = true;
}
if (!haveSize && !haveCount) {
cerr << endl << "*****" << endl << "*****ERROR: Need -w (window size) or -n (number of windows). " << endl << "*****" << endl;
showHelp = true;
}
if (haveSize && haveCount) {
cerr << endl << "*****" << endl << "*****ERROR: Can't combine -w (window size) and -n (number of windows). Please use one or the other. " << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
WindowMaker *wm = new WindowMaker(genomeFile, size, step);
WindowMaker *wm = NULL;
if (haveCount)
wm = new WindowMaker(inputFile, inputFileType, count);
if (haveSize)
wm = new WindowMaker(inputFile, inputFileType, size, step);
delete wm;
}
else {
......@@ -99,21 +135,43 @@ int windowmaker_main(int argc, char* argv[]) {
void windowmaker_help(void) {
cerr << "\nTool: bedtools makewindows" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "\nTool: bedtools makewindows" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Makes adjacent and/or sliding windows across a genome." << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -g <genome> -w <window_size>" << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] [-g <genome> OR -b <bed>]" << endl;
cerr << " [ -w <window_size> OR -n <number of windows> ]" << endl << endl;
cerr << "Input Options: " << endl;
cerr << "\t-g <genome>" << endl;
cerr << "\t\tGenome file size (see notes below)." << endl;
cerr << "\t\tWindows will be created for each chromosome in the file." << endl << endl;
cerr << "\t-b <bed>" << endl;
cerr << "\t\tBED file (with chrom,start,end fields)." << endl;
cerr << "\t\tWindows will be created for each interval in the file." << endl << endl;
cerr << "Options: " << endl;
cerr << "Windows Output Options: " << endl;
cerr << "\t-w <window_size>" << endl;
cerr << "\t\tDivide each input interval (either a chromosome or a BED interval)" << endl;
cerr << "\t\tto fixed-sized windows (i.e. same number of nucleotide in each window)." << endl;
cerr << "\t\tCan be combined with -s <step_size>" << endl << endl;
cerr << "\t-s <step_size>" << endl;
cerr << "\t\tStep size: i.e., how many base pairs to step before" << endl;
cerr << "\t\tcreating a new window. Used to create \"sliding\" windows." << endl;
cerr << "\t\t- Defaults to window size (non-sliding windows)." << endl << endl;
cerr << "\t-n <number_of_windows>" << endl;
cerr << "\t\tDivide each input interval (either a chromosome or a BED interval)" << endl;
cerr << "\t\tto fixed number of windows (i.e. same number of windows, with" << endl;
cerr << "\t\tvarying window sizes)." << endl << endl;
cerr << "\t-s\t" << "Step size: i.e., how many base pairs to step before" << endl;
cerr << "\t\tcreating a new window. Used to create \"sliding\" windows." << endl;
cerr << "\t\t- Defaults to -w (non-sliding windows)." << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl;
cerr << "\t <chromName><TAB><chromSize>" << endl << endl;
cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl;
cerr << "\t <chromName><TAB><chromSize>" << endl << endl;
cerr << "\tFor example, Human (hg19):" << endl;
cerr << "\tchr1\t249250621" << endl;
cerr << "\tchr2\t243199373" << endl;
......@@ -124,8 +182,59 @@ void windowmaker_help(void) {
cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl;
cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl;
cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \\" << endl;
cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl;
cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl;
cerr << "Examples: " << endl;
cerr << " # Divide the human genome into windows of 1MB:" << endl;
cerr << " $ " << PROGRAM_NAME << " -g hg19.txt -w 1000000" << endl;
cerr << " chr1 0 1000000" << endl;
cerr << " chr1 1000000 2000000" << endl;
cerr << " chr1 2000000 3000000" << endl;
cerr << " chr1 3000000 4000000" << endl;
cerr << " chr1 4000000 5000000" << endl;
cerr << " ..." << endl;
cerr << endl;
cerr << " # Divide the human genome into sliding (=overlapping) windows of 1MB, with 500KB overlap:" << endl;
cerr << " $ " << PROGRAM_NAME << " -g hg19.txt -w 1000000 -s 500000" << endl;
cerr << " chr1 0 1000000" << endl;
cerr << " chr1 500000 1500000" << endl;
cerr << " chr1 1000000 2000000" << endl;
cerr << " chr1 1500000 2500000" << endl;
cerr << " chr1 2000000 3000000" << endl;
cerr << " ..." << endl;
cerr << endl;
cerr << " # Divide each chromosome in human genome to 1000 windows of equal size:" << endl;
cerr << " $ " << PROGRAM_NAME << " -g hg19.txt -n 1000" << endl;
cerr << " chr1 0 249251" << endl;
cerr << " chr1 249251 498502" << endl;
cerr << " chr1 498502 747753" << endl;
cerr << " chr1 747753 997004" << endl;
cerr << " chr1 997004 1246255" << endl;
cerr << " ..." << endl;
cerr << endl;
cerr << " # Divide each interval in the given BED file into 10 equal-sized windows:" << endl;
cerr << " $ cat input.bed" << endl;
cerr << " chr5 60000 70000" << endl;
cerr << " chr5 73000 90000" << endl;
cerr << " chr5 100000 101000" << endl;
cerr << " $ " << PROGRAM_NAME << " -b input.bed -n 10" << endl;
cerr << " chr5 60000 61000" << endl;
cerr << " chr5 61000 62000" << endl;
cerr << " chr5 62000 63000" << endl;
cerr << " chr5 63000 64000" << endl;
cerr << " chr5 64000 65000" << endl;
cerr << " ..." << endl;
cerr << endl;
cerr << endl;
exit(1);
}
}
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment