diff --git a/src/shuffleBed/Makefile b/src/shuffleBed/Makefile index 28af18583215cf029d4f94bc84312523cef6208b..2f945715cff8508e0167776f96e6b658064f4493 100644 --- a/src/shuffleBed/Makefile +++ b/src/shuffleBed/Makefile @@ -6,6 +6,7 @@ BIN_DIR = ../../bin/ # define our includes # ------------------- INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/bedFilePE/ \ -I$(UTILITIES_DIR)/genomeFile/ \ -I$(UTILITIES_DIR)/lineFileUtilities/ \ -I$(UTILITIES_DIR)/gzstream/ \ diff --git a/src/shuffleBed/shuffleBed.cpp b/src/shuffleBed/shuffleBed.cpp index e32abdb2c660f6af2b9ec7febbbfebf651e676c4..ac6f6228a44e6fa4cd7ad684252ac3706d2fe874 100644 --- a/src/shuffleBed/shuffleBed.cpp +++ b/src/shuffleBed/shuffleBed.cpp @@ -13,9 +13,12 @@ #include "shuffleBed.h" -BedShuffle::BedShuffle(string &bedFile, string &genomeFile, string &excludeFile, string &includeFile, - bool haveSeed, bool haveExclude, bool haveInclude, bool sameChrom, - float overlapFraction, int seed, bool chooseChrom) { +BedShuffle::BedShuffle(string &bedFile, string &genomeFile, + string &excludeFile, string &includeFile, + bool haveSeed, bool haveExclude, + bool haveInclude, bool sameChrom, + float overlapFraction, int seed, + bool chooseChrom, bool isBedpe) { _bedFile = bedFile; _genomeFile = genomeFile; @@ -27,6 +30,7 @@ BedShuffle::BedShuffle(string &bedFile, string &genomeFile, string &excludeFile, _overlapFraction = overlapFraction; _haveSeed = haveSeed; _chooseChrom = chooseChrom; + _isBedpe = isBedpe; // use the supplied seed for the random @@ -41,8 +45,12 @@ BedShuffle::BedShuffle(string &bedFile, string &genomeFile, string &excludeFile, _seed = (unsigned)time(0)+(unsigned)getpid(); srand(_seed); } + + if (_isBedpe == false) + _bed = new BedFile(bedFile); + else + _bedpe = new BedFilePE(bedFile); - _bed = new BedFile(bedFile); _genome = new GenomeFile(genomeFile); _chroms = _genome->getChromList(); _numChroms = _genome->getNumberOfChroms(); @@ -81,49 +89,129 @@ BedShuffle::~BedShuffle(void) { void BedShuffle::Shuffle() { - BED bedEntry; - _bed->Open(); - while (_bed->GetNextBed(bedEntry)) { - if (_bed->_status == BED_VALID) { - ChooseLocus(bedEntry); - _bed->reportBedNewLine(bedEntry); + + if (_isBedpe == false) { + BED bedEntry; + _bed->Open(); + while (_bed->GetNextBed(bedEntry)) { + if (_bed->_status == BED_VALID) { + ChooseLocus(bedEntry); + _bed->reportBedNewLine(bedEntry); + } } + _bed->Close(); + } + // BEDPE input + else { + int lineNum = 0; // current input line number + BedLineStatus status; + + BEDPE bedEntry; + _bedpe->Open(); + while ((status = _bedpe->GetNextBedPE(bedEntry, lineNum)) != BED_INVALID) + { + if (status == BED_VALID) { + ChoosePairedLocus(bedEntry); + _bedpe->reportBedPENewLine(bedEntry); + } + } + _bedpe->Close(); } - _bed->Close(); } void BedShuffle::ShuffleWithExclusions() { - BED bedEntry; - _bed->Open(); - while (_bed->GetNextBed(bedEntry)) { - if (_bed->_status == BED_VALID) { - // keep looking as long as the chosen - // locus happens to overlap with regions - // that the user wishes to exclude. - int tries = 0; - bool haveOverlap = false; - do - { - // choose a new locus - ChooseLocus(bedEntry); - haveOverlap = _exclude->anyHits(bedEntry.chrom, bedEntry.start, bedEntry.end, - bedEntry.strand, false, false, _overlapFraction, false); - tries++; - } while ((haveOverlap == true) && (tries <= MAX_TRIES)); + if (_isBedpe == false) { + BED bedEntry; + _bed->Open(); + while (_bed->GetNextBed(bedEntry)) { + if (_bed->_status == BED_VALID) { + // keep looking as long as the chosen + // locus happens to overlap with regions + // that the user wishes to exclude. + int tries = 0; + bool haveOverlap = false; + do + { + // choose a new locus + ChooseLocus(bedEntry); + haveOverlap = _exclude->anyHits(bedEntry.chrom, + bedEntry.start, + bedEntry.end, + bedEntry.strand, + false, false, + _overlapFraction, false); + tries++; + } while ((haveOverlap == true) && (tries <= MAX_TRIES)); - if (tries > MAX_TRIES) { - cerr << "Error, line " << _bed->_lineNum << ": tried " << MAX_TRIES << " potential loci for entry, but could not avoid excluded regions. Ignoring entry and moving on." << endl; + if (tries > MAX_TRIES) { + cerr << "Error, line " << _bed->_lineNum + << ": tried " << MAX_TRIES + << " potential loci for entry, but could not avoid " + << "excluded regions. Ignoring entry and moving on." + << endl; } + else { + _bed->reportBedNewLine(bedEntry); + } } - else { - _bed->reportBedNewLine(bedEntry); + } + _bed->Close(); + } + // BEDPE input + else { + int lineNum = 0; // current input line number + BedLineStatus status; + + BEDPE bedEntry; + _bedpe->Open(); + while ((status = _bedpe->GetNextBedPE(bedEntry, lineNum)) != BED_INVALID) + { + if (status == BED_VALID) { + // keep looking as long as the chosen + // locus happens to overlap with regions + // that the user wishes to exclude. + int tries = 0; + bool haveOverlap1 = false; + bool haveOverlap2 = false; + do + { + // choose a new locus + ChoosePairedLocus(bedEntry); + haveOverlap1 = _exclude->anyHits(bedEntry.chrom1, + bedEntry.start1, + bedEntry.end1, + bedEntry.strand1, + false, false, + _overlapFraction, false); + + haveOverlap2 = _exclude->anyHits(bedEntry.chrom2, + bedEntry.start2, + bedEntry.end2, + bedEntry.strand2, + false, false, + _overlapFraction, false); + tries++; + } while (((haveOverlap1 == true) || (haveOverlap2 == true)) + && (tries <= MAX_TRIES)); + + if (tries > MAX_TRIES) { + cerr << "Error, line " << _bed->_lineNum + << ": tried " << MAX_TRIES + << " potential loci for entry, but could not avoid " + << "excluded regions. Ignoring entry and moving on." + << endl; + } + else { + _bedpe->reportBedPENewLine(bedEntry); + } + } } + _bedpe->Close(); } - _bed->Close(); } @@ -198,6 +286,58 @@ void BedShuffle::ChooseLocus(BED &bedEntry) { } +void BedShuffle::ChoosePairedLocus(BEDPE &b) { + + CHRPOS foot1_len = b.end1 - b.start1; + CHRPOS foot2_len = b.end2 - b.start2; + CHRPOS length = b.end2 - b.start1; + + if (b.chrom1 == b.chrom2) + { + CHRPOS chromSize; + do + { + uint32_t randStart = ((rand() << 31) | rand()) % _genomeSize; + pair<string, int> location = _genome->projectOnGenome(randStart); + b.chrom1 = location.first; + b.chrom2 = location.first; + b.start1 = location.second; + b.end1 = b.start1 + foot1_len; + b.end2 = b.start1 + length; + b.start2 = b.end2 - foot2_len; + chromSize = _genome->getChromSize(location.first); + } while ((b.end1 > chromSize) || + (b.start2 > chromSize) || + (b.end2 > chromSize)); + // keep looking if we have exceeded the end of the chrom. + } + else + { + CHRPOS chromSize1, chromSize2; + do + { + uint32_t rand1Start = ((rand() << 31) | rand()) % _genomeSize; + uint32_t rand2Start = ((rand() << 31) | rand()) % _genomeSize; + pair<string, int> location1 = _genome->projectOnGenome(rand1Start); + pair<string, int> location2 = _genome->projectOnGenome(rand2Start); + + b.chrom1 = location1.first; + b.chrom2 = location2.first; + b.start1 = location1.second; + b.start2 = location2.second; + + b.end1 = b.start1 + foot1_len; + b.end2 = b.start2 + foot2_len; + chromSize1 = _genome->getChromSize(location1.first); + chromSize2 = _genome->getChromSize(location2.first); + + } while ((b.end1 > chromSize1) || + (b.end2 > chromSize2)); + // keep looking if we have exceeded the end of the chrom. + } +} + + void BedShuffle::ChooseLocusFromInclusionFile(BED &bedEntry) { string chrom = bedEntry.chrom; @@ -238,4 +378,3 @@ void BedShuffle::ChooseLocusFromInclusionFile(BED &bedEntry) { ChooseLocusFromInclusionFile(bedEntry); } } - diff --git a/src/shuffleBed/shuffleBed.h b/src/shuffleBed/shuffleBed.h index 7500a045118d342018c91aa359bf625f78ba1380..648a34d4c5b474e77486f13a1b667e2a86b9db7b 100644 --- a/src/shuffleBed/shuffleBed.h +++ b/src/shuffleBed/shuffleBed.h @@ -10,6 +10,7 @@ Licenced under the GNU General Public License 2.0 license. ******************************************************************************/ #include "bedFile.h" +#include "bedFilePE.h" #include "genomeFile.h" #include <vector> @@ -34,9 +35,12 @@ class BedShuffle { public: // constructor - BedShuffle(string &bedFile, string &genomeFile, string &excludeFile, string &includeFile, - bool haveSeed, bool haveExclude, bool haveInclude, bool sameChrom, - float overlapFraction, int seed, bool chooseChrom); + BedShuffle(string &bedFile, string &genomeFile, + string &excludeFile, string &includeFile, + bool haveSeed, bool haveExclude, + bool haveInclude, bool sameChrom, + float overlapFraction, int seed, + bool chooseChrom, bool isBedpe); // destructor ~BedShuffle(void); @@ -54,10 +58,12 @@ private: bool _haveInclude; bool _haveSeed; bool _chooseChrom; + bool _isBedpe; // The BED file from which to compute coverage. BedFile *_bed; + BedFilePE *_bedpe; BedFile *_exclude; BedFile *_include; @@ -76,4 +82,6 @@ private: void ChooseLocus(BED &); void ChooseLocusFromInclusionFile(BED &); + + void ChoosePairedLocus(BEDPE &b); }; diff --git a/src/shuffleBed/shuffleBedMain.cpp b/src/shuffleBed/shuffleBedMain.cpp index 8a8d8b602b85d584ec246ff21c2b9e83b3c57183..8edb6a40b28e505adffd76124e2ea5f3589bef90 100644 --- a/src/shuffleBed/shuffleBedMain.cpp +++ b/src/shuffleBed/shuffleBedMain.cpp @@ -44,6 +44,7 @@ int shuffle_main(int argc, char* argv[]) { int seed = -1; bool sameChrom = false; bool chooseChrom = false; + bool isBedpe = false; for(int i = 1; i < argc; i++) { @@ -110,6 +111,9 @@ int shuffle_main(int argc, char* argv[]) { i++; } } + else if(PARAMETER_CHECK("-bedpe", 6, parameterLength)) { + isBedpe = true; + } else { cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; @@ -128,9 +132,10 @@ int shuffle_main(int argc, char* argv[]) { } if (!showHelp) { - BedShuffle *bc = new BedShuffle(bedFile, genomeFile, excludeFile, includeFile, - haveSeed, haveExclude, haveInclude, sameChrom, - overlapFraction, seed, chooseChrom); + BedShuffle *bc = new BedShuffle(bedFile, genomeFile, excludeFile, + includeFile, haveSeed, haveExclude, + haveInclude, sameChrom, overlapFraction, + seed, chooseChrom, isBedpe); delete bc; return 0; } @@ -177,7 +182,9 @@ void shuffle_help(void) { cerr << "\t\tgenome (the default), first choose a chrom randomly, and then" << endl; cerr << "\t\tchoose a random start coordinate on that chrom. This leads" << endl; cerr << "\t\tto features being ~uniformly distributed among the chroms," << endl; - cerr << "\t\tas opposed to features being distribute as a function of chrom size." << endl << endl; + cerr << "\t\tas opposed to features being distribute as a function of chrom size." << endl << endl; + + cerr << "\t-bedpe\t" << "Indicate that the A file is in BEDPE format." << endl << endl; cerr << "Notes: " << endl;