diff --git a/src/mergeBed/mergeMain.cpp b/src/mergeBed/mergeMain.cpp index 84b8db972337584ff7297d3058849a07776b5144..0a2f6946c6a20e540c75aaa5dbf3cd8bf3c1eb51 100755 --- a/src/mergeBed/mergeMain.cpp +++ b/src/mergeBed/mergeMain.cpp @@ -48,7 +48,7 @@ int main(int argc, char* argv[]) { int parameterLength = (int)strlen(argv[i]); - if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if(PARAMETER_CHECK("-i", 2, parameterLength)) { haveBed = true; bedFile = argv[i + 1]; i++; @@ -57,25 +57,25 @@ int main(int argc, char* argv[]) { numEntries = true; i++; } - else if(PARAMETER_CHECK("-d", 2, parameterLength)) { - haveMaxDistance = true; - maxDistance = atoi(argv[i + 1]); - i++; - } + else if(PARAMETER_CHECK("-d", 2, parameterLength)) { + haveMaxDistance = true; + maxDistance = atoi(argv[i + 1]); + i++; + } else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; } } // make sure we have both input files if (!haveBed) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl; - showHelp = true; + cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl; + showHelp = true; } - + if (!showHelp) { - BedMerge *bm = new BedMerge(bedFile, numEntries, maxDistance); + BedMerge *bm = new BedMerge(bedFile, numEntries, maxDistance); bm->MergeBed(); return 0; } @@ -85,7 +85,7 @@ int main(int argc, char* argv[]) { } void ShowHelp(void) { - + cerr << "===============================================" << endl; cerr << " " <<PROGRAM_NAME << " v" << VERSION << endl ; cerr << " Aaron Quinlan, Ph.D. (aaronquinlan@gmail.com) " << endl ; @@ -94,7 +94,7 @@ void ShowHelp(void) { cerr << "Description: Merges overlapping bed entries into a sinle interval." << endl << endl; cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <input.bed>" << endl << endl; - + cerr << "OPTIONS: " << endl; cerr << "\t" << "-n\t\t\t" << "Report the number of BED entries that were merged. (=1 if no merging occured)" << endl << endl; cerr << "\t" << "-d\t\t\t" << "Maximum distance between features allowed for features to be merged. (Default=0)" << endl << endl; @@ -105,5 +105,5 @@ void ShowHelp(void) { // end the program here exit(1); - + } diff --git a/src/sortBed/sortBed.cpp b/src/sortBed/sortBed.cpp index 90b6f2f970ef8f2858d0123aa5535126b8c95b27..13551e0f2ca38b5cc2ea5024f1626b0dd7fcaf71 100755 --- a/src/sortBed/sortBed.cpp +++ b/src/sortBed/sortBed.cpp @@ -24,9 +24,32 @@ BedSort::BedSort(string &bedFile) { BedSort::~BedSort(void) { } -// -// Merge overlapping BED entries into a single entry -// + +/* + reportBed + + Writes the _original_ BED entry for A. + Works for BED3 - BED6. +*/ +void BedSort::reportBed(const BED &a) { + + if (bed->bedType == 3) { + cout << a.chrom << "\t" << a.start << "\t" << a.end; + } + else if (bed->bedType == 4) { + cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" + << a.name; + } + else if (bed->bedType == 5) { + cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" + << a.name << "\t" << a.score; + } + else if (bed->bedType == 6) { + cout << a.chrom << "\t" << a.start << "\t" << a.end << "\t" + << a.name << "\t" << a.score << "\t" << a.strand; + } +} + void BedSort::SortBed() { // load the "B" bed file into a map so @@ -40,8 +63,169 @@ void BedSort::SortBed() { vector<BED> bedList = m->second; for (unsigned int i = 0; i < bedList.size(); ++i) { - cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl; + reportBed(bedList[i]); cout << "\n"; + ///cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl; + } + } +} + + +void BedSort::SortBedBySizeAsc() { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + bed->loadBedFileIntoMapNoBin(); + + vector<BED> masterList; + masterList.reserve(1000000); + + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = bed->bedMapNoBin.begin(); m != bed->bedMapNoBin.end(); ++m) { + + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + + // add the entries from this chromosome to the current list + for (unsigned int i = 0; i < m->second.size(); ++i) { + masterList.push_back(m->second[i]); } } + + // sort the master list by size (asc.) + sort(masterList.begin(), masterList.end(), sortBySizeAsc); + + // report the entries in ascending order + for (unsigned int i = 0; i < masterList.size(); ++i) { + reportBed(masterList[i]); cout << "\n"; + //cout << masterList[i].chrom << "\t" << masterList[i].start << "\t" << masterList[i].end << endl; + } +} + + +void BedSort::SortBedBySizeDesc() { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + bed->loadBedFileIntoMapNoBin(); + + vector<BED> masterList; + masterList.reserve(1000000); + + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = bed->bedMapNoBin.begin(); m != bed->bedMapNoBin.end(); ++m) { + + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + + // add the entries from this chromosome to the current list + for (unsigned int i = 0; i < m->second.size(); ++i) { + masterList.push_back(m->second[i]); + } + } + + // sort the master list by size (asc.) + sort(masterList.begin(), masterList.end(), sortBySizeDesc); + + // report the entries in ascending order + for (unsigned int i = 0; i < masterList.size(); ++i) { + reportBed(masterList[i]); cout << "\n"; + //cout << masterList[i].chrom << "\t" << masterList[i].start << "\t" << masterList[i].end << endl; + } +} + +void BedSort::SortBedByChromThenSizeAsc() { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + bed->loadBedFileIntoMapNoBin(); + + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = bed->bedMapNoBin.begin(); m != bed->bedMapNoBin.end(); ++m) { + + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + sort(bedList.begin(), bedList.end(), sortBySizeAsc); + + for (unsigned int i = 0; i < bedList.size(); ++i) { + reportBed(bedList[i]); cout << "\n"; + //cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl; + } + } +} + + +void BedSort::SortBedByChromThenSizeDesc() { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + bed->loadBedFileIntoMapNoBin(); + + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = bed->bedMapNoBin.begin(); m != bed->bedMapNoBin.end(); ++m) { + + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + + sort(bedList.begin(), bedList.end(), sortBySizeDesc); + + for (unsigned int i = 0; i < bedList.size(); ++i) { + reportBed(bedList[i]); cout << "\n"; + //cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl; + } + } +} + + +void BedSort::SortBedByChromThenScoreAsc() { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + bed->loadBedFileIntoMapNoBin(); + + if (bed->bedType >= 5) { + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = bed->bedMapNoBin.begin(); m != bed->bedMapNoBin.end(); ++m) { + + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + sort(bedList.begin(), bedList.end(), sortByScoreAsc); + + for (unsigned int i = 0; i < bedList.size(); ++i) { + reportBed(bedList[i]); cout << "\n"; + //cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl; + } + } + } + else { + cerr << "Error: Requested a sort by score, but your BED file does not appear to be in BED 5 format or greater. Exiting." << endl; + exit(1); + } +} + + +void BedSort::SortBedByChromThenScoreDesc() { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + bed->loadBedFileIntoMapNoBin(); + + if (bed->bedType >= 5) { + // loop through each chromosome and merge their BED entries + for (masterBedMapNoBin::iterator m = bed->bedMapNoBin.begin(); m != bed->bedMapNoBin.end(); ++m) { + + // bedList is already sorted by start position. + vector<BED> bedList = m->second; + sort(bedList.begin(), bedList.end(), sortByScoreDesc); + + for (unsigned int i = 0; i < bedList.size(); ++i) { + reportBed(bedList[i]); cout << "\n"; + //cout << bedList[i].chrom << "\t" << bedList[i].start << "\t" << bedList[i].end << endl; + } + } + } + else { + cerr << "Error: Requested a sort by score, but your BED file does not appear to be in BED 5 format or greater. Exiting." << endl; + exit(1); + } } diff --git a/src/sortBed/sortBed.h b/src/sortBed/sortBed.h index 5e46e290a0760fd99fb033ddbeaaf07a14cb4718..7f3fcb3b1e168837aebe5329720260794ba7d703 100755 --- a/src/sortBed/sortBed.h +++ b/src/sortBed/sortBed.h @@ -22,16 +22,24 @@ class BedSort { public: - // constructor - BedSort(string &); - - // destructor - ~BedSort(void); - - void SortBed(); - -private: + // constructor + BedSort(string &); + + // destructor + ~BedSort(void); + + // write BED to stdout + void reportBed(const BED &); + + void SortBed(); // the default. sorts by chrom (asc.) then by start (asc.) + void SortBedBySizeAsc(); + void SortBedBySizeDesc(); + void SortBedByChromThenSizeAsc(); + void SortBedByChromThenSizeDesc(); + void SortBedByChromThenScoreAsc(); + void SortBedByChromThenScoreDesc(); +private: string bedFile; // instance of a bed file class. diff --git a/src/sortBed/sortMain.cpp b/src/sortBed/sortMain.cpp index 362c60e8bb39c03d05fa9e7f915f7970a0a79918..e5d09f9c169dec895cadd98718edf4fdab6efbc6 100755 --- a/src/sortBed/sortMain.cpp +++ b/src/sortBed/sortMain.cpp @@ -23,7 +23,15 @@ int main(int argc, char* argv[]) { // input files string bedFile; bool haveBed = false; - + int sortChoices = 0; + + bool sortBySizeAsc = false; + bool sortBySizeDesc = false; + bool sortByChromThenSizeAsc = false; + bool sortByChromThenSizeDesc = false; + bool sortByChromThenScoreAsc = false; + bool sortByChromThenScoreDesc = false; + for(int i = 1; i < argc; i++) { int parameterLength = (int)strlen(argv[i]); @@ -45,26 +53,82 @@ int main(int argc, char* argv[]) { int parameterLength = (int)strlen(argv[i]); - if(argv[i]) { + if(PARAMETER_CHECK("-i", 2, parameterLength)) { haveBed = true; - bedFile = argv[i]; + bedFile = argv[i + 1]; + i++; + } + else if(PARAMETER_CHECK("-sizeA", 6, parameterLength)) { + sortBySizeAsc = true; + sortChoices++; + i++; + } + else if(PARAMETER_CHECK("-sizeD", 6, parameterLength)) { + sortBySizeDesc = true; + sortChoices++; + i++; + } + else if(PARAMETER_CHECK("-chrThenSizeA", 13, parameterLength)) { + sortByChromThenSizeAsc = true; + sortChoices++; i++; } + else if(PARAMETER_CHECK("-chrThenSizeD", 13, parameterLength)) { + sortByChromThenSizeDesc = true; + sortChoices++; + i++; + } + else if(PARAMETER_CHECK("-chrThenScoreA", 14, parameterLength)) { + sortByChromThenScoreAsc = true; + sortChoices++; + i++; + } + else if(PARAMETER_CHECK("-chrThenScoreD", 14, parameterLength)) { + sortByChromThenScoreDesc = true; + sortChoices++; + i++; + } else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; showHelp = true; } } // make sure we have both input files if (!haveBed) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl; - showHelp = true; + cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file. " << endl << "*****" << endl; + showHelp = true; } - + if (sortChoices > 1) { + cerr << endl << "*****" << endl << "*****ERROR: Sorting options are mutually exclusive. Please choose just one. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { BedSort *bm = new BedSort(bedFile); - bm->SortBed(); + + if (sortBySizeAsc) { + bm->SortBedBySizeAsc(); + } + else if (sortBySizeDesc) { + bm->SortBedBySizeDesc(); + } + else if (sortByChromThenSizeAsc) { + bm->SortBedByChromThenSizeAsc(); + } + else if (sortByChromThenSizeDesc) { + bm->SortBedByChromThenSizeDesc(); + } + else if (sortByChromThenScoreAsc) { + bm->SortBedByChromThenScoreAsc(); + } + else if (sortByChromThenScoreDesc) { + bm->SortBedByChromThenScoreDesc(); + } + else { + bm->SortBed(); + } return 0; } else { @@ -73,17 +137,27 @@ int main(int argc, char* argv[]) { } void ShowHelp(void) { - + cerr << "===============================================" << endl; cerr << " " <<PROGRAM_NAME << " v" << VERSION << endl ; cerr << " Aaron Quinlan, Ph.D. (aaronquinlan@gmail.com) " << endl ; cerr << " Hall Laboratory, University of Virginia" << endl; cerr << "===============================================" << endl << endl; - cerr << "Description: Sorts a BED file by chrom, then by start position." << endl << endl; - cerr << "***NOTE: Only BED3 - BED6 formats allowed.***"<< endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " <input.bed>" << endl << endl; + cerr << "Description: Sorts a BED file in various and useful ways." << endl << endl; + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <input.bed>" << endl << endl; + + cerr << "OPTIONS: " << endl; + cerr << "\t" << "-sizeA\t\t" << "Sort the BED file by feature size in ascending order. Sorts across all chromosomes." << endl << endl; + cerr << "\t" << "-sizeD\t\t" << "Sort the BED file by feature size in descending order. Sorts across all chromosomes." << endl << endl; + cerr << "\t" << "-chrThenSizeA\t" << "Sort the BED file by chrom (ascending), then feature size in ascending order." << endl << endl; + cerr << "\t" << "-chrThenSizeD\t" << "Sort the BED file by chrom (ascending), then feature size in descending order." << endl << endl; + cerr << "\t" << "-chrThenScoreA\t" << "Sort the BED file by chrom (ascending), then score in ascending order." << endl << endl; + cerr << "\t" << "-chrThenScoreD\t" << "Sort the BED file by chrom (ascending), then scor size in descending order." << endl << endl; + + cerr << "NOTES: " << endl; + cerr << "\t" << "-i stdin\t\t" << "Allows BED file A to be read from stdin. E.g.: cat a.bed | sortBed -i stdin" << endl << endl; + cerr << "\t***Only BED3 - BED6 formats allowed.***"<< endl << endl; // end the program here exit(1); - + } diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index 67ac0cb37ca3462849d6e719d4e71f6c386bab32..19929909fc56f4e72d750c616359ad3d98f4cfc0 100755 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -68,29 +68,6 @@ int max(const int a, int b) { } } -//********************************************* -// Sorting functions -//********************************************* -bool sortByChrom(BED const & a, BED const & b){ - if (a.chrom < b.chrom) return true; - else return false; -}; - -bool sortByStart(const BED &a, const BED &b){ - if (a.start < b.start) return true; - else return false; -}; - -bool byChromThenStart(BED const & a, BED const & b){ - - if (a.chrom < b.chrom) return true; - else if (a.chrom > b.chrom) return false; - - if (a.start < b.start) return true; - else if (a.start >= b.start) return false; - -}; - //************************************************ // Exception checking //************************************************ @@ -120,7 +97,58 @@ static int getBin(int start, int end) return 0; } +//********************************************* +// Sorting functions +//********************************************* +bool sortByChrom(BED const & a, BED const & b){ + if (a.chrom < b.chrom) return true; + else return false; +}; + +bool sortByStart(const BED &a, const BED &b){ + if (a.start < b.start) return true; + else return false; +}; + +bool sortBySizeAsc(const BED &a, const BED &b){ + + unsigned int aLen = a.end - a.start; + unsigned int bLen = b.end - b.start; + + if (aLen < bLen) return true; + else return false; +}; + +bool sortBySizeDesc(const BED &a, const BED &b){ + + unsigned int aLen = a.end - a.start; + unsigned int bLen = b.end - b.start; + + if (aLen > bLen) return true; + else return false; +}; +bool sortByScoreAsc(const BED &a, const BED &b){ + if (a.score < b.score) return true; + else return false; +}; + +bool sortByScoreDesc(const BED &a, const BED &b){ + if (a.score > b.score) return true; + else return false; +}; + + +bool byChromThenStart(BED const & a, BED const & b){ + + if (a.chrom < b.chrom) return true; + else if (a.chrom > b.chrom) return false; + + if (a.start < b.start) return true; + else if (a.start >= b.start) return false; + + return false; +}; void BedFile::binKeeperFind(map<int, vector<BED>, std::less<int> > &bk, const int start, const int end, vector<BED> &hits) /* diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 5f72330bb2ecd965e96520276f7d5439c7cdd73b..d0009798c5dbb8c3347b6eb43ae174816fc14490 100755 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -59,6 +59,15 @@ std::string ToString(const T & value) void Tokenize(const string& str, vector<string>& tokens); +// BED Sorting Methods +bool sortByChrom(BED const &, BED const &); +bool sortByStart(const BED &, const BED &); +bool sortBySizeAsc(const BED &, const BED &); +bool sortBySizeDesc(const BED &, const BED &); +bool sortByScoreAsc(const BED &, const BED &); +bool sortByScoreDesc(const BED &, const BED &); +bool byChromThenStart(BED const &, BED const &); + //************************************************* // Common typedefs //*************************************************