Skip to content
Snippets Groups Projects
Commit 60015931 authored by Aaron's avatar Aaron
Browse files

[ENC] Add -N (removeAny) option to subtract. Re-adding improvement from @brentp

parent b3e6fa27
No related branches found
No related tags found
No related merge requests found
...@@ -11,20 +11,21 @@ ...@@ -11,20 +11,21 @@
******************************************************************************/ ******************************************************************************/
#include "lineFileUtilities.h" #include "lineFileUtilities.h"
#include "subtractBed.h" #include "subtractBed.h"
#include <numeric>
/* /*
Constructor Constructor
*/ */
BedSubtract::BedSubtract(string &bedAFile, string &bedBFile, BedSubtract::BedSubtract(string &bedAFile, string &bedBFile,
float overlapFraction, bool sameStrand, float overlapFraction, bool sameStrand,
bool diffStrand, bool removeAll) bool diffStrand, bool removeAll, bool removeAny)
: _bedAFile(bedAFile) : _bedAFile(bedAFile)
, _bedBFile(bedBFile) , _bedBFile(bedBFile)
, _overlapFraction(overlapFraction) , _overlapFraction(overlapFraction)
, _sameStrand(sameStrand) , _sameStrand(sameStrand)
, _diffStrand(diffStrand) , _diffStrand(diffStrand)
, _removeAll(removeAll) , _removeAll(removeAll)
, _removeAny(removeAny)
{ {
_bedA = new BedFile(bedAFile); _bedA = new BedFile(bedAFile);
_bedB = new BedFile(bedBFile); _bedB = new BedFile(bedBFile);
...@@ -54,7 +55,6 @@ void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) { ...@@ -54,7 +55,6 @@ void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) {
vector<BED>::const_iterator h = hits.begin(); vector<BED>::const_iterator h = hits.begin();
vector<BED>::const_iterator hitsEnd = hits.end(); vector<BED>::const_iterator hitsEnd = hits.end();
for (; h != hitsEnd; ++h) { for (; h != hitsEnd; ++h) {
int s = max(a.start, h->start); int s = max(a.start, h->start);
int e = min(a.end, h->end); int e = min(a.end, h->end);
int overlapBases = (e - s); int overlapBases = (e - s);
...@@ -69,7 +69,7 @@ void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) { ...@@ -69,7 +69,7 @@ void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) {
numOverlaps++; numOverlaps++;
numConsumedByB++; numConsumedByB++;
} }
else if ( overlap >= _overlapFraction ) { else if ( overlap >= _overlapFraction || _removeAny) {
numOverlaps++; numOverlaps++;
bOverlaps.push_back(*h); bOverlaps.push_back(*h);
} }
...@@ -80,7 +80,7 @@ void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) { ...@@ -80,7 +80,7 @@ void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) {
// no overlap found, so just report A as-is. // no overlap found, so just report A as-is.
_bedA->reportBedNewLine(a); _bedA->reportBedNewLine(a);
} }
else if (numOverlaps == 1) { else if ((numOverlaps == 1) && (!_removeAny)) {
// one overlap found. only need to look at the single // one overlap found. only need to look at the single
// entry in bOverlaps. // entry in bOverlaps.
...@@ -116,14 +116,17 @@ void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) { ...@@ -116,14 +116,17 @@ void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) {
} }
} }
} }
else if (numOverlaps > 1) { else if ((numOverlaps > 1) || _removeAny) {
// multiple overlapz found. look at all the hits // multiple overlapz found. look at all the hits
// and figure out which bases in A survived. then // and figure out which bases in A survived. then
// report the contigous intervals that survived. // report the contigous intervals that survived.
vector<bool> aKeep(a.end - a.start, true); vector<bool> aKeep(a.end - a.start, true);
if ((numConsumedByB == 0) && (_removeAll == false)) { if ((numConsumedByB == 0)) {
if(_removeAll){ return; }
// if there's any overlap, then we don't report.
// track the number of hit starts and ends at each position in A // track the number of hit starts and ends at each position in A
for (vector<BED>::iterator h = bOverlaps.begin(); for (vector<BED>::iterator h = bOverlaps.begin();
h != bOverlaps.end(); h != bOverlaps.end();
...@@ -136,6 +139,14 @@ void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) { ...@@ -136,6 +139,14 @@ void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) {
aKeep[i-a.start-1] = false; aKeep[i-a.start-1] = false;
} }
} }
if (_removeAny){
int asum = std::accumulate(aKeep.rbegin(), aKeep.rend(), 0);
float afrac = 1.0 - (float)asum / aKeep.size();
if(afrac > _overlapFraction){ return ; }
for (unsigned int i = 0; i < aKeep.size(); ++i) {
aKeep[i] = true;
}
}
// report the remaining blocks. // report the remaining blocks.
for (unsigned int i = 0; i < aKeep.size(); ++i) { for (unsigned int i = 0; i < aKeep.size(); ++i) {
if (aKeep[i] == true) { if (aKeep[i] == true) {
......
...@@ -29,7 +29,7 @@ public: ...@@ -29,7 +29,7 @@ public:
// constructor // constructor
BedSubtract(string &bedAFile, string &bedBFile, BedSubtract(string &bedAFile, string &bedBFile,
float overlapFraction, bool sameStrand, float overlapFraction, bool sameStrand,
bool diffStrand, bool removeAll); bool diffStrand, bool removeAll, bool removeAny);
// destructor // destructor
~BedSubtract(void); ~BedSubtract(void);
...@@ -44,6 +44,7 @@ private: ...@@ -44,6 +44,7 @@ private:
bool _sameStrand; bool _sameStrand;
bool _diffStrand; bool _diffStrand;
bool _removeAll; bool _removeAll;
bool _removeAny;
// instances of bed file class. // instances of bed file class.
BedFile *_bedA, *_bedB; BedFile *_bedA, *_bedB;
......
...@@ -44,6 +44,7 @@ int subtract_main(int argc, char* argv[]) { ...@@ -44,6 +44,7 @@ int subtract_main(int argc, char* argv[]) {
bool sameStrand = false; bool sameStrand = false;
bool diffStrand = false; bool diffStrand = false;
bool removeAll = false; bool removeAll = false;
bool removeAny = false;
// check to see if we should print out some help // check to see if we should print out some help
if(argc <= 1) showHelp = true; if(argc <= 1) showHelp = true;
...@@ -94,6 +95,9 @@ int subtract_main(int argc, char* argv[]) { ...@@ -94,6 +95,9 @@ int subtract_main(int argc, char* argv[]) {
else if (PARAMETER_CHECK("-A", 2, parameterLength)) { else if (PARAMETER_CHECK("-A", 2, parameterLength)) {
removeAll = true; removeAll = true;
} }
else if (PARAMETER_CHECK("-N", 2, parameterLength)) {
removeAny = true;
}
else { else {
cerr << endl cerr << endl
<< "*****ERROR: Unrecognized parameter: " << "*****ERROR: Unrecognized parameter: "
...@@ -109,10 +113,14 @@ int subtract_main(int argc, char* argv[]) { ...@@ -109,10 +113,14 @@ int subtract_main(int argc, char* argv[]) {
<< endl << "*****" << endl; << endl << "*****" << endl;
showHelp = true; showHelp = true;
} }
if (removeAny && !haveFraction){
cerr << endl << "*****" << endl
<< "*****WARNING: -N is no different from -A without -f"
<< endl << "*****" << endl;
}
if (sameStrand && diffStrand) { if (sameStrand && diffStrand) {
cerr << endl << "*****" << endl cerr << endl << "*****" << endl
<< "*****ERROR: Request either -s OR -S, not both." << "*****ERROR: Request either -s OR -S, not both."
<< endl << "*****" << endl; << endl << "*****" << endl;
showHelp = true; showHelp = true;
} }
...@@ -121,7 +129,7 @@ int subtract_main(int argc, char* argv[]) { ...@@ -121,7 +129,7 @@ int subtract_main(int argc, char* argv[]) {
BedSubtract *bs = new BedSubtract(bedAFile, bedBFile, BedSubtract *bs = new BedSubtract(bedAFile, bedBFile,
overlapFraction, sameStrand, overlapFraction, sameStrand,
diffStrand, removeAll); diffStrand, removeAll, removeAny);
delete bs; delete bs;
return 0; return 0;
} }
...@@ -156,6 +164,9 @@ void subtract_help(void) { ...@@ -156,6 +164,9 @@ void subtract_help(void) {
cerr << "\t-A\t" << "Remove entire feature if any overlap. That is, by default," << endl; cerr << "\t-A\t" << "Remove entire feature if any overlap. That is, by default," << endl;
cerr << "\t\tonly subtract the portion of A that overlaps B. Here, if" << endl; cerr << "\t\tonly subtract the portion of A that overlaps B. Here, if" << endl;
cerr << "\t\tany overlap is found (or -f amount), the entire feature is removed." << endl << endl; cerr << "\t\tany overlap is found (or -f amount), the entire feature is removed." << endl << endl;
cerr << "\t-N\t" << "Same as -A except when used with -f, the amount is the sum" << endl;
cerr << "\t\tof all features (not any single feature)." << endl << endl;
// end the program here // end the program here
exit(1); exit(1);
......
chr1 0 10
chr1 0 2
chr1 4 6
...@@ -111,3 +111,23 @@ echo \ ...@@ -111,3 +111,23 @@ echo \
$BT subtract -a a.bed -b b.bed -A -f 0.1 > obs $BT subtract -a a.bed -b b.bed -A -f 0.1 > obs
check obs exp check obs exp
rm obs exp rm obs exp
###########################################################
# test -N with -f subtraction
###########################################################
echo " subtract.t9...\c"
echo \
"chr1 0 10" > exp
$BT subtract -a c.bed -b d.bed -N -f 0.4 > obs
check obs exp
rm obs exp
###########################################################
# test -N with -f subtraction
###########################################################
echo " subtract.t10...\c"
touch exp
$BT subtract -a c.bed -b d.bed -N -f 0.39 > obs
check obs exp
rm obs exp
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment