closestBed.cpp 9.23 KB
Newer Older
Aaron's avatar
Aaron committed
1
2
3
4
5
6
7
8
9
/*****************************************************************************
  closestBed.cpp

  (c) 2009 - Aaron Quinlan
  Hall Laboratory
  Department of Biochemistry and Molecular Genetics
  University of Virginia
  aaronquinlan@gmail.com

Aaron's avatar
Aaron committed
10
  Licenced under the GNU General Public License 2.0 license.
Aaron's avatar
Aaron committed
11
******************************************************************************/
12
#include "lineFileUtilities.h"
Aaron Quinlan's avatar
Aaron Quinlan committed
13
14
15
#include "closestBed.h"

const int MAXSLOP = 256000000;  // 2*MAXSLOP = 512 megabases.
16
17
                                // We don't want to keep looking if we
                                // can't find a nearby feature within 512 Mb.
Aaron Quinlan's avatar
Aaron Quinlan committed
18
19
20
21
const int SLOPGROWTH = 2048000;


/*
22
    Constructor
Aaron Quinlan's avatar
Aaron Quinlan committed
23
*/
Aaron's avatar
Aaron committed
24
BedClosest::BedClosest(string &bedAFile, string &bedBFile, bool sameStrand, bool diffStrand,
25
                       string &tieMode, bool reportDistance, bool signDistance, string &_strandedDistMode,
26
                       bool ignoreOverlaps, bool ignoreUpstream, bool ignoreDownstream, bool printHeader) 
27
28
29
    : _bedAFile(bedAFile)
    , _bedBFile(bedBFile)
    , _tieMode(tieMode)
Aaron's avatar
Aaron committed
30
31
    , _sameStrand(sameStrand)
    , _diffStrand(diffStrand)
32
    , _reportDistance(reportDistance)
33
    , _signDistance(signDistance)
34
    , _strandedDistMode(_strandedDistMode)
35
    , _ignoreOverlaps(ignoreOverlaps)
36
37
    , _ignoreUpstream(ignoreUpstream)
    , _ignoreDownstream(ignoreDownstream)
38
    , _printHeader(printHeader)
39
40
41
{
    _bedA           = new BedFile(_bedAFile);
    _bedB           = new BedFile(_bedBFile);
42
    FindClosestBed();
Aaron Quinlan's avatar
Aaron Quinlan committed
43
44
}

45

Aaron Quinlan's avatar
Aaron Quinlan committed
46
/*
47
    Destructor
Aaron Quinlan's avatar
Aaron Quinlan committed
48
49
50
51
52
53
54
*/
BedClosest::~BedClosest(void) {
}


void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) {

55
56
57
58
    int slop = 0;  // start out just looking for overlaps
                   // within the current bin (~128Kb)

    // update the current feature's start and end
Aaron Quinlan's avatar
Aaron Quinlan committed
59

60
61
62
63
    CHRPOS aFudgeStart = 0;
    CHRPOS aFudgeEnd;
    int numOverlaps = 0;
    vector<BED> closestB;
64
65
    CHRPOS minDistance = INT_MAX;
    int32_t curDistance = INT_MAX;
66
    vector<int32_t> distances;
Aaron Quinlan's avatar
Aaron Quinlan committed
67

Aaron's avatar
Aaron committed
68
69
    // is there at least one feature in B on the same chrom
    // as the current A feature?
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
    if(_bedB->bedMap.find(a.chrom) != _bedB->bedMap.end()) {

        while ((numOverlaps == 0) && (slop <= MAXSLOP)) {

            // add some slop (starting at 0 bases) to a in hopes
            // of finding a hit in B
            if ((static_cast<int>(a.start) - slop) > 0)
                aFudgeStart = a.start - slop;
            else
                aFudgeStart = 0;

            if ((static_cast<int>(a.start) + slop) < (2 * MAXSLOP))
                aFudgeEnd = a.end + slop;
            else
                aFudgeEnd = 2 * MAXSLOP;

            // THE HEAVY LIFTING
            // search for hits with the current slop added
88
89
            _bedB->allHits(a.chrom, aFudgeStart, aFudgeEnd, a.strand, 
                           hits, _sameStrand, _diffStrand, 0.0, false);
90
91
92
93
94
95
96
97
98

            vector<BED>::const_iterator h = hits.begin();
            vector<BED>::const_iterator hitsEnd = hits.end();
            for (; h != hitsEnd; ++h) {

                // do the actual features overlap?
                int s = max(a.start, h->start);
                int e = min(a.end, h->end);
                int overlapBases = (e - s);             // the number of overlapping bases b/w a and b
99
100
101
102
103
104
105
106
107
108
109

                // make sure we allow overlapping features.
                if ((overlapBases > 0) && (_ignoreOverlaps == true))
                    continue;
                else
                    numOverlaps++;

                // there is overlap. make sure we allow overlapping features ()
                if (overlapBases > 0) {
                    closestB.push_back(*h);
                    distances.push_back(0);
110
111
                }
                // the hit is to the "left" of A
112
                else if (h->end <= a.start) {
113
114
115
116
117
                    curDistance = a.start - h->end;
                    if (_signDistance) {
                        if ((_strandedDistMode == "ref")
                                || (_strandedDistMode == "a" && a.strand != "-")
                                || (_strandedDistMode == "b" && h->strand == "-")) {
118
                            // hit is "upstream" of A
119
120
                            if (_ignoreUpstream) {
                                numOverlaps--;
121
                                continue;
122
123
                            }
                            else {
124
                                curDistance = -curDistance;
125
                            }
126
                        }
127
128
                        else if (_ignoreDownstream) {
                            numOverlaps--;
129
                            continue;
130
                        }
131
132
133
134
135
                    }
                    
                    if (abs(curDistance) < minDistance) {
                        minDistance = abs(curDistance);
                        
136
137
138
                        closestB.clear();
                        closestB.push_back(*h);
                        distances.clear();
139
                        distances.push_back(curDistance);
140
                    }
141
142
                    else if (abs(curDistance) == minDistance) {
                        minDistance = abs(curDistance);
143
                        closestB.push_back(*h);
144
                        distances.push_back(curDistance);
145
146
147
                    }
                }
                // the hit is to the "right" of A
148
                else if (h->start >= a.end) {
149
150
151
152
                    curDistance = h->start - a.end;
                    if (_signDistance) {
                        if ((_strandedDistMode == "a" && a.strand == "-")
                                || (_strandedDistMode == "b" && h->strand != "-")) {
153
                            // hit is "upstream" of A
154
155
                            if (_ignoreUpstream) {
                                numOverlaps--;
156
                                continue;
157
158
                            }
                            else{
159
                                curDistance = -curDistance;
160
                            }
161
                        }
162
163
                        else if (_ignoreDownstream){
                            numOverlaps--;
164
                            continue;
165
                        }
166
167
                    }
                    if (abs(curDistance) < minDistance) {
168
                        minDistance = abs(curDistance);
169
170
171
                        closestB.clear();
                        closestB.push_back(*h);
                        distances.clear();
172
                        distances.push_back(curDistance);
173
                    }
174
                    else if (abs(curDistance) == minDistance) {
175
                        minDistance = abs(curDistance);
176
                        closestB.push_back(*h);
177
                        distances.push_back(curDistance);
178
179
180
181
182
183
184
185
186
187
188
189
190
191
                    }
                }
            }
            // if no overlaps were found, we'll widen the range
            // by SLOPGROWTH in each direction and search again.
            slop += SLOPGROWTH;
        }
    }
    // there is no feature in B on the same chromosome as A
    else {
        _bedA->reportBedTab(a);
        if (_reportDistance == true) {
            _bedB->reportNullBedTab();
            cout << -1 << endl;
192
193
        }
        else
194
195
            _bedB->reportNullBedNewLine();
    }
Aaron Quinlan's avatar
Aaron Quinlan committed
196

197
198
    // report the closest feature(s) in B to the current A feature.
    // obey the user's reporting request (_tieMode)
199
    if (numOverlaps > 0) {
200
        if (closestB.size() == 1 || (_tieMode == "first" && closestB.size() > 0)) {
201
202
203
204
            _bedA->reportBedTab(a);
            if (_reportDistance == true) {
                _bedB->reportBedTab(closestB[0]);
                cout << distances[0] << endl;
205
206
            }
            else
207
208
209
210
                _bedB->reportBedNewLine(closestB[0]);
        }
        else {
            if (_tieMode == "all") {
211
                size_t i = 0;
212
213
214
215
216
                for (vector<BED>::iterator b = closestB.begin(); b != closestB.end(); ++b) {
                    _bedA->reportBedTab(a);
                    if (_reportDistance == true) {
                        _bedB->reportBedTab(*b);
                        cout << distances[i++] <<endl;
217
218
                    }
                    else
219
220
221
                        _bedB->reportBedNewLine(*b);
                }
            }
222
            else if (_tieMode == "last" && closestB.size() > 0) {
223
224
225
226
                _bedA->reportBedTab(a);
                if (_reportDistance == true) {
                    _bedB->reportBedTab(closestB[closestB.size()-1]);
                    cout << distances[distances.size() - 1]<<endl;
227
228
229
                }
                else
                    _bedB->reportBedNewLine(closestB[closestB.size()-1]);
230
231
232
            }
        }
    }
Aaron Quinlan's avatar
Aaron Quinlan committed
233
234
}

235

236
void BedClosest::FindClosestBed() {
Aaron Quinlan's avatar
Aaron Quinlan committed
237

238
239
240
241
    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedFileIntoMap();

242
    BED a;
243
244
245
246
    vector<BED> hits;                   // vector of potential hits
    hits.reserve(100);

    _bedA->Open();
247
248
249
250
    // report A's header first if asked.
    if (_printHeader == true) {
        _bedA->PrintHeader();
    }
251
    // process each entry in A in search of the closest feature in B
Aaron's avatar
Aaron committed
252
253
    while (_bedA->GetNextBed(a)) {
        if (_bedA->_status == BED_VALID) {
254
255
256
257
258
            FindWindowOverlaps(a, hits);
            hits.clear();
        }
    }
    _bedA->Close();
Aaron's avatar
Aaron committed
259
260
}
// END ClosestBed
Aaron Quinlan's avatar
Aaron Quinlan committed
261