intersectBed.cpp 13.4 KB
Newer Older
1
/*****************************************************************************
2
  intersectBed.cpp
3
4
5
6
7
8
9

  (c) 2009 - Aaron Quinlan
  Hall Laboratory
  Department of Biochemistry and Molecular Genetics
  University of Virginia
  aaronquinlan@gmail.com

Aaron's avatar
Aaron committed
10
  Licenced under the GNU General Public License 2.0 license.
11
******************************************************************************/
12
#include "lineFileUtilities.h"
Aaron Quinlan's avatar
Aaron Quinlan committed
13
14
#include "intersectBed.h"

15
16
17
/************************************
Helper functions
************************************/
18
bool BedIntersect::processHits(const BED &a, const vector<BED> &hits) {
19

20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
    // // how many overlaps are there b/w the bed and the set of hits?
    // CHRPOS s, e;
    // int overlapBases;
    // int  numOverlaps = 0;
    // bool hitsFound   = false;
    // int aLength      = (a.end - a.start);   // the length of a in b.p.
    // 
    // // loop through the hits and report those that meet the user's criteria
    // vector<BED>::const_iterator h       = hits.begin();
    // vector<BED>::const_iterator hitsEnd = hits.end();
    // for (; h != hitsEnd; ++h) {
    //     s            = max(a.start, h->start);
    //     e            = min(a.end, h->end);
    //     overlapBases = (e - s);             // the number of overlapping bases b/w a and b
    // 
    //     // is there enough overlap relative to the user's request? (default ~ 1bp)
    //     if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) {
    //         // Report the hit if the user doesn't care about reciprocal overlap between A and B.
    //         if (_reciprocal == false) {
    //             hitsFound = true;
    //             numOverlaps++;
    //             if (_printable == true)
    //                 ReportOverlapDetail(overlapBases, a, *h, s, e);
    //         }
    //         // we require there to be sufficient __reciprocal__ overlap
    //         else {
    //             int bLength    = (h->end - h->start);
    //             float bOverlap = ( (float) overlapBases / (float) bLength );
    //             if (bOverlap >= _overlapFraction) {
    //                 hitsFound = true;
    //                 numOverlaps++;
    //                 if (_printable == true)
    //                     ReportOverlapDetail(overlapBases, a, *h, s, e);
    //             }
    //         }
    //     }
    // }
    // // report the summary of the overlaps if requested.
    // ReportOverlapSummary(a, numOverlaps);
    // // were hits found for this BED feature?
    // return hitsFound;
61
    bool hitsFound   = false;
62
    if (_printable == true) {
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
        // -wao the user wants to force the reporting of 0 overlap
        if (hits.size() == 0 && _writeAllOverlap) {
            _bedA->reportBedTab(a);
            _bedB->reportNullBedTab();
            printf("0\n");
        }
        else {
            vector<BED>::const_iterator h       = hits.begin();
            vector<BED>::const_iterator hitsEnd = hits.end();
            for (; h != hitsEnd; ++h) {
                CHRPOS s = max(a.start, h->start);
                CHRPOS e = min(a.end, h->end);
                int overlapBases = (e - s);
                ReportOverlapDetail(overlapBases, a, *h, s, e);
                hitsFound = true;
            }
79
80
        }
    }
81
82
83
    else {
        ReportOverlapSummary(a, hits.size());
    }
84
    return hitsFound;
85
86
}

Aaron Quinlan's avatar
Aaron Quinlan committed
87

88
/*
89
    Constructor
90
*/
91
92
BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit,
                           bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap,
93
                           float overlapFraction, bool noHit, bool writeCount, bool sameStrand, bool diffStrand,
94
                           bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam,
95
                           bool sortedInput, bool printHeader) {
96
97
98
99
100
101
102
103
104
105
106

    _bedAFile            = bedAFile;
    _bedBFile            = bedBFile;
    _anyHit              = anyHit;
    _noHit               = noHit;
    _writeA              = writeA;
    _writeB              = writeB;
    _writeOverlap        = writeOverlap;
    _writeAllOverlap     = writeAllOverlap;
    _writeCount          = writeCount;
    _overlapFraction     = overlapFraction;
107
108
    _sameStrand          = sameStrand;
    _diffStrand          = diffStrand;
109
    _reciprocal          = reciprocal;
110
    _obeySplits          = obeySplits;
111
112
    _bamInput            = bamInput;
    _bamOutput           = bamOutput;
Aaron's avatar
Aaron committed
113
    _isUncompressedBam   = isUncompressedBam;
114
    _sortedInput         = sortedInput;
115
    _printHeader         = printHeader;
116

117
118
119
120
121
    // should we print each overlap, or does the user want summary information?
    _printable = true;
    if (_anyHit || _noHit || _writeCount)
        _printable = false;
        
122
123
124
125
    if (_bamInput == false)
        IntersectBed();
    else
        IntersectBam(bedAFile);
Aaron Quinlan's avatar
Aaron Quinlan committed
126
127
}

128

129
/*
130
    Destructor
131
*/
Aaron Quinlan's avatar
Aaron Quinlan committed
132
133
134
135
BedIntersect::~BedIntersect(void) {
}


136
bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) {
137
    bool hitsFound = false;
138
    
139
    // collect and report the sufficient hits
140
141
142
    _bedB->allHits(a.chrom, a.start, a.end, a.strand,
                   hits, _sameStrand, _diffStrand,
                   _overlapFraction, _reciprocal);
143
    hitsFound = processHits(a, hits);
144
    return hitsFound;
Aaron's avatar
Aaron committed
145
146
147
}


148
void BedIntersect::ReportOverlapDetail(int overlapBases, const BED &a, const BED &b, CHRPOS s, CHRPOS e) {
149
150
151
152
    // default. simple intersection only
    if (_writeA == false && _writeB == false && _writeOverlap == false) {
        _bedA->reportBedRangeNewLine(a,s,e);
    }
153
    //  -wa -wb write the original A and B
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
    else if (_writeA == true && _writeB == true) {
        _bedA->reportBedTab(a);
        _bedB->reportBedNewLine(b);
    }
    // -wa write just the original A
    else if (_writeA == true) {
        _bedA->reportBedNewLine(a);
    }
    // -wb write the intersected portion of A and the original B
    else if (_writeB == true) {
        _bedA->reportBedRangeTab(a,s,e);
        _bedB->reportBedNewLine(b);
    }
    // -wo write the original A and B plus the no. of overlapping bases.
    else if (_writeOverlap == true) {
        _bedA->reportBedTab(a);
        _bedB->reportBedTab(b);
171
172
        if (b.zeroLength == false) printf("%d\n", overlapBases);
        else printf("0\n");
173
    }
Aaron's avatar
Aaron committed
174
}
175

Aaron's avatar
Aaron committed
176
177

void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) {
178
179
180
181
182
183
184
185
186
187
188
189
190
    // -u  just report the fact that there was >= 1 overlaps
    if (_anyHit && (numOverlapsFound >= 1)) {
        _bedA->reportBedNewLine(a);
    }
    // -c  report the total number of features overlapped in B
    else if (_writeCount) {
        _bedA->reportBedTab(a);
        printf("%d\n", numOverlapsFound);
    }
    // -v  report iff there were no overlaps
    else if (_noHit && (numOverlapsFound == 0)) {
        _bedA->reportBedNewLine(a);
    }
191
}
Aaron Quinlan's avatar
Aaron Quinlan committed
192

193

194
void BedIntersect::IntersectBed() {
Aaron Quinlan's avatar
Aaron Quinlan committed
195

196
197
198
199
200
201
202
203
204
205
206
    // create new BED file objects for A and B
    _bedA = new BedFile(_bedAFile);
    _bedB = new BedFile(_bedBFile);

    if (_sortedInput == false) {
        // load the "B" file into a map in order to
        // compare each entry in A to it in search of overlaps.
        _bedB->loadBedFileIntoMap();

        vector<BED> hits;
        hits.reserve(100);
Aaron's avatar
Aaron committed
207
        BED a;
208
209
210

        // open the "A" file, process each BED entry and searh for overlaps.
        _bedA->Open();
211
212
213
214
        // report A's header first if asked.
        if (_printHeader == true) {
            _bedA->PrintHeader();
        }
Aaron's avatar
Aaron committed
215
216
        while (_bedA->GetNextBed(a)) {
            if (_bedA->_status == BED_VALID) {
217
218
219
                // treat the BED as a single "block"
                if (_obeySplits == false) {
                    FindOverlaps(a, hits);
220
                    hits.clear();
221
222
223
224
                }
                // split the BED12 into blocks and look for overlaps in each discrete block
                else {
                    bedVector bedBlocks;  // vec to store the discrete BED "blocks"
225
                    splitBedIntoBlocks(a,bedBlocks);
226
227
228
229
230
231
232

                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
                    vector<BED>::const_iterator bedEnd  = bedBlocks.end();
                    for (; bedItr != bedEnd; ++bedItr) {
                        FindOverlaps(*bedItr, hits);
                        hits.clear();
                    }
233
                }
234
            }
235
        }
236
237
238
239
        _bedA->Close();
    }
    else {
        // use the chromsweep algorithm to detect overlaps on the fly.
240
241
242
243
        ChromSweep sweep = ChromSweep(_bedA, _bedB, 
                                      _sameStrand, _diffStrand, 
                                      _overlapFraction, _reciprocal,
                                      _printHeader);
Aaron's avatar
Aaron committed
244

245
        pair<BED, vector<BED> > hit_set;
Aaron's avatar
Aaron committed
246
        hit_set.second.reserve(10000);
247
248
249
        while (sweep.Next(hit_set)) {
            processHits(hit_set.first, hit_set.second);
        }
250
    }
Aaron Quinlan's avatar
Aaron Quinlan committed
251
252
253
}


254
255
void BedIntersect::IntersectBam(string bamFile) {

256
257
    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
258
    _bedB = new BedFile(_bedBFile);
259
260
    _bedB->loadBedFileIntoMap();

261
262
263
264
265
266
267
    // create a dummy BED A file for printing purposes if not
    // using BAM output.
    if (_bamOutput == false) {
        _bedA = new BedFile(_bedAFile);
        _bedA->bedType = 6;
    }

268
269
270
    // open the BAM file
    BamReader reader;
    BamWriter writer;
271
    
272
273
274
    reader.Open(bamFile);

    // get header & reference information
275
276
    string bamHeader  = reader.GetHeaderText();
    RefVector refs    = reader.GetReferenceData();
277
278
279

    // open a BAM output to stdout if we are writing BAM
    if (_bamOutput == true) {
280
281
282
        // set compression mode
        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
283
284
285
        writer.SetCompressionMode(compressionMode);
        // open our BAM writer
        writer.Open("stdout", bamHeader, refs);
286
287
288
289
290
291
    }

    vector<BED> hits;
    // reserve some space
    hits.reserve(100);

292
    
293
    BamAlignment bam;    
294
295
296
297
298
299
300
    // get each set of alignments for each pair.
    while (reader.GetNextAlignment(bam)) {

        if (bam.IsMapped()) {
            BED a;
            a.chrom = refs.at(bam.RefID).RefName;
            a.start = bam.Position;
301
            a.end   = bam.GetEndPosition(false, false);
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316

            // build the name field from the BAM alignment.
            a.name = bam.Name;
            if (bam.IsFirstMate()) a.name += "/1";
            if (bam.IsSecondMate()) a.name += "/2";

            a.score  = ToString(bam.MapQuality);

            a.strand = "+";
            if (bam.IsReverseStrand()) a.strand = "-";

            if (_bamOutput == true) {
                bool overlapsFound = false;
                // treat the BAM alignment as a single "block"
                if (_obeySplits == false) {
317
318
319
                    overlapsFound = _bedB->anyHits(a.chrom, a.start, a.end, 
                                                   a.strand, _sameStrand, _diffStrand,
                                                   _overlapFraction, _reciprocal);
320
321
322
323
                }
                // split the BAM alignment into discrete blocks and
                // look for overlaps only within each block.
                else {
324
                    bool overlapFoundForBlock;
325
326
                    bedVector bedBlocks;  // vec to store the discrete BED "blocks" from a
                    // we don't want to split on "D" ops, hence the "false"
327
                    getBamBlocks(bam, refs, bedBlocks, false);
328

329
                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
330
331
                    vector<BED>::const_iterator bedEnd  = bedBlocks.end();
                    for (; bedItr != bedEnd; ++bedItr) {
332
333
334
                        overlapFoundForBlock = _bedB->anyHits(bedItr->chrom, bedItr->start, bedItr->end, 
                                                              bedItr->strand, _sameStrand, _diffStrand,
                                                              _overlapFraction, _reciprocal);
335
                        if (overlapFoundForBlock == true)
336
                            overlapsFound = true;
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
                    }
                }
                if (overlapsFound == true) {
                    if (_noHit == false)
                        writer.SaveAlignment(bam);
                }
                else {
                    if (_noHit == true) {
                        writer.SaveAlignment(bam);
                    }
                }
            }
            else {
                // treat the BAM alignment as a single BED "block"
                if (_obeySplits == false) {
                    FindOverlaps(a, hits);
                    hits.clear();
                }
                // split the BAM alignment into discrete BED blocks and
                // look for overlaps only within each block.
                else {
                    bedVector bedBlocks;  // vec to store the discrete BED "blocks" from a
359
360
361
                    getBamBlocks(bam, refs, bedBlocks, false);

                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
362
363
364
                    vector<BED>::const_iterator bedEnd  = bedBlocks.end();
                    for (; bedItr != bedEnd; ++bedItr) {
                        FindOverlaps(*bedItr, hits);
365
                        hits.clear();
366
367
368
369
                    }
                }
            }
        }
370
371
372
373
        // BAM IsMapped() is false
        else if (_noHit == true) {
            writer.SaveAlignment(bam);
        }
374
375
376
377
378
379
380
    }

    // close the relevant BAM files.
    reader.Close();
    if (_bamOutput == true) {
        writer.Close();
    }
381
382
}