annotateBed.cpp 7.82 KB
Newer Older
1
2
3
4
5
6
7
8
9
/*****************************************************************************
  annotateBed.cpp

  (c) 2009 - Aaron Quinlan
  Hall Laboratory
  Department of Biochemistry and Molecular Genetics
  University of Virginia
  aaronquinlan@gmail.com

10
  Licenced under the GNU General Public License 2.0 license.
11
12
13
14
******************************************************************************/
#include "lineFileUtilities.h"
#include "annotateBed.h"

Aaron's avatar
Aaron committed
15
// build
16
BedAnnotate::BedAnnotate(const string &mainFile, const vector<string> &annoFileNames,
Aaron's avatar
Aaron committed
17
18
19
20
21
            const vector<string> &annoTitles, bool forceStrand, bool reportCounts, bool reportBoth) :

    _mainFile(mainFile),
    _annoFileNames(annoFileNames),
    _annoTitles(annoTitles),
22
    _forceStrand(forceStrand),
Aaron's avatar
Aaron committed
23
24
25
26
    _reportCounts(reportCounts),
    _reportBoth(reportBoth)
{
    _bed = new BedFile(_mainFile);
27
}
Aaron's avatar
Aaron committed
28
29
30
31


// destroy and delete the open file pointers
BedAnnotate::~BedAnnotate(void) {
32
    delete _bed;
Aaron's avatar
Aaron committed
33
34
    CloseAnnoFiles();
}
35
36


Aaron's avatar
Aaron committed
37
void BedAnnotate::OpenAnnoFiles() {
38
39
40
41
42
    for (size_t i=0; i < _annoFileNames.size(); ++i) {
        BedFile *file = new BedFile(_annoFileNames[i]);
        file->Open();
        _annoFiles.push_back(file);
    }
Aaron's avatar
Aaron committed
43
44
45
46
}


void BedAnnotate::CloseAnnoFiles() {
47
48
49
50
51
    for (size_t i=0; i < _annoFiles.size(); ++i) {
        BedFile *file = _annoFiles[i];
        delete file;
        _annoFiles[i] = NULL;
    }
52
53
54
}


Aaron's avatar
Aaron committed
55
void BedAnnotate::PrintHeader() {
56
    // print a hash to indicate header and then write a tab
Aaron's avatar
Aaron committed
57
58
59
60
    // for each field in the main file.
    printf("#");
    for (size_t i = 0; i < _bed->bedType; ++i)
        printf("\t");
61

Aaron's avatar
Aaron committed
62
    // now print the label for each file.
63
64
65
66
67
68
69
70
71
72
    if (_reportBoth == false) {
        for (size_t i = 0; i < _annoTitles.size(); ++i)
            printf("%s\t", _annoTitles[i].c_str());
        printf("\n");
    }
    else {
        for (size_t i = 0; i < _annoTitles.size(); ++i)
            printf("%s_cnt\t%s_pct", _annoTitles[i].c_str(), _annoTitles[i].c_str());
        printf("\n");
    }
73
74
75
}


Aaron's avatar
Aaron committed
76
void BedAnnotate::InitializeMainFile() {
77
78
79
80
81
82
83
84
    // process each chromosome
    masterBedCovListMap::iterator chromItr = _bed->bedCovListMap.begin();
    masterBedCovListMap::iterator chromEnd = _bed->bedCovListMap.end();
    for (; chromItr != chromEnd; ++chromItr) {
        // for each chrom, process each bin
        binsToBedCovLists::iterator binItr = chromItr->second.begin();
        binsToBedCovLists::iterator binEnd = chromItr->second.end();
        for (; binItr != binEnd; ++binItr) {
Aaron's avatar
Aaron committed
85
            // initialize BEDCOVLIST in this chrom/bin
86
87
88
89
            vector<BEDCOVLIST>::iterator bedItr = binItr->second.begin();
            vector<BEDCOVLIST>::iterator bedEnd = binItr->second.end();
            for (; bedItr != bedEnd; ++bedItr) {
                // initialize the depthMaps, counts, etc. for each anno file.
Aaron's avatar
Aaron committed
90
91
92
93
94
95
                for (size_t i = 0; i < _annoFiles.size(); ++i) {
                    map<unsigned int, DEPTH> dummy;
                    bedItr->depthMapList.push_back(dummy);
                    bedItr->counts.push_back(0);
                    bedItr->minOverlapStarts.push_back(INT_MAX);
                }
96
97
            }
        }
Aaron's avatar
Aaron committed
98
    }
99
100
}

Aaron's avatar
Aaron committed
101
102

void BedAnnotate::AnnotateBed() {
103
104
105
106

    // load the "main" bed file into a map so
    // that we can easily compare each annoFile to it for overlaps
    _bed->loadBedCovListFileIntoMap();
Aaron's avatar
Aaron committed
107
    // open the annotations files for processing;
108
    OpenAnnoFiles();
Aaron's avatar
Aaron committed
109
110
    // initialize counters, depths, etc. for the main file
    InitializeMainFile();
111
112

    // annotate the main file with the coverage from the annotation files.
Aaron's avatar
Aaron committed
113
114
    for (size_t annoIndex = 0; annoIndex < _annoFiles.size(); ++annoIndex) {
        // grab the current annotation file.
115
        BedFile *anno = _annoFiles[annoIndex];
Aaron's avatar
Aaron committed
116
        int lineNum = 0;
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
        BED a, nullBed;
        BedLineStatus bedStatus;

        // process each entry in the current anno file
        while ((bedStatus = anno->GetNextBed(a, lineNum)) != BED_INVALID) {
            if (bedStatus == BED_VALID) {
                _bed->countListHits(a, annoIndex, _forceStrand);
                a = nullBed;
            }
        }
    }

    // report the annotations of the main file from the anno file.
    ReportAnnotations();
    // close the annotations files;
    CloseAnnoFiles();
Aaron's avatar
Aaron committed
133
}
134
135


Aaron's avatar
Aaron committed
136
void BedAnnotate::ReportAnnotations() {
137

Aaron's avatar
Aaron committed
138
139
140
141
    if (_annoTitles.size() > 0) {
        PrintHeader();
    }

142
143
144
145
146
147
148
149
150
    // process each chromosome
    masterBedCovListMap::const_iterator chromItr = _bed->bedCovListMap.begin();
    masterBedCovListMap::const_iterator chromEnd = _bed->bedCovListMap.end();
    for (; chromItr != chromEnd; ++chromItr) {
        // for each chrom, process each bin
        binsToBedCovLists::const_iterator binItr = chromItr->second.begin();
        binsToBedCovLists::const_iterator binEnd = chromItr->second.end();
        for (; binItr != binEnd; ++binItr) {
            // for each chrom & bin, compute and report
Aaron's avatar
Aaron committed
151
            // the observed coverage for each feature
152
153
154
155
156
157
            vector<BEDCOVLIST>::const_iterator bedItr = binItr->second.begin();
            vector<BEDCOVLIST>::const_iterator bedEnd = binItr->second.end();
            for (; bedItr != bedEnd; ++bedItr) {
                // print the main BED entry.
                _bed->reportBedTab(*bedItr);

Aaron's avatar
Aaron committed
158
159
                // now report the coverage from each annotation file.
                for (size_t i = 0; i < _annoFiles.size(); ++i) {
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
                    unsigned int totalLength = 0;
                    int zeroDepthCount = 0; // number of bases with zero depth
                    int depth          = 0; // tracks the depth at the current base

                    // the start is either the first base in the feature OR
                    // the leftmost position of an overlapping feature. e.g. (s = start):
                    // A    ----------
                    // B    s    ------------
                    int start          = min(bedItr->minOverlapStarts[i], bedItr->start);

                    map<unsigned int, DEPTH>::const_iterator depthItr;
                    map<unsigned int, DEPTH>::const_iterator depthEnd;

                    // compute the coverage observed at each base in the feature marching from start to end.
                    for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) {
                        // map pointer grabbing the starts and ends observed at this position
                        depthItr = bedItr->depthMapList[i].find(pos);
Aaron's avatar
Aaron committed
177
                        depthEnd = bedItr->depthMapList[i].end();
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197

                        // increment coverage if starts observed at this position.
                        if (depthItr != depthEnd)
                            depth += depthItr->second.starts;
                        // update zero depth
                        if ((pos > bedItr->start) && (pos <= bedItr->end) && (depth == 0))
                            zeroDepthCount++;
                        // decrement coverage if ends observed at this position.
                        if (depthItr != depthEnd)
                            depth = depth - depthItr->second.ends;
                    }
                    // Summarize the coverage for the current interval,
                    CHRPOS length     = bedItr->end - bedItr->start;
                    totalLength       += length;
                    int nonZeroBases   = (length - zeroDepthCount);
                    float fractCovered = (float) nonZeroBases / length;
                    if (_reportCounts == false && _reportBoth == false)
                        printf("%f\t", fractCovered);
                    else if (_reportCounts == true && _reportBoth == false)
                        printf("%d\t", bedItr->counts[i]);
Aaron's avatar
Aaron committed
198
                    else if (_reportCounts == false && _reportBoth == true)
199
200
201
                        printf("%d\t%f\t", bedItr->counts[i], fractCovered);
                }
                // print newline for next feature.
Aaron's avatar
Aaron committed
202
                printf("\n");
203
204
205
            }
        }
    }
206
207
208
}