bedFile.cpp

/*****************************************************************************
  bedFile.cpp

  (c) 2009 - Aaron Quinlan
  Hall Laboratory
  Department of Biochemistry and Molecular Genetics
  University of Virginia
  aaronquinlan@gmail.com

  Licensed under the GNU General Public License 2.0 license.
******************************************************************************/
#include "bedFile.h"


/************************************************
Helper functions
*************************************************/
void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks) {

    if (bed.otherFields.size() < 6) {
        cerr << "Input error: Cannot split into blocks. Found interval with fewer than 12 columns on line " << lineNum << "." << endl;
        exit(1);
    }

    int blockCount = atoi(bed.otherFields[3].c_str());
    if ( blockCount <= 0 ) {
        cerr << "Input error: found interval having <= 0 blocks on line " << lineNum << "." << endl;
        exit(1);
    }
    else if ( blockCount == 1 ) {
        //take a short-cut for single blocks
        bedBlocks.push_back(bed);
    }
    else {
        // get the comma-delimited strings for the BED12 block starts and block ends.
        string blockSizes(bed.otherFields[4]);
        string blockStarts(bed.otherFields[5]);

        vector<int> sizes;
        vector<int> starts;
        Tokenize(blockSizes, sizes, ",");
        Tokenize(blockStarts, starts, ",");

        if ( sizes.size() != (size_t) blockCount || starts.size() != (size_t) blockCount ) {
            cerr << "Input error: found interval with block-counts not matching starts/sizes on line " << lineNum << "." << endl;
            exit(1);
        }

        // add each BED block to the bedBlocks vector
        for (UINT i = 0; i < (UINT) blockCount; ++i) {
            CHRPOS blockStart = bed.start + starts[i];
            CHRPOS blockEnd   = bed.start + starts[i] + sizes[i];
            BED currBedBlock(bed.chrom, blockStart, blockEnd, bed.name, bed.score, bed.strand, bed.otherFields);
            bedBlocks.push_back(currBedBlock);
        }
    }
}


/***********************************************
Sorting comparison functions
************************************************/
bool sortByChrom(BED const &a, BED const &b) {
    if (a.chrom < b.chrom) return true;
    else return false;
};

bool sortByStart(const BED &a, const BED &b) {
    if (a.start < b.start) return true;
    else return false;
};

bool sortBySizeAsc(const BED &a, const BED &b) {

    CHRPOS aLen = a.end - a.start;
    CHRPOS bLen = b.end - b.start;

    if (aLen < bLen) return true;
    else return false;
};

bool sortBySizeDesc(const BED &a, const BED &b) {

    CHRPOS aLen = a.end - a.start;
    CHRPOS bLen = b.end - b.start;

    if (aLen > bLen) return true;
    else return false;
};

bool sortByScoreAsc(const BED &a, const BED &b) {
    if (a.score < b.score) return true;
    else return false;
};

bool sortByScoreDesc(const BED &a, const BED &b) {
    if (a.score > b.score) return true;
    else return false;
};

bool byChromThenStart(BED const &a, BED const &b) {

    if (a.chrom < b.chrom) return true;
    else if (a.chrom > b.chrom) return false;

    if (a.start < b.start) return true;
    else if (a.start >= b.start) return false;

    return false;
};


/*******************************************
Class methods
*******************************************/

// Constructor
BedFile::BedFile(string &bedFile)
: bedFile(bedFile),
  _typeIsKnown(false)
{}

// Destructor
BedFile::~BedFile(void) {
}


void BedFile::Open(void) {
    
    _bedFields.reserve(12);
    
    if (bedFile == "stdin" || bedFile == "-") {
        _bedStream = &cin;
    }
    else {
        _bedStream = new ifstream(bedFile.c_str(), ios::in);
        
        if (isGzipFile(_bedStream) == true) {
            delete _bedStream;
            _bedStream = new igzstream(bedFile.c_str(), ios::in);
        }
        // bytes were consumed.  rewind.
        _bedStream->seekg(0);
        
        // can we open the file?
        if ( !(_bedStream->good()) ) {
            cerr << "Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl;
            exit (1);
        }
    }
}


// Close the BED file
void BedFile::Close(void) {
    if (bedFile != "stdin") delete _bedStream;
}


BedLineStatus BedFile::GetNextBed(BED &bed, int &lineNum) {

    // make sure there are still lines to process.
    // if so, tokenize, validate and return the BED entry.
    _bedFields.clear();
    if (_bedStream->good()) {
        // parse the bedStream pointer
        getline(*_bedStream, _bedLine);
        lineNum++;

        // split into a string vector.
        Tokenize(_bedLine, _bedFields);

        // load the BED struct as long as it's a valid BED entry.
        return parseLine(bed, _bedFields, lineNum);
    }

    // default if file is closed or EOF
    return BED_INVALID;
}


void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end,
                                 string strand, vector<BED> &hits, bool sameStrand, bool diffStrand) {

    BIN startBin, endBin;
    startBin = (start >> _binFirstShift);
    endBin = ((end-1) >> _binFirstShift);

    // loop through each bin "level" in the binning hierarchy
    for (BINLEVEL i = 0; i < _binLevels; ++i) {

        // loop through each bin at this level of the hierarchy
        BIN offset = _binOffsetsExtended[i];
        for (BIN j = (startBin+offset); j <= (endBin+offset); ++j)  {

            // loop through each feature in this chrom/bin and see if it overlaps
            // with the feature that was passed in.  if so, add the feature to
            // the list of hits.
            vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin();
            vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end();

            for (; bedItr != bedEnd; ++bedItr) {
                // do we have sufficient overlap?
                if (overlaps(bedItr->start, bedItr->end, start, end) > 0) {
                    
                    bool strands_are_same = (strand == bedItr->strand);
                    
                    // test for necessary strandedness
                    if ( (sameStrand == false && diffStrand == false)
                         ||
                         (sameStrand == true && strands_are_same == true)
                         ||
                         (diffStrand == true && strands_are_same == false)
                       )
                    {
                        hits.push_back(*bedItr);
                    }
                }
            }
        }
        startBin >>= _binNextShift;
        endBin >>= _binNextShift;
    }
}


bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand,
                                          bool sameStrand, bool diffStrand, float overlapFraction) {

    BIN startBin, endBin;
    startBin = (start   >> _binFirstShift);
    endBin   = ((end-1) >> _binFirstShift);

    CHRPOS aLength = (end - start);

    // loop through each bin "level" in the binning hierarchy
    for (BINLEVEL i = 0; i < _binLevels; ++i) {

        // loop through each bin at this level of the hierarchy
        BIN offset = _binOffsetsExtended[i];
        for (BIN j = (startBin+offset); j <= (endBin+offset); ++j)  {

            // loop through each feature in this chrom/bin and see if it overlaps
            // with the feature that was passed in.  if so, add the feature to
            // the list of hits.
            vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin();
            vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end();
            for (; bedItr != bedEnd; ++bedItr) {

                CHRPOS s = max(start, bedItr->start);
                CHRPOS e = min(end, bedItr->end);
                // the number of overlapping bases b/w a and b
                int overlapBases = (e - s);

                // do we have sufficient overlap?
                if ( (float) overlapBases / (float) aLength  >= overlapFraction) {
                    
                    bool strands_are_same = (strand == bedItr->strand);
                    
                    // test for necessary strandedness
                    if ( (sameStrand == false && diffStrand == false)
                         ||
                         (sameStrand == true && strands_are_same == true)
                         ||
                         (diffStrand == true && strands_are_same == false)
                       )
                    {
                        return true;
                    }
                }
            }
        }
        startBin >>= _binNextShift;
        endBin >>= _binNextShift;
    }
    return false;
}


bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand,
                                                    bool sameStrand, bool diffStrand, float overlapFraction) {

    BIN startBin, endBin;
    startBin = (start >> _binFirstShift);
    endBin = ((end-1) >> _binFirstShift);

    CHRPOS aLength = (end - start);

    // loop through each bin "level" in the binning hierarchy
    for (BINLEVEL i = 0; i < _binLevels; ++i) {

        // loop through each bin at this level of the hierarchy
        BIN offset = _binOffsetsExtended[i];
        for (BIN j = (startBin+offset); j <= (endBin+offset); ++j)  {

            // loop through each feature in this chrom/bin and see if it overlaps
            // with the feature that was passed in.  if so, add the feature to
            // the list of hits.
            vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin();
            vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end();
            for (; bedItr != bedEnd; ++bedItr) {
                CHRPOS s = max(start, bedItr->start);
                CHRPOS e = min(end, bedItr->end);

                // the number of overlapping bases b/w a and b
                int overlapBases = (e - s);

                // do we have sufficient overlap?
                if ( (float) overlapBases / (float) aLength  >= overlapFraction) {
                    CHRPOS bLength = (bedItr->end - bedItr->start);
                    float bOverlap = ( (float) overlapBases / (float) bLength );
                    bool strands_are_same = (strand == bedItr->strand);
                    
                    // test for sufficient reciprocal overlap and strandedness
                    if ( (bOverlap >= overlapFraction) && 
                         ((sameStrand == false && diffStrand == false)
                             ||
                         (sameStrand == true && strands_are_same == true)
                             ||
                         (diffStrand == true && strands_are_same == false))
                    )
                    {
                        return true;
                    }
                }
            }
        }
        startBin >>= _binNextShift;
        endBin >>= _binNextShift;
    }
    return false;
}


void BedFile::countHits(const BED &a, bool sameStrand, bool diffStrand, bool countsOnly) {

    BIN startBin, endBin;
    startBin = (a.start >> _binFirstShift);
    endBin = ((a.end-1) >> _binFirstShift);

    // loop through each bin "level" in the binning hierarchy
    for (BINLEVEL i = 0; i < _binLevels; ++i) {

        // loop through each bin at this level of the hierarchy
        BIN offset = _binOffsetsExtended[i];
        for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {

            // loop through each feature in this chrom/bin and see if it overlaps
            // with the feature that was passed in.  if so, add the feature to
            // the list of hits.
            vector<BEDCOV>::iterator bedItr = bedCovMap[a.chrom][j].begin();
            vector<BEDCOV>::iterator bedEnd = bedCovMap[a.chrom][j].end();
            for (; bedItr != bedEnd; ++bedItr) {
                
                bool strands_are_same = (a.strand == bedItr->strand);
                // skip the hit if not on the same strand (and we care)
                if ((sameStrand == true && strands_are_same == false) ||
                    (diffStrand == true && strands_are_same == true)
                   ) 
                {
                    continue;
                }
                else if (overlaps(bedItr->start, bedItr->end, a.start, a.end) > 0) {

                    bedItr->count++;
                    if (countsOnly == false) {
                        if (a.zeroLength == false) {
                            bedItr->depthMap[a.start+1].starts++;
                            bedItr->depthMap[a.end].ends++;
                        }
                        else {
                            // correct for the fact that we artificially expanded the zeroLength feature
                            bedItr->depthMap[a.start+2].starts++;
                            bedItr->depthMap[a.end-1].ends++;                        
                        }

                        if (a.start < bedItr->minOverlapStart) {
                            bedItr->minOverlapStart = a.start;
                        }
                    }
                }
            }
        }
        startBin >>= _binNextShift;
        endBin >>= _binNextShift;
    }
}


void BedFile::countSplitHits(const vector<BED> &bedBlocks, bool sameStrand, bool diffStrand, bool countsOnly) {

    // set to track the distinct B features that had coverage.
    // we'll update the counts of coverage for these features by one
    // at the end of this function to avoid over-counting.
    set< vector<BEDCOV>::iterator > validHits;

    vector<BED>::const_iterator blockItr  = bedBlocks.begin();
    vector<BED>::const_iterator blockEnd  = bedBlocks.end();
    for (; blockItr != blockEnd; ++blockItr) {

        BIN startBin, endBin;
        startBin = (blockItr->start >> _binFirstShift);
        endBin = ((blockItr->end-1) >> _binFirstShift);

        // loop through each bin "level" in the binning hierarchy
        for (BINLEVEL i = 0; i < _binLevels; ++i) {

            // loop through each bin at this level of the hierarchy
            BIN offset = _binOffsetsExtended[i];
            for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {

                // loop through each feature in this chrom/bin and see if it overlaps
                // with the feature that was passed in.  if so, add the feature to
                // the list of hits.
                vector<BEDCOV>::iterator bedItr = bedCovMap[blockItr->chrom][j].begin();
                vector<BEDCOV>::iterator bedEnd = bedCovMap[blockItr->chrom][j].end();
                for (; bedItr != bedEnd; ++bedItr) {

                    bool strands_are_same = (blockItr->strand == bedItr->strand);
                    // skip the hit if not on the same strand (and we care)
                    if ((sameStrand == true && strands_are_same == false) ||
                        (diffStrand == true && strands_are_same == true)
                       ) 
                    {
                        continue;
                    }
                    else if (overlaps(bedItr->start, bedItr->end, blockItr->start, blockItr->end) > 0) {
                        if (countsOnly == false) {
                            if (blockItr->zeroLength == false) {
                                bedItr->depthMap[blockItr->start+1].starts++;
                                bedItr->depthMap[blockItr->end].ends++;
                            }
                            else {
                                // correct for the fact that we artificially expanded the zeroLength feature
                                bedItr->depthMap[blockItr->start+2].starts++;
                                bedItr->depthMap[blockItr->end-1].ends++;
                            }
                        }

                        validHits.insert(bedItr);
                        if (blockItr->start < bedItr->minOverlapStart)
                            bedItr->minOverlapStart = blockItr->start;
                    }
                }
            }
            startBin >>= _binNextShift;
            endBin >>= _binNextShift;
        }
    }
    // incrment the count of overlap by one for each B feature that overlapped
    // the current passed hit.  This is necessary to prevent over-counting for
    // each "split"" of a single read.
    set< vector<BEDCOV>::iterator >::iterator validHitsItr = validHits.begin();
    set< vector<BEDCOV>::iterator >::iterator validHitsEnd = validHits.end();
    for (; validHitsItr != validHitsEnd; ++validHitsItr)
        // the validHitsItr points to another itr, hence the (*itr)-> dereferencing.
        // ugly, but that's C++.
        (*validHitsItr)->count++;
}


void BedFile::countListHits(const BED &a, int index, bool sameStrand, bool diffStrand) {

    BIN startBin, endBin;
    startBin = (a.start >> _binFirstShift);
    endBin = ((a.end-1) >> _binFirstShift);

    // loop through each bin "level" in the binning hierarchy
    for (BINLEVEL i = 0; i < _binLevels; ++i) {

        // loop through each bin at this level of the hierarchy
        BIN offset = _binOffsetsExtended[i];
        for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) {

            // loop through each feature in this chrom/bin and see if it overlaps
            // with the feature that was passed in.  if so, add the feature to
            // the list of hits.
            vector<BEDCOVLIST>::iterator bedItr = bedCovListMap[a.chrom][j].begin();
            vector<BEDCOVLIST>::iterator bedEnd = bedCovListMap[a.chrom][j].end();
            for (; bedItr != bedEnd; ++bedItr) {

                bool strands_are_same = (a.strand == bedItr->strand);
                // skip the hit if not on the same strand (and we care)
                if ((sameStrand == true && strands_are_same == false) ||
                    (diffStrand == true && strands_are_same == true)
                   ) 
                {
                    continue;
                }
                else if (overlaps(bedItr->start, bedItr->end, a.start, a.end) > 0) {
                    bedItr->counts[index]++;
                    if (a.zeroLength == false) {
                        bedItr->depthMapList[index][a.start+1].starts++;
                        bedItr->depthMapList[index][a.end].ends++;
                    }
                    else {
                        // correct for the fact that we artificially expanded the zeroLength feature
                        bedItr->depthMapList[index][a.start+2].starts++;
                        bedItr->depthMapList[index][a.end-1].ends++;                        
                    }

                    if (a.start < bedItr->minOverlapStarts[index]) {
                        bedItr->minOverlapStarts[index] = a.start;
                    }
                }
            }
        }
        startBin >>= _binNextShift;
        endBin >>= _binNextShift;
    }
}

void BedFile::setZeroBased(bool zeroBased) { this->isZeroBased = zeroBased; }

void BedFile::setGff (bool gff) { this->_isGff = gff; }


void BedFile::setVcf (bool vcf) { this->_isVcf = vcf; }


void BedFile::setFileType (FileType type) {
    _fileType    = type;
    _typeIsKnown = true;
}


void BedFile::setBedType (int colNums) {
    bedType = colNums;
}


void BedFile::loadBedFileIntoMap() {

    BED bedEntry, nullBed;
    int lineNum = 0;
    BedLineStatus bedStatus;

    Open();
    while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
        if (bedStatus == BED_VALID) {
            BIN bin = getBin(bedEntry.start, bedEntry.end);
            bedMap[bedEntry.chrom][bin].push_back(bedEntry);
            bedEntry = nullBed;
        }
    }
    Close();
}


void BedFile::loadBedCovFileIntoMap() {

    BED bedEntry, nullBed;
    int lineNum = 0;
    BedLineStatus bedStatus;

    Open();
    while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
        if (bedStatus == BED_VALID) {
            BIN bin = getBin(bedEntry.start, bedEntry.end);

            BEDCOV bedCov;
            bedCov.chrom        = bedEntry.chrom;
            bedCov.start        = bedEntry.start;
            bedCov.end          = bedEntry.end;
            bedCov.name         = bedEntry.name;
            bedCov.score        = bedEntry.score;
            bedCov.strand       = bedEntry.strand;
            bedCov.otherFields  = bedEntry.otherFields;
            bedCov.zeroLength   = bedEntry.zeroLength;
            bedCov.count = 0;
            bedCov.minOverlapStart = INT_MAX;

            bedCovMap[bedEntry.chrom][bin].push_back(bedCov);
            bedEntry = nullBed;
        }
    }
    Close();
}

void BedFile::loadBedCovListFileIntoMap() {

    BED bedEntry, nullBed;
    int lineNum = 0;
    BedLineStatus bedStatus;

    Open();
    while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
        if (bedStatus == BED_VALID) {
            BIN bin = getBin(bedEntry.start, bedEntry.end);

            BEDCOVLIST bedCovList;
            bedCovList.chrom        = bedEntry.chrom;
            bedCovList.start        = bedEntry.start;
            bedCovList.end          = bedEntry.end;
            bedCovList.name         = bedEntry.name;
            bedCovList.score        = bedEntry.score;
            bedCovList.strand       = bedEntry.strand;
            bedCovList.otherFields  = bedEntry.otherFields;
            bedCovList.zeroLength   = bedEntry.zeroLength;

            bedCovListMap[bedEntry.chrom][bin].push_back(bedCovList);
            bedEntry = nullBed;
        }
    }
    Close();
}


void BedFile::loadBedFileIntoMapNoBin() {

    BED bedEntry, nullBed;
    int lineNum = 0;
    BedLineStatus bedStatus;

    Open();
    while ((bedStatus = this->GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
        if (bedStatus == BED_VALID) {
            bedMapNoBin[bedEntry.chrom].push_back(bedEntry);
            bedEntry = nullBed;
        }
    }
    Close();

    // sort the BED entries for each chromosome
    // in ascending order of start position
    for (masterBedMapNoBin::iterator m = this->bedMapNoBin.begin(); m != this->bedMapNoBin.end(); ++m) {
        sort(m->second.begin(), m->second.end(), sortByStart);
    }
}