Commit e65c98bc authored by arq5x's avatar arq5x
Browse files

Merge branch 'master' of https://github.com/arq5x/bedtools2

parents 237766a7 81f458fb
......@@ -66,7 +66,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/sortBed \
$(SRC_DIR)/spacingFile \
$(SRC_DIR)/split \
$(SRC_DIR)/subtractBed \
$(SRC_DIR)/subtractFile \
$(SRC_DIR)/tagBam \
$(SRC_DIR)/unionBedGraphs \
$(SRC_DIR)/windowBed \
......
bedtools - a swiss army knife for genome arithmetic
===================================================
**Current version**: 2.22.1
**Current version**: 2.23.0
Note
-------
......
......@@ -114,6 +114,11 @@ void closest_help(void) {
cerr << "\t-header\t" << "Print the header from the A file prior to results." << endl << endl;
cerr << "\t-nonamecheck\t" << "For sorted data, don't throw an error if the file has different naming conventions" << endl;
cerr << "\t\t\tfor the same chromosome. ex. \"chr1\" vs \"chr01\"." << endl << endl;
cerr << "Notes: " << endl;
cerr << "\tReports \"none\" for chrom and \"-1\" for all other fields when a feature" << endl;
cerr << "\tis not found in B on the same chromosome as the feature in A." << endl;
......
......@@ -134,6 +134,9 @@ void intersect_help(void) {
cerr << "\t-sortout\t" << "When using multiple databases, sort the output DB hits" << endl;
cerr << "\t\t\tfor each record." << endl << endl;
cerr << "\t-nonamecheck\t" << "For sorted data, don't throw an error if the file has different naming conventions" << endl;
cerr << "\t\t\tfor the same chromosome. ex. \"chr1\" vs \"chr01\"." << endl << endl;
CommonHelp();
cerr << "Notes: " << endl;
......
......@@ -73,6 +73,11 @@ void jaccard_help(void) {
cerr << "\t\t" << "the forward or reverse strand, respectively." << endl;
cerr << "\t\t" << "- By default, merging is done without respect to strand." << endl << endl;
cerr << "\t-nonamecheck\t" << "For sorted data, don't throw an error if the file has different naming conventions" << endl;
cerr << "\t\t\tfor the same chromosome. ex. \"chr1\" vs \"chr01\"." << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) Input files must be sorted by chrom, then start position."
<< endl << endl;
......
......@@ -84,6 +84,10 @@ void map_help(void) {
cerr << "\t-prec\t" << "Sets the decimal precision for output (Default: 5)" << endl << endl;
cerr << "\t-nonamecheck\t" << "For sorted data, don't throw an error if the file has different naming conventions" << endl;
cerr << "\t\t\tfor the same chromosome. ex. \"chr1\" vs \"chr01\"." << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) Both input files must be sorted by chrom, then start." << endl << endl;
......
/*****************************************************************************
subtractBed.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "lineFileUtilities.h"
#include "subtractBed.h"
#include <numeric>
/*
Constructor
*/
BedSubtract::BedSubtract(string &bedAFile, string &bedBFile,
float overlapFraction, bool sameStrand,
bool diffStrand, bool removeAll, bool removeAny)
: _bedAFile(bedAFile)
, _bedBFile(bedBFile)
, _overlapFraction(overlapFraction)
, _sameStrand(sameStrand)
, _diffStrand(diffStrand)
, _removeAll(removeAll)
, _removeAny(removeAny)
{
_bedA = new BedFile(bedAFile);
_bedB = new BedFile(bedBFile);
SubtractBed();
}
/*
Destructor
*/
BedSubtract::~BedSubtract(void) {
}
void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) {
// find all of the overlaps between a and B.
_bedB->allHits(a.chrom, a.start, a.end, a.strand,
hits, _sameStrand, _diffStrand, 0.0, false);
// is A completely spanned by an entry in B?
// if so, A should not be reported.
int numConsumedByB = 0;
int numOverlaps = 0;
vector<BED> bOverlaps; // list of hits in B. Special proc. if multiple.
vector<BED>::const_iterator h = hits.begin();
vector<BED>::const_iterator hitsEnd = hits.end();
for (; h != hitsEnd; ++h) {
int s = max(a.start, h->start);
int e = min(a.end, h->end);
int overlapBases = (e - s);
int aLength = (a.end - a.start);
if (s < e) {
// is there enough overlap (default ~ 1bp)
float overlap = ((float) overlapBases / (float) aLength);
if (overlap >= 1.0) {
numOverlaps++;
numConsumedByB++;
}
else if ( overlap >= _overlapFraction || _removeAny) {
numOverlaps++;
bOverlaps.push_back(*h);
}
}
}
if (numOverlaps == 0) {
// no overlap found, so just report A as-is.
_bedA->reportBedNewLine(a);
}
else if ((numOverlaps == 1) && (!_removeAny)) {
// one overlap found. only need to look at the single
// entry in bOverlaps.
// if A was not "consumed" by any entry in B
if ((numConsumedByB == 0) && (_removeAll == false)) {
BED theHit = bOverlaps[0];
// A ++++++++++++
// B ----
// Res. ==== ====
if ( (theHit.start > a.start) && (theHit.end < a.end) ) {
_bedA->reportBedRangeNewLine(a,a.start,theHit.start);
_bedA->reportBedRangeNewLine(a,theHit.end,a.end);
}
// A ++++++++++++
// B ----------
// Res. ==
else if (theHit.start == a.start) {
_bedA->reportBedRangeNewLine(a,theHit.end,a.end);
}
// A ++++++++++++
// B ----------
// Res. ====
else if (theHit.start < a.start) {
_bedA->reportBedRangeNewLine(a,theHit.end,a.end);
}
// A ++++++++++++
// B ----------
// Res. =======
else if (theHit.start > a.start) {
_bedA->reportBedRangeNewLine(a,a.start,theHit.start);
}
}
}
else if ((numOverlaps > 1) || _removeAny) {
// multiple overlapz found. look at all the hits
// and figure out which bases in A survived. then
// report the contigous intervals that survived.
vector<bool> aKeep(a.end - a.start, true);
if (numConsumedByB == 0) {
if(_removeAll){ return; }
// if there's any overlap, then we don't report.
// track the number of hit starts and ends at each position in A
for (vector<BED>::iterator h = bOverlaps.begin();
h != bOverlaps.end();
++h)
{
int s = max(a.start, h->start);
int e = min(a.end, h->end);
for (int i = s+1; i <= e; ++i) {
aKeep[i-a.start-1] = false;
}
}
if (_removeAny){
int asum = std::accumulate(aKeep.rbegin(), aKeep.rend(), 0);
float afrac = 1.0 - (float)asum / aKeep.size();
if(afrac > _overlapFraction){ return ; }
for (unsigned int i = 0; i < aKeep.size(); ++i) {
aKeep[i] = true;
}
}
// report the remaining blocks.
for (unsigned int i = 0; i < aKeep.size(); ++i) {
if (aKeep[i] == true) {
CHRPOS blockStart = i + a.start;
while ((aKeep[i] == true) && (i < aKeep.size())) {
i++;
}
CHRPOS blockEnd = i + a.start;
blockEnd = min(a.end, blockEnd);
_bedA->reportBedRangeNewLine(a,blockStart,blockEnd);
}
}
}
}
}
void BedSubtract::SubtractBed() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bedB->loadBedFileIntoMap();
BED a;
vector<BED> hits;
// reserve some space
hits.reserve(100);
_bedA->Open();
while (_bedA->GetNextBed(a)) {
if (_bedA->_status == BED_VALID) {
FindAndSubtractOverlaps(a, hits);
hits.clear();
}
}
_bedA->Close();
}
// END Intersect
/*****************************************************************************
subtractBed.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#ifndef SUBTRACTBED_H
#define SUBTRACTBED_H
#include "bedFile.h"
#include <vector>
#include <iostream>
#include <fstream>
using namespace std;
//************************************************
// Class methods and elements
//************************************************
class BedSubtract {
public:
// constructor
BedSubtract(string &bedAFile, string &bedBFile,
float overlapFraction, bool sameStrand,
bool diffStrand, bool removeAll, bool removeAny);
// destructor
~BedSubtract(void);
private:
// processing variables
string _bedAFile;
string _bedBFile;
float _overlapFraction;
bool _sameStrand;
bool _diffStrand;
bool _removeAll;
bool _removeAny;
// instances of bed file class.
BedFile *_bedA, *_bedB;
// methods
void FindAndSubtractOverlaps(BED &a, vector<BED> &hits);
void SubtractBed();
};
#endif /* SUBTRACTBED_H */
......@@ -5,18 +5,31 @@ BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/fileType/ \
INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/GenomeFile/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src \
-I$(UTILITIES_DIR)/BlockedIntervals \
-I$(UTILITIES_DIR)/BamTools-Ancillary \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/BinTree \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= subtractMain.cpp subtractBed.cpp subtractBed.h
OBJECTS= subtractMain.o subtractBed.o
SOURCES= subtractMain.cpp subtractFile.cpp subtractFile.h
OBJECTS= subtractMain.o subtractFile.o
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= subtractFile
all: $(BUILT_OBJECTS)
......@@ -24,10 +37,10 @@ all: $(BUILT_OBJECTS)
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/subtractMain.o $(OBJ_DIR)/subtractBed.o
@rm -f $(OBJ_DIR)/subtractMain.o $(OBJ_DIR)/subtractFile.o
.PHONY: clean
/*
* subtractFile.cpp
*
* Created on: Feb 19, 2015
* Author: nek3d
*/
/*****************************************************************************
subtractFile.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "subtractFile.h"
#include "ContextSubtract.h"
#include "FileRecordMgr.h"
#include "NewChromsweep.h"
#include "BinTree.h"
#include "RecordOutputMgr.h"
#include <numeric> //for std::accumulate
SubtractFile::SubtractFile(ContextSubtract *context)
: _context(context),
_blockMgr(NULL),
_recordOutputMgr(NULL),
_tmpBlocksMgr(NULL)
{
_recordOutputMgr = new RecordOutputMgr();
_recordOutputMgr->init(_context);
if (_context->getObeySplits()) {
_blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal());
_recordOutputMgr->setSplitInfo(_blockMgr);
}
_tmpBlocksMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal());
}
SubtractFile::~SubtractFile(void) {
delete _blockMgr;
_blockMgr = NULL;
delete _recordOutputMgr;
delete _tmpBlocksMgr;
}
void SubtractFile::processHits(RecordKeyVector &hits) {
_recordOutputMgr->printRecord(hits);
}
bool SubtractFile::subtractFiles()
{
if (_context->getSortedInput()) {
return processSortedFiles();
}
return processUnsortedFiles();
}
bool SubtractFile::processSortedFiles()
{
// use the chromsweep algorithm to detect overlaps on the fly.
NewChromSweep sweep(_context);
if (!sweep.init()) {
return false;
}
RecordKeyVector hitSet;
while (sweep.next(hitSet)) {
if (_context->getObeySplits()) {
RecordKeyVector keySet(hitSet.getKey());
RecordKeyVector resultSet(hitSet.getKey());
_blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet);
subtractHits(resultSet);
} else {
subtractHits(hitSet);
}
}
if (!_context->hasGenomeFile()) {
sweep.closeOut(true);
}
return true;
}
bool SubtractFile::processUnsortedFiles()
{
BinTree *binTree = new BinTree( _context);
binTree->loadDB();
FileRecordMgr *queryFRM = _context->getFile(_context->getQueryFileIdx());
while (!queryFRM->eof()) {
Record *queryRecord = queryFRM->getNextRecord();
if (queryRecord == NULL) {
continue;
}
RecordKeyVector hitSet(queryRecord);
binTree->getHits(queryRecord, hitSet);
if (_context->getObeySplits()) {
RecordKeyVector keySet(hitSet.getKey());
RecordKeyVector resultSet;
_blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet);
subtractHits(resultSet);
} else {
subtractHits(hitSet);
}
queryFRM->deleteRecord(queryRecord);
}
//clean up.
delete binTree;
return true;
}
void SubtractFile::subtractHits(RecordKeyVector &hits) {
if (hits.empty()) {
// no intersection, nothing to subtract.
// just copy key to hits as if it were a
// self-intersection. This is just for reporting
// purposes.
hits.push_back(hits.getKey());
processHits(hits);
return;
}
if (_context->getRemoveAll() && _context->getSubtractFraction() == 0.0) {
// hits aren't empty, meaning there is intersection,
// so we want to not report the hit.
hits.clearAll();
return;
}
//loop through hits. Track which bases in query were covered
const Record *keyRec = hits.getKey();
int keyStart = keyRec->getStartPos();
int keyEnd = keyRec->getEndPos();
//this vector of bools will represent the bases of the query.
//for each base, true means uncovered, false means covered.
//they begin as all uncovered.
vector<bool> keyBases(keyEnd - keyStart, true);
//now loop through the hits, and cover corresponding query bases
//by setting them to false.
bool basesRemoved = false;
for (RecordKeyVector::const_iterator_type iter = hits.begin(); iter != hits.end(); iter = hits.next()) {
const Record *hitRec = *iter;
int hitStart = hitRec->getStartPos();
int hitEnd = hitRec->getEndPos();
int startIdx = max(keyStart, hitStart) - keyStart;
int endIdx = min(keyEnd, hitEnd) - keyStart;
int keyLen = keyEnd - keyStart;
int coveredLen = endIdx - startIdx;
float coveragePct = (float)coveredLen / (float)keyLen;
//for each base in the hit, set the base in the query to false.
//this effectively "erases" the covered bits. Only do
if (_context->getRemoveSum() || coveragePct >= _context->getSubtractFraction()) {
std::fill(keyBases.begin() + startIdx, keyBases.begin() + endIdx, false);
basesRemoved = true;
}
}
if (!basesRemoved) {
//treat as if there were no intersection
hits.clearVector();
hits.push_back(hits.getKey());
processHits(hits);
return;
} else if (_context->getRemoveAll()) {
hits.clearAll();
return;
}
// if the -N option is used ( removeSum), do not report if the percentage of
// uniquely covered bases exceeds the overlap fraction.
if (_context->getRemoveSum()) {
//determine how many bases are left uncovered.
int numBasesUncovered = std::accumulate(keyBases.begin(), keyBases.end(), 0);
//determine percentage that are covered.
float pctCovered = 1.0 - (float)numBasesUncovered / (float)(keyEnd - keyStart);
if (pctCovered > _context->getSubtractFraction()) {
hits.clearAll();
return;
} else {
hits.clearVector();
hits.push_back(hits.getKey());
}
processHits(hits);
return;
}
//now make "blocks" out of the query's remaining stretches of
//uncovered bases.
RecordKeyVector tempHits(keyRec);
for (int i = 0; i < (int)keyBases.size(); i++) {
if (keyBases[i] == true) {
int blockStart = keyStart + i;
while (keyBases[i] == true && i < (int)keyBases.size()) {
i++;
}
int blockEnd = min(keyStart + i, keyEnd);
tempHits.push_back(_tmpBlocksMgr->allocateAndAssignRecord(keyRec, blockStart, blockEnd));
}
}
processHits(tempHits);
_tmpBlocksMgr->deleteBlocks(tempHits);
}
/*
* subtractFile.h
*
* Created on: Feb 19, 2015
* Author: nek3d
*/
#ifndef SUBTRACTFILE_H_
#define SUBTRACTFILE_H_
#include "RecordKeyVector.h"
#include "BlockMgr.h"
using namespace std;
class ContextSubtract;
class BlockMgr;
class RecordOutputMgr;
class BlockMgr;
class SubtractFile {