Commit e2be0789 authored by nkindlon's avatar nkindlon
Browse files

Added jaccard w/ -split and unit tests. Fixed Bam input as bam output bug,...

Added jaccard w/ -split and unit tests. Fixed Bam input as bam output bug, added unit test in sample.
parent bbda1f41
/*****************************************************************************
Jaccard.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "Jaccard.h"
#include "BlockMgr.h"
#include "NewChromsweep.h"
Jaccard::Jaccard(ContextJaccard *context)
: _context(context),
_intersectionVal(0),
_unionVal(0),
_numIntersections(0)
{
_blockMgr = new BlockMgr(_context->getOverlapFraction(), _context->getReciprocal());
}
Jaccard::~Jaccard(void) {
delete _blockMgr;
_blockMgr = NULL;
}
bool Jaccard::calculate() {
if (!getIntersectionAndUnion()) {
return false;
}
// header
cout << "intersection\tunion-intersection\tjaccard\tn_intersections" << endl;
unsigned long adjustedUnion = _unionVal - _intersectionVal;
cout << _intersectionVal << "\t" << adjustedUnion << "\t" <<
(float) _intersectionVal / (float)adjustedUnion << "\t" << _numIntersections << endl;
return true;
}
bool Jaccard::getIntersectionAndUnion() {
NewChromSweep sweep(_context);
if (!sweep.init()) {
return false;
}
RecordKeyList hitSet;
while (sweep.next(hitSet)) {
if (_context->getObeySplits()) {
RecordKeyList keySet(hitSet.getKey());
RecordKeyList resultSet(hitSet.getKey());
_blockMgr->findBlockedOverlaps(keySet, hitSet, resultSet);
_intersectionVal += getTotalIntersection(&resultSet);
} else {
_intersectionVal += getTotalIntersection(&hitSet);
}
}
sweep.closeOut();
unsigned long queryUnion = sweep.getQueryTotalRecordLength();
unsigned long dbUnion = sweep.getDatabaseTotalRecordLength();
_unionVal = queryUnion + dbUnion;
return true;
}
unsigned long Jaccard::getTotalIntersection(RecordKeyList *recList)
{
unsigned long intersection = 0;
const Record *key = recList->getKey();
int keyStart = key->getStartPos();
int keyEnd = key->getEndPos();
int hitIdx = 0;
for (RecordKeyList::const_iterator_type iter = recList->begin(); iter != recList->end(); iter = recList->next()) {
const Record *currRec = iter->value();
int maxStart = max(currRec->getStartPos(), keyStart);
int minEnd = min(currRec->getEndPos(), keyEnd);
if (_context->getObeySplits()) {
intersection += _blockMgr->getOverlapBases(hitIdx);
hitIdx++;
} else {
intersection += (unsigned long)(minEnd - maxStart);
}
}
_numIntersections += (int)recList->size();
return intersection;
}
......@@ -12,59 +12,28 @@
#ifndef JACCARD_H
#define JACCARD_H
#include "bedFile.h"
#include "chromsweep.h"
#include "api/BamReader.h"
#include "api/BamAux.h"
#include "BlockedIntervals.h"
#include "BamAncillary.h"
using namespace BamTools;
#include <vector>
#include <iostream>
#include <fstream>
#include <stdlib.h>
using namespace std;
#include "ContextJaccard.h"
class BlockMgr;
class Jaccard {
public:
// constructor
Jaccard(string bedAFile, string bedBFile,
float overlapFraction, bool reciprocal,
bool valueOnly);
Jaccard(ContextJaccard *context);
~Jaccard();
// destructor
~Jaccard(void);
bool calculate();
private:
//------------------------------------------------
// private attributes
//------------------------------------------------
string _bedAFile;
string _bedBFile;
bool _reciprocal;
bool _valueOnly;
float _overlapFraction;
// instance of a bed file class.
BedFile *_bedA, *_bedB;
//------------------------------------------------
// private methods
//------------------------------------------------
unsigned long GetIntersection(size_t &n_intersections);
unsigned long GetUnion();
void CalculateJaccard();
size_t GetTotalIntersection(const BED &a, const vector<BED> &hits);
ContextJaccard *_context;
BlockMgr *_blockMgr;
unsigned long _intersectionVal;
unsigned long _unionVal;
int _numIntersections;
bool getIntersectionAndUnion();
unsigned long getTotalIntersection(RecordKeyList *hits);
};
#endif /* JACCARD_H */
......@@ -5,24 +5,33 @@ BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/genomeFile/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/fileType/ \
INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/GenomeFile/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src \
-I$(UTILITIES_DIR)/BlockedIntervals \
-I$(UTILITIES_DIR)/BamTools-Ancillary \
-I$(UTILITIES_DIR)/chromsweep \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/VectorOps \
-I$(UTILITIES_DIR)/BinTree \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= jaccardMain.cpp jaccard.cpp jaccard.h
OBJECTS= jaccardMain.o jaccard.o
SOURCES= jaccardMain.cpp Jaccard.cpp Jaccard.h
OBJECTS= jaccardMain.o Jaccard.o
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= jaccard
PROGRAM= Jaccard
all: $(BUILT_OBJECTS)
......@@ -34,6 +43,6 @@ $(BUILT_OBJECTS): $(SOURCES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/jaccardMain.o $(OBJ_DIR)/jaccard.o
@rm -f $(OBJ_DIR)/jaccardMain.o $(OBJ_DIR)/Jaccard.o
.PHONY: clean
/*****************************************************************************
jaccard.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "lineFileUtilities.h"
#include "jaccard.h"
/************************************
Helper functions
************************************/
size_t Jaccard::GetTotalIntersection(const BED &a, const vector<BED> &hits)
{
size_t intersection = 0;
vector<BED>::const_iterator h = hits.begin();
vector<BED>::const_iterator hitsEnd = hits.end();
for (; h != hitsEnd; ++h) {
CHRPOS s = max(a.start, h->start);
CHRPOS e = min(a.end, h->end);
intersection += (e - s);
}
return intersection;
}
/*
Constructor
*/
Jaccard::Jaccard(string bedAFile, string bedBFile,
float overlapFraction, bool reciprocal,
bool valueOnly)
{
_bedAFile = bedAFile;
_bedBFile = bedBFile;
_overlapFraction = overlapFraction;
_reciprocal = reciprocal;
_valueOnly = valueOnly;
CalculateJaccard();
}
/*
Destructor
*/
Jaccard::~Jaccard(void) {
}
unsigned long Jaccard::GetIntersection(size_t &n_intersections) {
_bedA = new BedFile(_bedAFile);
_bedB = new BedFile(_bedBFile);
unsigned long I = 0;
ChromSweep sweep = ChromSweep(_bedA, _bedB,
false, false,
_overlapFraction, _reciprocal,
true, false);
pair<BED, vector<BED> > hit_set;
hit_set.second.reserve(10000);
while (sweep.Next(hit_set)) {
I += GetTotalIntersection(hit_set.first, hit_set.second);
n_intersections += hit_set.second.size();
}
return I;
}
void Jaccard::CalculateJaccard() {
size_t n_intersections = 0;
unsigned long I = GetIntersection(n_intersections);
unsigned long U = _bedA->getTotalFlattenedLength() + \
_bedB->getTotalFlattenedLength();
float jaccard_stat = (float) I / ((float) U - (float) I);
if (!_valueOnly) {
// header
cout << "intersection\t"
<< "union-intersection\t"
<< "jaccard\t"
<< "n_intersections"
<< endl;
// result
cout << I << "\t"
<< U - I << "\t"
<< jaccard_stat << "\t"
<< n_intersections
<< endl;
}
else {
cout << jaccard_stat << endl;
}
}
......@@ -9,124 +9,39 @@
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "jaccard.h"
#include "Jaccard.h"
#include "version.h"
using namespace std;
// define our program name
#define PROGRAM_NAME "bedtools jaccard"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void jaccard_help(void);
int jaccard_main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
// input files
string bedAFile;
string bedBFile;
// input arguments
float overlapFraction = 1E-9;
bool haveBedA = false;
bool haveBedB = false;
bool haveFraction = false;
bool reciprocalFraction = false;
bool valueOnly = false;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
}
ContextJaccard *context = new ContextJaccard();
if (!context->parseCmdArgs(argc, argv, 1) || context->getShowHelp() || !context->isValidState()) {
if (!context->getErrorMsg().empty()) {
cerr << context->getErrorMsg() << endl;
}
jaccard_help();
delete context;
return 0;
}
Jaccard *jaccard = new Jaccard(context);
if(showHelp) jaccard_help();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-a", 2, parameterLength)) {
if ((i+1) < argc) {
haveBedA = true;
bedAFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-b", 2, parameterLength)) {
if ((i+1) < argc) {
haveBedB = true;
bedBFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-f", 2, parameterLength)) {
if ((i+1) < argc) {
haveFraction = true;
overlapFraction = atof(argv[i + 1]);
i++;
}
}
else if(PARAMETER_CHECK("-r", 2, parameterLength)) {
reciprocalFraction = true;
}
else if(PARAMETER_CHECK("-valueOnly", 10, parameterLength)) {
valueOnly = true;
}
else {
cerr << endl
<< "*****ERROR: Unrecognized parameter: "
<< argv[i]
<< " *****"
<< endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveBedA || !haveBedB) {
cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl;
showHelp = true;
}
if (reciprocalFraction && !haveFraction) {
cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
Jaccard *j = new Jaccard(bedAFile, bedBFile,
overlapFraction, reciprocalFraction,
valueOnly);
delete j;
return 0;
}
else {
jaccard_help();
return 0;
}
bool retVal = jaccard->calculate();
delete jaccard;
delete context;
return retVal ? 0 : 1;
}
void jaccard_help(void) {
cerr << "\nTool: bedtools jaccard (aka jaccard)" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Calculate Jaccard statistic b/w two feature files."
<< endl
<< "\t Jaccard is the length of the intersection over the union."
......@@ -146,12 +61,122 @@ void jaccard_help(void) {
cerr << "\t-r\t" << "Require that the fraction overlap be reciprocal for A and B." << endl;
cerr << "\t\t- In other words, if -f is 0.90 and -r is used, this requires" << endl;
cerr << "\t\t that B overlap 90% of A and A _also_ overlaps 90% of B." << endl << endl;
cerr << "\t-split\t" << "Treat \"split\" BAM or BED12 entries as distinct BED intervals." << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) Input files must be sorted by chrom, then start position."
cerr << "\t(1) Input files must be sorted by chrom, then start position."
<< endl << endl;
// end the program here
exit(1);
}
//// define our program name
//
//
//// define our parameter checking macro
//#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
//
//// function declarations
//void jaccard_help(void);
//
//int jaccard_main(int argc, char* argv[]) {
//
// // our configuration variables
// bool showHelp = false;
//
// // input files
// string bedAFile;
// string bedBFile;
//
// // input arguments
// float overlapFraction = 1E-9;
//
// bool haveBedA = false;
// bool haveBedB = false;
// bool haveFraction = false;
// bool reciprocalFraction = false;
// bool valueOnly = false;
//
// // check to see if we should print out some help
// if(argc <= 1) showHelp = true;
//
// for(int i = 1; i < argc; i++) {
// int parameterLength = (int)strlen(argv[i]);
//
// if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
// (PARAMETER_CHECK("--help", 5, parameterLength))) {
// showHelp = true;
// }
// }
//
// if(showHelp) jaccard_help();
//
// // do some parsing (all of these parameters require 2 strings)
// for(int i = 1; i < argc; i++) {
//
// int parameterLength = (int)strlen(argv[i]);
//
// if(PARAMETER_CHECK("-a", 2, parameterLength)) {
// if ((i+1) < argc) {
// haveBedA = true;
// bedAFile = argv[i + 1];
// i++;
// }
// }
// else if(PARAMETER_CHECK("-b", 2, parameterLength)) {
// if ((i+1) < argc) {
// haveBedB = true;
// bedBFile = argv[i + 1];
// i++;
// }
// }
// else if(PARAMETER_CHECK("-f", 2, parameterLength)) {
// if ((i+1) < argc) {
// haveFraction = true;
// overlapFraction = atof(argv[i + 1]);
// i++;
// }
// }
// else if(PARAMETER_CHECK("-r", 2, parameterLength)) {
// reciprocalFraction = true;
// }
// else if(PARAMETER_CHECK("-valueOnly", 10, parameterLength)) {
// valueOnly = true;
// }
// else {
// cerr << endl
// << "*****ERROR: Unrecognized parameter: "
// << argv[i]
// << " *****"
// << endl << endl;
// showHelp = true;
// }
// }
//
// // make sure we have both input files
// if (!haveBedA || !haveBedB) {
// cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl;
// showHelp = true;
// }
//
// if (reciprocalFraction && !haveFraction) {
// cerr << endl << "*****" << endl << "*****ERROR: If using -r, you need to define -f." << endl << "*****" << endl;
// showHelp = true;
// }
//
// if (!showHelp) {
//
// Jaccard *j = new Jaccard(bedAFile, bedBFile,
// overlapFraction, reciprocalFraction,
// valueOnly);
// delete j;
// return 0;
// }
// else {
// jaccard_help();
// return 0;
// }
//}
/*
* ContextJaccard.cpp
*
* Created on: Apr 24, 2014
* Author: nek3d
*/
#include "ContextJaccard.h"
ContextJaccard::ContextJaccard() {
setSortedInput(true);
setUseMergedIntervals(true);
}
ContextJaccard::~ContextJaccard() {
}
/*
* ContextJaccard.h
*
* Created on: Apr 24, 2014
* Author: nek3d
*/
#ifndef CONTEXTJACCARD_H_
#define CONTEXTJACCARD_H_