Commit e052d77f authored by Neil Kindlon's avatar Neil Kindlon
Browse files

first check in for new coverage tool

parent 5d0f1362
......@@ -37,7 +37,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/closestFile \
$(SRC_DIR)/clusterBed \
$(SRC_DIR)/complementBed \
$(SRC_DIR)/coverageBed \
$(SRC_DIR)/coverageFile \
$(SRC_DIR)/expand \
$(SRC_DIR)/fastaFromBed \
$(SRC_DIR)/flankBed \
......
......@@ -47,7 +47,7 @@ int bedpetobam_main(int argc, char* argv[]);//
void closest_help();
int cluster_main(int argc, char* argv[]); //
int complement_main(int argc, char* argv[]);//
int coverage_main(int argc, char* argv[]); //
void coverage_help();
int regress_test_main(int argc, char **argv); //
int expand_main(int argc, char* argv[]);//
int fastafrombed_main(int argc, char* argv[]);//
......@@ -104,7 +104,6 @@ int main(int argc, char *argv[])
// genome arithmetic tools
else if (subCmd == "window") return window_main(argc-1, argv+1);
else if (subCmd == "coverage") return coverage_main(argc-1, argv+1);
else if (subCmd == "genomecov") return genomecoverage_main(argc-1, argv+1);
else if (subCmd == "cluster") return cluster_main(argc-1, argv+1);
else if (subCmd == "complement") return complement_main(argc-1, argv+1);
......@@ -306,5 +305,8 @@ void showHelp(const QuickString &subCmd) {
spacing_help();
} else if (subCmd == "fisher") {
fisher_help();
} else if (subCmd == "coverage") {
coverage_help();
}
}
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/genomeFile/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BlockedIntervals \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= coverageMain.cpp coverageBed.cpp coverageBed.h
OBJECTS= coverageMain.o coverageBed.o
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
all: $(BUILT_OBJECTS)
.PHONY: all
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/coverageMain.o $(OBJ_DIR)/coverageBed.o
.PHONY: clean
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
TOOL_DIR = ../../src/
INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/GenomeFile/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src \
-I$(UTILITIES_DIR)/BlockedIntervals \
-I$(UTILITIES_DIR)/BamTools-Ancillary \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/VectorOps \
-I$(UTILITIES_DIR)/BinTree \
-I$(UTILITIES_DIR)/version/ \
-I$(UTILITIES_DIR)/ToolBase/ \
-I$(TOOL_DIR)/intersectFile/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= coverageHelp.cpp coverageFile.cpp coverageFile.h
OBJECTS= coverageHelp.o coverageFile.o
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
all: $(BUILT_OBJECTS)
.PHONY: all
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/coverageHelp.o $(OBJ_DIR)/coverageFile.o
.PHONY: clean
/*
* coverageFile.cpp
*
* Created on: May 8, 2015
* Author: nek3d
*/
#include "coverageFile.h"
CoverageFile::CoverageFile(ContextCoverage *context)
: IntersectFile(context),
_depthArray(NULL),
_depthArrayCapacity(0),
_queryLen(0),
_totalQueryLen(0),
_queryOffset(0),
_floatValBuf(NULL)
{
//allocate and initialize depth array.
//Use C's malloc and free becauase we want
//to be able to realloc.
//Not using vector because arrays are faster.
size_t newSize = sizeof(size_t) * DEFAULT_DEPTH_CAPACITY;
_depthArray = (size_t *)malloc(newSize);
memset(_depthArray, 0, newSize);
_depthArrayCapacity = DEFAULT_DEPTH_CAPACITY;
_floatValBuf = new char[floatValBufLen];
}
CoverageFile::~CoverageFile() {
free(_depthArray);
delete _floatValBuf;
}
void CoverageFile::processHits(RecordOutputMgr *outputMgr, RecordKeyVector &hits) {
makeDepthCount(hits);
_finalOutput.clear();
switch(upCast(_context)->getCoverageType()) {
case ContextCoverage::COUNT:
doCounts(outputMgr, hits);
break;
case ContextCoverage::PER_BASE:
doPerBase(outputMgr, hits);
break;
case ContextCoverage::HIST:
doHist(outputMgr, hits);
break;
case ContextCoverage::DEFAULT:
default:
doDefault(outputMgr, hits);
break;
}
}
void CoverageFile::cleanupHits(RecordKeyVector &hits) {
IntersectFile::cleanupHits(hits);
memset(_depthArray, 0, sizeof(size_t) * _queryLen);
}
void CoverageFile::giveFinalReport(RecordOutputMgr *outputMgr) {
//only give report for histogram option
if (upCast(_context)->getCoverageType() != ContextCoverage::HIST) {
return;
}
for (depthMapType::iterator iter = _finalDepthMap.begin(); iter != _finalDepthMap.end(); iter++) {
size_t depth = iter->first;
size_t basesAtDepth = iter->second;
float depthPct = (float)basesAtDepth / (float)_totalQueryLen;
_finalOutput = "all\t";
_finalOutput.append(depth);
_finalOutput.append("\t");
_finalOutput.append(basesAtDepth);
_finalOutput.append("\t");
_finalOutput.append(_totalQueryLen);
_finalOutput.append("\t");
format(depthPct);
outputMgr->printRecord(NULL, _finalOutput);
}
}
void CoverageFile::makeDepthCount(RecordKeyVector &hits) {
const Record *key = hits.getKey();
_queryOffset = key->getStartPos();
_queryLen = (size_t)(key->getEndPos() - _queryOffset);
_totalQueryLen += _queryLen;
//resize depth array if needed
if (_depthArrayCapacity < _queryLen) {
_depthArray = (size_t*)realloc(_depthArray, sizeof(size_t) * _queryLen);
_depthArrayCapacity = _queryLen;
memset(_depthArray, 0, sizeof(size_t) * _depthArrayCapacity);
}
//loop through hits, which may not be in sorted order, due to
//potential multiple databases, and increment the depth array as needed.
for (RecordKeyVector::const_iterator_type iter = hits.begin(); iter != hits.end(); iter = hits.next()) {
const Record *dbRec = *iter;
int dbStart = dbRec->getStartPos();
int dbEnd = dbRec->getEndPos();
int maxStart = max(_queryOffset, dbStart);
int minEnd = min(dbEnd, key->getEndPos());
for (int i=maxStart; i < minEnd; i++) {
_depthArray[i - _queryOffset]++;
}
}
}
size_t CoverageFile::countBasesAtDepth(size_t depth) {
size_t retCount = 0;
for (size_t i=0; i < _queryLen; i++) {
if (_depthArray[i] == depth) {
retCount++;
}
}
return retCount;
}
void CoverageFile::doCounts(RecordOutputMgr *outputMgr, RecordKeyVector &hits)
{
_finalOutput = hits.size();
outputMgr->printRecord(hits.getKey(), _finalOutput);
}
void CoverageFile::doPerBase(RecordOutputMgr *outputMgr, RecordKeyVector &hits)
{
//loop through all bases in query, printing full record and metrcis for each
const Record * queryRec = hits.getKey();
for (size_t i= 0; i < _queryLen; i++) {
_finalOutput = i +1;
_finalOutput.append("\t");
_finalOutput.append(_depthArray[i]);
outputMgr->printRecord(queryRec, _finalOutput);
}
}
void CoverageFile::doHist(RecordOutputMgr *outputMgr, RecordKeyVector &hits)
{
//make a map of depths to num bases with that depth
_currDepthMap.clear();
for (size_t i=0; i < _queryLen; i++) {
_currDepthMap[_depthArray[i]]++;
_finalDepthMap[_depthArray[i]]++;
}
for (depthMapType::iterator iter = _currDepthMap.begin(); iter != _currDepthMap.end(); iter++) {
size_t depth = iter->first;
size_t numBasesAtDepth = iter->second;
float coveredBases = (float)numBasesAtDepth / (float)_queryLen;
_finalOutput = depth;
_finalOutput.append("\t");
_finalOutput.append(numBasesAtDepth);
_finalOutput.append("\t");
_finalOutput.append(_queryLen);
_finalOutput.append("\t");
format(coveredBases);
outputMgr->printRecord(hits.getKey(), _finalOutput);
}
}
void CoverageFile::doDefault(RecordOutputMgr *outputMgr, RecordKeyVector &hits)
{
size_t nonZeroBases = _queryLen - countBasesAtDepth(0);
float coveredBases = (float)nonZeroBases / (float)_queryLen;
_finalOutput = hits.size();
_finalOutput.append("\t");
_finalOutput.append(nonZeroBases);
_finalOutput.append("\t");
_finalOutput.append(_queryLen);
_finalOutput.append("\t");
format(coveredBases);
outputMgr->printRecord(hits.getKey(), _finalOutput);
}
void CoverageFile::format(float val)
{
memset(_floatValBuf, 0, floatValBufLen);
sprintf(_floatValBuf, "%0.7f", val);
_finalOutput.append(_floatValBuf);
}
/*
* coverageFile.h
*
* Created on: May 8, 2015
* Author: nek3d
*/
#ifndef COVERAGEFILE_H_
#define COVERAGEFILE_H_
#include "intersectFile.h"
#include "ContextCoverage.h"
class CoverageFile : public IntersectFile {
public:
CoverageFile(ContextCoverage *);
~CoverageFile();
virtual void processHits(RecordOutputMgr *outputMgr, RecordKeyVector &hits);
virtual void cleanupHits(RecordKeyVector &hits);
virtual void giveFinalReport(RecordOutputMgr *outputMgr);
protected:
QuickString _finalOutput;
size_t *_depthArray;
size_t _depthArrayCapacity;
size_t _queryLen;
size_t _totalQueryLen;
int _queryOffset;
static const int DEFAULT_DEPTH_CAPACITY = 1024;
char *_floatValBuf;
static const int floatValBufLen = 16;
typedef map<size_t, size_t> depthMapType;
depthMapType _currDepthMap;
depthMapType _finalDepthMap;
virtual ContextCoverage *upCast(ContextBase *context) { return static_cast<ContextCoverage*>(context); }
void makeDepthCount(RecordKeyVector &hits);
size_t countBasesAtDepth(size_t depth);
void doCounts(RecordOutputMgr *outputMgr, RecordKeyVector &hits);
void doPerBase(RecordOutputMgr *outputMgr, RecordKeyVector &hits);
void doHist(RecordOutputMgr *outputMgr, RecordKeyVector &hits);
void doDefault(RecordOutputMgr *outputMgr, RecordKeyVector &hits);
void format(float val);
};
#endif /* COVERAGEFILE_H_ */
......@@ -9,126 +9,8 @@
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "coverageBed.h"
#include "version.h"
using namespace std;
// define the version
#define PROGRAM_NAME "bedtools coverage"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void coverage_help(void);
int coverage_main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
// input files
string bedAFile;
string bedBFile;
// parm flags
bool sameStrand = false;
bool diffStrand = false;
bool writeHistogram = false;
bool eachBase = false;
bool obeySplits = false;
bool bamInput = false;
bool haveBedA = false;
bool haveBedB = false;
bool countsOnly = false;
// check to see if we should print out some help
if(argc <= 1) showHelp = true;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
}
}
if(showHelp) coverage_help();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-a", 2, parameterLength)) {
if ((i+1) < argc) {
haveBedA = true;
bedAFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-abam", 5, parameterLength)) {
if ((i+1) < argc) {
haveBedA = true;
bamInput = true;
bedAFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-b", 2, parameterLength)) {
if ((i+1) < argc) {
haveBedB = true;
bedBFile = argv[i + 1];
i++;
}
}
else if (PARAMETER_CHECK("-s", 2, parameterLength)) {
sameStrand = true;
}
else if (PARAMETER_CHECK("-S", 2, parameterLength)) {
diffStrand = true;
}
else if (PARAMETER_CHECK("-hist", 5, parameterLength)) {
writeHistogram = true;
}
else if(PARAMETER_CHECK("-d", 2, parameterLength)) {
eachBase = true;
}
else if (PARAMETER_CHECK("-split", 6, parameterLength)) {
obeySplits = true;
}
else if (PARAMETER_CHECK("-counts", 7, parameterLength)) {
countsOnly = true;
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveBedA || !haveBedB) {
cerr << endl << "*****" << endl << "*****ERROR: Need -a and -b files. " << endl << "*****" << endl;
showHelp = true;
}
if (sameStrand && diffStrand) {
cerr << endl << "*****" << endl << "*****ERROR: Request either -s OR -S, not both." << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedCoverage *bg = new BedCoverage(bedAFile, bedBFile, sameStrand, diffStrand,
writeHistogram, bamInput, obeySplits, eachBase, countsOnly);
delete bg;
}
else {
coverage_help();
}
return 0;
}
#include "CommonHelp.h"
void coverage_help(void) {
......@@ -137,7 +19,7 @@ void coverage_help(void) {
cerr << "Summary: Returns the depth and breadth of coverage of features from A" << endl;
cerr << "\t on the intervals in B." << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl;
cerr << "Usage: " << "bedtools coverage" << " [OPTIONS] -a <bed/gff/vcf> -b <bed/gff/vcf>" << endl << endl;
cerr << "Options: " << endl;
......@@ -169,6 +51,36 @@ void coverage_help(void) {
cerr << "\t\tFor BED12 files, this uses the BlockCount, BlockStarts," << endl;
cerr << "\t\tand BlockEnds fields (i.e., columns 10,11,12)." << endl << endl;
cerr << "\t-sorted\t" << "Use the \"chromsweep\" algorithm for sorted (-k1,1 -k2,2n) input." << endl << endl;
cerr << "\t-g\t" << "Provide a genome file to enforce consistent chromosome sort order" << endl;
cerr <<"\t\tacross input files. Only applies when used with -sorted option." << endl << endl;
cerr << "\t-header\t" << "Print the header from the A file prior to results." << endl << endl;
cerr << "\t-nobuf\t" << "Disable buffered output. Using this option will cause each line"<< endl;
cerr <<"\t\tof output to be printed as it is generated, rather than saved" << endl;
cerr <<"\t\tin a buffer. This will make printing large output files " << endl;
cerr <<"\t\tnoticeably slower, but can be useful in conjunction with" << endl;
cerr <<"\t\tother software tools and scripts that need to process one" << endl;
cerr <<"\t\tline of bedtools output at a time." << endl << endl;
cerr << "\t-names\t" << "When using multiple databases, provide an alias for each that" << endl;
cerr <<"\t\twill appear instead of a fileId when also printing the DB record." << endl << endl;
cerr << "\t-filenames" << "\tWhen using multiple databases, show each complete filename" << endl;
cerr <<"\t\t\tinstead of a fileId when also printing the DB record." << endl << endl;
cerr << "\t-sortout\t" << "When using multiple databases, sort the output DB hits" << endl;
cerr << "\t\t\tfor each record." << endl << endl;
cerr << "\t-nonamecheck\t" << "For sorted data, don't throw an error if the file has different naming conventions" << endl;
cerr << "\t\t\tfor the same chromosome. ex. \"chr1\" vs \"chr01\"." << endl << endl;
CommonHelp();
cerr << "Default Output: " << endl;
cerr << "\t" << " After each entry in B, reports: " << endl;
cerr << "\t 1) The number of features in A that overlapped the B interval." << endl;
......
......@@ -98,6 +98,7 @@ bool IntersectFile::nextUnsortedFind(RecordKeyVector &hits)
if (queryRecord == NULL) {
continue;
} else {
_context->testNameConventions(queryRecord);
hits.setKey(queryRecord);
_binTree->getHits(queryRecord, hits);
return true;
......
......@@ -3,20 +3,6 @@ OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
TOOL_DIR = ../../src/
# -------------------
# define our includes
# -------------------
# INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
# -I$(UTILITIES_DIR)/gzstream/ \
# -I$(UTILITIES_DIR)/genomeFile/ \
# -I$(UTILITIES_DIR)/lineFileUtilities/ \
# -I$(UTILITIES_DIR)/fileType/ \
# -I$(UTILITIES_DIR)/BamTools/include \
# -I$(UTILITIES_DIR)/BamTools-Ancillary \
# -I$(UTILITIES_DIR)/chromsweep \
# -I$(UTILITIES_DIR)/VectorOps \
# -I$(UTILITIES_DIR)/version/
INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/fileType/ \
......
......@@ -56,6 +56,7 @@ void map_help(void) {
cerr << "\t-nonamecheck\t" << "For sorted data, don't throw an error if the file has different naming conventions" << endl;
cerr << "\t\t\tfor the same chromosome. ex. \"chr1\" vs \"chr01\"." << endl << endl;
CommonHelp();
cerr << "Notes: " << endl;
cerr << "\t(1) Both input files must be sorted by chrom, then start." << endl << endl;
......
......@@ -650,7 +650,9 @@ bool ContextBase::parseIoBufSize(QuickString bufStr)
}
void ContextBase::testNameConventions(const Record *record) {
if (getNameCheckDisabled() || _nameConventionWarningTripped) return;
//Do nothing if using the -nonamecheck option,
//warning already given, or record is unmapped BAM record
if (getNameCheckDisabled() || _nameConventionWarningTripped || record->isUnmapped()) return;
int fileIdx = record->getFileIdx();
......