Commit b0bc5b7e authored by Aaron Quinlan's avatar Aaron Quinlan
Browse files

Merge pull request #261 from nkindlon/master

Converted complement to PFM with merge-based approach, sorted input e…
parents aa11ef9d f5927cf6
......@@ -36,7 +36,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/bed12ToBed6 \
$(SRC_DIR)/closestFile \
$(SRC_DIR)/clusterBed \
$(SRC_DIR)/complementBed \
$(SRC_DIR)/complementFile \
$(SRC_DIR)/coverageFile \
$(SRC_DIR)/expand \
$(SRC_DIR)/fastaFromBed \
......
......@@ -46,7 +46,7 @@ int bedtoigv_main(int argc, char* argv[]);//
int bedpetobam_main(int argc, char* argv[]);//
void closest_help();
int cluster_main(int argc, char* argv[]); //
int complement_main(int argc, char* argv[]);//
void complement_help();
void coverage_help();
int regress_test_main(int argc, char **argv); //
int expand_main(int argc, char* argv[]);//
......@@ -106,7 +106,6 @@ int main(int argc, char *argv[])
else if (subCmd == "window") return window_main(argc-1, argv+1);
else if (subCmd == "genomecov") return genomecoverage_main(argc-1, argv+1);
else if (subCmd == "cluster") return cluster_main(argc-1, argv+1);
else if (subCmd == "complement") return complement_main(argc-1, argv+1);
else if (subCmd == "slop") return slop_main(argc-1, argv+1);
else if (subCmd == "split") return split_main(argc-1, argv+1);
else if (subCmd == "flank") return flank_main(argc-1, argv+1);
......@@ -307,6 +306,8 @@ void showHelp(const QuickString &subCmd) {
fisher_help();
} else if (subCmd == "coverage") {
coverage_help();
} else if (subCmd == "complement") {
complement_help();
}
}
/*****************************************************************************
complementBed.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "lineFileUtilities.h"
#include "complementBed.h"
BedComplement::BedComplement(string &bedFile, string &genomeFile) {
_bedFile = bedFile;
_genomeFile = genomeFile;
_bed = new BedFile(bedFile);
_genome = new GenomeFile(genomeFile);
}
BedComplement::~BedComplement(void) {
}
//
// Merge overlapping BED entries into a single entry
//
void BedComplement::ComplementBed() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bed->loadBedFileIntoMapNoBin();
// get a list of the chroms in the user's genome
vector<string> chromList = _genome->getChromList();
// process each chrom in the genome
for (size_t c = 0; c < chromList.size(); ++c) {
string currChrom = chromList[c];
// create a "bit vector" for the chrom
CHRPOS currChromSize = _genome->getChromSize(currChrom);
vector<bool> chromMasks(currChromSize, 0);
// mask the chrom for every feature in the BED file
bedVector::const_iterator bItr = _bed->bedMapNoBin[currChrom].begin();
bedVector::const_iterator bEnd = _bed->bedMapNoBin[currChrom].end();
for (; bItr != bEnd; ++bItr) {
if (bItr->end > currChromSize) {
cerr << "Warning: end of BED entry exceeds chromosome length. "
<< "Please correct." << endl;
_bed->reportBedNewLine(*bItr);
exit(1);
}
// mask all of the positions spanned by this BED entry.
for (CHRPOS b = bItr->start; b < bItr->end; b++)
chromMasks[b] = 1;
}
// report the unmasked, that is, complemented parts of the chrom
CHRPOS i = 0;
CHRPOS start;
while (i < chromMasks.size()) {
if (chromMasks[i] == 0) {
start = i;
while ((chromMasks[i] == 0) && (i < chromMasks.size()))
i++;
if (start > 0)
cout << currChrom << "\t" << start << "\t" << i << endl;
else
cout << currChrom << "\t" << 0 << "\t" << i << endl;
}
i++;
}
}
}
/*****************************************************************************
complementBed.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "bedFile.h"
#include "GenomeFile.h"
#include <vector>
#include <bitset>
#include <algorithm>
#include <iostream>
#include <fstream>
#include <limits.h>
#include <stdlib.h>
using namespace std;
//************************************************
// Class methods and elements
//************************************************
class BedComplement {
public:
// constructor
BedComplement(string &bedFile, string &genomeFile);
// destructor
~BedComplement(void);
void ComplementBed();
private:
string _bedFile;
string _genomeFile;
BedFile *_bed;
GenomeFile *_genome;
};
/*****************************************************************************
complementBedMain.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "complementBed.h"
#include "version.h"
using namespace std;
// define our program name
#define PROGRAM_NAME "bedtools complement"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void complement_help(void);
int complement_main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
// input files
string bedFile = "stdin";
string genomeFile;
bool haveBed = true;
bool haveGenome = false;
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
}
}
if(showHelp) complement_help();
// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-i", 2, parameterLength)) {
if ((i+1) < argc) {
bedFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-g", 2, parameterLength)) {
if ((i+1) < argc) {
haveGenome = true;
genomeFile = argv[i + 1];
i++;
}
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}
// make sure we have both input files
if (!haveBed || !haveGenome) {
cerr << endl << "*****" << endl << "*****ERROR: Need -i BED file and -g genome file. " << endl << "*****" << endl;
showHelp = true;
}
if (!showHelp) {
BedComplement *bc = new BedComplement(bedFile, genomeFile);
bc->ComplementBed();
}
else {
complement_help();
}
return 0;
}
void complement_help(void) {
cerr << "\nTool: bedtools complement (aka complementBed)" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Returns the base pair complement of a feature file." << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl;
cerr << "\t <chromName><TAB><chromSize>" << endl << endl;
cerr << "\tFor example, Human (hg19):" << endl;
cerr << "\tchr1\t249250621" << endl;
cerr << "\tchr2\t243199373" << endl;
cerr << "\t..." << endl;
cerr << "\tchr18_gl000207_random\t4262" << endl << endl;
cerr << "Tips: " << endl;
cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl;
cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl;
cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \\" << endl;
cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl;
exit(1);
}
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
TOOL_DIR = ../../src/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/GenomeFile/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/version/
-I$(UTILITIES_DIR)/BamTools/src \
-I$(UTILITIES_DIR)/BlockedIntervals \
-I$(UTILITIES_DIR)/BamTools-Ancillary \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/BinTree \
-I$(UTILITIES_DIR)/version/ \
-I$(UTILITIES_DIR)/ToolBase/ \
-I$(TOOL_DIR)/intersectFile/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= complementMain.cpp complementBed.cpp complementBed.h
OBJECTS= complementMain.o complementBed.o
SOURCES= complementHelp.cpp complementFile.cpp complementFile.h
OBJECTS= complementHelp.o complementFile.o
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
PROGRAM= complementBed
PROGRAM= complementFile
all: $(BUILT_OBJECTS)
......@@ -28,11 +41,10 @@ all: $(BUILT_OBJECTS)
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(DFLAGS) $(INCLUDES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/complementMain.o $(OBJ_DIR)/complementBed.o
@rm -f $(OBJ_DIR)/complementHelp.o $(OBJ_DIR)/complementFile.o
.PHONY: clean
#include "complementFile.h"
#include "NewGenomeFile.h"
ComplementFile::ComplementFile(ContextComplement *context)
: ToolBase(context),
_genomeFile(context->getGenomeFile()),
_currStartPos(0),
_outputMgr(NULL),
_chromList(_genomeFile->getChromList()),
_currPosInGenomeList(-1)
{
}
ComplementFile::~ComplementFile() {
}
bool ComplementFile::init()
{
_frm = static_cast<FileRecordMergeMgr *>(upCast(_context)->getFile(0));
return true;
}
bool ComplementFile::findNext(RecordKeyVector &hits)
{
while (!_frm->eof()) {
_frm->getNextRecord(&hits);
if (hits.getKey() == NULL) continue;
return true;
}
return false;
}
void ComplementFile::processHits(RecordOutputMgr *outputMgr, RecordKeyVector &hits)
{
_outputMgr = outputMgr;
const Record *rec = hits.getKey();
//test for chrom change.
const QuickString &newChrom = rec->getChrName();
if (_currChrom != newChrom) {
outPutLastRecordInPrevChrom();
//if record's chrom doesn't exist in the genome file, do
//nothing
if (!fastForward(newChrom)) return;
//we've switched to a new chromosome that is in both the DB
//and genome file.
_currStartPos = 0;
_currChrom = newChrom;
_outRecord.setChrName(newChrom);
}
int endPos = rec->getStartPos();
printRecord(endPos);
_currStartPos = rec->getEndPos();
}
void ComplementFile::cleanupHits(RecordKeyVector &hits)
{
_frm->deleteMergedRecord(hits);
}
bool ComplementFile::finalizeCalculations() {
outPutLastRecordInPrevChrom();
fastForward("");
return true;
}
void ComplementFile::outPutLastRecordInPrevChrom()
{
const QuickString &chrom = _outRecord.getChrName();
//do nothing if triggered by first record in DB. At this point,
//there was no prev chrom, so nothing is stored in the output Record yet.
if (chrom.empty()) return;
int maxChromSize = _genomeFile->getChromSize(chrom);
if (_currStartPos >= maxChromSize) return; //chrom already covered and reported.
printRecord(maxChromSize);
}
bool ComplementFile::fastForward(const QuickString &newChrom) {
if (!newChrom.empty() && !_genomeFile->hasChrom(newChrom)) return false;
int i= _currPosInGenomeList +1;
while (i < (int)_chromList.size() && _chromList[i] != newChrom) {
_outRecord.setChrName(_chromList[i]);
_currStartPos = 0;
int endPos = _genomeFile->getChromSize(_chromList[i]);
printRecord(endPos);
i++;
}
if (newChrom.empty()) return true;
if (i== (int)_chromList.size()) {
//reached end but didn't find new chrom. Genome and DB are not sorted in same order.
cerr << "***** ERROR: genome file and input file are not sorted in same order. Exiting..." << endl;
exit(1);
//this is where we'd return false if we weren't exiting.
}
_currChrom = newChrom;
_currPosInGenomeList = i;
return true;
}
void ComplementFile::printRecord(int endPos)
{
_outRecord.setStartPos(_currStartPos);
QuickString startStr;
startStr.append(_currStartPos);
_outRecord.setStartPosStr(startStr);
_outRecord.setEndPos(endPos);
QuickString endStr;
endStr.append(endPos);
_outRecord.setEndPosStr(endStr);
_outputMgr->printRecord(&_outRecord);
_outputMgr->newline();
}
/*
* complementFile.h
*
* Created on: Jun 16, 2015
* Author: nek3d
*/
#ifndef COMPLEMENTFILE_H_
#define COMPLEMENTFILE_H_
#include "ToolBase.h"
#include "ContextComplement.h"
class BlockMgr;
class BinTree;
class NewGenomeFile;
class RecordOutputMgr;
class ComplementFile : public ToolBase {
public:
ComplementFile(ContextComplement *context);
virtual ~ComplementFile();
virtual bool init();
virtual bool findNext(RecordKeyVector &hits);
virtual void processHits(RecordOutputMgr *outputMgr, RecordKeyVector &hits);
virtual void cleanupHits(RecordKeyVector &hits);
virtual bool finalizeCalculations();
virtual void giveFinalReport(RecordOutputMgr *outputMgr) {}
protected:
FileRecordMergeMgr *_frm;
Bed3Interval _outRecord;
QuickString _currChrom;
const NewGenomeFile *_genomeFile;
int _currStartPos;
RecordOutputMgr *_outputMgr;
const vector<QuickString> &_chromList;
int _currPosInGenomeList;
virtual ContextComplement *upCast(ContextBase *context) { return static_cast<ContextComplement *>(context); }
void outPutLastRecordInPrevChrom();
bool fastForward(const QuickString &newChrom);
void printRecord(int endPos);
};
#endif /* COMPLEMENTFILE_H_ */
/*
* subtractMain.cpp
*
* Created on: Feb 19, 2015
* Author: nek3d
*/
#include "CommonHelp.h"
void complement_help(void) {
cerr << "\nTool: bedtools complement (aka complementBed)" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Returns the base pair complement of a feature file." << endl << endl;
cerr << "Usage: " << "bedtools complement" << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl;
cerr << "Notes: " << endl;
cerr << "\t(1) The genome file should tab delimited and structured as follows:" << endl;
cerr << "\t <chromName><TAB><chromSize>" << endl << endl;
cerr << "\tFor example, Human (hg19):" << endl;
cerr << "\tchr1\t249250621" << endl;
cerr << "\tchr2\t243199373" << endl;
cerr << "\t..." << endl;
cerr << "\tchr18_gl000207_random\t4262" << endl << endl;
cerr << "Tips: " << endl;
cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl;
cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl;
cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \\" << endl;
cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl;
exit(1);
}
/*
* ContextComplement.cpp
*
* Created on: Feb 19, 2015
* Author: nek3d
*/
#include "ContextComplement.h"
ContextComplement::ContextComplement()
{
setSortedInput(true);
setUseMergedIntervals(true);
setExplicitBedOutput(true);
}
ContextComplement::~ContextComplement()
{
}
bool ContextComplement::isValidState()
{
if (!hasGenomeFile()) {
_errorMsg = "\n***** ERROR: no -g genome file provided. *****";
return false;
}
return ContextBase::isValidState();
}
/*
* ContextComplement.h
*
* Created on: Feb 19, 2015
* Author: nek3d
*/
#ifndef CONTEXTCOMPLEMENT_H_
#define CONTEXTCOMPLEMENT_H_