Commit f36d6db4 authored by nkindlon's avatar nkindlon
Browse files

Merge converted to PFM, first check-in

parent ed71c8e0
......@@ -47,7 +47,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/linksBed \
$(SRC_DIR)/maskFastaFromBed \
$(SRC_DIR)/mapFile \
$(SRC_DIR)/mergeBed \
$(SRC_DIR)/mergeFile \
$(SRC_DIR)/multiBamCov \
$(SRC_DIR)/multiIntersectBed \
$(SRC_DIR)/nekSandbox1 \
......
......@@ -81,7 +81,7 @@ bool FileIntersect::processUnsortedFiles()
while (!queryFRM->eof()) {
Record *queryRecord = queryFRM->allocateAndGetNextRecord();
Record *queryRecord = queryFRM->getNextRecord();
if (queryRecord == NULL) {
continue;
}
......
/*****************************************************************************
mergeBed.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "lineFileUtilities.h"
#include "mergeBed.h"
void BedMerge::ReportMergedNames(const vector<string> &names) {
if (names.size() > 0) {
printf("\t");
vector<string>::const_iterator nameItr = names.begin();
vector<string>::const_iterator nameEnd = names.end();
for (; nameItr != nameEnd; ++nameItr) {
if (nameItr < (nameEnd - 1))
cout << *nameItr << _delimiter;
else
cout << *nameItr;
}
}
else {
cerr << endl
<< "*****" << endl
<< "*****ERROR: "
<< "No names found to report for the -names option. Exiting."
<< endl
<< "*****" << endl;
exit(1);
}
}
void BedMerge::ReportMergedScores(const vector<string> &scores) {
// setup a VectorOps instances for the list of scores.
// VectorOps methods used for each possible operation.
VectorOps vo(scores);
std::stringstream buffer;
if (scores.size() > 0) {
if (_scoreOp == "sum")
buffer << setprecision (PRECISION) << vo.GetSum();
else if (_scoreOp == "min")
buffer << setprecision (PRECISION) << vo.GetMin();
else if (_scoreOp == "max")
buffer << setprecision (PRECISION) << vo.GetMax();
else if (_scoreOp == "mean")
buffer << setprecision (PRECISION) << vo.GetMean();
else if (_scoreOp == "median")
buffer << setprecision (PRECISION) << vo.GetMedian();
else if (_scoreOp == "mode")
buffer << setprecision (PRECISION) << vo.GetMode();
else if (_scoreOp == "antimode")
buffer << setprecision (PRECISION) << vo.GetAntiMode();
else if (_scoreOp == "collapse")
buffer << setprecision (PRECISION) << vo.GetCollapse(_delimiter);
cout << "\t" << buffer.str();
}
else {
cerr << endl
<< "*****" << endl
<< "*****ERROR: No scores found to report for the -scores option. Exiting." << endl
<< "*****" << endl;
exit(1);
}
}
// ===============
// = Constructor =
// ===============
BedMerge::BedMerge(string &bedFile,
bool numEntries,
int maxDistance,
bool forceStrand,
bool reportNames,
bool reportScores,
const string &scoreOp,
const string &delimiter) :
_bedFile(bedFile),
_numEntries(numEntries),
_forceStrand(forceStrand),
_reportNames(reportNames),
_reportScores(reportScores),
_scoreOp(scoreOp),
_maxDistance(maxDistance),
_delimiter(delimiter)
{
_bed = new BedFile(bedFile);
if (_forceStrand == false)
MergeBed();
else
MergeBedStranded();
}
// =================
// = Destructor =
// =================
BedMerge::~BedMerge(void) {
}
// ===============================================
// Convenience method for reporting merged blocks
// ================================================
void BedMerge::Report(string chrom, int start,
int end, const vector<string> &names,
const vector<string> &scores, int mergeCount)
{
// ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type
//if (_bed->isZeroBased == false) {start++;}
printf("%s\t%d\t%d", chrom.c_str(), start, end);
// just the merged intervals
if (_numEntries == false && _reportNames == false &&
_reportScores == false) {
printf("\n");
}
// merged intervals and counts
else if (_numEntries == true && _reportNames == false &&
_reportScores == false) {
printf("\t%d\n", mergeCount);
}
// merged intervals, counts, and scores
else if (_numEntries == true && _reportNames == false &&
_reportScores == true) {
printf("\t%d", mergeCount);
ReportMergedScores(scores);
printf("\n");
}
// merged intervals, counts, and names
else if (_numEntries == true && _reportNames == true &&
_reportScores == false) {
ReportMergedNames(names);
printf("\t%d\n", mergeCount);
}
// merged intervals, counts, names, and scores
else if (_numEntries == true && _reportNames == true &&
_reportScores == true) {
ReportMergedNames(names);
ReportMergedScores(scores);
printf("\t%d\n", mergeCount);
}
// merged intervals and names
else if (_numEntries == false && _reportNames == true &&
_reportScores == false) {
ReportMergedNames(names);
printf("\n");
}
// merged intervals and scores
else if (_numEntries == false && _reportNames == false &&
_reportScores == true) {
ReportMergedScores(scores);
printf("\n");
}
// merged intervals, names, and scores
else if (_numEntries == false && _reportNames == true &&
_reportScores == true) {
ReportMergedNames(names);
ReportMergedScores(scores);
printf("\n");
}
}
// =========================================================
// Convenience method for reporting merged blocks by strand
// =========================================================
void BedMerge::ReportStranded(string chrom, int start,
int end, const vector<string> &names,
const vector<string> &scores, int mergeCount,
string strand)
{
// ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type
//if (_bed->isZeroBased == false) {start++;}
printf("%s\t%d\t%d", chrom.c_str(), start, end);
// just the merged intervals
if (_numEntries == false && _reportNames == false &&
_reportScores == false) {
printf("\t\t\t%s\n", strand.c_str());
}
// merged intervals and counts
else if (_numEntries == true && _reportNames == false &&
_reportScores == false) {
printf("\t\t%d\t%s\n", mergeCount, strand.c_str());
}
// merged intervals, counts, and scores
else if (_numEntries == true && _reportNames == false &&
_reportScores == true) {
printf("\t%d", mergeCount);
ReportMergedScores(scores);
printf("\t%s\n", strand.c_str());
}
// merged intervals, counts, and names
else if (_numEntries == true && _reportNames == true &&
_reportScores == false) {
ReportMergedNames(names);
printf("\t%d\t%s", mergeCount, strand.c_str());
printf("\n");
}
// merged intervals, counts, names, and scores
else if (_numEntries == true && _reportNames == true &&
_reportScores == true) {
ReportMergedNames(names);
ReportMergedScores(scores);
printf("\t%s\t%d", strand.c_str(), mergeCount);
printf("\n");
}
// merged intervals and names
else if (_numEntries == false && _reportNames == true &&
_reportScores == false) {
ReportMergedNames(names);
printf("\t.\t%s\n", strand.c_str());
}
// merged intervals and scores
else if (_numEntries == false && _reportNames == false &&
_reportScores == true) {
printf("\t");
ReportMergedScores(scores);
printf("\t%s\n", strand.c_str());
}
// merged intervals, names, and scores
else if (_numEntries == false && _reportNames == true &&
_reportScores == true) {
ReportMergedNames(names);
ReportMergedScores(scores);
printf("\t%s\n", strand.c_str());
}
}
// =====================================================
// = Merge overlapping BED entries into a single entry =
// =====================================================
void BedMerge::MergeBed() {
int mergeCount = 1;
vector<string> names;
vector<string> scores;
int start = -1;
int end = -1;
BED prev, curr;
_bed->Open();
while (_bed->GetNextBed(curr, true)) { // true = force sorted intervals
if (_bed->_status != BED_VALID)
continue;
// new block, no overlap
if ( (((int) curr.start - end) > _maxDistance) ||
(curr.chrom != prev.chrom))
{
if (start >= 0) {
Report(prev.chrom, start, end, names, scores, mergeCount);
// reset
mergeCount = 1;
names.clear();
scores.clear();
}
start = curr.start;
end = curr.end;
if (!curr.name.empty())
names.push_back(curr.name);
if (!curr.score.empty())
scores.push_back(curr.score);
}
// same block, overlaps
else {
if ((int) curr.end > end)
end = curr.end;
if (!curr.name.empty())
names.push_back(curr.name);
if (!curr.score.empty())
scores.push_back(curr.score);
mergeCount++;
}
prev = curr;
}
if (start >= 0) {
Report(prev.chrom, start, end, names, scores, mergeCount);
}
}
// ===============================================================================
// = Merge overlapping BED entries into a single entry, accounting for strandedness =
// ================================================================================
void BedMerge::MergeBedStranded() {
// load the "B" bed file into a map so
// that we can easily compare "A" to it for overlaps
_bed->loadBedFileIntoMapNoBin();
// loop through each chromosome and merge their BED entries
masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
for (; m != mEnd; ++m) {
// bedList is already sorted by start position.
string chrom = m->first;
// make a list of the two strands to merge separately.
vector<string> strands(2);
strands[0] = "+";
strands[1] = "-";
// do two passes, one for each strand.
for (unsigned int s = 0; s < strands.size(); s++) {
int mergeCount = 1;
int numOnStrand = 0;
vector<string> names;
vector<string> scores;
// merge overlapping features for this chromosome.
int start = -1;
int end = -1;
vector<BED>::const_iterator bedItr = m->second.begin();
vector<BED>::const_iterator bedEnd = m->second.end();
for (; bedItr != bedEnd; ++bedItr) {
// if forcing strandedness, move on if the hit
// is not on the current strand.
if (bedItr->strand != strands[s]) { continue; }
else { numOnStrand++; }
if ( (((int) bedItr->start - end) > _maxDistance) ||
(end < 0))
{
if (start >= 0) {
ReportStranded(chrom, start, end, names,
scores, mergeCount, strands[s]);
// reset
mergeCount = 1;
names.clear();
scores.clear();
}
start = bedItr->start;
end = bedItr->end;
if (!bedItr->name.empty()) names.push_back(bedItr->name);
if (!bedItr->score.empty()) scores.push_back(bedItr->score);
}
else {
if ((int) bedItr-> end > end) end = bedItr->end;
mergeCount++;
if (!bedItr->name.empty()) names.push_back(bedItr->name);
if (!bedItr->score.empty()) scores.push_back(bedItr->score);
}
}
if (start >= 0) {
ReportStranded(chrom, start, end, names,
scores, mergeCount, strands[s]);
}
}
}
}
/*****************************************************************************
mergeBed.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "bedFile.h"
#include <vector>
#include <algorithm>
#include <numeric>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <limits.h>
#include <stdlib.h>
#include "VectorOps.h"
using namespace std;
const int PRECISION = 21;
//************************************************
// Class methods and elements
//************************************************
class BedMerge {
public:
// constructor
BedMerge(string &bedFile, bool numEntries,
int maxDistance, bool forceStrand,
bool reportNames, bool reportScores,
const string &scoreOp, const string &delimiter);
// destructor
~BedMerge(void);
void MergeBed();
void MergeBedStranded();
private:
string _bedFile;
bool _numEntries;
bool _forceStrand;
bool _reportNames;
bool _reportScores;
string _scoreOp;
int _maxDistance;
string _delimiter;
// instance of a bed file class.
BedFile *_bed;
void Report(string chrom, int start, int end,
const vector<string> &names,
const vector<string> &scores,
int mergeCount);
void ReportStranded(string chrom, int start, int end,
const vector<string> &names,
const vector<string> &scores,
int mergeCount,
string strand);
void ReportMergedNames(const vector<string> &names);
void ReportMergedScores(const vector<string> &scores);
};
......@@ -5,18 +5,29 @@ BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/VectorOps/ \
INCLUDES = -I$(UTILITIES_DIR)/Contexts/ \
-I$(UTILITIES_DIR)/general/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/GenomeFile/ \
-I$(UTILITIES_DIR)/BamTools/include \
-I$(UTILITIES_DIR)/BamTools/src \
-I$(UTILITIES_DIR)/BlockedIntervals \
-I$(UTILITIES_DIR)/BamTools-Ancillary \
-I$(UTILITIES_DIR)/FileRecordTools/ \
-I$(UTILITIES_DIR)/FileRecordTools/FileReaders/ \
-I$(UTILITIES_DIR)/FileRecordTools/Records/ \
-I$(UTILITIES_DIR)/KeyListOps/ \
-I$(UTILITIES_DIR)/RecordOutputMgr/ \
-I$(UTILITIES_DIR)/NewChromsweep \
-I$(UTILITIES_DIR)/BinTree \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= mergeMain.cpp mergeBed.cpp mergeBed.h
OBJECTS= mergeMain.o mergeBed.o
SOURCES= mergeMain.cpp mergeFile.cpp mergeFile.h
OBJECTS= mergeMain.o mergeFile.o
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
......@@ -30,6 +41,6 @@ $(BUILT_OBJECTS): $(SOURCES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/mergeMain.o $(OBJ_DIR)/mergeBed.o
@rm -f $(OBJ_DIR)/mergeMain.o $(OBJ_DIR)/mergeFile.o
.PHONY: clean
\ No newline at end of file
/*****************************************************************************
mergeFile.cpp
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "mergeFile.h"
MergeFile::MergeFile(ContextMerge *context)
: _context(context),
_recordOutputMgr(NULL)
{
_recordOutputMgr = new RecordOutputMgr();
_recordOutputMgr->init(_context);
}
MergeFile::~MergeFile()
{
delete _recordOutputMgr;
_recordOutputMgr = NULL;
}
bool MergeFile::merge()
{
RecordKeyList hitSet;
FileRecordMgr *frm = _context->getFile(0);
while (!frm->eof()) {
Record *key = frm->getNextRecord(&hitSet);
if (key == NULL) continue;
_recordOutputMgr->printRecord(hitSet.getKey(), _context->getColumnOpsVal(hitSet));
}
return true;
}
/*****************************************************************************
mergeFile.h
(c) 2009 - Aaron Quinlan
Hall Laboratory
Department of Biochemistry and Molecular Genetics
University of Virginia
aaronquinlan@gmail.com
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#ifndef MERGE_FILE_H_
#define MERGE_FILE_H_
//************************************************
// Class methods and elements
//************************************************
#include "ContextMerge.h"
#include "RecordOutputMgr.h"
class MergeFile {
public:
MergeFile(ContextMerge *context);
~MergeFile();
bool merge();
private:
ContextMerge *_context;
RecordOutputMgr *_recordOutputMgr;
};
#endif
......@@ -9,7 +9,7 @@
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "mergeBed.h"
#include "mergeFile.h"
#include "version.h"
using namespace std;
......@@ -26,113 +26,21 @@ void merge_help(void);
int merge_main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
// input files
string bedFile = "stdin";
int maxDistance = 0;
string scoreOp = "";
// input arguments
bool haveBed = true;
bool numEntries = false;
bool haveMaxDistance = false;
bool forceStrand = false;
bool reportNames = false;
bool reportScores = false;
string delimiter = ",";
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
ContextMerge *context = new ContextMerge();
if (!context->parseCmdArgs(argc, argv, 1) || context->getShowHelp() || !context->isValidState()) {