Commit 73e435b9 authored by lindenb's avatar lindenb
Browse files

betools pool

parent 3ca83fb3
......@@ -10,6 +10,7 @@ VERSION_FILE=./src/utils/version/version_git.h
RELEASED_VERSION_FILE=./src/utils/version/version_release.txt
# define our object and binary directories
export OBJ_DIR = obj
export BIN_DIR = bin
......@@ -56,6 +57,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/nucBed \
$(SRC_DIR)/pairToBed \
$(SRC_DIR)/pairToPair \
$(SRC_DIR)/pool \
$(SRC_DIR)/randomBed \
$(SRC_DIR)/regressTest \
$(SRC_DIR)/reldist \
......
......@@ -65,6 +65,7 @@ int nek_sandbox1_main(int argc, char* argv[]);//
int nuc_main(int argc, char* argv[]);//
int pairtobed_main(int argc, char* argv[]);//
int pairtopair_main(int argc, char* argv[]);//
int pool_main(int argc, char* argv[]); //
int random_main(int argc, char* argv[]); //
int reldist_main(int argc, char* argv[]); //
int sample_main(int argc, char* argv[]); //
......@@ -99,6 +100,7 @@ int main(int argc, char *argv[])
else if (sub_cmd == "complement") return complement_main(argc-1, argv+1);
else if (sub_cmd == "subtract") return subtract_main(argc-1, argv+1);
else if (sub_cmd == "slop") return slop_main(argc-1, argv+1);
else if (sub_cmd == "pool") return pool_main(argc-1, argv+1);
else if (sub_cmd == "flank") return flank_main(argc-1, argv+1);
else if (sub_cmd == "sort") return sort_main(argc-1, argv+1);
else if (sub_cmd == "random") return random_main(argc-1, argv+1);
......
UTILITIES_DIR = ../utils/
OBJ_DIR = ../../obj/
BIN_DIR = ../../bin/
# -------------------
# define our includes
# -------------------
INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \
-I$(UTILITIES_DIR)/lineFileUtilities/ \
-I$(UTILITIES_DIR)/gzstream/ \
-I$(UTILITIES_DIR)/fileType/ \
-I$(UTILITIES_DIR)/version/
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= poolBedMain.cpp poolBed.cpp poolBed.h
OBJECTS= poolBed.o poolBedMain.o
BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS))
all: $(BUILT_OBJECTS)
.PHONY: all
$(BUILT_OBJECTS): $(SOURCES)
@echo " * compiling" $(*F).cpp
@$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES)
clean:
@echo "Cleaning up."
@rm -f $(OBJ_DIR)/poolBedMain.o $(OBJ_DIR)/poolBed.o
.PHONY: clean
/*****************************************************************************
poolBed.h
(c) 2015 - Pierre Lindenbaum PhD
@yokofakun http://plindenbaum.blogspot.com
Univ. Nantes, France
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include <math.h>
#include "lineFileUtilities.h"
#include "poolBed.h"
class BedPoolItems
{
public:
vector<BED> items;
double _nbases;
BedPoolItems():_nbases(-1)
{
}
~BedPoolItems()
{
}
double length(const BED* bedEntry)
{
if(bedEntry==NULL && this->_nbases>=0) return _nbases;
double nBases=0.0;
for(size_t i=0;i< items.size();++i)
{
nBases += items[i].size();
}
if( bedEntry!=NULL)
{
nBases += bedEntry->size();
}
else
{
this->_nbases = nBases;
}
return nBases;
}
};
BedPool::BedPool(string &bedFile, string& outfileprefix,size_t num_split):
_bedFile(bedFile),
_outfileprefix(outfileprefix),
_num_split(num_split),
_bed(0)
{
_bed = new BedFile(bedFile);
doWork();
}
BedPool::~BedPool(void) {
if(_bed!=0) delete _bed;
}
void BedPool::doWork() {
size_t i,nLine(0);
vector<BedPoolItems*> pools;
BED bedEntry;
_bed->Open();
while (_bed->GetNextBed(bedEntry)) {
if (_bed->_status == BED_VALID) {
//pools can be created
if( pools.size() < this->_num_split )
{
BedPoolItems* newpool=new BedPoolItems;
pools.push_back(newpool);
//pools.back().bf->bedType = _bed->bedType ;
//pools.back().bf->addBEDIntoMap(bedEntry);
newpool->items.push_back(bedEntry);
}
//only one pool ?
else if( this->_num_split == 1 )
{
pools.front()->items.push_back(bedEntry);
}
else
{
size_t index_insert = 0UL;
vector<double> stdevs;
double lowest_stddev=-1;
/** try to insert the bedEntry in any of the pool */
for(size_t try_index_insert = 0;
try_index_insert < pools.size();
++try_index_insert
)
{
vector<double> pool_sizes;
double mean = 0;
/* get the mean size of the pool */
for(i=0;i< pools.size() ; ++i)
{
double num_bases = pools[i]->length(i==try_index_insert?&bedEntry:NULL);
pool_sizes.push_back( num_bases );
mean += num_bases;
}
mean = mean / pools.size() ;
/* get the stddev of the pool */
double stddev=0.0;
for(i=0;i< pools.size() ; ++i)
{
stddev += fabs( pool_sizes[i] - mean);
}
/* this position in BED index is better = stddev of size lowest */
if( try_index_insert == 0 || lowest_stddev> stddev )
{
lowest_stddev = stddev ;
index_insert = try_index_insert;
}
}
pools[index_insert]->items.push_back(bedEntry);
pools[index_insert]->_nbases=-1;/* reset size cache */
}
}
}
for(i=0;i< pools.size() ; ++i)
{
string filename(this->_outfileprefix);
char tmp[10];
sprintf(tmp,"%05d",(i+1));
filename.append(".").append(tmp).append(".bed");
FILE* out = fopen(filename.c_str(),"w");
if(out==NULL)
{
fprintf(stderr,"Cannot open \"%s\". %s\n",
filename.c_str(),
strerror(errno)
);
exit(EXIT_FAILURE);
}
BedFile bf;
bf.bedType = this->_bed->bedType;
for(size_t j=0;j< pools[i]->items.size();++j)
{
bf.reportToFileBedNewLine(out,pools[i]->items[j]);
}
fclose(out);
}
_bed->Close();
}
/*****************************************************************************
poolBed.h
(c) 2015 - Pierre Lindenbaum PhD
@yokofakun http://plindenbaum.blogspot.com
Univ. Nantes, France
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "bedFile.h"
#include <vector>
#include <iostream>
#include <fstream>
#include <map>
#include <cstdlib>
#include <ctime>
#include <sys/time.h>
#include <unistd.h>
#include <sys/types.h>
#include <algorithm> // for binary search
using namespace std;
//************************************************
// Class methods and elements
//************************************************
class BedPool {
public:
// constructor
BedPool(string &bedFile, string& outfileprefix,size_t num_split);
// destructor
~BedPool(void);
private:
string _bedFile;
string _outfileprefix;
size_t _num_split;
// The BED file from which to compute coverage.
BedFile *_bed;
void doWork();
};
/*****************************************************************************
dealBed.h
(c) 2015 - Pierre Lindenbaum PhD
@yokofakun http://plindenbaum.blogspot.com
Univ. Nantes, France
Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "poolBed.h"
#include "version.h"
using namespace std;
// define our program name
#define PROGRAM_NAME "bedtools pool"
// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
// function declarations
void pool_help(void);
int pool_main(int argc, char* argv[]) {
// our configuration variables
bool showHelp = false;
bool haveBed = false;
// input files
string bedFile("stdin");
string outfileprefix("_pool");
size_t num_split(10UL);
for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);
if(PARAMETER_CHECK("-i", 2, parameterLength)) {
if ((i+1) < argc) {
bedFile = argv[i + 1];
haveBed=true;
i++;
}
}
else if(PARAMETER_CHECK("-p", 2, parameterLength)) {
if ((i+1) < argc) {
outfileprefix.assign(argv[i + 1]);
i++;
}
}
else if(PARAMETER_CHECK("-n", 2, parameterLength)) {
if ((i+1) < argc) {
num_split = atof(argv[i + 1]);
i++;
}
}
else if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
}
}
if(outfileprefix.empty())
{
fprintf(stderr,"output prefix is empty\n");
exit(EXIT_FAILURE);
}
if(showHelp) pool_help();
if( num_split < 0UL) num_split = 1UL;
if (!showHelp) {
BedPool *bc = new BedPool(bedFile,
outfileprefix,num_split);
delete bc;
return 0;
}
else {
pool_help();
}
return 0;
}
void pool_help(void) {
cerr << "\nTool: bedtools pool" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Groups items of a Bed file in " << endl << endl;
cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed> -n number-of-files" << endl << endl;
cerr << "Options: " << endl;
cerr << "\t-n\t" << "Number of pools to create." << endl;
cerr << "\t-p\t" << "output file prefix" << endl;
// end the program here
exit(1);
}
......@@ -163,7 +163,7 @@ public:
zeroLength(false)
{}
int size() {
int size() const {
return end-start;
}
......@@ -998,14 +998,14 @@ public:
/*
reportBedNewLine
reportToFileBedNewLine
Writes the _original_ BED entry with a NEWLINE
at the end of the line.
Works for BED3 - BED6.
*/
template <typename T>
inline void reportBedNewLine(const T &bed) {
inline void reportToFileBedNewLine(FILE* out,const T &bed) {
// if it is azeroLength feature, we need to
// correct the start and end coords to what they were
......@@ -1020,24 +1020,24 @@ public:
//BED
if (_isGff == false && _isVcf == false) {
if (this->bedType == 3) {
printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end);
fprintf(out,"%s\t%d\t%d\n", bed.chrom.c_str(), start, end);
}
else if (this->bedType == 4) {
printf ("%s\t%d\t%d\t%s\n",
fprintf(out,"%s\t%d\t%d\t%s\n",
bed.chrom.c_str(), start, end, bed.name.c_str());
}
else if (this->bedType == 5) {
printf ("%s\t%d\t%d\t%s\t%s\n",
fprintf(out,"%s\t%d\t%d\t%s\t%s\n",
bed.chrom.c_str(), start, end,
bed.name.c_str(), bed.score.c_str());
}
else if (this->bedType == 6) {
printf ("%s\t%d\t%d\t%s\t%s\t%s\n",
fprintf(out,"%s\t%d\t%d\t%s\t%s\t%s\n",
bed.chrom.c_str(), start, end, bed.name.c_str(),
bed.score.c_str(), bed.strand.c_str());
}
else if (this->bedType > 6) {
printf ("%s\t%d\t%d\t%s\t%s\t%s",
fprintf(out,"%s\t%d\t%d\t%s\t%s\t%s",
bed.chrom.c_str(), start, end, bed.name.c_str(),
bed.score.c_str(), bed.strand.c_str());
......@@ -1046,27 +1046,27 @@ public:
vector<uint16_t>::const_iterator
othEnd = bed.other_idxs.end();
for ( ; othIt != othEnd; ++othIt) {
printf("\t%s", bed.fields[*othIt].c_str());
fprintf(out,"\t%s", bed.fields[*othIt].c_str());
}
printf("\n");
fprintf(out,"\n");
}
}
// VCF
else if (_isGff == false && _isVcf == true) {
printf ("%s\t%d", bed.chrom.c_str(), start+1);
fprintf(out,"%s\t%d", bed.chrom.c_str(), start+1);
vector<uint16_t>::const_iterator othIt = bed.other_idxs.begin();
vector<uint16_t>::const_iterator othEnd = bed.other_idxs.end();
for ( ; othIt != othEnd; ++othIt) {
printf("\t%s", bed.fields[*othIt].c_str());
fprintf(out,"\t%s", bed.fields[*othIt].c_str());
}
printf("\n");
fprintf(out,"\n");
}
// GFF
else if (_isGff == true) {
// "GFF-8"
if (this->bedType == 8) {
printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n",
fprintf (out,"%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n",
bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
bed.name.c_str(), start+1, end,
bed.score.c_str(), bed.strand.c_str(),
......@@ -1074,7 +1074,7 @@ public:
}
// "GFF-9"
else if (this->bedType == 9) {
printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n",
fprintf (out,"%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n",
bed.chrom.c_str(), bed.fields[bed.other_idxs[0]].c_str(),
bed.name.c_str(), start+1, end,
bed.score.c_str(), bed.strand.c_str(),
......@@ -1084,6 +1084,12 @@ public:
}
}
/* specialized version of reportBedNewLine printing to stdout */
template <typename T>
inline void reportBedNewLine(const T &bed) {
reportToFileBedNewLine(stdout,bed);
}
/*
reportBedRangeNewLine
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment