From d4448d675902c3faa5ecb785992ce760b46b4efe Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Fri, 17 Jun 2011 16:27:28 -0400 Subject: [PATCH] First cut at new multiBamCov tool. I like. --- Makefile | 1 + src/multiBamCov/Makefile | 48 ++++++++++++ src/multiBamCov/multiBamCov.cpp | 114 ++++++++++++++++++++++++++++ src/multiBamCov/multiBamCov.h | 55 ++++++++++++++ src/multiBamCov/multiBamCovMain.cpp | 114 ++++++++++++++++++++++++++++ 5 files changed, 332 insertions(+) create mode 100644 src/multiBamCov/Makefile create mode 100644 src/multiBamCov/multiBamCov.cpp create mode 100644 src/multiBamCov/multiBamCov.h create mode 100644 src/multiBamCov/multiBamCovMain.cpp diff --git a/Makefile b/Makefile index 0a1fa06e..4abfdd2a 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/linksBed \ $(SRC_DIR)/maskFastaFromBed \ $(SRC_DIR)/mergeBed \ + $(SRC_DIR)/multiBamCov \ $(SRC_DIR)/nucBed \ $(SRC_DIR)/overlap \ $(SRC_DIR)/pairToBed \ diff --git a/src/multiBamCov/Makefile b/src/multiBamCov/Makefile new file mode 100644 index 00000000..79412902 --- /dev/null +++ b/src/multiBamCov/Makefile @@ -0,0 +1,48 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/version/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/BamTools/include + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= multiBamCovMain.cpp multiBamCov.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=bedFile.o lineFileUtilities.o gzstream.o fileType.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +PROGRAM= multiBamCov + +all: $(PROGRAM) + +.PHONY: all + +$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS) + @echo " * linking $(PROGRAM)" + @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS) -L$(UTILITIES_DIR)/BamTools/lib/ -lbamtools + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean diff --git a/src/multiBamCov/multiBamCov.cpp b/src/multiBamCov/multiBamCov.cpp new file mode 100644 index 00000000..3d53c8ef --- /dev/null +++ b/src/multiBamCov/multiBamCov.cpp @@ -0,0 +1,114 @@ +/***************************************************************************** + multiBamCov.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "lineFileUtilities.h" +#include "multiBamCov.h" +#include "api/BamMultiReader.h" + + +/* + Constructor +*/ +MultiCovBam::MultiCovBam(const vector<string> &bam_files, const string bed_file) +: +_bam_files(bam_files), +_bed_file(bed_file) +{ + _bed = new BedFile(_bed_file); + LoadBamFileMap(); +} + + +/* + Destructor +*/ +MultiCovBam::~MultiCovBam(void) +{} + + + +void MultiCovBam::CollectCoverage() +{ + BamMultiReader reader; + reader.SetIndexCacheMode(BamIndex::NoIndexCaching); + + if ( !reader.Open(_bam_files) ) + { + cerr << "Could not open input BAM files." << endl; return; + } + else + { + // attempt to find index files + reader.LocateIndexes(); + + // if index data available for all BAM files, we can use SetRegion + if ( reader.HasIndexes() ) { + BED bed, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + + _bed->Open(); + // loop through each BED entry, jump to it, + // and collect coverage from each BAM + while ((bedStatus = _bed->GetNextBed(bed, lineNum)) != BED_INVALID) + { + if (bedStatus == BED_VALID) + { + reader.SetRegion(reader.GetReferenceID(bed.chrom), + (int) bed.start, + reader.GetReferenceID(bed.chrom), + (int) bed.end); + + // everything checks out, just iterate through specified region, counting alignments + vector<int> counts(_bam_files.size()); + BamAlignment al; + while ( reader.GetNextAlignmentCore(al) ) + { + // lookup the offset of the file name and tabulate coverage + // for the appropriate file + counts[bamFileMap[al.Filename]]++; + } + // report the cov at this interval for each file and reset + _bed->reportBedTab(bed); + ReportCounts(counts); + bed = nullBed; + } + } + _bed->Close(); + } + else { + cerr << "Could not find indexes." << endl; + reader.Close(); + exit(1); + } + } +} + + +void MultiCovBam::LoadBamFileMap(void) +{ + for (size_t i = 0; i < _bam_files.size(); ++i) + { + bamFileMap[_bam_files[i]] = i; + } +} + +void MultiCovBam::ReportCounts(const vector<int> &counts) +{ + for (size_t i = 0; i < counts.size(); ++i) + { + if (i < counts.size() - 1) + cout << counts[i] << "\t"; + else + cout << counts[i]; + } + cout << endl; +} diff --git a/src/multiBamCov/multiBamCov.h b/src/multiBamCov/multiBamCov.h new file mode 100644 index 00000000..2e8f0fb6 --- /dev/null +++ b/src/multiBamCov/multiBamCov.h @@ -0,0 +1,55 @@ +/***************************************************************************** + multiBamCov.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#ifndef MULTICOVBAM_H +#define MULTICOVBAM_H + +#include "bedFile.h" +#include "api/BamMultiReader.h" +using namespace BamTools; + + +#include <vector> +#include <iostream> +#include <fstream> +#include <stdlib.h> +using namespace std; + + + +class MultiCovBam { + +public: + + // constructor + MultiCovBam(const vector<string> &bam_files, const string bed_file); + + // destructor + ~MultiCovBam(void); + + void CollectCoverage(); + +private: + + //------------------------------------------------ + // private attributes + //------------------------------------------------ + vector<string> _bam_files; + string _bed_file; + BedFile *_bed; + + map<string, int> bamFileMap; + + void LoadBamFileMap(void); + void ReportCounts(const vector<int> &counts); +}; + +#endif /* MULTIBAMCOV_H */ diff --git a/src/multiBamCov/multiBamCovMain.cpp b/src/multiBamCov/multiBamCovMain.cpp new file mode 100644 index 00000000..66552e69 --- /dev/null +++ b/src/multiBamCov/multiBamCovMain.cpp @@ -0,0 +1,114 @@ +/***************************************************************************** + multiBamCovMain.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "multiBamCov.h" +#include "version.h" + +using namespace std; + +// define our program name +#define PROGRAM_NAME "multiBamCov" + + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +// function declarations +void ShowHelp(void); + +int main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input files + string bedFile; + vector<string> bamFiles; + + // input arguments + bool haveBed = false; + bool haveBams = false; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-bed", 4, parameterLength)) { + if ((i+1) < argc) { + haveBed = true; + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-bams", 5, parameterLength)) { + if ((i+1) < argc) { + haveBams = true; + i = i+1; + string file = argv[i]; + while (file[0] != '-' && i < argc) { + bamFiles.push_back(file); + i++; + if (i < argc) + file = argv[i]; + } + i--; + } + } + } + + if (!showHelp) { + MultiCovBam *mc = new MultiCovBam(bamFiles, bedFile); + mc->CollectCoverage(); + delete mc; + return 0; + } + else { + ShowHelp(); + } +} + +void ShowHelp(void) { + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Counts sequence coverage for multiple bams at specific loci." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -bams aln.1.bam aln.2.bam ... aln.n.bam -bed <bed/gff/vcf>" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-bams\t" << "The bam files." << endl << endl; + + cerr << "\t-bed\t" << "The bed file." << endl << endl; + + + + // end the program here + exit(1); + +} -- GitLab