From 0b3f2511e7b895c261997384f7747e941da88492 Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Thu, 23 Jun 2011 15:06:17 -0400 Subject: [PATCH] Pushed tagBam to GitHub --- Makefile | 1 + src/tagBam/Makefile | 51 ++++++++++++++ src/tagBam/tagBam.cpp | 108 +++++++++++++++++++++++++++++ src/tagBam/tagBam.h | 74 ++++++++++++++++++++ src/tagBam/tagBamMain.cpp | 141 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 375 insertions(+) create mode 100644 src/tagBam/Makefile create mode 100644 src/tagBam/tagBam.cpp create mode 100644 src/tagBam/tagBam.h create mode 100644 src/tagBam/tagBamMain.cpp diff --git a/Makefile b/Makefile index 8e1fa0b4..4ee8b3e1 100644 --- a/Makefile +++ b/Makefile @@ -37,6 +37,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/slopBed \ $(SRC_DIR)/sortBed \ $(SRC_DIR)/subtractBed \ + $(SRC_DIR)/tagBam \ $(SRC_DIR)/unionBedGraphs \ $(SRC_DIR)/windowBed diff --git a/src/tagBam/Makefile b/src/tagBam/Makefile new file mode 100644 index 00000000..b2dbcc4c --- /dev/null +++ b/src/tagBam/Makefile @@ -0,0 +1,51 @@ +UTILITIES_DIR = ../utils/ +OBJ_DIR = ../../obj/ +BIN_DIR = ../../bin/ + +# ------------------- +# define our includes +# ------------------- +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ \ + -I$(UTILITIES_DIR)/version/ \ + -I$(UTILITIES_DIR)/gzstream/ \ + -I$(UTILITIES_DIR)/genomeFile/ \ + -I$(UTILITIES_DIR)/lineFileUtilities/ \ + -I$(UTILITIES_DIR)/fileType/ \ + -I$(UTILITIES_DIR)/BamTools/include \ + -I$(UTILITIES_DIR)/BamTools-Ancillary +# ---------------------------------- +# define our source and object files +# ---------------------------------- +SOURCES= tagBamMain.cpp tagBam.cpp +OBJECTS= $(SOURCES:.cpp=.o) +_EXT_OBJECTS=bedFile.o lineFileUtilities.o gzstream.o fileType.o +EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) +BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) +PROGRAM= tagBam + + +all: $(PROGRAM) + +.PHONY: all + +$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS) + @echo " * linking $(PROGRAM)" + @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS) -L$(UTILITIES_DIR)/BamTools/lib/ -lbamtools + +$(BUILT_OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) + +$(EXT_OBJECTS): + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/bedFile/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/lineFileUtilities/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/gzstream/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools-Ancillary/ + @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/fileType/ + +clean: + @echo "Cleaning up." + @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* + +.PHONY: clean diff --git a/src/tagBam/tagBam.cpp b/src/tagBam/tagBam.cpp new file mode 100644 index 00000000..3a65db8d --- /dev/null +++ b/src/tagBam/tagBam.cpp @@ -0,0 +1,108 @@ +/***************************************************************************** + tagBam.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "lineFileUtilities.h" +#include "tagBam.h" + +// build +TagBam::TagBam(const string &bamFile, const vector<string> &annoFileNames, + const vector<string> &annoLables, bool forceStrand) : + + _bamFile(bamFile), + _annoFileNames(annoFileNames), + _annoLabels(annoLables), + _forceStrand(forceStrand) +{} + + +// destroy and delete the open file pointers +TagBam::~TagBam(void) { + delete _bed; + CloseAnnoFiles(); +} + + +void TagBam::OpenAnnoFiles() { + for (size_t i=0; i < _annoFileNames.size(); ++i) { + BedFile *file = new BedFile(_annoFileNames[i]); + file->loadBedFileIntoMap(); + _annoFiles.push_back(file); + } +} + + +void TagBam::CloseAnnoFiles() { + for (size_t i=0; i < _annoFiles.size(); ++i) { + BedFile *file = _annoFiles[i]; + delete file; + _annoFiles[i] = NULL; + } +} + +bool TagBam::FindOneOrMoreOverlap(const BED &a, BedFile *bedFile) { + return bedFile->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, + _forceStrand, 0); +} + +void TagBam::Tag() { + + // open the annotations files for processing; + OpenAnnoFiles(); + + // open the BAM file + BamReader reader; + BamWriter writer; + reader.Open(_bamFile); + // get header & reference information + string bamHeader = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // set compression mode + BamWriter::CompressionMode compressionMode = BamWriter::Compressed; +// if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; + writer.SetCompressionMode(compressionMode); + // open our BAM writer + writer.Open("stdout", bamHeader, refs); + + // rip through the BAM file and test for overlaps with each annotation file. + BamAlignment al; + while (reader.GetNextAlignment(al)) { + if (al.IsMapped() == true) { + BED a; + a.chrom = refs.at(al.RefID).RefName; + a.start = al.Position; + a.end = al.GetEndPosition(false, false); + if (al.IsReverseStrand()) a.strand = "-"; + a.strand = "+"; + + ostringstream annotations; + // annotate the BAM file based on overlaps with the annotation files. + for (size_t i = 0; i < _annoFiles.size(); ++i) + { + // grab the current annotation file. + BedFile *anno = _annoFiles[i]; + // add the label for this annotation file to tag if there is overlap + if (FindOneOrMoreOverlap(a, anno)) { + annotations << _annoLabels[i] << ";"; + } + } + // were there any overlaps with which to make a tag? + if (annotations.str().size() > 0) { + al.AddTag("YB", "Z", annotations.str().substr(0, annotations.str().size() - 1)); // get rid of the last ";" + } + writer.SaveAlignment(al); + } + } + reader.Close(); + + // close the annotations files; + CloseAnnoFiles(); +} diff --git a/src/tagBam/tagBam.h b/src/tagBam/tagBam.h new file mode 100644 index 00000000..85ada900 --- /dev/null +++ b/src/tagBam/tagBam.h @@ -0,0 +1,74 @@ +/***************************************************************************** + tagBam.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#ifndef TAGBAM_H +#define TAGBAM_H + +#include "bedFile.h" + +#include "version.h" +#include "api/BamReader.h" +#include "api/BamWriter.h" +#include "api/BamAux.h" +#include "BamAncillary.h" +using namespace BamTools; + +#include "bedFile.h" +#include <vector> +#include <algorithm> +#include <iostream> +#include <iomanip> +#include <fstream> +#include <stdlib.h> + +using namespace std; + +//************************************************ +// Class methods and elements +//************************************************ +class TagBam { + +public: + + // constructor + TagBam(const string &bamFile, const vector<string> &annoFileNames, + const vector<string> &annoLabels, bool forceStrand); + + // destructor + ~TagBam(void); + + // annotate the BAM file with all of the annotation files. + void Tag(); + +private: + + // input files. + string _bamFile; + vector<string> _annoFileNames; + vector<string> _annoLabels; + + // instance of a bed file class. + BedFile *_bed; + vector<BedFile*> _annoFiles; + + // do we care about strandedness when tagging? + bool _forceStrand; + + // private function for reporting coverage information + void ReportAnnotations(); + + void OpenAnnoFiles(); + + void CloseAnnoFiles(); + + bool FindOneOrMoreOverlap(const BED &a, BedFile *bedFile); +}; +#endif /* TAGBAM_H */ diff --git a/src/tagBam/tagBamMain.cpp b/src/tagBam/tagBamMain.cpp new file mode 100644 index 00000000..f66ae83b --- /dev/null +++ b/src/tagBam/tagBamMain.cpp @@ -0,0 +1,141 @@ +/***************************************************************************** + annotateMain.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "tagBam.h" +#include "version.h" + +using namespace std; + +// define the version +#define PROGRAM_NAME "tagBam" + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +// function declarations +void ShowHelp(void); + +int main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input file + string bamFile; + + // parm flags + bool forceStrand = false; + bool haveBam = false; + bool haveFiles = false; + bool haveLabels = false; + + // list of annotation files / names + vector<string> inputFiles; + vector<string> inputLabels; + + // check to see if we should print out some help + if(argc <= 1) showHelp = true; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + haveBam = true; + bamFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-files", 6, parameterLength)) { + if ((i+1) < argc) { + haveFiles = true; + i = i+1; + string file = argv[i]; + while (file[0] != '-' && i < argc) { + inputFiles.push_back(file); + i++; + if (i < argc) + file = argv[i]; + } + i--; + } + } + else if(PARAMETER_CHECK("-tags", 5, parameterLength)) { + if ((i+1) < argc) { + haveLabels = true; + i = i+1; + string label = argv[i]; + while (label[0] != '-' && i < argc) { + inputLabels.push_back(label); + i++; + if (i < argc) + label = argv[i]; + } + i--; + } + } + else if (PARAMETER_CHECK("-s", 2, parameterLength)) { + forceStrand = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBam || !haveFiles || !haveLabels) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i, -files, and -labels. " << endl << "*****" << endl; + showHelp = true; + } + + if (!showHelp) { + TagBam *ba = new TagBam(bamFile, inputFiles, inputLabels, forceStrand); + ba->Tag(); + delete ba; + return 0; + } + else { + ShowHelp(); + } +} + +void ShowHelp(void) { + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Annotates a BAM file based on overlaps with multiple BED/GFF/VCF files" << endl; + cerr << "\t on the intervals in -i." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <BAM> -files FILE1 FILE2 .. FILEn -labels LAB1 LAB2 ,,, LABn" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-s\t" << "Force strandedness. That is, only tag alignments that have the same" << endl; + cerr << "\t\tstrand as a feature in the annotation file(s)." << endl << endl; + + exit(1); +} -- GitLab