diff --git a/src/utils/BamTools/BGZF.cpp b/src/utils/BamTools/BGZF.cpp deleted file mode 100644 index 853d7b5194ee06e9e6a18b77369433b71337cc27..0000000000000000000000000000000000000000 --- a/src/utils/BamTools/BGZF.cpp +++ /dev/null @@ -1,398 +0,0 @@ -// *************************************************************************** -// BGZF.cpp (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 16 August 2010 (DB) -// --------------------------------------------------------------------------- -// BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading & writing BGZF files -// *************************************************************************** - -#include <algorithm> -#include "BGZF.h" -using namespace BamTools; -using std::string; -using std::min; - -BgzfData::BgzfData(void) - : UncompressedBlockSize(DEFAULT_BLOCK_SIZE) - , CompressedBlockSize(MAX_BLOCK_SIZE) - , BlockLength(0) - , BlockOffset(0) - , BlockAddress(0) - , IsOpen(false) - , IsWriteOnly(false) - , IsWriteUncompressed(false) - , Stream(NULL) - , UncompressedBlock(NULL) - , CompressedBlock(NULL) -{ - try { - CompressedBlock = new char[CompressedBlockSize]; - UncompressedBlock = new char[UncompressedBlockSize]; - } catch( std::bad_alloc& ba ) { - printf("BGZF ERROR: unable to allocate memory for our BGZF object.\n"); - exit(1); - } -} - -// destructor -BgzfData::~BgzfData(void) { - if( CompressedBlock ) delete[] CompressedBlock; - if( UncompressedBlock ) delete[] UncompressedBlock; -} - -// closes BGZF file -void BgzfData::Close(void) { - - // skip if file not open, otherwise set flag - if ( !IsOpen ) return; - - // if writing to file, flush the current BGZF block, - // then write an empty block (as EOF marker) - if ( IsWriteOnly ) { - FlushBlock(); - int blockLength = DeflateBlock(); - fwrite(CompressedBlock, 1, blockLength, Stream); - } - - // flush and close - fflush(Stream); - fclose(Stream); - IsWriteUncompressed = false; - IsOpen = false; -} - -// compresses the current block -int BgzfData::DeflateBlock(void) { - - // initialize the gzip header - char* buffer = CompressedBlock; - memset(buffer, 0, 18); - buffer[0] = GZIP_ID1; - buffer[1] = (char)GZIP_ID2; - buffer[2] = CM_DEFLATE; - buffer[3] = FLG_FEXTRA; - buffer[9] = (char)OS_UNKNOWN; - buffer[10] = BGZF_XLEN; - buffer[12] = BGZF_ID1; - buffer[13] = BGZF_ID2; - buffer[14] = BGZF_LEN; - - // set compression level - const int compressionLevel = ( IsWriteUncompressed ? 0 : Z_DEFAULT_COMPRESSION ); - - // loop to retry for blocks that do not compress enough - int inputLength = BlockOffset; - int compressedLength = 0; - unsigned int bufferSize = CompressedBlockSize; - - while ( true ) { - - // initialize zstream values - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)UncompressedBlock; - zs.avail_in = inputLength; - zs.next_out = (Bytef*)&buffer[BLOCK_HEADER_LENGTH]; - zs.avail_out = bufferSize - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH; - - // initialize the zlib compression algorithm - if ( deflateInit2(&zs, compressionLevel, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY) != Z_OK ) { - printf("BGZF ERROR: zlib deflate initialization failed.\n"); - exit(1); - } - - // compress the data - int status = deflate(&zs, Z_FINISH); - if ( status != Z_STREAM_END ) { - - deflateEnd(&zs); - - // reduce the input length and try again - if ( status == Z_OK ) { - inputLength -= 1024; - if( inputLength < 0 ) { - printf("BGZF ERROR: input reduction failed.\n"); - exit(1); - } - continue; - } - - printf("BGZF ERROR: zlib::deflateEnd() failed.\n"); - exit(1); - } - - // finalize the compression routine - if ( deflateEnd(&zs) != Z_OK ) { - printf("BGZF ERROR: zlib::deflateEnd() failed.\n"); - exit(1); - } - - compressedLength = zs.total_out; - compressedLength += BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH; - if ( compressedLength > MAX_BLOCK_SIZE ) { - printf("BGZF ERROR: deflate overflow.\n"); - exit(1); - } - - break; - } - - // store the compressed length - BgzfData::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); - - // store the CRC32 checksum - unsigned int crc = crc32(0, NULL, 0); - crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength); - BgzfData::PackUnsignedInt(&buffer[compressedLength - 8], crc); - BgzfData::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); - - // ensure that we have less than a block of data left - int remaining = BlockOffset - inputLength; - if ( remaining > 0 ) { - if ( remaining > inputLength ) { - printf("BGZF ERROR: after deflate, remainder too large.\n"); - exit(1); - } - memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining); - } - - BlockOffset = remaining; - return compressedLength; -} - -// flushes the data in the BGZF block -void BgzfData::FlushBlock(void) { - - // flush all of the remaining blocks - while ( BlockOffset > 0 ) { - - // compress the data block - int blockLength = DeflateBlock(); - - // flush the data to our output stream - int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream); - - if ( numBytesWritten != blockLength ) { - printf("BGZF ERROR: expected to write %u bytes during flushing, but wrote %u bytes.\n", blockLength, numBytesWritten); - exit(1); - } - - BlockAddress += blockLength; - } -} - -// de-compresses the current block -int BgzfData::InflateBlock(const int& blockLength) { - - // Inflate the block in m_BGZF.CompressedBlock into m_BGZF.UncompressedBlock - z_stream zs; - zs.zalloc = NULL; - zs.zfree = NULL; - zs.next_in = (Bytef*)CompressedBlock + 18; - zs.avail_in = blockLength - 16; - zs.next_out = (Bytef*)UncompressedBlock; - zs.avail_out = UncompressedBlockSize; - - int status = inflateInit2(&zs, GZIP_WINDOW_BITS); - if ( status != Z_OK ) { - printf("BGZF ERROR: could not decompress block - zlib::inflateInit() failed\n"); - return -1; - } - - status = inflate(&zs, Z_FINISH); - if ( status != Z_STREAM_END ) { - inflateEnd(&zs); - printf("BGZF ERROR: could not decompress block - zlib::inflate() failed\n"); - return -1; - } - - status = inflateEnd(&zs); - if ( status != Z_OK ) { - printf("BGZF ERROR: could not decompress block - zlib::inflateEnd() failed\n"); - return -1; - } - - return zs.total_out; -} - -// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing) -bool BgzfData::Open(const string& filename, const char* mode, bool isWriteUncompressed ) { - - // determine open mode - if ( strcmp(mode, "rb") == 0 ) - IsWriteOnly = false; - else if ( strcmp(mode, "wb") == 0) - IsWriteOnly = true; - else { - printf("BGZF ERROR: unknown file mode: %s\n", mode); - return false; - } - - // ---------------------------------------------------------------- - // open Stream to read to/write from file, stdin, or stdout - // stdin/stdout option contributed by Aaron Quinlan (2010-Jan-03) - - // read/write BGZF data to/from a file - if ( (filename != "stdin") && (filename != "stdout") ) - Stream = fopen(filename.c_str(), mode); - - // read BGZF data from stdin - else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) ) - Stream = freopen(NULL, mode, stdin); - - // write BGZF data to stdout - else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) ) - Stream = freopen(NULL, mode, stdout); - - if ( !Stream ) { - printf("BGZF ERROR: unable to open file %s\n", filename.c_str() ); - return false; - } - - // set flags, return success - IsOpen = true; - IsWriteUncompressed = isWriteUncompressed; - return true; -} - -// reads BGZF data into a byte buffer -int BgzfData::Read(char* data, const unsigned int dataLength) { - - if ( !IsOpen || IsWriteOnly || dataLength == 0 ) return 0; - - char* output = data; - unsigned int numBytesRead = 0; - while ( numBytesRead < dataLength ) { - - int bytesAvailable = BlockLength - BlockOffset; - if ( bytesAvailable <= 0 ) { - if ( !ReadBlock() ) return -1; - bytesAvailable = BlockLength - BlockOffset; - if ( bytesAvailable <= 0 ) break; - } - - char* buffer = UncompressedBlock; - int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable ); - memcpy(output, buffer + BlockOffset, copyLength); - - BlockOffset += copyLength; - output += copyLength; - numBytesRead += copyLength; - } - - if ( BlockOffset == BlockLength ) { - BlockAddress = ftell64(Stream); - BlockOffset = 0; - BlockLength = 0; - } - - return numBytesRead; -} - -// reads a BGZF block -bool BgzfData::ReadBlock(void) { - - char header[BLOCK_HEADER_LENGTH]; - int64_t blockAddress = ftell64(Stream); - - int count = fread(header, 1, sizeof(header), Stream); - if ( count == 0 ) { - BlockLength = 0; - return true; - } - - if ( count != sizeof(header) ) { - printf("BGZF ERROR: read block failed - could not read block header\n"); - return false; - } - - if ( !BgzfData::CheckBlockHeader(header) ) { - printf("BGZF ERROR: read block failed - invalid block header\n"); - return false; - } - - int blockLength = BgzfData::UnpackUnsignedShort(&header[16]) + 1; - char* compressedBlock = CompressedBlock; - memcpy(compressedBlock, header, BLOCK_HEADER_LENGTH); - int remaining = blockLength - BLOCK_HEADER_LENGTH; - - count = fread(&compressedBlock[BLOCK_HEADER_LENGTH], 1, remaining, Stream); - if ( count != remaining ) { - printf("BGZF ERROR: read block failed - could not read data from block\n"); - return false; - } - - count = InflateBlock(blockLength); - if ( count < 0 ) { - printf("BGZF ERROR: read block failed - could not decompress block data\n"); - return false; - } - - if ( BlockLength != 0 ) - BlockOffset = 0; - - BlockAddress = blockAddress; - BlockLength = count; - return true; -} - -// seek to position in BGZF file -bool BgzfData::Seek(int64_t position) { - - if ( !IsOpen ) return false; - - int blockOffset = (position & 0xFFFF); - int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; - - if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) { - printf("BGZF ERROR: unable to seek in file\n"); - return false; - } - - BlockLength = 0; - BlockAddress = blockAddress; - BlockOffset = blockOffset; - return true; -} - -// get file position in BGZF file -int64_t BgzfData::Tell(void) { - if ( !IsOpen ) - return false; - else - return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) ); -} - -// writes the supplied data into the BGZF buffer -unsigned int BgzfData::Write(const char* data, const unsigned int dataLen) { - - if ( !IsOpen || !IsWriteOnly ) return false; - - // initialize - unsigned int numBytesWritten = 0; - const char* input = data; - unsigned int blockLength = UncompressedBlockSize; - - // copy the data to the buffer - while ( numBytesWritten < dataLen ) { - - unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten); - char* buffer = UncompressedBlock; - memcpy(buffer + BlockOffset, input, copyLength); - - BlockOffset += copyLength; - input += copyLength; - numBytesWritten += copyLength; - - if ( BlockOffset == blockLength ) - FlushBlock(); - } - - return numBytesWritten; -} diff --git a/src/utils/BamTools/BGZF.h b/src/utils/BamTools/BGZF.h deleted file mode 100644 index 8a709f4d3a2098b4f4550a4dbdcbad0ef8444f85..0000000000000000000000000000000000000000 --- a/src/utils/BamTools/BGZF.h +++ /dev/null @@ -1,325 +0,0 @@ -// *************************************************************************** -// BGZF.h (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 16 August 2010 (DB) -// --------------------------------------------------------------------------- -// BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading & writing BGZF files -// *************************************************************************** - -#ifndef BGZF_H -#define BGZF_H - -// 'C' includes -#include <cstdio> -#include <cstdlib> -#include <cstring> - -// C++ includes -#include <string> - -// zlib includes -#include "zlib.h" - -// Platform-specific large-file support -#ifndef BAMTOOLS_LFS -#define BAMTOOLS_LFS - #ifdef WIN32 - #define ftell64(a) _ftelli64(a) - #define fseek64(a,b,c) _fseeki64(a,b,c) - #else - #define ftell64(a) ftello(a) - #define fseek64(a,b,c) fseeko(a,b,c) - #endif -#endif // BAMTOOLS_LFS - -// Platform-specific type definitions -#ifndef BAMTOOLS_TYPES -#define BAMTOOLS_TYPES - #ifdef _MSC_VER - typedef char int8_t; - typedef unsigned char uint8_t; - typedef short int16_t; - typedef unsigned short uint16_t; - typedef int int32_t; - typedef unsigned int uint32_t; - typedef long long int64_t; - typedef unsigned long long uint64_t; - #else - #include <stdint.h> - #endif -#endif // BAMTOOLS_TYPES - -namespace BamTools { - -// zlib constants -const int GZIP_ID1 = 31; -const int GZIP_ID2 = 139; -const int CM_DEFLATE = 8; -const int FLG_FEXTRA = 4; -const int OS_UNKNOWN = 255; -const int BGZF_XLEN = 6; -const int BGZF_ID1 = 66; -const int BGZF_ID2 = 67; -const int BGZF_LEN = 2; -const int GZIP_WINDOW_BITS = -15; -const int Z_DEFAULT_MEM_LEVEL = 8; - -// BZGF constants -const int BLOCK_HEADER_LENGTH = 18; -const int BLOCK_FOOTER_LENGTH = 8; -const int MAX_BLOCK_SIZE = 65536; -const int DEFAULT_BLOCK_SIZE = 65536; - -struct BgzfData { - - // data members - public: - unsigned int UncompressedBlockSize; - unsigned int CompressedBlockSize; - unsigned int BlockLength; - unsigned int BlockOffset; - uint64_t BlockAddress; - bool IsOpen; - bool IsWriteOnly; - bool IsWriteUncompressed; - FILE* Stream; - char* UncompressedBlock; - char* CompressedBlock; - - // constructor & destructor - public: - BgzfData(void); - ~BgzfData(void); - - // main interface methods - public: - // closes BGZF file - void Close(void); - // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing) - bool Open(const std::string& filename, const char* mode, bool isWriteUncompressed = false); - // reads BGZF data into a byte buffer - int Read(char* data, const unsigned int dataLength); - // seek to position in BGZF file - bool Seek(int64_t position); - // get file position in BGZF file - int64_t Tell(void); - // writes the supplied data into the BGZF buffer - unsigned int Write(const char* data, const unsigned int dataLen); - - // internal methods - private: - // compresses the current block - int DeflateBlock(void); - // flushes the data in the BGZF block - void FlushBlock(void); - // de-compresses the current block - int InflateBlock(const int& blockLength); - // reads a BGZF block - bool ReadBlock(void); - - // static 'utility' methods - public: - // checks BGZF block header - static inline bool CheckBlockHeader(char* header); - // packs an unsigned integer into the specified buffer - static inline void PackUnsignedInt(char* buffer, unsigned int value); - // packs an unsigned short into the specified buffer - static inline void PackUnsignedShort(char* buffer, unsigned short value); - // unpacks a buffer into a double - static inline double UnpackDouble(char* buffer); - static inline double UnpackDouble(const char* buffer); - // unpacks a buffer into a float - static inline float UnpackFloat(char* buffer); - static inline float UnpackFloat(const char* buffer); - // unpacks a buffer into a signed int - static inline signed int UnpackSignedInt(char* buffer); - static inline signed int UnpackSignedInt(const char* buffer); - // unpacks a buffer into a signed short - static inline signed short UnpackSignedShort(char* buffer); - static inline signed short UnpackSignedShort(const char* buffer); - // unpacks a buffer into an unsigned int - static inline unsigned int UnpackUnsignedInt(char* buffer); - static inline unsigned int UnpackUnsignedInt(const char* buffer); - // unpacks a buffer into an unsigned short - static inline unsigned short UnpackUnsignedShort(char* buffer); - static inline unsigned short UnpackUnsignedShort(const char* buffer); -}; - -// ------------------------------------------------------------- -// static 'utility' method implementations - -// checks BGZF block header -inline -bool BgzfData::CheckBlockHeader(char* header) { - return (header[0] == GZIP_ID1 && - header[1] == (char)GZIP_ID2 && - header[2] == Z_DEFLATED && - (header[3] & FLG_FEXTRA) != 0 && - BgzfData::UnpackUnsignedShort(&header[10]) == BGZF_XLEN && - header[12] == BGZF_ID1 && - header[13] == BGZF_ID2 && - BgzfData::UnpackUnsignedShort(&header[14]) == BGZF_LEN ); -} - -// 'packs' an unsigned integer into the specified buffer -inline -void BgzfData::PackUnsignedInt(char* buffer, unsigned int value) { - buffer[0] = (char)value; - buffer[1] = (char)(value >> 8); - buffer[2] = (char)(value >> 16); - buffer[3] = (char)(value >> 24); -} - -// 'packs' an unsigned short into the specified buffer -inline -void BgzfData::PackUnsignedShort(char* buffer, unsigned short value) { - buffer[0] = (char)value; - buffer[1] = (char)(value >> 8); -} - -// 'unpacks' a buffer into a double (includes both non-const & const char* flavors) -inline -double BgzfData::UnpackDouble(char* buffer) { - union { double value; unsigned char valueBuffer[sizeof(double)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - un.valueBuffer[4] = buffer[4]; - un.valueBuffer[5] = buffer[5]; - un.valueBuffer[6] = buffer[6]; - un.valueBuffer[7] = buffer[7]; - return un.value; -} - -inline -double BgzfData::UnpackDouble(const char* buffer) { - union { double value; unsigned char valueBuffer[sizeof(double)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - un.valueBuffer[4] = buffer[4]; - un.valueBuffer[5] = buffer[5]; - un.valueBuffer[6] = buffer[6]; - un.valueBuffer[7] = buffer[7]; - return un.value; -} - -// 'unpacks' a buffer into a float (includes both non-const & const char* flavors) -inline -float BgzfData::UnpackFloat(char* buffer) { - union { float value; unsigned char valueBuffer[sizeof(float)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -inline -float BgzfData::UnpackFloat(const char* buffer) { - union { float value; unsigned char valueBuffer[sizeof(float)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -// 'unpacks' a buffer into a signed int (includes both non-const & const char* flavors) -inline -signed int BgzfData::UnpackSignedInt(char* buffer) { - union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -inline -signed int BgzfData::UnpackSignedInt(const char* buffer) { - union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -// 'unpacks' a buffer into a signed short (includes both non-const & const char* flavors) -inline -signed short BgzfData::UnpackSignedShort(char* buffer) { - union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -inline -signed short BgzfData::UnpackSignedShort(const char* buffer) { - union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -// 'unpacks' a buffer into an unsigned int (includes both non-const & const char* flavors) -inline -unsigned int BgzfData::UnpackUnsignedInt(char* buffer) { - union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -inline -unsigned int BgzfData::UnpackUnsignedInt(const char* buffer) { - union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - un.valueBuffer[2] = buffer[2]; - un.valueBuffer[3] = buffer[3]; - return un.value; -} - -// 'unpacks' a buffer into an unsigned short (includes both non-const & const char* flavors) -inline -unsigned short BgzfData::UnpackUnsignedShort(char* buffer) { - union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -inline -unsigned short BgzfData::UnpackUnsignedShort(const char* buffer) { - union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un; - un.value = 0; - un.valueBuffer[0] = buffer[0]; - un.valueBuffer[1] = buffer[1]; - return un.value; -} - -} // namespace BamTools - -#endif // BGZF_H diff --git a/src/utils/BamTools/BamAux.h b/src/utils/BamTools/BamAux.h deleted file mode 100644 index f92875e611994c3c4e8892c1958c599f2a942c1b..0000000000000000000000000000000000000000 --- a/src/utils/BamTools/BamAux.h +++ /dev/null @@ -1,991 +0,0 @@ -// *************************************************************************** -// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 27 July 2010 (DB) -// --------------------------------------------------------------------------- -// Provides the basic constants, data structures, etc. for using BAM files -// *************************************************************************** - -#ifndef BAMAUX_H -#define BAMAUX_H - -// C inclues -#include <cctype> -#include <cstdio> -#include <cstdlib> -#include <cstring> - -// C++ includes -#include <exception> -#include <map> -#include <string> -#include <utility> -#include <vector> - -// Platform-specific type definitions -#ifndef BAMTOOLS_TYPES -#define BAMTOOLS_TYPES - #ifdef _MSC_VER - typedef char int8_t; - typedef unsigned char uint8_t; - typedef short int16_t; - typedef unsigned short uint16_t; - typedef int int32_t; - typedef unsigned int uint32_t; - typedef long long int64_t; - typedef unsigned long long uint64_t; - #else - #include <stdint.h> - #endif -#endif // BAMTOOLS_TYPES - -namespace BamTools { - -// BAM constants -const int BAM_CORE_SIZE = 32; -const int BAM_CMATCH = 0; -const int BAM_CINS = 1; -const int BAM_CDEL = 2; -const int BAM_CREF_SKIP = 3; -const int BAM_CSOFT_CLIP = 4; -const int BAM_CHARD_CLIP = 5; -const int BAM_CPAD = 6; -const int BAM_CIGAR_SHIFT = 4; -const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); - -// BAM index constants -const int MAX_BIN = 37450; // =(8^6-1)/7+1 -const int BAM_MIN_CHUNK_GAP = 32768; -const int BAM_LIDX_SHIFT = 14; - -// Explicit variable sizes -const int BT_SIZEOF_INT = 4; - -struct CigarOp; - -struct BamAlignment { - - // constructors & destructor - public: - BamAlignment(void); - BamAlignment(const BamAlignment& other); - ~BamAlignment(void); - - // Queries against alignment flags - public: - bool IsDuplicate(void) const; // Returns true if this read is a PCR duplicate - bool IsFailedQC(void) const; // Returns true if this read failed quality control - bool IsFirstMate(void) const; // Returns true if alignment is first mate on read - bool IsMapped(void) const; // Returns true if alignment is mapped - bool IsMateMapped(void) const; // Returns true if alignment's mate is mapped - bool IsMateReverseStrand(void) const; // Returns true if alignment's mate mapped to reverse strand - bool IsPaired(void) const; // Returns true if alignment part of paired-end read - bool IsPrimaryAlignment(void) const; // Returns true if reported position is primary alignment - bool IsProperPair(void) const; // Returns true if alignment is part of read that satisfied paired-end resolution - bool IsReverseStrand(void) const; // Returns true if alignment mapped to reverse strand - bool IsSecondMate(void) const; // Returns true if alignment is second mate on read - - // Manipulate alignment flags - public: - void SetIsDuplicate(bool ok); // Sets "PCR duplicate" flag - void SetIsFailedQC(bool ok); // Sets "failed quality control" flag - void SetIsFirstMate(bool ok); // Sets "alignment is first mate" flag - void SetIsMateUnmapped(bool ok); // Sets "alignment's mate is mapped" flag - void SetIsMateReverseStrand(bool ok); // Sets "alignment's mate mapped to reverse strand" flag - void SetIsPaired(bool ok); // Sets "alignment part of paired-end read" flag - void SetIsProperPair(bool ok); // Sets "alignment is part of read that satisfied paired-end resolution" flag - void SetIsReverseStrand(bool ok); // Sets "alignment mapped to reverse strand" flag - void SetIsSecondaryAlignment(bool ok); // Sets "position is primary alignment" flag - void SetIsSecondMate(bool ok); // Sets "alignment is second mate on read" flag - void SetIsUnmapped(bool ok); // Sets "alignment is mapped" flag - - // Tag data access methods - public: - // ------------------------------------------------------------------------------------- - // N.B. - The following tag-modifying methods may not be used on BamAlignments fetched - // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in - // error message (to keep output clean) but will ALWAYS return false. Only user- - // generated BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid. - - // add tag data (create new TAG entry with TYPE and VALUE) - // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details - // returns true if new data added, false if error or TAG already exists - // N.B. - will NOT modify existing tag. Use EditTag() instead - bool AddTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H - bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i - bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i - bool AddTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f - - // edit tag data (sets existing TAG with TYPE to VALUE or adds new TAG if not already present) - // TYPE is one of {A, i, f, Z, H} depending on VALUE - see SAM/BAM spec for details - // returns true if edit was successfaul, false if error - bool EditTag(const std::string& tag, const std::string& type, const std::string& value); // type must be Z or H - bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); // type must be A or i - bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); // type must be A or i - bool EditTag(const std::string& tag, const std::string& type, const float& value); // type must be A, i, or f - - // specific tag data access methods - these only remain for legacy support - bool GetEditDistance(uint32_t& editDistance) const; // get "NM" tag data (implemented as GetTag("NM", editDistance)) - bool GetReadGroup(std::string& readGroup) const; // get "RG" tag data (implemented as GetTag("RG", readGroup)) - - // generic tag data access methods - bool GetTag(const std::string& tag, std::string& destination) const; // access variable-length char or hex strings - bool GetTag(const std::string& tag, uint32_t& destination) const; // access unsigned integer data - bool GetTag(const std::string& tag, int32_t& destination) const; // access signed integer data - bool GetTag(const std::string& tag, float& destination) const; // access floating point data - - // remove tag data - // returns true if removal was successful, false if error - // N.B. - returns false if TAG does not exist (no removal can occur) - bool RemoveTag(const std::string& tag); - - // Additional data access methods - public: - int GetEndPosition(bool usePadded = false) const; // calculates alignment end position, based on starting position and CIGAR operations - - // 'internal' utility methods - private: - static bool FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed); - static bool SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed); - - // Data members - public: - std::string Name; // Read name - int32_t Length; // Query length - std::string QueryBases; // 'Original' sequence (as reported from sequencing machine) - std::string AlignedBases; // 'Aligned' sequence (includes any indels, padding, clipping) - std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values) - std::string TagData; // Tag data (accessor methods will pull the requested information out) - int32_t RefID; // ID number for reference sequence - int32_t Position; // Position (0-based) where alignment starts - uint16_t Bin; // Bin in BAM file where this alignment resides - uint16_t MapQuality; // Mapping quality score - uint32_t AlignmentFlag; // Alignment bit-flag - see Is<something>() methods to query this value, SetIs<something>() methods to manipulate - std::vector<CigarOp> CigarData; // CIGAR operations for this alignment - int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned - int32_t MatePosition; // Position (0-based) where alignment's mate starts - int32_t InsertSize; // Mate-pair insert size - - // internal data - public: - struct BamAlignmentSupportData { - - // data members - std::string AllCharData; - uint32_t BlockLength; - uint32_t NumCigarOperations; - uint32_t QueryNameLength; - uint32_t QuerySequenceLength; - bool HasCoreOnly; - - // constructor - BamAlignmentSupportData(void) - : BlockLength(0) - , NumCigarOperations(0) - , QueryNameLength(0) - , QuerySequenceLength(0) - , HasCoreOnly(false) - { } - }; - - // contains raw character data & lengths - BamAlignmentSupportData SupportData; - - // allow these classes access to BamAlignment private members (SupportData) - // but client code should not need to touch this data - friend class BamReader; - friend class BamWriter; - - // Alignment flag query constants - // Use the get/set methods above instead - private: - enum { PAIRED = 1 - , PROPER_PAIR = 2 - , UNMAPPED = 4 - , MATE_UNMAPPED = 8 - , REVERSE = 16 - , MATE_REVERSE = 32 - , READ_1 = 64 - , READ_2 = 128 - , SECONDARY = 256 - , QC_FAILED = 512 - , DUPLICATE = 1024 - }; -}; - -// ---------------------------------------------------------------- -// Auxiliary data structs & typedefs - -struct CigarOp { - - // data members - char Type; // Operation type (MIDNSHP) - uint32_t Length; // Operation length (number of bases) - - // constructor - CigarOp(const char type = '\0', - const uint32_t length = 0) - : Type(type) - , Length(length) - { } -}; - -struct RefData { - - // data members - std::string RefName; // Name of reference sequence - int32_t RefLength; // Length of reference sequence - bool RefHasAlignments; // True if BAM file contains alignments mapped to reference sequence - - // constructor - RefData(const int32_t& length = 0, - bool ok = false) - : RefLength(length) - , RefHasAlignments(ok) - { } -}; - -typedef std::vector<RefData> RefVector; -typedef std::vector<BamAlignment> BamAlignmentVector; - -struct BamRegion { - - // data members - int LeftRefID; - int LeftPosition; - int RightRefID; - int RightPosition; - - // constructor - BamRegion(const int& leftID = -1, - const int& leftPos = -1, - const int& rightID = -1, - const int& rightPos = -1) - : LeftRefID(leftID) - , LeftPosition(leftPos) - , RightRefID(rightID) - , RightPosition(rightPos) - { } -}; - -// ---------------------------------------------------------------- -// Added: 3-35-2010 DWB -// Fixed: Routines to provide endian-correctness -// ---------------------------------------------------------------- - -// returns true if system is big endian -inline bool SystemIsBigEndian(void) { - const uint16_t one = 0x0001; - return ((*(char*) &one) == 0 ); -} - -// swaps endianness of 16-bit value 'in place' -inline void SwapEndian_16(int16_t& x) { - x = ((x >> 8) | (x << 8)); -} - -inline void SwapEndian_16(uint16_t& x) { - x = ((x >> 8) | (x << 8)); -} - -// swaps endianness of 32-bit value 'in-place' -inline void SwapEndian_32(int32_t& x) { - x = ( (x >> 24) | - ((x << 8) & 0x00FF0000) | - ((x >> 8) & 0x0000FF00) | - (x << 24) - ); -} - -inline void SwapEndian_32(uint32_t& x) { - x = ( (x >> 24) | - ((x << 8) & 0x00FF0000) | - ((x >> 8) & 0x0000FF00) | - (x << 24) - ); -} - -// swaps endianness of 64-bit value 'in-place' -inline void SwapEndian_64(int64_t& x) { - x = ( (x >> 56) | - ((x << 40) & 0x00FF000000000000ll) | - ((x << 24) & 0x0000FF0000000000ll) | - ((x << 8) & 0x000000FF00000000ll) | - ((x >> 8) & 0x00000000FF000000ll) | - ((x >> 24) & 0x0000000000FF0000ll) | - ((x >> 40) & 0x000000000000FF00ll) | - (x << 56) - ); -} - -inline void SwapEndian_64(uint64_t& x) { - x = ( (x >> 56) | - ((x << 40) & 0x00FF000000000000ll) | - ((x << 24) & 0x0000FF0000000000ll) | - ((x << 8) & 0x000000FF00000000ll) | - ((x >> 8) & 0x00000000FF000000ll) | - ((x >> 24) & 0x0000000000FF0000ll) | - ((x >> 40) & 0x000000000000FF00ll) | - (x << 56) - ); -} - -// swaps endianness of 'next 2 bytes' in a char buffer (in-place) -inline void SwapEndian_16p(char* data) { - uint16_t& value = (uint16_t&)*data; - SwapEndian_16(value); -} - -// swaps endianness of 'next 4 bytes' in a char buffer (in-place) -inline void SwapEndian_32p(char* data) { - uint32_t& value = (uint32_t&)*data; - SwapEndian_32(value); -} - -// swaps endianness of 'next 8 bytes' in a char buffer (in-place) -inline void SwapEndian_64p(char* data) { - uint64_t& value = (uint64_t&)*data; - SwapEndian_64(value); -} - -// ---------------------------------------------------------------- -// BamAlignment member methods - -// constructors & destructor -inline BamAlignment::BamAlignment(void) { } - -inline BamAlignment::BamAlignment(const BamAlignment& other) - : Name(other.Name) - , Length(other.Length) - , QueryBases(other.QueryBases) - , AlignedBases(other.AlignedBases) - , Qualities(other.Qualities) - , TagData(other.TagData) - , RefID(other.RefID) - , Position(other.Position) - , Bin(other.Bin) - , MapQuality(other.MapQuality) - , AlignmentFlag(other.AlignmentFlag) - , CigarData(other.CigarData) - , MateRefID(other.MateRefID) - , MatePosition(other.MatePosition) - , InsertSize(other.InsertSize) - , SupportData(other.SupportData) -{ } - -inline BamAlignment::~BamAlignment(void) { } - -// Queries against alignment flags -inline bool BamAlignment::IsDuplicate(void) const { return ( (AlignmentFlag & DUPLICATE) != 0 ); } -inline bool BamAlignment::IsFailedQC(void) const { return ( (AlignmentFlag & QC_FAILED) != 0 ); } -inline bool BamAlignment::IsFirstMate(void) const { return ( (AlignmentFlag & READ_1) != 0 ); } -inline bool BamAlignment::IsMapped(void) const { return ( (AlignmentFlag & UNMAPPED) == 0 ); } -inline bool BamAlignment::IsMateMapped(void) const { return ( (AlignmentFlag & MATE_UNMAPPED) == 0 ); } -inline bool BamAlignment::IsMateReverseStrand(void) const { return ( (AlignmentFlag & MATE_REVERSE) != 0 ); } -inline bool BamAlignment::IsPaired(void) const { return ( (AlignmentFlag & PAIRED) != 0 ); } -inline bool BamAlignment::IsPrimaryAlignment(void) const { return ( (AlignmentFlag & SECONDARY) == 0 ); } -inline bool BamAlignment::IsProperPair(void) const { return ( (AlignmentFlag & PROPER_PAIR) != 0 ); } -inline bool BamAlignment::IsReverseStrand(void) const { return ( (AlignmentFlag & REVERSE) != 0 ); } -inline bool BamAlignment::IsSecondMate(void) const { return ( (AlignmentFlag & READ_2) != 0 ); } - -// Manipulate alignment flags -inline void BamAlignment::SetIsDuplicate(bool ok) { if (ok) AlignmentFlag |= DUPLICATE; else AlignmentFlag &= ~DUPLICATE; } -inline void BamAlignment::SetIsFailedQC(bool ok) { if (ok) AlignmentFlag |= QC_FAILED; else AlignmentFlag &= ~QC_FAILED; } -inline void BamAlignment::SetIsFirstMate(bool ok) { if (ok) AlignmentFlag |= READ_1; else AlignmentFlag &= ~READ_1; } -inline void BamAlignment::SetIsMateUnmapped(bool ok) { if (ok) AlignmentFlag |= MATE_UNMAPPED; else AlignmentFlag &= ~MATE_UNMAPPED; } -inline void BamAlignment::SetIsMateReverseStrand(bool ok) { if (ok) AlignmentFlag |= MATE_REVERSE; else AlignmentFlag &= ~MATE_REVERSE; } -inline void BamAlignment::SetIsPaired(bool ok) { if (ok) AlignmentFlag |= PAIRED; else AlignmentFlag &= ~PAIRED; } -inline void BamAlignment::SetIsProperPair(bool ok) { if (ok) AlignmentFlag |= PROPER_PAIR; else AlignmentFlag &= ~PROPER_PAIR; } -inline void BamAlignment::SetIsReverseStrand(bool ok) { if (ok) AlignmentFlag |= REVERSE; else AlignmentFlag &= ~REVERSE; } -inline void BamAlignment::SetIsSecondaryAlignment(bool ok) { if (ok) AlignmentFlag |= SECONDARY; else AlignmentFlag &= ~SECONDARY; } -inline void BamAlignment::SetIsSecondMate(bool ok) { if (ok) AlignmentFlag |= READ_2; else AlignmentFlag &= ~READ_2; } -inline void BamAlignment::SetIsUnmapped(bool ok) { if (ok) AlignmentFlag |= UNMAPPED; else AlignmentFlag &= ~UNMAPPED; } - -// calculates alignment end position, based on starting position and CIGAR operations -inline -int BamAlignment::GetEndPosition(bool usePadded) const { - - // initialize alignment end to starting position - int alignEnd = Position; - - // iterate over cigar operations - std::vector<CigarOp>::const_iterator cigarIter = CigarData.begin(); - std::vector<CigarOp>::const_iterator cigarEnd = CigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter) { - const char cigarType = (*cigarIter).Type; - if ( cigarType == 'M' || cigarType == 'D' || cigarType == 'N' ) { - alignEnd += (*cigarIter).Length; - } - else if ( usePadded && cigarType == 'I' ) { - alignEnd += (*cigarIter).Length; - } - } - return alignEnd; -} - -inline -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type != "Z" && type != "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, copy tag data to temp buffer - std::string newTag = tag + type + value; - const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -inline -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "f" || type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, convert value to string - union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; - un.value = value; - - // copy original tag data to temp buffer - std::string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(unsigned int)); - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -inline -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) { - return AddTag(tag, type, (const uint32_t&)value); -} - -inline -bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag already exists, return false - // use EditTag explicitly instead - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) return false; - - // otherwise, convert value to string - union { float value; char valueBuffer[sizeof(float)]; } un; - un.value = value; - - // copy original tag data to temp buffer - std::string newTag = tag + type; - const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float - char originalTagData[newTagDataLength]; - memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term - - // append newTag - strcat(originalTagData + tagDataLength, newTag.data()); - memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float)); - - // store temp buffer back in TagData - const char* newTagData = (const char*)originalTagData; - TagData.assign(newTagData, newTagDataLength); - - // return success - return true; -} - -inline -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type != "Z" && type != "H" ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + value.size()]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - const unsigned int dataLength = strlen(value.c_str()); - memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 ); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1; - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -inline -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "f" || type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + sizeof(value)]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - union { unsigned int value; char valueBuffer[sizeof(unsigned int)]; } un; - un.value = value; - memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(unsigned int)); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + sizeof(unsigned int); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -inline -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) { - return EditTag(tag, type, (const uint32_t&)value); -} - -inline -bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) { - - if ( SupportData.HasCoreOnly ) return false; - if ( tag.size() != 2 || type.size() != 1 ) return false; - if ( type == "Z" || type == "H" ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - // make sure array is more than big enough - char newTagData[originalTagDataLength + sizeof(value)]; - - // copy original tag data up til desired tag - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // copy new VALUE in place of current tag data - union { float value; char valueBuffer[sizeof(float)]; } un; - un.value = value; - memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float)); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData - 1; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagOffset = beginningTagDataLength + sizeof(float); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); - - // ensure null-terminator - newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; - - // save new tag data - TagData.assign(newTagData, endTagOffset + endTagDataLength); - return true; - } - - // tag not found, attempt AddTag - else return AddTag(tag, type, value); -} - -// get "NM" tag data - originally contributed by Aaron Quinlan -// stores data in 'editDistance', returns success/fail -inline -bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { - return GetTag("NM", (uint32_t&)editDistance); -} - -// get "RG" tag data -// stores data in 'readGroup', returns success/fail -inline -bool BamAlignment::GetReadGroup(std::string& readGroup) const { - return GetTag("RG", readGroup); -} - -inline -bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - const unsigned int dataLength = strlen(pTagData); - destination.clear(); - destination.resize(dataLength); - memcpy( (char*)destination.data(), pTagData, dataLength ); - return true; - } - - // tag not found, return failure - return false; -} - -inline -bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found, determine data byte-length, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - - // determine data byte-length - const char type = *(pTagData - 1); - int destinationLength = 0; - switch (type) { - // 1 byte data - case 'A': - case 'c': - case 'C': - destinationLength = 1; - break; - - // 2 byte data - case 's': - case 'S': - destinationLength = 2; - break; - - // 4 byte data - case 'i': - case 'I': - destinationLength = 4; - break; - - // unsupported type for integer destination (float or var-length strings) - case 'f': - case 'Z': - case 'H': - printf("ERROR: Cannot store tag of type %c in integer destination\n", type); - return false; - - // unknown tag type - default: - printf("ERROR: Unknown tag storage class encountered: [%c]\n", type); - return false; - } - - // store in destination - destination = 0; - memcpy(&destination, pTagData, destinationLength); - return true; - } - - // tag not found, return failure - return false; -} - -inline -bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const { - return GetTag(tag, (uint32_t&)destination); -} - -inline -bool BamAlignment::GetTag(const std::string& tag, float& destination) const { - - // make sure tag data exists - if ( SupportData.HasCoreOnly || TagData.empty() ) - return false; - - // localize the tag data - char* pTagData = (char*)TagData.data(); - const unsigned int tagDataLength = TagData.size(); - unsigned int numBytesParsed = 0; - - // if tag found, determine data byte-length, store data in readGroup, return success - if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { - //pTagData += numBytesParsed; - - // determine data byte-length - const char type = *(pTagData - 1); - int destinationLength = 0; - switch(type) { - - // 1 byte data - case 'A': - case 'c': - case 'C': - destinationLength = 1; - break; - - // 2 byte data - case 's': - case 'S': - destinationLength = 2; - break; - - // 4 byte data - case 'f': - case 'i': - case 'I': - destinationLength = 4; - break; - - // unsupported type (var-length strings) - case 'Z': - case 'H': - printf("ERROR: Cannot store tag of type %c in integer destination\n", type); - return false; - - // unknown tag type - default: - printf("ERROR: Unknown tag storage class encountered: [%c]\n", type); - return false; - } - - // store in destination - destination = 0.0; - memcpy(&destination, pTagData, destinationLength); - return true; - } - - // tag not found, return failure - return false; -} - -inline -bool BamAlignment::RemoveTag(const std::string& tag) { - - // BamAlignments fetched using BamReader::GetNextAlignmentCore() are not allowed - // also, return false if no data present to remove - if ( SupportData.HasCoreOnly || TagData.empty() ) return false; - - // localize the tag data - char* pOriginalTagData = (char*)TagData.data(); - char* pTagData = pOriginalTagData; - const unsigned int originalTagDataLength = TagData.size(); - unsigned int newTagDataLength = 0; - unsigned int numBytesParsed = 0; - - // if tag found, store data in readGroup, return success - if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { - - char newTagData[originalTagDataLength]; - - // copy original tag data up til desired tag - pTagData -= 3; - numBytesParsed -= 3; - const unsigned int beginningTagDataLength = numBytesParsed; - newTagDataLength += beginningTagDataLength; - memcpy(newTagData, pOriginalTagData, numBytesParsed); - - // skip to next tag (if tag for removal is last, return true) - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return true; - - // copy everything from current tag (the next one after tag for removal) to end - const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); - const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; - memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength ); - - // save new tag data - TagData.assign(newTagData, beginningTagDataLength + endTagDataLength); - return true; - } - - // tag not found, no removal - return failure - return false; -} - -inline -bool BamAlignment::FindTag(const std::string& tag, char* &pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) { - - while ( numBytesParsed < tagDataLength ) { - - const char* pTagType = pTagData; - const char* pTagStorageType = pTagData + 2; - pTagData += 3; - numBytesParsed += 3; - - // check the current tag, return true on match - if ( std::strncmp(pTagType, tag.c_str(), 2) == 0 ) - return true; - - // get the storage class and find the next tag - if ( *pTagStorageType == '\0' ) return false; - if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; - if ( *pTagData == '\0' ) return false; - } - - // checked all tags, none match - return false; -} - -inline -bool BamAlignment::SkipToNextTag(const char storageType, char* &pTagData, unsigned int& numBytesParsed) { - - switch(storageType) { - - case 'A': - case 'c': - case 'C': - ++numBytesParsed; - ++pTagData; - break; - - case 's': - case 'S': - numBytesParsed += 2; - pTagData += 2; - break; - - case 'f': - case 'i': - case 'I': - numBytesParsed += 4; - pTagData += 4; - break; - - case 'Z': - case 'H': - while(*pTagData) { - ++numBytesParsed; - ++pTagData; - } - // increment for null-terminator - ++numBytesParsed; - ++pTagData; - break; - - default: - // error case - printf("ERROR: Unknown tag storage class encountered: [%c]\n", storageType); - return false; - } - - // return success - return true; -} - -} // namespace BamTools - -#endif // BAMAUX_H diff --git a/src/utils/BamTools/BamIndex.cpp b/src/utils/BamTools/BamIndex.cpp deleted file mode 100644 index a3fa09e9a11fb307e106fe965b1cbd346c8c2bc4..0000000000000000000000000000000000000000 --- a/src/utils/BamTools/BamIndex.cpp +++ /dev/null @@ -1,926 +0,0 @@ -// *************************************************************************** -// BamIndex.cpp (c) 2009 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 17 August 2010 (DB) -// --------------------------------------------------------------------------- -// Provides index functionality - both for the default (standardized) BAM -// index format (.bai) as well as a BamTools-specific (nonstandard) index -// format (.bti). -// *************************************************************************** - -#include <cstdio> -#include <cstdlib> -#include <algorithm> -// #include <iostream> -#include <map> -#include "BamIndex.h" -#include "BamReader.h" -#include "BGZF.h" -using namespace std; -using namespace BamTools; - -// ------------------------------- -// BamIndex implementation - -BamIndex::BamIndex(BamTools::BgzfData* bgzf, BamTools::BamReader* reader, bool isBigEndian) - : m_BGZF(bgzf) - , m_reader(reader) - , m_isBigEndian(isBigEndian) -{ - if ( m_reader && m_reader->IsOpen() ) - m_references = m_reader->GetReferenceData(); -} - -bool BamIndex::HasAlignments(const int& referenceID) { - - // return false if invalid ID - if ( (referenceID < 0) || (referenceID >= (int)m_references.size()) ) - return false; - - // else return status of reference (has alignments?) - else - return m_references.at(referenceID).RefHasAlignments; -} - -// ######################################################################################### -// ######################################################################################### - -// ------------------------------- -// BamDefaultIndex structs & typedefs - -namespace BamTools { - -// -------------------------------------------------- -// BamDefaultIndex data structures & typedefs -struct Chunk { - - // data members - uint64_t Start; - uint64_t Stop; - - // constructor - Chunk(const uint64_t& start = 0, - const uint64_t& stop = 0) - : Start(start) - , Stop(stop) - { } -}; - -bool ChunkLessThan(const Chunk& lhs, const Chunk& rhs) { - return lhs.Start < rhs.Start; -} - -typedef vector<Chunk> ChunkVector; -typedef map<uint32_t, ChunkVector> BamBinMap; -typedef vector<uint64_t> LinearOffsetVector; - -struct ReferenceIndex { - - // data members - BamBinMap Bins; - LinearOffsetVector Offsets; - - // constructor - ReferenceIndex(const BamBinMap& binMap = BamBinMap(), - const LinearOffsetVector& offsets = LinearOffsetVector()) - : Bins(binMap) - , Offsets(offsets) - { } -}; - -typedef vector<ReferenceIndex> BamDefaultIndexData; - -} // namespace BamTools - -// ------------------------------- -// BamDefaultIndex implementation - -struct BamDefaultIndex::BamDefaultIndexPrivate { - - // ------------------------- - // data members - - BamDefaultIndexData m_indexData; - BamDefaultIndex* m_parent; - - // ------------------------- - // ctor & dtor - - BamDefaultIndexPrivate(BamDefaultIndex* parent) : m_parent(parent) { } - ~BamDefaultIndexPrivate(void) { } - - // ------------------------- - // internal methods - - // calculate bins that overlap region - int BinsFromRegion(const BamTools::BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[BamTools::MAX_BIN]); - // saves BAM bin entry for index - void InsertBinEntry(BamBinMap& binMap, const uint32_t& saveBin, const uint64_t& saveOffset, const uint64_t& lastOffset); - // saves linear offset entry for index - void InsertLinearOffset(LinearOffsetVector& offsets, const BamAlignment& bAlignment, const uint64_t& lastOffset); - // simplifies index by merging 'chunks' - void MergeChunks(void); - -}; - -BamDefaultIndex::BamDefaultIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) - : BamIndex(bgzf, reader, isBigEndian) -{ - d = new BamDefaultIndexPrivate(this); -} - -BamDefaultIndex::~BamDefaultIndex(void) { - d->m_indexData.clear(); - delete d; - d = 0; -} - -// calculate bins that overlap region -int BamDefaultIndex::BamDefaultIndexPrivate::BinsFromRegion(const BamRegion& region, const bool isRightBoundSpecified, uint16_t bins[MAX_BIN]) { - - // get region boundaries - uint32_t begin = (unsigned int)region.LeftPosition; - uint32_t end; - - // if right bound specified AND left&right bounds are on same reference - // OK to use right bound position - if ( isRightBoundSpecified && ( region.LeftRefID == region.RightRefID ) ) - end = (unsigned int)region.RightPosition; - - // otherwise, use end of left bound reference as cutoff - else - end = (unsigned int)m_parent->m_references.at(region.LeftRefID).RefLength - 1; - - // initialize list, bin '0' always a valid bin - int i = 0; - bins[i++] = 0; - - // get rest of bins that contain this region - unsigned int k; - for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { bins[i++] = k; } - for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { bins[i++] = k; } - for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { bins[i++] = k; } - for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { bins[i++] = k; } - for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { bins[i++] = k; } - - // return number of bins stored - return i; -} - -bool BamDefaultIndex::Build(void) { - - // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) - return false; - - // move file pointer to beginning of alignments - m_reader->Rewind(); - - // get reference count, reserve index space - int numReferences = (int)m_references.size(); - for ( int i = 0; i < numReferences; ++i ) { - d->m_indexData.push_back(ReferenceIndex()); - } - - // sets default constant for bin, ID, offset, coordinate variables - const uint32_t defaultValue = 0xffffffffu; - - // bin data - uint32_t saveBin(defaultValue); - uint32_t lastBin(defaultValue); - - // reference ID data - int32_t saveRefID(defaultValue); - int32_t lastRefID(defaultValue); - - // offset data - uint64_t saveOffset = m_BGZF->Tell(); - uint64_t lastOffset = saveOffset; - - // coordinate data - int32_t lastCoordinate = defaultValue; - - BamAlignment bAlignment; - while ( m_reader->GetNextAlignmentCore(bAlignment) ) { - - // change of chromosome, save ID, reset bin - if ( lastRefID != bAlignment.RefID ) { - lastRefID = bAlignment.RefID; - lastBin = defaultValue; - } - - // if lastCoordinate greater than BAM position - file not sorted properly - else if ( lastCoordinate > bAlignment.Position ) { - printf("BAM file not properly sorted:\n"); - printf("Alignment %s : %d > %d on reference (id = %d)", bAlignment.Name.c_str(), lastCoordinate, bAlignment.Position, bAlignment.RefID); - exit(1); - } - - // if valid reference && BAM bin spans some minimum cutoff (smaller bin ids span larger regions) - if ( (bAlignment.RefID >= 0) && (bAlignment.Bin < 4681) ) { - - // save linear offset entry (matched to BAM entry refID) - ReferenceIndex& refIndex = d->m_indexData.at(bAlignment.RefID); - LinearOffsetVector& offsets = refIndex.Offsets; - d->InsertLinearOffset(offsets, bAlignment, lastOffset); - } - - // if current BamAlignment bin != lastBin, "then possibly write the binning index" - if ( bAlignment.Bin != lastBin ) { - - // if not first time through - if ( saveBin != defaultValue ) { - - // save Bam bin entry - ReferenceIndex& refIndex = d->m_indexData.at(saveRefID); - BamBinMap& binMap = refIndex.Bins; - d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); - } - - // update saveOffset - saveOffset = lastOffset; - - // update bin values - saveBin = bAlignment.Bin; - lastBin = bAlignment.Bin; - - // update saveRefID - saveRefID = bAlignment.RefID; - - // if invalid RefID, break out (why?) - if ( saveRefID < 0 ) { break; } - } - - // make sure that current file pointer is beyond lastOffset - if ( m_BGZF->Tell() <= (int64_t)lastOffset ) { - printf("Error in BGZF offsets.\n"); - exit(1); - } - - // update lastOffset - lastOffset = m_BGZF->Tell(); - - // update lastCoordinate - lastCoordinate = bAlignment.Position; - } - - // save any leftover BAM data (as long as refID is valid) - if ( saveRefID >= 0 ) { - // save Bam bin entry - ReferenceIndex& refIndex = d->m_indexData.at(saveRefID); - BamBinMap& binMap = refIndex.Bins; - d->InsertBinEntry(binMap, saveBin, saveOffset, lastOffset); - } - - // simplify index by merging chunks - d->MergeChunks(); - - // iterate through references in index - // store whether reference has data & - // sort offsets in linear offset vector - BamDefaultIndexData::iterator indexIter = d->m_indexData.begin(); - BamDefaultIndexData::iterator indexEnd = d->m_indexData.end(); - for ( int i = 0; indexIter != indexEnd; ++indexIter, ++i ) { - - // get reference index data - ReferenceIndex& refIndex = (*indexIter); - BamBinMap& binMap = refIndex.Bins; - LinearOffsetVector& offsets = refIndex.Offsets; - - // store whether reference has alignments or no - m_references[i].RefHasAlignments = ( binMap.size() > 0 ); - - // sort linear offsets - sort(offsets.begin(), offsets.end()); - } - - // rewind file pointer to beginning of alignments, return success/fail - return m_reader->Rewind(); -} - -bool BamDefaultIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) { - - // calculate which bins overlap this region - uint16_t* bins = (uint16_t*)calloc(MAX_BIN, 2); - int numBins = d->BinsFromRegion(region, isRightBoundSpecified, bins); - - // get bins for this reference - const ReferenceIndex& refIndex = d->m_indexData.at(region.LeftRefID); - const BamBinMap& binMap = refIndex.Bins; - - // get minimum offset to consider - const LinearOffsetVector& linearOffsets = refIndex.Offsets; - uint64_t minOffset = ( (unsigned int)(region.LeftPosition>>BAM_LIDX_SHIFT) >= linearOffsets.size() ) ? 0 : linearOffsets.at(region.LeftPosition>>BAM_LIDX_SHIFT); - - // store all alignment 'chunk' starts (file offsets) for bins in this region - for ( int i = 0; i < numBins; ++i ) { - - const uint16_t binKey = bins[i]; - map<uint32_t, ChunkVector>::const_iterator binIter = binMap.find(binKey); - if ( (binIter != binMap.end()) && ((*binIter).first == binKey) ) { - - const ChunkVector& chunks = (*binIter).second; - std::vector<Chunk>::const_iterator chunksIter = chunks.begin(); - std::vector<Chunk>::const_iterator chunksEnd = chunks.end(); - for ( ; chunksIter != chunksEnd; ++chunksIter) { - - // if valid chunk found, store its file offset - const Chunk& chunk = (*chunksIter); - if ( chunk.Stop > minOffset ) - offsets.push_back( chunk.Start ); - } - } - } - - // clean up memory - free(bins); - - // sort the offsets before returning - sort(offsets.begin(), offsets.end()); - - // return whether any offsets were found - return ( offsets.size() != 0 ); -} - -// saves BAM bin entry for index -void BamDefaultIndex::BamDefaultIndexPrivate::InsertBinEntry(BamBinMap& binMap, - const uint32_t& saveBin, - const uint64_t& saveOffset, - const uint64_t& lastOffset) -{ - // look up saveBin - BamBinMap::iterator binIter = binMap.find(saveBin); - - // create new chunk - Chunk newChunk(saveOffset, lastOffset); - - // if entry doesn't exist - if ( binIter == binMap.end() ) { - ChunkVector newChunks; - newChunks.push_back(newChunk); - binMap.insert( pair<uint32_t, ChunkVector>(saveBin, newChunks)); - } - - // otherwise - else { - ChunkVector& binChunks = (*binIter).second; - binChunks.push_back( newChunk ); - } -} - -// saves linear offset entry for index -void BamDefaultIndex::BamDefaultIndexPrivate::InsertLinearOffset(LinearOffsetVector& offsets, - const BamAlignment& bAlignment, - const uint64_t& lastOffset) -{ - // get converted offsets - int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; - int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; - - // resize vector if necessary - int oldSize = offsets.size(); - int newSize = endOffset + 1; - if ( oldSize < newSize ) - offsets.resize(newSize, 0); - - // store offset - for( int i = beginOffset + 1; i <= endOffset; ++i ) { - if ( offsets[i] == 0 ) - offsets[i] = lastOffset; - } -} - -bool BamDefaultIndex::Load(const string& filename) { - - // open index file, abort on error - FILE* indexStream = fopen(filename.c_str(), "rb"); - if( !indexStream ) { - printf("ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str()); - return false; - } - - // set placeholder to receive input byte count (suppresses compiler warnings) - size_t elementsRead = 0; - - // see if index is valid BAM index - char magic[4]; - elementsRead = fread(magic, 1, 4, indexStream); - if ( strncmp(magic, "BAI\1", 4) ) { - printf("Problem with index file - invalid format.\n"); - fclose(indexStream); - return false; - } - - // get number of reference sequences - uint32_t numRefSeqs; - elementsRead = fread(&numRefSeqs, 4, 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(numRefSeqs); } - - // intialize space for BamDefaultIndexData data structure - d->m_indexData.reserve(numRefSeqs); - - // iterate over reference sequences - for ( unsigned int i = 0; i < numRefSeqs; ++i ) { - - // get number of bins for this reference sequence - int32_t numBins; - elementsRead = fread(&numBins, 4, 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(numBins); } - - if ( numBins > 0 ) { - RefData& refEntry = m_references[i]; - refEntry.RefHasAlignments = true; - } - - // intialize BinVector - BamBinMap binMap; - - // iterate over bins for that reference sequence - for ( int j = 0; j < numBins; ++j ) { - - // get binID - uint32_t binID; - elementsRead = fread(&binID, 4, 1, indexStream); - - // get number of regionChunks in this bin - uint32_t numChunks; - elementsRead = fread(&numChunks, 4, 1, indexStream); - - if ( m_isBigEndian ) { - SwapEndian_32(binID); - SwapEndian_32(numChunks); - } - - // intialize ChunkVector - ChunkVector regionChunks; - regionChunks.reserve(numChunks); - - // iterate over regionChunks in this bin - for ( unsigned int k = 0; k < numChunks; ++k ) { - - // get chunk boundaries (left, right) - uint64_t left; - uint64_t right; - elementsRead = fread(&left, 8, 1, indexStream); - elementsRead = fread(&right, 8, 1, indexStream); - - if ( m_isBigEndian ) { - SwapEndian_64(left); - SwapEndian_64(right); - } - - // save ChunkPair - regionChunks.push_back( Chunk(left, right) ); - } - - // sort chunks for this bin - sort( regionChunks.begin(), regionChunks.end(), ChunkLessThan ); - - // save binID, chunkVector for this bin - binMap.insert( pair<uint32_t, ChunkVector>(binID, regionChunks) ); - } - - // load linear index for this reference sequence - - // get number of linear offsets - int32_t numLinearOffsets; - elementsRead = fread(&numLinearOffsets, 4, 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(numLinearOffsets); } - - // intialize LinearOffsetVector - LinearOffsetVector offsets; - offsets.reserve(numLinearOffsets); - - // iterate over linear offsets for this reference sequeence - uint64_t linearOffset; - for ( int j = 0; j < numLinearOffsets; ++j ) { - // read a linear offset & store - elementsRead = fread(&linearOffset, 8, 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_64(linearOffset); } - offsets.push_back(linearOffset); - } - - // sort linear offsets - sort( offsets.begin(), offsets.end() ); - - // store index data for that reference sequence - d->m_indexData.push_back( ReferenceIndex(binMap, offsets) ); - } - - // close index file (.bai) and return - fclose(indexStream); - return true; -} - -// merges 'alignment chunks' in BAM bin (used for index building) -void BamDefaultIndex::BamDefaultIndexPrivate::MergeChunks(void) { - - // iterate over reference enties - BamDefaultIndexData::iterator indexIter = m_indexData.begin(); - BamDefaultIndexData::iterator indexEnd = m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - - // get BAM bin map for this reference - ReferenceIndex& refIndex = (*indexIter); - BamBinMap& bamBinMap = refIndex.Bins; - - // iterate over BAM bins - BamBinMap::iterator binIter = bamBinMap.begin(); - BamBinMap::iterator binEnd = bamBinMap.end(); - for ( ; binIter != binEnd; ++binIter ) { - - // get chunk vector for this bin - ChunkVector& binChunks = (*binIter).second; - if ( binChunks.size() == 0 ) { continue; } - - ChunkVector mergedChunks; - mergedChunks.push_back( binChunks[0] ); - - // iterate over chunks - int i = 0; - ChunkVector::iterator chunkIter = binChunks.begin(); - ChunkVector::iterator chunkEnd = binChunks.end(); - for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { - - // get 'currentChunk' based on numeric index - Chunk& currentChunk = mergedChunks[i]; - - // get iteratorChunk based on vector iterator - Chunk& iteratorChunk = (*chunkIter); - - // if currentChunk.Stop(shifted) == iterator Chunk.Start(shifted) - if ( currentChunk.Stop>>16 == iteratorChunk.Start>>16 ) { - - // set currentChunk.Stop to iteratorChunk.Stop - currentChunk.Stop = iteratorChunk.Stop; - } - - // otherwise - else { - // set currentChunk + 1 to iteratorChunk - mergedChunks.push_back(iteratorChunk); - ++i; - } - } - - // saved merged chunk vector - (*binIter).second = mergedChunks; - } - } -} - -// writes in-memory index data out to file -// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamDefaultIndex::Write(const std::string& bamFilename) { - - string indexFilename = bamFilename + ".bai"; - FILE* indexStream = fopen(indexFilename.c_str(), "wb"); - if ( indexStream == 0 ) { - printf("ERROR: Could not open file to save index.\n"); - return false; - } - - // write BAM index header - fwrite("BAI\1", 1, 4, indexStream); - - // write number of reference sequences - int32_t numReferenceSeqs = d->m_indexData.size(); - if ( m_isBigEndian ) { SwapEndian_32(numReferenceSeqs); } - fwrite(&numReferenceSeqs, 4, 1, indexStream); - - // iterate over reference sequences - BamDefaultIndexData::const_iterator indexIter = d->m_indexData.begin(); - BamDefaultIndexData::const_iterator indexEnd = d->m_indexData.end(); - for ( ; indexIter != indexEnd; ++ indexIter ) { - - // get reference index data - const ReferenceIndex& refIndex = (*indexIter); - const BamBinMap& binMap = refIndex.Bins; - const LinearOffsetVector& offsets = refIndex.Offsets; - - // write number of bins - int32_t binCount = binMap.size(); - if ( m_isBigEndian ) { SwapEndian_32(binCount); } - fwrite(&binCount, 4, 1, indexStream); - - // iterate over bins - BamBinMap::const_iterator binIter = binMap.begin(); - BamBinMap::const_iterator binEnd = binMap.end(); - for ( ; binIter != binEnd; ++binIter ) { - - // get bin data (key and chunk vector) - uint32_t binKey = (*binIter).first; - const ChunkVector& binChunks = (*binIter).second; - - // save BAM bin key - if ( m_isBigEndian ) { SwapEndian_32(binKey); } - fwrite(&binKey, 4, 1, indexStream); - - // save chunk count - int32_t chunkCount = binChunks.size(); - if ( m_isBigEndian ) { SwapEndian_32(chunkCount); } - fwrite(&chunkCount, 4, 1, indexStream); - - // iterate over chunks - ChunkVector::const_iterator chunkIter = binChunks.begin(); - ChunkVector::const_iterator chunkEnd = binChunks.end(); - for ( ; chunkIter != chunkEnd; ++chunkIter ) { - - // get current chunk data - const Chunk& chunk = (*chunkIter); - uint64_t start = chunk.Start; - uint64_t stop = chunk.Stop; - - if ( m_isBigEndian ) { - SwapEndian_64(start); - SwapEndian_64(stop); - } - - // save chunk offsets - fwrite(&start, 8, 1, indexStream); - fwrite(&stop, 8, 1, indexStream); - } - } - - // write linear offsets size - int32_t offsetSize = offsets.size(); - if ( m_isBigEndian ) { SwapEndian_32(offsetSize); } - fwrite(&offsetSize, 4, 1, indexStream); - - // iterate over linear offsets - LinearOffsetVector::const_iterator offsetIter = offsets.begin(); - LinearOffsetVector::const_iterator offsetEnd = offsets.end(); - for ( ; offsetIter != offsetEnd; ++offsetIter ) { - - // write linear offset value - uint64_t linearOffset = (*offsetIter); - if ( m_isBigEndian ) { SwapEndian_64(linearOffset); } - fwrite(&linearOffset, 8, 1, indexStream); - } - } - - // flush buffer, close file, and return success - fflush(indexStream); - fclose(indexStream); - return true; -} - -// ######################################################################################### -// ######################################################################################### - -// ------------------------------------- -// BamToolsIndex implementation - -namespace BamTools { - -struct BamToolsIndexEntry { - - // data members - int64_t Offset; - int RefID; - int Position; - - // ctor - BamToolsIndexEntry(const uint64_t& offset = 0, - const int& id = -1, - const int& position = -1) - : Offset(offset) - , RefID(id) - , Position(position) - { } -}; - -typedef vector<BamToolsIndexEntry> BamToolsIndexData; - -} // namespace BamTools - -struct BamToolsIndex::BamToolsIndexPrivate { - - // ------------------------- - // data members - BamToolsIndexData m_indexData; - BamToolsIndex* m_parent; - int32_t m_blockSize; - - // ------------------------- - // ctor & dtor - - BamToolsIndexPrivate(BamToolsIndex* parent) - : m_parent(parent) - , m_blockSize(1000) - { } - - ~BamToolsIndexPrivate(void) { } - - // ------------------------- - // internal methods -}; - -BamToolsIndex::BamToolsIndex(BgzfData* bgzf, BamReader* reader, bool isBigEndian) - : BamIndex(bgzf, reader, isBigEndian) -{ - d = new BamToolsIndexPrivate(this); -} - -BamToolsIndex::~BamToolsIndex(void) { - delete d; - d = 0; -} - -bool BamToolsIndex::Build(void) { - - // be sure reader & BGZF file are valid & open for reading - if ( m_reader == 0 || m_BGZF == 0 || !m_BGZF->IsOpen ) - return false; - - // move file pointer to beginning of alignments - m_reader->Rewind(); - - // plow through alignments, store block offsets - int32_t currentBlockCount = 0; - int64_t blockStartOffset = m_BGZF->Tell(); - int blockStartId = -1; - int blockStartPosition = -1; - BamAlignment al; - while ( m_reader->GetNextAlignmentCore(al) ) { - - // set reference flag - m_references[al.RefID].RefHasAlignments = true; - - // if beginning of block, save first alignment's refID & position - if ( currentBlockCount == 0 ) { - blockStartId = al.RefID; - blockStartPosition = al.Position; - } - - // increment block counter - ++currentBlockCount; - - // if block is full, get offset for next block, reset currentBlockCount - if ( currentBlockCount == d->m_blockSize ) { - - d->m_indexData.push_back( BamToolsIndexEntry(blockStartOffset, blockStartId, blockStartPosition) ); - blockStartOffset = m_BGZF->Tell(); - currentBlockCount = 0; - } - } - - return m_reader->Rewind(); -} - -// N.B. - ignores isRightBoundSpecified -bool BamToolsIndex::GetOffsets(const BamRegion& region, const bool isRightBoundSpecified, vector<int64_t>& offsets) { - - // return false if no index data present - if ( d->m_indexData.empty() ) return false; - - // clear any prior data - offsets.clear(); - - // calculate nearest index to jump to - int64_t previousOffset = -1; - BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin(); - BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end(); - for ( ; indexIter != indexEnd; ++indexIter ) { - - const BamToolsIndexEntry& entry = (*indexIter); - - // check if we are 'past' beginning of desired region - // if so, we will break out & use previously stored offset - if ( entry.RefID > region.LeftRefID ) break; - if ( (entry.RefID == region.LeftRefID) && (entry.Position > region.LeftPosition) ) break; - - // not past desired region, so store current entry offset in previousOffset - previousOffset = entry.Offset; - } - - // no index was found - if ( previousOffset == -1 ) - return false; - - // store offset & return success - offsets.push_back(previousOffset); - return true; -} - -bool BamToolsIndex::Load(const string& filename) { - - // open index file, abort on error - FILE* indexStream = fopen(filename.c_str(), "rb"); - if( !indexStream ) { - printf("ERROR: Unable to open the BAM index file %s for reading.\n", filename.c_str()); - return false; - } - - // set placeholder to receive input byte count (suppresses compiler warnings) - size_t elementsRead = 0; - - // see if index is valid BAM index - char magic[4]; - elementsRead = fread(magic, 1, 4, indexStream); - if ( strncmp(magic, "BTI\1", 4) ) { - printf("Problem with index file - invalid format.\n"); - fclose(indexStream); - return false; - } - - // read in block size - elementsRead = fread(&d->m_blockSize, sizeof(d->m_blockSize), 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(d->m_blockSize); } - - // read in number of offsets - uint32_t numOffsets; - elementsRead = fread(&numOffsets, sizeof(numOffsets), 1, indexStream); - if ( m_isBigEndian ) { SwapEndian_32(numOffsets); } - - // reserve space for index data - d->m_indexData.reserve(numOffsets); - - // iterate over index entries - for ( unsigned int i = 0; i < numOffsets; ++i ) { - - uint64_t offset; - int id; - int position; - - // read in data - elementsRead = fread(&offset, sizeof(offset), 1, indexStream); - elementsRead = fread(&id, sizeof(id), 1, indexStream); - elementsRead = fread(&position, sizeof(position), 1, indexStream); - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_64(offset); - SwapEndian_32(id); - SwapEndian_32(position); - } - - // save reference index entry - d->m_indexData.push_back( BamToolsIndexEntry(offset, id, position) ); - - // set reference flag - m_references[id].RefHasAlignments = true; // what about sparse references? wont be able to set flag? - } - - // close index file and return - fclose(indexStream); - return true; -} - -// writes in-memory index data out to file -// N.B. - (this is the original BAM filename, method will modify it to use applicable extension) -bool BamToolsIndex::Write(const std::string& bamFilename) { - - string indexFilename = bamFilename + ".bti"; - FILE* indexStream = fopen(indexFilename.c_str(), "wb"); - if ( indexStream == 0 ) { - printf("ERROR: Could not open file to save index.\n"); - return false; - } - - // write BAM index header - fwrite("BTI\1", 1, 4, indexStream); - - // write block size - int32_t blockSize = d->m_blockSize; - if ( m_isBigEndian ) { SwapEndian_32(blockSize); } - fwrite(&blockSize, sizeof(blockSize), 1, indexStream); - - // write number of offset entries - uint32_t numOffsets = d->m_indexData.size(); - if ( m_isBigEndian ) { SwapEndian_32(numOffsets); } - fwrite(&numOffsets, sizeof(numOffsets), 1, indexStream); - - // iterate over offset entries - BamToolsIndexData::const_iterator indexIter = d->m_indexData.begin(); - BamToolsIndexData::const_iterator indexEnd = d->m_indexData.end(); - for ( ; indexIter != indexEnd; ++ indexIter ) { - - // get reference index data - const BamToolsIndexEntry& entry = (*indexIter); - - // copy entry data - uint64_t offset = entry.Offset; - int id = entry.RefID; - int position = entry.Position; - - // swap endian-ness if necessary - if ( m_isBigEndian ) { - SwapEndian_64(offset); - SwapEndian_32(id); - SwapEndian_32(position); - } - - // write the reference index entry - fwrite(&offset, sizeof(offset), 1, indexStream); - fwrite(&id, sizeof(id), 1, indexStream); - fwrite(&position, sizeof(position), 1, indexStream); - } - - // flush file buffer, close file, and return success - fflush(indexStream); - fclose(indexStream); - return true; -} diff --git a/src/utils/BamTools/BamIndex.h b/src/utils/BamTools/BamIndex.h deleted file mode 100644 index 83138b6f162e27cea28c50e028259176d85080ff..0000000000000000000000000000000000000000 --- a/src/utils/BamTools/BamIndex.h +++ /dev/null @@ -1,120 +0,0 @@ -// *************************************************************************** -// BamIndex.h (c) 2009 Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 17 August 2010 (DB) -// --------------------------------------------------------------------------- -// Provides index functionality - both for the default (standardized) BAM -// index format (.bai) as well as a BamTools-specific (nonstandard) index -// format (.bti). -// *************************************************************************** - -#ifndef BAM_INDEX_H -#define BAM_INDEX_H - -#include <string> -#include <vector> -#include "BamAux.h" - -namespace BamTools { - -class BamReader; -class BgzfData; - -// -------------------------------------------------- -// BamIndex base class -class BamIndex { - - public: - BamIndex(BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - bool isBigEndian); - virtual ~BamIndex(void) { } - - public: - // creates index data (in-memory) from current reader data - virtual bool Build(void) =0; - // calculates offset(s) for a given region - virtual bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector<int64_t>& offsets) =0; - // loads existing data from file into memory - virtual bool Load(const std::string& filename) =0; - // returns whether reference has alignments or no - virtual bool HasAlignments(const int& referenceID); - // writes in-memory index data out to file - // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - virtual bool Write(const std::string& bamFilename) =0; - - protected: - BamTools::BgzfData* m_BGZF; - BamTools::BamReader* m_reader; - BamTools::RefVector m_references; - bool m_isBigEndian; -}; - -// -------------------------------------------------- -// BamDefaultIndex class -// -// implements default (per SAM/BAM spec) index file ops -class BamDefaultIndex : public BamIndex { - - - // ctor & dtor - public: - BamDefaultIndex(BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - bool isBigEndian); - ~BamDefaultIndex(void); - - // interface (implements BamIndex virtual methods) - public: - // creates index data (in-memory) from current reader data - bool Build(void); - // calculates offset(s) for a given region - bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector<int64_t>& offsets); - // loads existing data from file into memory - bool Load(const std::string& filename); - // writes in-memory index data out to file - // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - bool Write(const std::string& bamFilename); - - // internal implementation - private: - struct BamDefaultIndexPrivate; - BamDefaultIndexPrivate* d; -}; - -// -------------------------------------------------- -// BamToolsIndex class -// -// implements BamTools-specific index file ops -class BamToolsIndex : public BamIndex { - - // ctor & dtor - public: - BamToolsIndex(BamTools::BgzfData* bgzf, - BamTools::BamReader* reader, - bool isBigEndian); - ~BamToolsIndex(void); - - // interface (implements BamIndex virtual methods) - public: - // creates index data (in-memory) from current reader data - bool Build(void); - // calculates offset(s) for a given region - bool GetOffsets(const BamTools::BamRegion& region, const bool isRightBoundSpecified, std::vector<int64_t>& offsets); - // loads existing data from file into memory - bool Load(const std::string& filename); - // writes in-memory index data out to file - // N.B. - (this is the original BAM filename, method will modify it to use applicable extension) - bool Write(const std::string& bamFilename); - - // internal implementation - private: - struct BamToolsIndexPrivate; - BamToolsIndexPrivate* d; -}; - -} // namespace BamTools - -#endif // BAM_INDEX_H diff --git a/src/utils/BamTools/BamMultiReader.cpp b/src/utils/BamTools/BamMultiReader.cpp deleted file mode 100644 index 11d48daff295b0d602b78edfb1c24104637614e3..0000000000000000000000000000000000000000 --- a/src/utils/BamTools/BamMultiReader.cpp +++ /dev/null @@ -1,420 +0,0 @@ -// *************************************************************************** -// BamMultiReader.cpp (c) 2010 Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 20 July 2010 (DB) -// --------------------------------------------------------------------------- -// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Functionality for simultaneously reading multiple BAM files. -// -// This functionality allows applications to work on very large sets of files -// without requiring intermediate merge, sort, and index steps for each file -// subset. It also improves the performance of our merge system as it -// precludes the need to sort merged files. -// *************************************************************************** - -// C++ includes -#include <algorithm> -#include <iterator> -#include <string> -#include <vector> -#include <iostream> -#include <sstream> - -// BamTools includes -#include "BGZF.h" -#include "BamMultiReader.h" -using namespace BamTools; -using namespace std; - -// ----------------------------------------------------- -// BamMultiReader implementation -// ----------------------------------------------------- - -// constructor -BamMultiReader::BamMultiReader(void) - : CurrentRefID(0) - , CurrentLeft(0) -{ } - -// destructor -BamMultiReader::~BamMultiReader(void) { - Close(); // close the bam files - // clean up reader objects - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - delete it->first; - delete it->second; - } -} - -// close the BAM files -void BamMultiReader::Close(void) { - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - reader->Close(); // close the reader - } -} - -// updates the reference id stored in the BamMultiReader -// to reflect the current state of the readers -void BamMultiReader::UpdateReferenceID(void) { - // the alignments are sorted by position, so the first alignment will always have the lowest reference ID - if (alignments.begin()->second.second->RefID != CurrentRefID) { - // get the next reference id - // while there aren't any readers at the next ref id - // increment the ref id - int nextRefID = CurrentRefID; - while (alignments.begin()->second.second->RefID != nextRefID) { - ++nextRefID; - } - //cerr << "updating reference id from " << CurrentRefID << " to " << nextRefID << endl; - CurrentRefID = nextRefID; - } -} - -// checks if any readers still have alignments -bool BamMultiReader::HasOpenReaders() { - return alignments.size() > 0; -} - -// get next alignment among all files -bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { - - // bail out if we are at EOF in all files, means no more alignments to process - if (!HasOpenReaders()) - return false; - - // when all alignments have stepped into a new target sequence, update our - // current reference sequence id - UpdateReferenceID(); - - // our lowest alignment and reader will be at the front of our alignment index - BamAlignment* alignment = alignments.begin()->second.second; - BamReader* reader = alignments.begin()->second.first; - - // now that we have the lowest alignment in the set, save it by copy to our argument - nextAlignment = BamAlignment(*alignment); - - // remove this alignment index entry from our alignment index - alignments.erase(alignments.begin()); - - // and add another entry if we can get another alignment from the reader - if (reader->GetNextAlignment(*alignment)) { - alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), - make_pair(reader, alignment))); - } else { // do nothing - //cerr << "reached end of file " << lowestReader->GetFilename() << endl; - } - - return true; - -} - -// get next alignment among all files without parsing character data from alignments -bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) { - - // bail out if we are at EOF in all files, means no more alignments to process - if (!HasOpenReaders()) - return false; - - // when all alignments have stepped into a new target sequence, update our - // current reference sequence id - UpdateReferenceID(); - - // our lowest alignment and reader will be at the front of our alignment index - BamAlignment* alignment = alignments.begin()->second.second; - BamReader* reader = alignments.begin()->second.first; - - // now that we have the lowest alignment in the set, save it by copy to our argument - nextAlignment = BamAlignment(*alignment); - //memcpy(&nextAlignment, alignment, sizeof(BamAlignment)); - - // remove this alignment index entry from our alignment index - alignments.erase(alignments.begin()); - - // and add another entry if we can get another alignment from the reader - if (reader->GetNextAlignmentCore(*alignment)) { - alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), - make_pair(reader, alignment))); - } else { // do nothing - //cerr << "reached end of file " << lowestReader->GetFilename() << endl; - } - - return true; - -} - -// jumps to specified region(refID, leftBound) in BAM files, returns success/fail -bool BamMultiReader::Jump(int refID, int position) { - - //if ( References.at(refID).RefHasAlignments && (position <= References.at(refID).RefLength) ) { - CurrentRefID = refID; - CurrentLeft = position; - - bool result = true; - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - result &= reader->Jump(refID, position); - if (!result) { - cerr << "ERROR: could not jump " << reader->GetFilename() << " to " << refID << ":" << position << endl; - exit(1); - } - } - if (result) UpdateAlignments(); - return result; -} - -bool BamMultiReader::SetRegion(const int& leftRefID, const int& leftPosition, const int& rightRefID, const int& rightPosition) { - - BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition); - - return SetRegion(region); - -} - -bool BamMultiReader::SetRegion(const BamRegion& region) { - - Region = region; - - // NB: While it may make sense to track readers in which we can - // successfully SetRegion, In practice a failure of SetRegion means "no - // alignments here." It makes sense to simply accept the failure, - // UpdateAlignments(), and continue. - - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - it->first->SetRegion(region); - } - - UpdateAlignments(); - - return true; - -} - -void BamMultiReader::UpdateAlignments(void) { - // Update Alignments - alignments.clear(); - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* br = it->first; - BamAlignment* ba = it->second; - if (br->GetNextAlignment(*ba)) { - alignments.insert(make_pair(make_pair(ba->RefID, ba->Position), - make_pair(br, ba))); - } else { - // assume BamReader end of region / EOF - } - } -} - -// opens BAM files -bool BamMultiReader::Open(const vector<string> filenames, bool openIndexes, bool coreMode, bool useDefaultIndex) { - - // for filename in filenames - fileNames = filenames; // save filenames in our multireader - for (vector<string>::const_iterator it = filenames.begin(); it != filenames.end(); ++it) { - string filename = *it; - BamReader* reader = new BamReader; - - bool openedOK = true; - if (openIndexes) { - if (useDefaultIndex) - openedOK = reader->Open(filename, filename + ".bai"); - else - openedOK = reader->Open(filename, filename + ".bti"); - } else { - openedOK = reader->Open(filename); // for merging, jumping is disallowed - } - - // if file opened ok, check that it can be read - if ( openedOK ) { - - bool fileOK = true; - BamAlignment* alignment = new BamAlignment; - if (coreMode) { - fileOK &= reader->GetNextAlignmentCore(*alignment); - } else { - fileOK &= reader->GetNextAlignment(*alignment); - } - - if (fileOK) { - readers.push_back(make_pair(reader, alignment)); // store pointers to our readers for cleanup - alignments.insert(make_pair(make_pair(alignment->RefID, alignment->Position), - make_pair(reader, alignment))); - } else { - cerr << "WARNING: could not read first alignment in " << filename << ", ignoring file" << endl; - // if only file available & could not be read, return failure - if ( filenames.size() == 1 ) return false; - } - - } - - // TODO; any more error handling on openedOK ?? - else - return false; - } - - // files opened ok, at least one alignment could be read, - // now need to check that all files use same reference data - ValidateReaders(); - return true; -} - -void BamMultiReader::PrintFilenames(void) { - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - cout << reader->GetFilename() << endl; - } -} - -// for debugging -void BamMultiReader::DumpAlignmentIndex(void) { - for (AlignmentIndex::const_iterator it = alignments.begin(); it != alignments.end(); ++it) { - cerr << it->first.first << ":" << it->first.second << " " << it->second.first->GetFilename() << endl; - } -} - -// returns BAM file pointers to beginning of alignment data -bool BamMultiReader::Rewind(void) { - bool result = true; - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - result &= reader->Rewind(); - } - return result; -} - -// saves index data to BAM index files (".bai"/".bti") where necessary, returns success/fail -bool BamMultiReader::CreateIndexes(bool useDefaultIndex) { - bool result = true; - for (vector<pair<BamReader*, BamAlignment*> >::iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - result &= reader->CreateIndex(useDefaultIndex); - } - return result; -} - -// makes a virtual, unified header for all the bam files in the multireader -const string BamMultiReader::GetHeaderText(void) const { - - string mergedHeader = ""; - map<string, bool> readGroups; - - // foreach extraction entry (each BAM file) - for (vector<pair<BamReader*, BamAlignment*> >::const_iterator rs = readers.begin(); rs != readers.end(); ++rs) { - - map<string, bool> currentFileReadGroups; - - BamReader* reader = rs->first; - - stringstream header(reader->GetHeaderText()); - vector<string> lines; - string item; - while (getline(header, item)) - lines.push_back(item); - - for (vector<string>::const_iterator it = lines.begin(); it != lines.end(); ++it) { - - // get next line from header, skip if empty - string headerLine = *it; - if ( headerLine.empty() ) { continue; } - - // if first file, save HD & SQ entries - if ( rs == readers.begin() ) { - if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { - mergedHeader.append(headerLine.c_str()); - mergedHeader.append(1, '\n'); - } - } - - // (for all files) append RG entries if they are unique - if ( headerLine.find("@RG") == 0 ) { - stringstream headerLineSs(headerLine); - string part, readGroupPart, readGroup; - while(std::getline(headerLineSs, part, '\t')) { - stringstream partSs(part); - string subtag; - std::getline(partSs, subtag, ':'); - if (subtag == "ID") { - std::getline(partSs, readGroup, ':'); - break; - } - } - if (readGroups.find(readGroup) == readGroups.end()) { // prevents duplicate @RG entries - mergedHeader.append(headerLine.c_str() ); - mergedHeader.append(1, '\n'); - readGroups[readGroup] = true; - currentFileReadGroups[readGroup] = true; - } else { - // warn iff we are reading one file and discover duplicated @RG tags in the header - // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags - if (currentFileReadGroups.find(readGroup) != currentFileReadGroups.end()) { - cerr << "WARNING: duplicate @RG tag " << readGroup - << " entry in header of " << reader->GetFilename() << endl; - } - } - } - } - } - - // return merged header text - return mergedHeader; -} - -// ValidateReaders checks that all the readers point to BAM files representing -// alignments against the same set of reference sequences, and that the -// sequences are identically ordered. If these checks fail the operation of -// the multireader is undefined, so we force program exit. -void BamMultiReader::ValidateReaders(void) const { - int firstRefCount = readers.front().first->GetReferenceCount(); - BamTools::RefVector firstRefData = readers.front().first->GetReferenceData(); - for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) { - BamReader* reader = it->first; - BamTools::RefVector currentRefData = reader->GetReferenceData(); - BamTools::RefVector::const_iterator f = firstRefData.begin(); - BamTools::RefVector::const_iterator c = currentRefData.begin(); - if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) { - cerr << "ERROR: mismatched number of references in " << reader->GetFilename() - << " expected " << firstRefCount - << " reference sequences but only found " << reader->GetReferenceCount() << endl; - exit(1); - } - // this will be ok; we just checked above that we have identically-sized sets of references - // here we simply check if they are all, in fact, equal in content - while (f != firstRefData.end()) { - if (f->RefName != c->RefName || f->RefLength != c->RefLength) { - cerr << "ERROR: mismatched references found in " << reader->GetFilename() - << " expected: " << endl; - for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a) - cerr << a->RefName << " " << a->RefLength << endl; - cerr << "but found: " << endl; - for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a) - cerr << a->RefName << " " << a->RefLength << endl; - exit(1); - } - ++f; ++c; - } - } -} - -// NB: The following functions assume that we have identical references for all -// BAM files. We enforce this by invoking the above validation function -// (ValidateReaders) to verify that our reference data is the same across all -// files on Open, so we will not encounter a situation in which there is a -// mismatch and we are still live. - -// returns the number of reference sequences -const int BamMultiReader::GetReferenceCount(void) const { - return readers.front().first->GetReferenceCount(); -} - -// returns vector of reference objects -const BamTools::RefVector BamMultiReader::GetReferenceData(void) const { - return readers.front().first->GetReferenceData(); -} - -const int BamMultiReader::GetReferenceID(const string& refName) const { - return readers.front().first->GetReferenceID(refName); -} diff --git a/src/utils/BamTools/BamMultiReader.h b/src/utils/BamTools/BamMultiReader.h deleted file mode 100644 index bd36d7160e0d18aee8d63c8699262c4a296695bc..0000000000000000000000000000000000000000 --- a/src/utils/BamTools/BamMultiReader.h +++ /dev/null @@ -1,133 +0,0 @@ -// *************************************************************************** -// BamMultiReader.h (c) 2010 Erik Garrison -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 20 July 2010 (DB) -// --------------------------------------------------------------------------- -// Functionality for simultaneously reading multiple BAM files -// *************************************************************************** - -#ifndef BAMMULTIREADER_H -#define BAMMULTIREADER_H - -// C++ includes -#include <string> -#include <map> -#include <utility> // for pair -#include <sstream> - -using namespace std; - -// BamTools includes -#include "BamAux.h" -#include "BamReader.h" - -namespace BamTools { - -// index mapping reference/position pairings to bamreaders and their alignments -typedef multimap<pair<int, int>, pair<BamReader*, BamAlignment*> > AlignmentIndex; - - -class BamMultiReader { - - // constructor / destructor - public: - BamMultiReader(void); - ~BamMultiReader(void); - - // public interface - public: - - // positioning - int CurrentRefID; - int CurrentLeft; - - // region under analysis, specified using SetRegion - BamRegion Region; - - // ---------------------- - // BAM file operations - // ---------------------- - - // close BAM files - void Close(void); - - // opens BAM files (and optional BAM index files, if provided) - // @openIndexes - triggers index opening, useful for suppressing - // error messages during merging of files in which we may not have - // indexes. - // @coreMode - setup our first alignments using GetNextAlignmentCore(); - // also useful for merging - bool Open(const vector<string> filenames, bool openIndexes = true, bool coreMode = false, bool useDefaultIndex = true); - - // performs random-access jump to reference, position - bool Jump(int refID, int position = 0); - - // sets the target region - bool SetRegion(const BamRegion& region); - bool SetRegion(const int&, const int&, const int&, const int&); // convenience function to above - - // returns file pointers to beginning of alignments - bool Rewind(void); - - // ---------------------- - // access alignment data - // ---------------------- - // updates the reference id marker to match the lower limit of our readers - void UpdateReferenceID(void); - - // retrieves next available alignment (returns success/fail) from all files - bool GetNextAlignment(BamAlignment&); - // retrieves next available alignment (returns success/fail) from all files - // and populates the support data with information about the alignment - // *** BUT DOES NOT PARSE CHARACTER DATA FROM THE ALIGNMENT - bool GetNextAlignmentCore(BamAlignment&); - // ... should this be private? - bool HasOpenReaders(void); - - // ---------------------- - // access auxiliary data - // ---------------------- - - // returns unified SAM header text for all files - const string GetHeaderText(void) const; - // returns number of reference sequences - const int GetReferenceCount(void) const; - // returns vector of reference objects - const BamTools::RefVector GetReferenceData(void) const; - // returns reference id (used for BamMultiReader::Jump()) for the given reference name - const int GetReferenceID(const std::string& refName) const; - // validates that we have a congruent set of BAM files that are aligned against the same reference sequences - void ValidateReaders() const; - - // ---------------------- - // BAM index operations - // ---------------------- - - // creates index for BAM files which lack them, saves to files (default = bamFilename + ".bai") - bool CreateIndexes(bool useDefaultIndex = true); - - //const int GetReferenceID(const string& refName) const; - - // utility - void PrintFilenames(void); - void DumpAlignmentIndex(void); - void UpdateAlignments(void); // updates our alignment cache - - // private implementation - private: - - // the set of readers and alignments which we operate on, maintained throughout the life of this class - vector<pair<BamReader*, BamAlignment*> > readers; - - // readers and alignments sorted by reference id and position, to keep track of the lowest (next) alignment - // when a reader reaches EOF, its entry is removed from this index - AlignmentIndex alignments; - - vector<string> fileNames; -}; - -} // namespace BamTools - -#endif // BAMMULTIREADER_H diff --git a/src/utils/BamTools/BamReader.cpp b/src/utils/BamTools/BamReader.cpp deleted file mode 100644 index d6355769144f5d2e7f628251942a04afc672205b..0000000000000000000000000000000000000000 --- a/src/utils/BamTools/BamReader.cpp +++ /dev/null @@ -1,773 +0,0 @@ -// *************************************************************************** -// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 15 July 2010 (DB) -// --------------------------------------------------------------------------- -// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files -// *************************************************************************** - -// C++ includes -#include <algorithm> -#include <iterator> -#include <string> -#include <vector> -#include <iostream> - -// BamTools includes -#include "BGZF.h" -#include "BamReader.h" -#include "BamIndex.h" -using namespace BamTools; -using namespace std; - -struct BamReader::BamReaderPrivate { - - // ------------------------------- - // structs, enums, typedefs - // ------------------------------- - enum RegionState { BEFORE_REGION = 0 - , WITHIN_REGION - , AFTER_REGION - }; - - // ------------------------------- - // data members - // ------------------------------- - - // general file data - BgzfData mBGZF; - string HeaderText; - //BamIndex Index; - BamIndex* NewIndex; - RefVector References; - bool IsIndexLoaded; - int64_t AlignmentsBeginOffset; - string Filename; - string IndexFilename; - - // system data - bool IsBigEndian; - - // user-specified region values - BamRegion Region; - bool IsLeftBoundSpecified; - bool IsRightBoundSpecified; - - bool IsRegionSpecified; - int CurrentRefID; - int CurrentLeft; - - // parent BamReader - BamReader* Parent; - - // BAM character constants - const char* DNA_LOOKUP; - const char* CIGAR_LOOKUP; - - // ------------------------------- - // constructor & destructor - // ------------------------------- - BamReaderPrivate(BamReader* parent); - ~BamReaderPrivate(void); - - // ------------------------------- - // "public" interface - // ------------------------------- - - // file operations - void Close(void); - bool Jump(int refID, int position = 0); - bool Open(const string& filename, const string& indexFilename = ""); - bool Rewind(void); - bool SetRegion(const BamRegion& region); - - // access alignment data - bool GetNextAlignment(BamAlignment& bAlignment); - bool GetNextAlignmentCore(BamAlignment& bAlignment); - - // access auxiliary data - int GetReferenceID(const string& refName) const; - - // index operations - bool CreateIndex(bool useDefaultIndex); - - // ------------------------------- - // internal methods - // ------------------------------- - - // *** reading alignments and auxiliary data *** // - - // fills out character data for BamAlignment data - bool BuildCharData(BamAlignment& bAlignment); - // checks to see if alignment overlaps current region - RegionState IsOverlap(BamAlignment& bAlignment); - // retrieves header text from BAM file - void LoadHeaderData(void); - // retrieves BAM alignment under file pointer - bool LoadNextAlignment(BamAlignment& bAlignment); - // builds reference data structure from BAM file - void LoadReferenceData(void); - - // *** index file handling *** // - - // clear out inernal index data structure - void ClearIndex(void); - // loads index from BAM index file - bool LoadIndex(void); -}; - -// ----------------------------------------------------- -// BamReader implementation (wrapper around BRPrivate) -// ----------------------------------------------------- -// constructor -BamReader::BamReader(void) { - d = new BamReaderPrivate(this); -} - -// destructor -BamReader::~BamReader(void) { - delete d; - d = 0; -} - -// file operations -void BamReader::Close(void) { d->Close(); } -bool BamReader::IsOpen(void) const { return d->mBGZF.IsOpen; } -bool BamReader::Jump(int refID, int position) { - d->Region.LeftRefID = refID; - d->Region.LeftPosition = position; - d->IsLeftBoundSpecified = true; - d->IsRightBoundSpecified = false; - return d->Jump(refID, position); -} -bool BamReader::Open(const string& filename, const string& indexFilename) { return d->Open(filename, indexFilename); } -bool BamReader::Rewind(void) { return d->Rewind(); } -bool BamReader::SetRegion(const BamRegion& region) { return d->SetRegion(region); } -bool BamReader::SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound) { - return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) ); -} - -// access alignment data -bool BamReader::GetNextAlignment(BamAlignment& bAlignment) { return d->GetNextAlignment(bAlignment); } -bool BamReader::GetNextAlignmentCore(BamAlignment& bAlignment) { return d->GetNextAlignmentCore(bAlignment); } - -// access auxiliary data -const string BamReader::GetHeaderText(void) const { return d->HeaderText; } -int BamReader::GetReferenceCount(void) const { return d->References.size(); } -const RefVector& BamReader::GetReferenceData(void) const { return d->References; } -int BamReader::GetReferenceID(const string& refName) const { return d->GetReferenceID(refName); } -const std::string BamReader::GetFilename(void) const { return d->Filename; } - -// index operations -bool BamReader::CreateIndex(bool useDefaultIndex) { return d->CreateIndex(useDefaultIndex); } - -// ----------------------------------------------------- -// BamReaderPrivate implementation -// ----------------------------------------------------- - -// constructor -BamReader::BamReaderPrivate::BamReaderPrivate(BamReader* parent) - : NewIndex(0) - , IsIndexLoaded(false) - , AlignmentsBeginOffset(0) - , IsLeftBoundSpecified(false) - , IsRightBoundSpecified(false) - , IsRegionSpecified(false) - , CurrentRefID(0) - , CurrentLeft(0) - , Parent(parent) - , DNA_LOOKUP("=ACMGRSVTWYHKDBN") - , CIGAR_LOOKUP("MIDNSHP") -{ - IsBigEndian = SystemIsBigEndian(); -} - -// destructor -BamReader::BamReaderPrivate::~BamReaderPrivate(void) { - Close(); -} - -bool BamReader::BamReaderPrivate::BuildCharData(BamAlignment& bAlignment) { - - // calculate character lengths/offsets - const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; - const unsigned int cigarDataOffset = bAlignment.SupportData.QueryNameLength; - const unsigned int seqDataOffset = bAlignment.SupportData.QueryNameLength + (bAlignment.SupportData.NumCigarOperations * 4); - const unsigned int qualDataOffset = seqDataOffset + (bAlignment.SupportData.QuerySequenceLength+1)/2; - const unsigned int tagDataOffset = qualDataOffset + bAlignment.SupportData.QuerySequenceLength; - const unsigned int tagDataLength = dataLength - tagDataOffset; - - // set up char buffers - const char* allCharData = bAlignment.SupportData.AllCharData.data(); - uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); - const char* seqData = ((const char*)allCharData) + seqDataOffset; - const char* qualData = ((const char*)allCharData) + qualDataOffset; - char* tagData = ((char*)allCharData) + tagDataOffset; - - // store alignment name (depends on null char as terminator) - bAlignment.Name.assign((const char*)(allCharData)); - - // save CigarOps - CigarOp op; - bAlignment.CigarData.clear(); - bAlignment.CigarData.reserve(bAlignment.SupportData.NumCigarOperations); - for (unsigned int i = 0; i < bAlignment.SupportData.NumCigarOperations; ++i) { - - // swap if necessary - if ( IsBigEndian ) { SwapEndian_32(cigarData[i]); } - - // build CigarOp structure - op.Length = (cigarData[i] >> BAM_CIGAR_SHIFT); - op.Type = CIGAR_LOOKUP[ (cigarData[i] & BAM_CIGAR_MASK) ]; - - // save CigarOp - bAlignment.CigarData.push_back(op); - } - - - // save query sequence - bAlignment.QueryBases.clear(); - bAlignment.QueryBases.reserve(bAlignment.SupportData.QuerySequenceLength); - for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { - char singleBase = DNA_LOOKUP[ ( ( seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; - bAlignment.QueryBases.append(1, singleBase); - } - - // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character - bAlignment.Qualities.clear(); - bAlignment.Qualities.reserve(bAlignment.SupportData.QuerySequenceLength); - for (unsigned int i = 0; i < bAlignment.SupportData.QuerySequenceLength; ++i) { - char singleQuality = (char)(qualData[i]+33); - bAlignment.Qualities.append(1, singleQuality); - } - - // if QueryBases is empty (and this is a allowed case) - if ( bAlignment.QueryBases.empty() ) - bAlignment.AlignedBases = bAlignment.QueryBases; - - // if QueryBases contains data, then build AlignedBases using CIGAR data - else { - - // resize AlignedBases - bAlignment.AlignedBases.clear(); - bAlignment.AlignedBases.reserve(bAlignment.SupportData.QuerySequenceLength); - - // iterate over CigarOps - int k = 0; - vector<CigarOp>::const_iterator cigarIter = bAlignment.CigarData.begin(); - vector<CigarOp>::const_iterator cigarEnd = bAlignment.CigarData.end(); - for ( ; cigarIter != cigarEnd; ++cigarIter ) { - - const CigarOp& op = (*cigarIter); - switch(op.Type) { - - case ('M') : - case ('I') : - bAlignment.AlignedBases.append(bAlignment.QueryBases.substr(k, op.Length)); // for 'M', 'I' - write bases - // fall through - - case ('S') : - k += op.Length; // for 'S' - soft clip, skip over query bases - break; - - case ('D') : - bAlignment.AlignedBases.append(op.Length, '-'); // for 'D' - write gap character - break; - - case ('P') : - bAlignment.AlignedBases.append( op.Length, '*' ); // for 'P' - write padding character - break; - - case ('N') : - bAlignment.AlignedBases.append( op.Length, 'N' ); // for 'N' - write N's, skip bases in original query sequence - break; - - case ('H') : - break; // for 'H' - hard clip, do nothing to AlignedBases, move to next op - - default: - printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here - exit(1); - } - } - } - - // ----------------------- - // Added: 3-25-2010 DB - // Fixed: endian-correctness for tag data - // ----------------------- - if ( IsBigEndian ) { - int i = 0; - while ( (unsigned int)i < tagDataLength ) { - - i += 2; // skip tag type (e.g. "RG", "NM", etc) - uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning - ++i; // skip value type - - switch (type) { - - case('A') : - case('C') : - ++i; - break; - - case('S') : - SwapEndian_16p(&tagData[i]); - i += sizeof(uint16_t); - break; - - case('F') : - case('I') : - SwapEndian_32p(&tagData[i]); - i += sizeof(uint32_t); - break; - - case('D') : - SwapEndian_64p(&tagData[i]); - i += sizeof(uint64_t); - break; - - case('H') : - case('Z') : - while (tagData[i]) { ++i; } - ++i; // increment one more for null terminator - break; - - default : - printf("ERROR: Invalid tag value type\n"); // shouldn't get here - exit(1); - } - } - } - - // store TagData - bAlignment.TagData.clear(); - bAlignment.TagData.resize(tagDataLength); - memcpy((char*)bAlignment.TagData.data(), tagData, tagDataLength); - - // clear the core-only flag - bAlignment.SupportData.HasCoreOnly = false; - - // return success - return true; -} - -// clear index data structure -void BamReader::BamReaderPrivate::ClearIndex(void) { - delete NewIndex; - NewIndex = 0; -} - -// closes the BAM file -void BamReader::BamReaderPrivate::Close(void) { - - // close BGZF file stream - mBGZF.Close(); - - // clear out index data - ClearIndex(); - - // clear out header data - HeaderText.clear(); - - // clear out region flags - IsLeftBoundSpecified = false; - IsRightBoundSpecified = false; - IsRegionSpecified = false; -} - -// create BAM index from BAM file (keep structure in memory) and write to default index output file -bool BamReader::BamReaderPrivate::CreateIndex(bool useDefaultIndex) { - - // clear out prior index data - ClearIndex(); - - // create default index - if ( useDefaultIndex ) - NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian); - // create BamTools 'custom' index - else - NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian); - - bool ok = true; - ok &= NewIndex->Build(); - ok &= NewIndex->Write(Filename); - - // return success/fail - return ok; -} - -// get next alignment (from specified region, if given) -bool BamReader::BamReaderPrivate::GetNextAlignment(BamAlignment& bAlignment) { - - // if valid alignment found, attempt to parse char data, and return success/failure - if ( GetNextAlignmentCore(bAlignment) ) - return BuildCharData(bAlignment); - - // no valid alignment found - else - return false; -} - -// retrieves next available alignment core data (returns success/fail) -// ** DOES NOT parse any character data (read name, bases, qualities, tag data) -// these can be accessed, if necessary, from the supportData -// useful for operations requiring ONLY positional or other alignment-related information -bool BamReader::BamReaderPrivate::GetNextAlignmentCore(BamAlignment& bAlignment) { - - // if valid alignment available - if ( LoadNextAlignment(bAlignment) ) { - - // set core-only flag - bAlignment.SupportData.HasCoreOnly = true; - - // if region not specified, return success - if ( !IsLeftBoundSpecified ) return true; - - // determine region state (before, within, after) - BamReader::BamReaderPrivate::RegionState state = IsOverlap(bAlignment); - - // if alignment lies after region, return false - if ( state == AFTER_REGION ) - return false; - - while ( state != WITHIN_REGION ) { - // if no valid alignment available (likely EOF) return failure - if ( !LoadNextAlignment(bAlignment) ) return false; - // if alignment lies after region, return false (no available read within region) - state = IsOverlap(bAlignment); - if ( state == AFTER_REGION) return false; - - } - - // return success (alignment found that overlaps region) - return true; - } - - // no valid alignment - else - return false; -} - -// returns RefID for given RefName (returns References.size() if not found) -int BamReader::BamReaderPrivate::GetReferenceID(const string& refName) const { - - // retrieve names from reference data - vector<string> refNames; - RefVector::const_iterator refIter = References.begin(); - RefVector::const_iterator refEnd = References.end(); - for ( ; refIter != refEnd; ++refIter) { - refNames.push_back( (*refIter).RefName ); - } - - // return 'index-of' refName ( if not found, returns refNames.size() ) - return distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); -} - -// returns region state - whether alignment ends before, overlaps, or starts after currently specified region -// this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true -BamReader::BamReaderPrivate::RegionState BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { - - // -------------------------------------------------- - // check alignment start against right bound cutoff - - // if full region of interest was given - if ( IsRightBoundSpecified ) { - - // read starts on right bound reference, but AFTER right bound position - if ( bAlignment.RefID == Region.RightRefID && bAlignment.Position > Region.RightPosition ) - return AFTER_REGION; - - // if read starts on reference AFTER right bound, return false - if ( bAlignment.RefID > Region.RightRefID ) - return AFTER_REGION; - } - - // -------------------------------------------------------- - // no right bound given OR read starts before right bound - // so, check if it overlaps left bound - - // if read starts on left bound reference AND after left boundary, return success - if ( bAlignment.RefID == Region.LeftRefID && bAlignment.Position >= Region.LeftPosition) - return WITHIN_REGION; - - // if read is on any reference sequence before left bound, return false - if ( bAlignment.RefID < Region.LeftRefID ) - return BEFORE_REGION; - - // -------------------------------------------------------- - // read is on left bound reference, but starts before left bound position - - // if it overlaps, return WITHIN_REGION - if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) - return WITHIN_REGION; - // else begins before left bound position - else - return BEFORE_REGION; -} - -// jumps to specified region(refID, leftBound) in BAM file, returns success/fail -bool BamReader::BamReaderPrivate::Jump(int refID, int position) { - - // ----------------------------------------------------------------------- - // check for existing index - if ( NewIndex == 0 ) return false; - // see if reference has alignments - if ( !NewIndex->HasAlignments(refID) ) return false; - // make sure position is valid - if ( position > References.at(refID).RefLength ) return false; - - // determine possible offsets - vector<int64_t> offsets; - if ( !NewIndex->GetOffsets(Region, IsRightBoundSpecified, offsets) ) { - printf("ERROR: Could not jump: unable to calculate offset for specified region.\n"); - return false; - } - - // iterate through offsets - BamAlignment bAlignment; - bool result = true; - for ( vector<int64_t>::const_iterator o = offsets.begin(); o != offsets.end(); ++o) { - - // attempt seek & load first available alignment - result &= mBGZF.Seek(*o); - LoadNextAlignment(bAlignment); - - // if this alignment corresponds to desired position - // return success of seeking back to 'current offset' - if ( (bAlignment.RefID == refID && bAlignment.Position + bAlignment.Length > position) || (bAlignment.RefID > refID) ) { - if ( o != offsets.begin() ) --o; - return mBGZF.Seek(*o); - } - } - - return result; -} - -// load BAM header data -void BamReader::BamReaderPrivate::LoadHeaderData(void) { - - // check to see if proper BAM header - char buffer[4]; - if (mBGZF.Read(buffer, 4) != 4) { - printf("Could not read header type\n"); - exit(1); - } - - if (strncmp(buffer, "BAM\001", 4)) { - printf("wrong header type!\n"); - exit(1); - } - - // get BAM header text length - mBGZF.Read(buffer, 4); - unsigned int headerTextLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(headerTextLength); } - - // get BAM header text - char* headerText = (char*)calloc(headerTextLength + 1, 1); - mBGZF.Read(headerText, headerTextLength); - HeaderText = (string)((const char*)headerText); - - // clean up calloc-ed temp variable - free(headerText); -} - -// load existing index data from BAM index file (".bai"), return success/fail -bool BamReader::BamReaderPrivate::LoadIndex(void) { - - // clear out any existing index data - ClearIndex(); - - // skip if index file empty - if ( IndexFilename.empty() ) - return false; - - // check supplied filename for index type - size_t defaultExtensionFound = IndexFilename.find(".bai"); - size_t customExtensionFound = IndexFilename.find(".bti"); - - // if SAM/BAM default (".bai") - if ( defaultExtensionFound != string::npos ) - NewIndex = new BamDefaultIndex(&mBGZF, Parent, IsBigEndian); - - // if BamTools custom index (".bti") - else if ( customExtensionFound != string::npos ) - NewIndex = new BamToolsIndex(&mBGZF, Parent, IsBigEndian); - - // else unknown - else { - printf("ERROR: Unknown index file extension.\n"); - return false; - } - - // return success of loading index data - return NewIndex->Load(IndexFilename); -} - -// populates BamAlignment with alignment data under file pointer, returns success/fail -bool BamReader::BamReaderPrivate::LoadNextAlignment(BamAlignment& bAlignment) { - - // read in the 'block length' value, make sure it's not zero - char buffer[4]; - mBGZF.Read(buffer, 4); - bAlignment.SupportData.BlockLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(bAlignment.SupportData.BlockLength); } - if ( bAlignment.SupportData.BlockLength == 0 ) { return false; } - - // read in core alignment data, make sure the right size of data was read - char x[BAM_CORE_SIZE]; - if ( mBGZF.Read(x, BAM_CORE_SIZE) != BAM_CORE_SIZE ) { return false; } - - if ( IsBigEndian ) { - for ( int i = 0; i < BAM_CORE_SIZE; i+=sizeof(uint32_t) ) { - SwapEndian_32p(&x[i]); - } - } - - // set BamAlignment 'core' and 'support' data - bAlignment.RefID = BgzfData::UnpackSignedInt(&x[0]); - bAlignment.Position = BgzfData::UnpackSignedInt(&x[4]); - - unsigned int tempValue = BgzfData::UnpackUnsignedInt(&x[8]); - bAlignment.Bin = tempValue >> 16; - bAlignment.MapQuality = tempValue >> 8 & 0xff; - bAlignment.SupportData.QueryNameLength = tempValue & 0xff; - - tempValue = BgzfData::UnpackUnsignedInt(&x[12]); - bAlignment.AlignmentFlag = tempValue >> 16; - bAlignment.SupportData.NumCigarOperations = tempValue & 0xffff; - - bAlignment.SupportData.QuerySequenceLength = BgzfData::UnpackUnsignedInt(&x[16]); - bAlignment.MateRefID = BgzfData::UnpackSignedInt(&x[20]); - bAlignment.MatePosition = BgzfData::UnpackSignedInt(&x[24]); - bAlignment.InsertSize = BgzfData::UnpackSignedInt(&x[28]); - - // set BamAlignment length - bAlignment.Length = bAlignment.SupportData.QuerySequenceLength; - - // read in character data - make sure proper data size was read - bool readCharDataOK = false; - const unsigned int dataLength = bAlignment.SupportData.BlockLength - BAM_CORE_SIZE; - char* allCharData = (char*)calloc(sizeof(char), dataLength); - - if ( mBGZF.Read(allCharData, dataLength) == (signed int)dataLength) { - - // store 'allCharData' in supportData structure - bAlignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength); - - // set success flag - readCharDataOK = true; - } - - free(allCharData); - return readCharDataOK; -} - -// loads reference data from BAM file -void BamReader::BamReaderPrivate::LoadReferenceData(void) { - - // get number of reference sequences - char buffer[4]; - mBGZF.Read(buffer, 4); - unsigned int numberRefSeqs = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(numberRefSeqs); } - if (numberRefSeqs == 0) { return; } - References.reserve((int)numberRefSeqs); - - // iterate over all references in header - for (unsigned int i = 0; i != numberRefSeqs; ++i) { - - // get length of reference name - mBGZF.Read(buffer, 4); - unsigned int refNameLength = BgzfData::UnpackUnsignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(refNameLength); } - char* refName = (char*)calloc(refNameLength, 1); - - // get reference name and reference sequence length - mBGZF.Read(refName, refNameLength); - mBGZF.Read(buffer, 4); - int refLength = BgzfData::UnpackSignedInt(buffer); - if ( IsBigEndian ) { SwapEndian_32(refLength); } - - // store data for reference - RefData aReference; - aReference.RefName = (string)((const char*)refName); - aReference.RefLength = refLength; - References.push_back(aReference); - - // clean up calloc-ed temp variable - free(refName); - } -} - -// opens BAM file (and index) -bool BamReader::BamReaderPrivate::Open(const string& filename, const string& indexFilename) { - - Filename = filename; - IndexFilename = indexFilename; - - // open the BGZF file for reading, return false on failure - if ( !mBGZF.Open(filename, "rb") ) - return false; - - // retrieve header text & reference data - LoadHeaderData(); - LoadReferenceData(); - - // store file offset of first alignment - AlignmentsBeginOffset = mBGZF.Tell(); - - // open index file & load index data (if exists) - if ( !IndexFilename.empty() ) - LoadIndex(); - - // return success - return true; -} - -// returns BAM file pointer to beginning of alignment data -bool BamReader::BamReaderPrivate::Rewind(void) { - - // rewind to first alignment - if ( !mBGZF.Seek(AlignmentsBeginOffset) ) return false; - - // retrieve first alignment data - BamAlignment al; - if ( !LoadNextAlignment(al) ) return false; - - // reset default region info using first alignment in file - Region.LeftRefID = al.RefID; - Region.LeftPosition = al.Position; - Region.RightRefID = -1; - Region.RightPosition = -1; - IsLeftBoundSpecified = false; - IsRightBoundSpecified = false; - - // rewind back to before first alignment - // return success/fail of seek - return mBGZF.Seek(AlignmentsBeginOffset); -} - -// sets a region of interest (with left & right bound reference/position) -// attempts a Jump() to left bound as well -// returns success/failure of Jump() -bool BamReader::BamReaderPrivate::SetRegion(const BamRegion& region) { - - // save region of interest - Region = region; - - // set flags - if ( region.LeftRefID >= 0 && region.LeftPosition >= 0 ) - IsLeftBoundSpecified = true; - if ( region.RightRefID >= 0 && region.RightPosition >= 0 ) - IsRightBoundSpecified = true; - - // attempt jump to beginning of region, return success/fail of Jump() - return Jump( Region.LeftRefID, Region.LeftPosition ); -} diff --git a/src/utils/BamTools/BamReader.h b/src/utils/BamTools/BamReader.h deleted file mode 100644 index a56316efcafd41e7ab33ab07b71316d12d49b158..0000000000000000000000000000000000000000 --- a/src/utils/BamTools/BamReader.h +++ /dev/null @@ -1,98 +0,0 @@ -// *************************************************************************** -// BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 9 July 2010 (DB) -// --------------------------------------------------------------------------- -// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for reading BAM files -// *************************************************************************** - -#ifndef BAMREADER_H -#define BAMREADER_H - -// C++ includes -#include <string> - -// BamTools includes -#include "BamAux.h" - -namespace BamTools { - -class BamReader { - - // constructor / destructor - public: - BamReader(void); - ~BamReader(void); - - // public interface - public: - - // ---------------------- - // BAM file operations - // ---------------------- - - // close BAM file - void Close(void); - // returns whether reader is open for reading or not - bool IsOpen(void) const; - // performs random-access jump to reference, position - bool Jump(int refID, int position = 0); - // opens BAM file (and optional BAM index file, if provided) - bool Open(const std::string& filename, const std::string& indexFilename = ""); - // returns file pointer to beginning of alignments - bool Rewind(void); - // sets a region of interest (with left & right bound reference/position) - // attempts a Jump() to left bound as well - // returns success/failure of Jump() - bool SetRegion(const BamRegion& region); - bool SetRegion(const int& leftRefID, const int& leftBound, const int& rightRefID, const int& rightBound); - - // ---------------------- - // access alignment data - // ---------------------- - - // retrieves next available alignment (returns success/fail) - bool GetNextAlignment(BamAlignment& bAlignment); - - // retrieves next available alignment core data (returns success/fail) - // ** DOES NOT parse any character data (read name, bases, qualities, tag data) - // these can be accessed, if necessary, from the supportData - // useful for operations requiring ONLY positional or other alignment-related information - bool GetNextAlignmentCore(BamAlignment& bAlignment); - - // ---------------------- - // access auxiliary data - // ---------------------- - - // returns SAM header text - const std::string GetHeaderText(void) const; - // returns number of reference sequences - int GetReferenceCount(void) const; - // returns vector of reference objects - const BamTools::RefVector& GetReferenceData(void) const; - // returns reference id (used for BamReader::Jump()) for the given reference name - int GetReferenceID(const std::string& refName) const; - // returns the name of the file associated with this BamReader - const std::string GetFilename(void) const; - - // ---------------------- - // BAM index operations - // ---------------------- - - // creates index for BAM file, saves to file (default = bamFilename + ".bai") - bool CreateIndex(bool useDefaultIndex = true); - - // private implementation - private: - struct BamReaderPrivate; - BamReaderPrivate* d; -}; - -} // namespace BamTools - -#endif // BAMREADER_H diff --git a/src/utils/BamTools/BamWriter.cpp b/src/utils/BamTools/BamWriter.cpp deleted file mode 100644 index 49e223b29e45e6e370fd058794c20bdbc0f630f9..0000000000000000000000000000000000000000 --- a/src/utils/BamTools/BamWriter.cpp +++ /dev/null @@ -1,432 +0,0 @@ -// *************************************************************************** -// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 17 August 2010 (DB) -// --------------------------------------------------------------------------- -// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -#include <iostream> - -#include "BGZF.h" -#include "BamWriter.h" -using namespace BamTools; -using namespace std; - -struct BamWriter::BamWriterPrivate { - - // data members - BgzfData mBGZF; - bool IsBigEndian; - - // constructor / destructor - BamWriterPrivate(void) { - IsBigEndian = SystemIsBigEndian(); - } - - ~BamWriterPrivate(void) { - mBGZF.Close(); - } - - // "public" interface - void Close(void); - bool Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed); - void SaveAlignment(const BamAlignment& al); - - // internal methods - const unsigned int CalculateMinimumBin(const int begin, int end) const; - void CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar); - void EncodeQuerySequence(const string& query, string& encodedQuery); -}; - -// ----------------------------------------------------- -// BamWriter implementation -// ----------------------------------------------------- - -// constructor -BamWriter::BamWriter(void) { - d = new BamWriterPrivate; -} - -// destructor -BamWriter::~BamWriter(void) { - delete d; - d = 0; -} - -// closes the alignment archive -void BamWriter::Close(void) { - d->Close(); -} - -// opens the alignment archive -bool BamWriter::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) { - return d->Open(filename, samHeader, referenceSequences, isWriteUncompressed); -} - -// saves the alignment to the alignment archive -void BamWriter::SaveAlignment(const BamAlignment& al) { - d->SaveAlignment(al); -} - -// ----------------------------------------------------- -// BamWriterPrivate implementation -// ----------------------------------------------------- - -// closes the alignment archive -void BamWriter::BamWriterPrivate::Close(void) { - mBGZF.Close(); -} - -// calculates minimum bin for a BAM alignment interval -const unsigned int BamWriter::BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { - --end; - if( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); - if( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); - if( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); - if( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); - if( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); - return 0; -} - -// creates a cigar string from the supplied alignment -void BamWriter::BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) { - - // initialize - const unsigned int numCigarOperations = cigarOperations.size(); - packedCigar.resize(numCigarOperations * BT_SIZEOF_INT); - - // pack the cigar data into the string - unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); - - unsigned int cigarOp; - vector<CigarOp>::const_iterator coIter; - for(coIter = cigarOperations.begin(); coIter != cigarOperations.end(); ++coIter) { - - switch(coIter->Type) { - case 'M': - cigarOp = BAM_CMATCH; - break; - case 'I': - cigarOp = BAM_CINS; - break; - case 'D': - cigarOp = BAM_CDEL; - break; - case 'N': - cigarOp = BAM_CREF_SKIP; - break; - case 'S': - cigarOp = BAM_CSOFT_CLIP; - break; - case 'H': - cigarOp = BAM_CHARD_CLIP; - break; - case 'P': - cigarOp = BAM_CPAD; - break; - default: - printf("ERROR: Unknown cigar operation found: %c\n", coIter->Type); - exit(1); - } - - *pPackedCigar = coIter->Length << BAM_CIGAR_SHIFT | cigarOp; - pPackedCigar++; - } -} - -// encodes the supplied query sequence into 4-bit notation -void BamWriter::BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) { - - // prepare the encoded query string - const unsigned int queryLen = query.size(); - const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); - encodedQuery.resize(encodedQueryLen); - char* pEncodedQuery = (char*)encodedQuery.data(); - const char* pQuery = (const char*)query.data(); - - unsigned char nucleotideCode; - bool useHighWord = true; - - while(*pQuery) { - - switch(*pQuery) { - - case '=': - nucleotideCode = 0; - break; - - case 'A': - nucleotideCode = 1; - break; - - case 'C': - nucleotideCode = 2; - break; - - case 'G': - nucleotideCode = 4; - break; - - case 'T': - nucleotideCode = 8; - break; - - case 'N': - nucleotideCode = 15; - break; - - default: - printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); - exit(1); - } - - // pack the nucleotide code - if(useHighWord) { - *pEncodedQuery = nucleotideCode << 4; - useHighWord = false; - } else { - *pEncodedQuery |= nucleotideCode; - pEncodedQuery++; - useHighWord = true; - } - - // increment the query position - pQuery++; - } -} - -// opens the alignment archive -bool BamWriter::BamWriterPrivate::Open(const string& filename, const string& samHeader, const RefVector& referenceSequences, bool isWriteUncompressed) { - - // open the BGZF file for writing, return failure if error - if ( !mBGZF.Open(filename, "wb", isWriteUncompressed) ) - return false; - - // ================ - // write the header - // ================ - - // write the BAM signature - const unsigned char SIGNATURE_LENGTH = 4; - const char* BAM_SIGNATURE = "BAM\1"; - mBGZF.Write(BAM_SIGNATURE, SIGNATURE_LENGTH); - - // write the SAM header text length - uint32_t samHeaderLen = samHeader.size(); - if (IsBigEndian) SwapEndian_32(samHeaderLen); - mBGZF.Write((char*)&samHeaderLen, BT_SIZEOF_INT); - - // write the SAM header text - if(samHeaderLen > 0) - mBGZF.Write(samHeader.data(), samHeaderLen); - - // write the number of reference sequences - uint32_t numReferenceSequences = referenceSequences.size(); - if (IsBigEndian) SwapEndian_32(numReferenceSequences); - mBGZF.Write((char*)&numReferenceSequences, BT_SIZEOF_INT); - - // ============================= - // write the sequence dictionary - // ============================= - - RefVector::const_iterator rsIter; - for(rsIter = referenceSequences.begin(); rsIter != referenceSequences.end(); rsIter++) { - - // write the reference sequence name length - uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; - if (IsBigEndian) SwapEndian_32(referenceSequenceNameLen); - mBGZF.Write((char*)&referenceSequenceNameLen, BT_SIZEOF_INT); - - // write the reference sequence name - mBGZF.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); - - // write the reference sequence length - int32_t referenceLength = rsIter->RefLength; - if (IsBigEndian) SwapEndian_32(referenceLength); - mBGZF.Write((char*)&referenceLength, BT_SIZEOF_INT); - } - - // return success - return true; -} - -// saves the alignment to the alignment archive -void BamWriter::BamWriterPrivate::SaveAlignment(const BamAlignment& al) { - - // if BamAlignment contains only the core data and a raw char data buffer - // (as a result of BamReader::GetNextAlignmentCore()) - if ( al.SupportData.HasCoreOnly ) { - - // write the block size - unsigned int blockSize = al.SupportData.BlockLength; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (al.Bin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; - buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; - buffer[4] = al.SupportData.QuerySequenceLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { - for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); - } - - // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); - - // write the raw char data - mBGZF.Write((char*)al.SupportData.AllCharData.data(), al.SupportData.BlockLength-BAM_CORE_SIZE); - } - - // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc - // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) - else { - - // calculate char lengths - const unsigned int nameLength = al.Name.size() + 1; - const unsigned int numCigarOperations = al.CigarData.size(); - const unsigned int queryLength = al.QueryBases.size(); - const unsigned int tagDataLength = al.TagData.size(); - - // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) - // force calculation of Bin before storing - const int endPosition = al.GetEndPosition(); - const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); - - // create our packed cigar string - string packedCigar; - CreatePackedCigar(al.CigarData, packedCigar); - const unsigned int packedCigarLength = packedCigar.size(); - - // encode the query - string encodedQuery; - EncodeQuerySequence(al.QueryBases, encodedQuery); - const unsigned int encodedQueryLength = encodedQuery.size(); - - // write the block size - const unsigned int dataBlockSize = nameLength + packedCigarLength + encodedQueryLength + queryLength + tagDataLength; - unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; - if (IsBigEndian) SwapEndian_32(blockSize); - mBGZF.Write((char*)&blockSize, BT_SIZEOF_INT); - - // assign the BAM core data - uint32_t buffer[8]; - buffer[0] = al.RefID; - buffer[1] = al.Position; - buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; - buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; - buffer[4] = queryLength; - buffer[5] = al.MateRefID; - buffer[6] = al.MatePosition; - buffer[7] = al.InsertSize; - - // swap BAM core endian-ness, if necessary - if ( IsBigEndian ) { - for ( int i = 0; i < 8; ++i ) - SwapEndian_32(buffer[i]); - } - - // write the BAM core - mBGZF.Write((char*)&buffer, BAM_CORE_SIZE); - - // write the query name - mBGZF.Write(al.Name.c_str(), nameLength); - - // write the packed cigar - if ( IsBigEndian ) { - - char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); - memcpy(cigarData, packedCigar.data(), packedCigarLength); - - for (unsigned int i = 0; i < packedCigarLength; ++i) { - if ( IsBigEndian ) - SwapEndian_32p(&cigarData[i]); - } - - mBGZF.Write(cigarData, packedCigarLength); - free(cigarData); - } - else - mBGZF.Write(packedCigar.data(), packedCigarLength); - - // write the encoded query sequence - mBGZF.Write(encodedQuery.data(), encodedQueryLength); - - // write the base qualities - string baseQualities(al.Qualities); - char* pBaseQualities = (char*)al.Qualities.data(); - for(unsigned int i = 0; i < queryLength; i++) { - pBaseQualities[i] -= 33; - } - mBGZF.Write(pBaseQualities, queryLength); - - // write the read group tag - if ( IsBigEndian ) { - - char* tagData = (char*)calloc(sizeof(char), tagDataLength); - memcpy(tagData, al.TagData.data(), tagDataLength); - - int i = 0; - while ( (unsigned int)i < tagDataLength ) { - - i += 2; // skip tag type (e.g. "RG", "NM", etc) - uint8_t type = toupper(tagData[i]); // lower & upper case letters have same meaning - ++i; // skip value type - - switch (type) { - - case('A') : - case('C') : - ++i; - break; - - case('S') : - SwapEndian_16p(&tagData[i]); - i+=2; // sizeof(uint16_t) - break; - - case('F') : - case('I') : - SwapEndian_32p(&tagData[i]); - i+=4; // sizeof(uint32_t) - break; - - case('D') : - SwapEndian_64p(&tagData[i]); - i+=8; // sizeof(uint64_t) - break; - - case('H') : - case('Z') : - while (tagData[i]) { ++i; } - ++i; // increment one more for null terminator - break; - - default : - printf("ERROR: Invalid tag value type\n"); // shouldn't get here - free(tagData); - exit(1); - } - } - - mBGZF.Write(tagData, tagDataLength); - free(tagData); - } - else - mBGZF.Write(al.TagData.data(), tagDataLength); - } -} diff --git a/src/utils/BamTools/BamWriter.h b/src/utils/BamTools/BamWriter.h deleted file mode 100644 index 20e3ffc2750d266609b3be0d3464bcfa61886a5d..0000000000000000000000000000000000000000 --- a/src/utils/BamTools/BamWriter.h +++ /dev/null @@ -1,52 +0,0 @@ -// *************************************************************************** -// BamWriter.h (c) 2009 Michael Str�mberg, Derek Barnett -// Marth Lab, Department of Biology, Boston College -// All rights reserved. -// --------------------------------------------------------------------------- -// Last modified: 17 August 2010 (DB) -// --------------------------------------------------------------------------- -// Uses BGZF routines were adapted from the bgzf.c code developed at the Broad -// Institute. -// --------------------------------------------------------------------------- -// Provides the basic functionality for producing BAM files -// *************************************************************************** - -#ifndef BAMWRITER_H -#define BAMWRITER_H - -// C++ includes -#include <string> - -// BamTools includes -#include "BamAux.h" - -namespace BamTools { - -class BamWriter { - - // constructor/destructor - public: - BamWriter(void); - ~BamWriter(void); - - // public interface - public: - // closes the alignment archive - void Close(void); - // opens the alignment archive - bool Open(const std::string& filename, - const std::string& samHeader, - const BamTools::RefVector& referenceSequences, - bool writeUncompressed = false); - // saves the alignment to the alignment archive - void SaveAlignment(const BamTools::BamAlignment& al); - - // private implementation - private: - struct BamWriterPrivate; - BamWriterPrivate* d; -}; - -} // namespace BamTools - -#endif // BAMWRITER_H diff --git a/src/utils/BamTools/LICENSE b/src/utils/BamTools/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..eaee1fd08118a9b646620a70728532c91026adf9 --- /dev/null +++ b/src/utils/BamTools/LICENSE @@ -0,0 +1,22 @@ +The MIT License + +Copyright (c) 2009-2010 Derek Barnett, Erik Garrison, Gabor Marth, Michael Stromberg + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + diff --git a/src/utils/BamTools/Makefile b/src/utils/BamTools/Makefile index 2df3843649ad55ed99a23a83539f66e5435495ae..2e01ce06834256f255584840ed280d002afc91f6 100644 --- a/src/utils/BamTools/Makefile +++ b/src/utils/BamTools/Makefile @@ -1,26 +1,59 @@ -OBJ_DIR = ../../../obj/ -BIN_DIR = ../../../bin/ -UTILITIES_DIR = ../ - -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/gzstream/ -I$(UTILITIES_DIR)/fileType/ - -# ---------------------------------- -# define our source and object files -# ---------------------------------- -SOURCES= BamReader.cpp BamWriter.cpp BGZF.cpp BamAncillary.cpp BamIndex.cpp -OBJECTS= $(SOURCES:.cpp=.o) -BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) - -all: $(BUILT_OBJECTS) - -.PHONY: all - -$(BUILT_OBJECTS): $(SOURCES) - @echo " * compiling" $(*F).cpp - @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) - -clean: - @echo "Cleaning up." - @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* - +# ------------------- +# define our includes +# ------------------- +OBJ_DIR = ../../../obj/ +INCLUDES = -Isrc/ -Iinclude/ + +# ---------------------------------- +# define our source and object files +# ---------------------------------- + +SOURCES= src/api/BamAlignment.cpp \ + src/api/BamMultiReader.cpp \ + src/api/BamReader.cpp \ + src/api/BamWriter.cpp \ + src/api/SamHeader.cpp \ + src/api/SamProgram.cpp \ + src/api/SamProgramChain.cpp \ + src/api/SamReadGroup.cpp \ + src/api/SamReadGroupDictionary.cpp \ + src/api/SamSequence.cpp \ + src/api/SamSequenceDictionary.cpp \ + src/api/internal/BamHeader_p.cpp \ + src/api/internal/BamIndexFactory_p.cpp \ + src/api/internal/BamMultiReader_p.cpp \ + src/api/internal/BamRandomAccessController_p.cpp \ + src/api/internal/BamReader_p.cpp \ + src/api/internal/BamStandardIndex_p.cpp \ + src/api/internal/BamToolsIndex_p.cpp \ + src/api/internal/BamWriter_p.cpp \ + src/api/internal/BgzfStream_p.cpp \ + src/api/internal/SamFormatParser_p.cpp \ + src/api/internal/SamFormatPrinter_p.cpp \ + src/api/internal/SamHeaderValidator_p.cpp + +# ---------------------------------- +# define our source and object files +# ---------------------------------- +OBJECTS= $(SOURCES:.cpp=.o) +LIBRARY=libbamtools.a + +all: $(LIBRARY) + +.PHONY: all + +$(LIBRARY): $(OBJECTS) + @echo " * linking $(LIBRARY)" + ar cr lib/$@ $^ + +$(OBJECTS): $(SOURCES) + @echo " * compiling" $(*F).cpp + @$(CXX) -c $(*D)/$(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) -o $(*D)/$(*F).o + +clean: + @echo "Cleaning up." + @rm -f lib/* + @rm -f src/api/*.o + @rm -f src/api/internal/*.o + .PHONY: clean \ No newline at end of file diff --git a/src/utils/BamTools/README b/src/utils/BamTools/README new file mode 100644 index 0000000000000000000000000000000000000000..1780ede32a7abf818f2a5cf725e0e5a9ade71b1e --- /dev/null +++ b/src/utils/BamTools/README @@ -0,0 +1,61 @@ +-------------------------------------------------------------------------------- +README : BAMTOOLS +-------------------------------------------------------------------------------- + +BamTools provides both a programmer's API and an end-user's toolkit for handling +BAM files. + +I. Learn More + +II. License + +III. Acknowledgements + +IV. Contact + +-------------------------------------------------------------------------------- +I. Learn More: +-------------------------------------------------------------------------------- + +Installation steps, tutorial, API documentation, etc. are all now available +through the BamTools project wiki: + +https://github.com/pezmaster31/bamtools/wiki + +Join the mailing list(s) to stay informed of updates or get involved with +contributing: + +https://github.com/pezmaster31/bamtools/wiki/Mailing-lists + +-------------------------------------------------------------------------------- +II. License : +-------------------------------------------------------------------------------- + +Both the BamTools API and toolkit are released under the MIT License. +Copyright (c) 2009-2010 Derek Barnett, Erik Garrison, Gabor Marth, + Michael Stromberg + +See included file LICENSE for details. + +-------------------------------------------------------------------------------- +III. Acknowledgements : +-------------------------------------------------------------------------------- + + * Aaron Quinlan for several key feature ideas and bug fix contributions + * Baptiste Lepilleur for the public-domain JSON parser (JsonCPP) + * Heng Li, author of SAMtools - the original C-language BAM API/toolkit. + +-------------------------------------------------------------------------------- +IV. Contact : +-------------------------------------------------------------------------------- + +Feel free to contact me with any questions, comments, suggestions, bug reports, + etc. + +Derek Barnett +Marth Lab +Biology Dept., Boston College + +Email: barnetde@bc.edu +Project Websites: http://github.com/pezmaster31/bamtools (ACTIVE SUPPORT) + http://sourceforge.net/projects/bamtools (major updates only) diff --git a/src/utils/BamTools/src/api/BamAlignment.cpp b/src/utils/BamTools/src/api/BamAlignment.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c6665598a01159a0d20ab89d3998b15c4222543f --- /dev/null +++ b/src/utils/BamTools/src/api/BamAlignment.cpp @@ -0,0 +1,2402 @@ +// *************************************************************************** +// BamAlignment.cpp (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the BamAlignment data structure +// *************************************************************************** + +#include <api/BamAlignment.h> +#include <api/BamConstants.h> +using namespace BamTools; + +#include <cctype> +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <exception> +#include <iostream> +#include <map> +#include <utility> +using namespace std; + +/*! \class BamTools::BamAlignment + \brief The main BAM alignment data structure. + + Provides methods to query/modify BAM alignment data fields. +*/ +/*! \var BamAlignment::Name + \brief read name +*/ +/*! \var BamAlignment::Length + \brief length of query sequence +*/ +/*! \var BamAlignment::QueryBases + \brief 'original' sequence (as reported from sequencing machine) +*/ +/*! \var BamAlignment::AlignedBases + \brief 'aligned' sequence (includes any indels, padding, clipping) +*/ +/*! \var BamAlignment::Qualities + \brief FASTQ qualities (ASCII characters, not numeric values) +*/ +/*! \var BamAlignment::TagData + \brief tag data (use the provided methods to query/modify) +*/ +/*! \var BamAlignment::RefID + \brief ID number for reference sequence +*/ +/*! \var BamAlignment::Position + \brief position (0-based) where alignment starts +*/ +/*! \var BamAlignment::Bin + \brief BAM (standard) index bin number for this alignment +*/ +/*! \var BamAlignment::MapQuality + \brief mapping quality score +*/ +/*! \var BamAlignment::AlignmentFlag + \brief alignment bit-flag (use the provided methods to query/modify) +*/ +/*! \var BamAlignment::CigarData + \brief CIGAR operations for this alignment +*/ +/*! \var BamAlignment::MateRefID + \brief ID number for reference sequence where alignment's mate was aligned +*/ +/*! \var BamAlignment::MatePosition + \brief position (0-based) where alignment's mate starts +*/ +/*! \var BamAlignment::InsertSize + \brief mate-pair insert size +*/ +/*! \var BamAlignment::Filename + \brief name of BAM file which this alignment comes from +*/ + +/*! \fn BamAlignment::BamAlignment(void) + \brief constructor +*/ +BamAlignment::BamAlignment(void) + : RefID(-1) + , Position(-1) + , MateRefID(-1) + , MatePosition(-1) + , InsertSize(0) +{ } + +/*! \fn BamAlignment::BamAlignment(const BamAlignment& other) + \brief copy constructor +*/ +BamAlignment::BamAlignment(const BamAlignment& other) + : Name(other.Name) + , Length(other.Length) + , QueryBases(other.QueryBases) + , AlignedBases(other.AlignedBases) + , Qualities(other.Qualities) + , TagData(other.TagData) + , RefID(other.RefID) + , Position(other.Position) + , Bin(other.Bin) + , MapQuality(other.MapQuality) + , AlignmentFlag(other.AlignmentFlag) + , CigarData(other.CigarData) + , MateRefID(other.MateRefID) + , MatePosition(other.MatePosition) + , InsertSize(other.InsertSize) + , Filename(other.Filename) + , SupportData(other.SupportData) +{ } + +/*! \fn BamAlignment::~BamAlignment(void) + \brief destructor +*/ +BamAlignment::~BamAlignment(void) { } + +/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) + \brief Adds a field with string data to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param type 1-character tag type (must be "Z" or "H") + \param value string data to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const std::string& value) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // validate tag/type size & that type is OK for string value + if ( !IsValidSize(tag, type) ) return false; + if ( type.at(0) != Constants::BAM_TAG_TYPE_STRING && + type.at(0) != Constants::BAM_TAG_TYPE_HEX + ) + { + return false; + } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // otherwise, copy tag data to temp buffer + string newTag = tag + type + value; + const int newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); // removes original null-term, appends newTag + null-term + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) + \brief Adds a field with unsigned integer data to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param type 1-character tag type (must NOT be "f", "Z", "H", or "B") + \param value unsigned int data to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const uint32_t& value) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // validate tag/type size & that type is OK for uint32_t value + if ( !IsValidSize(tag, type) ) return false; + if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT || + type.at(0) == Constants::BAM_TAG_TYPE_STRING || + type.at(0) == Constants::BAM_TAG_TYPE_HEX || + type.at(0) == Constants::BAM_TAG_TYPE_ARRAY + ) + { + return false; + } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // otherwise, convert value to string + union { uint32_t value; char valueBuffer[sizeof(uint32_t)]; } un; + un.value = value; + + // copy original tag data to temp buffer + string newTag = tag + type; + const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new integer + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); + memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(uint32_t)); + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) + \brief Adds a field with signed integer data to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param type 1-character tag type (must NOT be "f", "Z", "H", or "B") + \param value signed int data to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const int32_t& value) { + return AddTag(tag, type, (const uint32_t&)value); +} + +/*! \fn bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) + \brief Adds a field with floating-point data to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param type 1-character tag type (must NOT be "Z", "H", or "B") + \param value float data to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const float& value) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // validate tag/type size & that type is OK for float value + if ( !IsValidSize(tag, type) ) return false; + if ( type.at(0) == Constants::BAM_TAG_TYPE_STRING || + type.at(0) == Constants::BAM_TAG_TYPE_HEX || + type.at(0) == Constants::BAM_TAG_TYPE_ARRAY + ) + { + return false; + } + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // otherwise, convert value to string + union { float value; char valueBuffer[sizeof(float)]; } un; + un.value = value; + + // copy original tag data to temp buffer + string newTag = tag + type; + const int newTagDataLength = tagDataLength + newTag.size() + 4; // leave room for new float + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength + 1); // '+1' for TagData null-term + + // append newTag + strcat(originalTagData + tagDataLength, newTag.data()); + memcpy(originalTagData + tagDataLength + newTag.size(), un.valueBuffer, sizeof(float)); + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool AddTag(const std::string& tag, const std::vector<uint8_t>& values); + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param values vector of uint8_t values to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::vector<uint8_t>& values) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // check for valid tag length + if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = Constants::BAM_TAG_TYPE_UINT8; + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const int newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(uint8_t); + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for ( int i = 0 ; i < numElements; ++i ) { + const uint8_t value = values.at(i); + memcpy(originalTagData + elementsBeginOffset + i*sizeof(uint8_t), + &value, sizeof(uint8_t)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool AddTag(const std::string& tag, const std::vector<int8_t>& values); + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param values vector of int8_t values to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::vector<int8_t>& values) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // check for valid tag length + if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = Constants::BAM_TAG_TYPE_INT8; + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const int newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(int8_t); + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for ( int i = 0 ; i < numElements; ++i ) { + const int8_t value = values.at(i); + memcpy(originalTagData + elementsBeginOffset + i*sizeof(int8_t), + &value, sizeof(int8_t)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool AddTag(const std::string& tag, const std::vector<uint16_t>& values); + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param values vector of uint16_t values to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::vector<uint16_t>& values) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // check for valid tag length + if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = Constants::BAM_TAG_TYPE_UINT16; + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const int newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(uint16_t); + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for ( int i = 0 ; i < numElements; ++i ) { + const uint16_t value = values.at(i); + memcpy(originalTagData + elementsBeginOffset + i*sizeof(uint16_t), + &value, sizeof(uint16_t)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool AddTag(const std::string& tag, const std::vector<int16_t>& values); + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param values vector of int16_t values to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::vector<int16_t>& values) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // check for valid tag length + if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = Constants::BAM_TAG_TYPE_INT16; + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const int newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(int16_t); + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for ( int i = 0 ; i < numElements; ++i ) { + const int16_t value = values.at(i); + memcpy(originalTagData + elementsBeginOffset + i*sizeof(int16_t), + &value, sizeof(int16_t)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool AddTag(const std::string& tag, const std::vector<uint32_t>& values); + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param values vector of uint32_t values to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::vector<uint32_t>& values) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // check for valid tag length + if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = Constants::BAM_TAG_TYPE_UINT32; + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const int newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(uint32_t); + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for ( int i = 0 ; i < numElements; ++i ) { + const uint32_t value = values.at(i); + memcpy(originalTagData + elementsBeginOffset + i*sizeof(uint32_t), + &value, sizeof(uint32_t)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool AddTag(const std::string& tag, const std::vector<int32_t>& values); + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param values vector of int32_t values to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::vector<int32_t>& values) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // check for valid tag length + if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = Constants::BAM_TAG_TYPE_INT32; + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const int newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(int32_t); + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for ( int i = 0 ; i < numElements; ++i ) { + const int32_t value = values.at(i); + memcpy(originalTagData + elementsBeginOffset + i*sizeof(int32_t), + &value, sizeof(int32_t)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool AddTag(const std::string& tag, const std::vector<float>& values); + \brief Adds a numeric array field to the BAM tags. + + Does NOT modify an existing tag - use \link BamAlignment::EditTag() \endlink instead. + + \param tag 2-character tag name + \param values vector of float values to store + + \return \c true if the \b new tag was added successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::AddTag(const std::string& tag, const std::vector<float>& values) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // check for valid tag length + if ( tag.size() != Constants::BAM_TAG_TAGSIZE ) return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag already exists, return false + // use EditTag explicitly instead + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // build new tag's base information + char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE]; + memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE ); + newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY; + newTagBase[3] = Constants::BAM_TAG_TYPE_FLOAT; + + // add number of array elements to newTagBase + const int32_t numElements = values.size(); + memcpy(newTagBase + 4, &numElements, sizeof(int32_t)); + + // copy current TagData string to temp buffer, leaving room for new tag's contents + const int newTagDataLength = tagDataLength + + Constants::BAM_TAG_ARRAYBASE_SIZE + + numElements*sizeof(float); + char originalTagData[newTagDataLength]; + memcpy(originalTagData, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term + + // write newTagBase (removes old null term) + strcat(originalTagData + tagDataLength, (const char*)newTagBase); + + // add vector elements to tag + int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE; + for ( int i = 0 ; i < numElements; ++i ) { + const float value = values.at(i); + memcpy(originalTagData + elementsBeginOffset + i*sizeof(float), + &value, sizeof(float)); + } + + // store temp buffer back in TagData + const char* newTagData = (const char*)originalTagData; + TagData.assign(newTagData, newTagDataLength); + + // return success + return true; +} + +/*! \fn bool BamAlignment::BuildCharData(void) + \brief Populates alignment string fields (read name, bases, qualities, tag data). + + An alignment retrieved using BamReader::GetNextAlignmentCore() lacks this data. + Using that method makes parsing much quicker when only positional data is required. + + However, if you later want to access the character data fields from such an alignment, + use this method to populate those fields. Provides ability to do 'lazy evaluation' of + alignment parsing. + + \return \c true if character data populated successfully (or was already available to begin with) +*/ +bool BamAlignment::BuildCharData(void) { + + // skip if char data already parsed + if ( !SupportData.HasCoreOnly ) + return true; + + // check system endianness + bool IsBigEndian = BamTools::SystemIsBigEndian(); + + // calculate character lengths/offsets + const unsigned int dataLength = SupportData.BlockLength - Constants::BAM_CORE_SIZE; + const unsigned int seqDataOffset = SupportData.QueryNameLength + (SupportData.NumCigarOperations * 4); + const unsigned int qualDataOffset = seqDataOffset + (SupportData.QuerySequenceLength+1)/2; + const unsigned int tagDataOffset = qualDataOffset + SupportData.QuerySequenceLength; + const unsigned int tagDataLength = dataLength - tagDataOffset; + + // check offsets to see what char data exists + const bool hasSeqData = ( seqDataOffset < dataLength ); + const bool hasQualData = ( qualDataOffset < dataLength ); + const bool hasTagData = ( tagDataOffset < dataLength ); + + // set up char buffers + const char* allCharData = SupportData.AllCharData.data(); + const char* seqData = ( hasSeqData ? (((const char*)allCharData) + seqDataOffset) : (const char*)0 ); + const char* qualData = ( hasQualData ? (((const char*)allCharData) + qualDataOffset) : (const char*)0 ); + char* tagData = ( hasTagData ? (((char*)allCharData) + tagDataOffset) : (char*)0 ); + + // store alignment name (relies on null char in name as terminator) + Name.assign((const char*)(allCharData)); + + // save query sequence + QueryBases.clear(); + if ( hasSeqData ) { + QueryBases.reserve(SupportData.QuerySequenceLength); + for (unsigned int i = 0; i < SupportData.QuerySequenceLength; ++i) { + char singleBase = Constants::BAM_DNA_LOOKUP[ ( (seqData[(i/2)] >> (4*(1-(i%2)))) & 0xf ) ]; + QueryBases.append(1, singleBase); + } + } + + // save qualities, converting from numeric QV to 'FASTQ-style' ASCII character + Qualities.clear(); + if ( hasQualData ) { + Qualities.reserve(SupportData.QuerySequenceLength); + for (unsigned int i = 0; i < SupportData.QuerySequenceLength; ++i) { + char singleQuality = (char)(qualData[i]+33); + Qualities.append(1, singleQuality); + } + } + + // clear previous AlignedBases + AlignedBases.clear(); + + // if QueryBases has data, build AlignedBases using CIGAR data + // otherwise, AlignedBases will remain empty (this case IS allowed) + if ( !QueryBases.empty() ) { + + // resize AlignedBases + AlignedBases.reserve(SupportData.QuerySequenceLength); + + // iterate over CigarOps + int k = 0; + vector<CigarOp>::const_iterator cigarIter = CigarData.begin(); + vector<CigarOp>::const_iterator cigarEnd = CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter ) { + const CigarOp& op = (*cigarIter); + + switch (op.Type) { + + // for 'M', 'I', '=', 'X' - write bases + case (Constants::BAM_CIGAR_MATCH_CHAR) : + case (Constants::BAM_CIGAR_INS_CHAR) : + case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : + case (Constants::BAM_CIGAR_MISMATCH_CHAR) : + AlignedBases.append(QueryBases.substr(k, op.Length)); + // fall through + + // for 'S' - soft clip, do not write bases + // but increment placeholder 'k' + case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : + k += op.Length; + break; + + // for 'D' - write gap character + case (Constants::BAM_CIGAR_DEL_CHAR) : + AlignedBases.append(op.Length, Constants::BAM_DNA_DEL); + break; + + // for 'P' - write padding character + case (Constants::BAM_CIGAR_PAD_CHAR) : + AlignedBases.append( op.Length, Constants::BAM_DNA_PAD ); + break; + + // for 'N' - write N's, skip bases in original query sequence + case (Constants::BAM_CIGAR_REFSKIP_CHAR) : + AlignedBases.append( op.Length, Constants::BAM_DNA_N ); + break; + + // for 'H' - hard clip, do nothing to AlignedBases, move to next op + case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : + break; + + // shouldn't get here + default: + cerr << "BamAlignment ERROR: invalid CIGAR operation type: " + << op.Type << endl; + exit(1); + } + } + } + + // save tag data + TagData.clear(); + if ( hasTagData ) { + if ( IsBigEndian ) { + int i = 0; + while ( (unsigned int)i < tagDataLength ) { + + i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) + const char type = tagData[i]; // get tag type at position i + ++i; // move i past tag type + + switch (type) { + + case(Constants::BAM_TAG_TYPE_ASCII) : + case(Constants::BAM_TAG_TYPE_INT8) : + case(Constants::BAM_TAG_TYPE_UINT8) : + // no endian swapping necessary for single-byte data + ++i; + break; + + case(Constants::BAM_TAG_TYPE_INT16) : + case(Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + + case(Constants::BAM_TAG_TYPE_FLOAT) : + case(Constants::BAM_TAG_TYPE_INT32) : + case(Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + + case(Constants::BAM_TAG_TYPE_HEX) : + case(Constants::BAM_TAG_TYPE_STRING) : + // no endian swapping necessary for hex-string/string data + while ( tagData[i] ) + ++i; + // increment one more for null terminator + ++i; + break; + + case(Constants::BAM_TAG_TYPE_ARRAY) : + + { + // read array type + const char arrayType = tagData[i]; + ++i; + + // swap endian-ness of number of elements in place, then retrieve for loop + BamTools::SwapEndian_32p(&tagData[i]); + int32_t numElements; + memcpy(&numElements, &tagData[i], sizeof(uint32_t)); + i += sizeof(uint32_t); + + // swap endian-ness of array elements + for ( int j = 0; j < numElements; ++j ) { + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + // no endian-swapping necessary + ++i; + break; + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + default: + // error case + cerr << "BamAlignment ERROR: unknown binary array type encountered: " + << arrayType << endl; + return false; + } + } + + break; + } + + // shouldn't get here + default : + cerr << "BamAlignment ERROR: invalid tag value type: " + << type << endl; + exit(1); + } + } + } + + // store tagData in alignment + TagData.resize(tagDataLength); + memcpy((char*)TagData.data(), tagData, tagDataLength); + } + + // clear the core-only flag + SupportData.HasCoreOnly = false; + + // return success + return true; +} + +/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) + \brief Edits a BAM tag field containing string data. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param type 1-character tag type (must be "Z" or "H") + \param value string data to store + + \return \c true if the tag was modified/created successfully + + \sa BamAlignment::RemoveTag() + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const std::string& value) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // validate tag/type size & that type is OK for string value + if ( !IsValidSize(tag, type) ) return false; + if ( type.at(0) != Constants::BAM_TAG_TYPE_STRING && + type.at(0) != Constants::BAM_TAG_TYPE_HEX ) + return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + value.size()]; + + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new @value in place of current tag data + const unsigned int dataLength = strlen(value.c_str()); + memcpy(newTagData + beginningTagDataLength, (char*)value.c_str(), dataLength+1 ); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) + return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + dataLength + 1; + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; + } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} + +/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) + \brief Edits a BAM tag field containing unsigned integer data. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param type 1-character tag type (must NOT be "f", "Z", "H", or "B") + \param value unsigned integer data to store + + \return \c true if the tag was modified/created successfully + + \sa BamAlignment::RemoveTag() + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const uint32_t& value) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // validate tag/type size & that type is OK for uint32_t value + if ( !IsValidSize(tag, type) ) return false; + if ( type.at(0) == Constants::BAM_TAG_TYPE_FLOAT || + type.at(0) == Constants::BAM_TAG_TYPE_STRING || + type.at(0) == Constants::BAM_TAG_TYPE_HEX || + type.at(0) == Constants::BAM_TAG_TYPE_ARRAY + ) + { + return false; + } + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + sizeof(value)]; + + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new @value in place of current tag data + union { uint32_t value; char valueBuffer[sizeof(uint32_t)]; } un; + un.value = value; + memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(uint32_t)); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) + return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + sizeof(uint32_t); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; + } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} + +/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) + \brief Edits a BAM tag field containing signed integer data. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param type 1-character tag type (must NOT be "f", "Z", "H", or "B") + \param value signed integer data to store + + \return \c true if the tag was modified/created successfully + + \sa BamAlignment::RemoveTag() + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const int32_t& value) { + return EditTag(tag, type, (const uint32_t&)value); +} + +/*! \fn bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) + \brief Edits a BAM tag field containing floating-point data. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param type 1-character tag type (must NOT be "Z", "H", or "B") + \param value float data to store + + \return \c true if the tag was modified/created successfully + + \sa BamAlignment::RemoveTag() + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const float& value) { + + // skip if core data not parsed + if ( SupportData.HasCoreOnly ) return false; + + // validate tag/type size & that type is OK for float value + if ( !IsValidSize(tag, type) ) return false; + if ( type.at(0) == Constants::BAM_TAG_TYPE_STRING || + type.at(0) == Constants::BAM_TAG_TYPE_HEX || + type.at(0) == Constants::BAM_TAG_TYPE_ARRAY + ) + { + return false; + } + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + // make sure array is more than big enough + char newTagData[originalTagDataLength + sizeof(value)]; + + // copy original tag data up til desired tag + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // copy new @value in place of current tag data + union { float value; char valueBuffer[sizeof(float)]; } un; + un.value = value; + memcpy(newTagData + beginningTagDataLength, un.valueBuffer, sizeof(float)); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData - 1; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) + return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagOffset = beginningTagDataLength + sizeof(float); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + endTagOffset, pTagData, endTagDataLength); + + // ensure null-terminator + newTagData[ endTagOffset + endTagDataLength + 1 ] = 0; + + // save new tag data + TagData.assign(newTagData, endTagOffset + endTagDataLength); + return true; + } + + // tag not found, attempt AddTag + else return AddTag(tag, type, value); +} + +/*! \fn bool EditTag(const std::string& tag, const std::vector<uint8_t>& values); + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param value vector of uint8_t values to store + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::vector<uint8_t>& values) { + + // can't do anything if TagData not parsed + if ( SupportData.HasCoreOnly ) + return false; + + // remove existing tag if present + if ( HasTag(tag) ) + RemoveTag(tag); + + // add tag record with new values + return AddTag(tag, values); +} + +/*! \fn bool EditTag(const std::string& tag, const std::vector<int8_t>& values); + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param value vector of int8_t values to store + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::vector<int8_t>& values) { + + // can't do anything if TagData not parsed + if ( SupportData.HasCoreOnly ) + return false; + + // remove existing tag if present + if ( HasTag(tag) ) + RemoveTag(tag); + + // add tag record with new values + return AddTag(tag, values); +} + +/*! \fn bool EditTag(const std::string& tag, const std::vector<uint16_t>& values); + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param value vector of uint16_t values to store + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::vector<uint16_t>& values) { + + // can't do anything if TagData not parsed + if ( SupportData.HasCoreOnly ) + return false; + + // remove existing tag if present + if ( HasTag(tag) ) + RemoveTag(tag); + + // add tag record with new values + return AddTag(tag, values); +} + +/*! \fn bool EditTag(const std::string& tag, const std::vector<int16_t>& values); + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param value vector of int16_t values to store + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::vector<int16_t>& values) { + + // can't do anything if TagData not parsed + if ( SupportData.HasCoreOnly ) + return false; + + // remove existing tag if present + if ( HasTag(tag) ) + RemoveTag(tag); + + // add tag record with new values + return AddTag(tag, values); +} + +/*! \fn bool EditTag(const std::string& tag, const std::vector<uint32_t>& values); + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param value vector of uint32_t values to store + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::vector<uint32_t>& values) { + + // can't do anything if TagData not parsed + if ( SupportData.HasCoreOnly ) + return false; + + // remove existing tag if present + if ( HasTag(tag) ) + RemoveTag(tag); + + // add tag record with new values + return AddTag(tag, values); +} + +/*! \fn bool EditTag(const std::string& tag, const std::vector<int32_t>& values); + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param value vector of int32_t values to store + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::vector<int32_t>& values) { + + // can't do anything if TagData not parsed + if ( SupportData.HasCoreOnly ) + return false; + + // remove existing tag if present + if ( HasTag(tag) ) + RemoveTag(tag); + + // add tag record with new values + return AddTag(tag, values); +} + +/*! \fn bool EditTag(const std::string& tag, const std::vector<float>& values); + \brief Edits a BAM tag field containing a numeric array. + + If \a tag does not exist, a new entry is created. + + \param tag 2-character tag name + \param value vector of float values to store + + \return \c true if the tag was modified/created successfully + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::EditTag(const std::string& tag, const std::vector<float>& values) { + + // can't do anything if TagData not parsed + if ( SupportData.HasCoreOnly ) + return false; + + // remove existing tag if present + if ( HasTag(tag) ) + RemoveTag(tag); + + // add tag record with new values + return AddTag(tag, values); +} + +/*! \fn bool BamAlignment::FindTag(const std::string& tag, char*& pTagData, const unsigned int& tagDataLength, unsigned int& numBytesParsed) + \internal + + Searches for requested tag in BAM tag data. + + \param tag requested 2-character tag name + \param pTagData pointer to current position in BamAlignment::TagData + \param tagDataLength length of BamAlignment::TagData + \param numBytesParsed number of bytes parsed so far + + \return \c true if found + + \post If \a tag is found, \a pTagData will point to the byte where the tag data begins. + \a numBytesParsed will correspond to the position in the full TagData string. + +*/ +bool BamAlignment::FindTag(const std::string& tag, + char*& pTagData, + const unsigned int& tagDataLength, + unsigned int& numBytesParsed) const +{ + + while ( numBytesParsed < tagDataLength ) { + + const char* pTagType = pTagData; + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + + // check the current tag, return true on match + if ( strncmp(pTagType, tag.c_str(), 2) == 0 ) + return true; + + // get the storage class and find the next tag + if ( *pTagStorageType == '\0' ) return false; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) return false; + if ( *pTagData == '\0' ) return false; + } + + // checked all tags, none match + return false; +} + +/*! \fn bool BamAlignment::GetEditDistance(uint32_t& editDistance) const + \brief Retrieves value of edit distance tag ("NM"). + + \deprecated Instead use BamAlignment::GetTag() + \code + BamAlignment::GetTag("NM", editDistance); + \endcode + + \param editDistance destination for retrieved value + + \return \c true if found +*/ +bool BamAlignment::GetEditDistance(uint32_t& editDistance) const { + return GetTag("NM", (uint32_t&)editDistance); +} + +/*! \fn int BamAlignment::GetEndPosition(bool usePadded = false, bool zeroBased = true) const + \brief Calculates alignment end position, based on starting position and CIGAR data. + + \param usePadded Inserted bases affect reported position. Default is false, so that reported + position stays 'sync-ed' with reference coordinates. + \param zeroBased Return (BAM standard) 0-based coordinate. Setting this to false can be useful + when using BAM data with half-open formats (e.g. BED). + + \return alignment end position +*/ +int BamAlignment::GetEndPosition(bool usePadded, bool zeroBased) const { + + // initialize alignment end to starting position + int alignEnd = Position; + + // iterate over cigar operations + vector<CigarOp>::const_iterator cigarIter = CigarData.begin(); + vector<CigarOp>::const_iterator cigarEnd = CigarData.end(); + for ( ; cigarIter != cigarEnd; ++cigarIter) { + const char cigarType = (*cigarIter).Type; + const uint32_t& cigarLength = (*cigarIter).Length; + + if ( cigarType == Constants::BAM_CIGAR_MATCH_CHAR || + cigarType == Constants::BAM_CIGAR_DEL_CHAR || + cigarType == Constants::BAM_CIGAR_REFSKIP_CHAR ) + alignEnd += cigarLength; + else if ( usePadded && cigarType == Constants::BAM_CIGAR_INS_CHAR ) + alignEnd += cigarLength; + } + + // adjust for zero-based coordinates, if requested + if ( zeroBased ) alignEnd -= 1; + + // return result + return alignEnd; +} + +/*! \fn bool BamAlignment::GetReadGroup(std::string& readGroup) const + \brief Retrieves value of read group tag ("RG"). + + \deprecated Instead use BamAlignment::GetTag() + \code + BamAlignment::GetTag("RG", readGroup); + \endcode + + \param readGroup destination for retrieved value + + \return \c true if found +*/ +bool BamAlignment::GetReadGroup(std::string& readGroup) const { + return GetTag("RG", readGroup); +} + +/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const + \brief Retrieves the string value associated with a BAM tag. + + \param tag 2-character tag name + \param destination destination for retrieved value + + \return \c true if found +*/ +bool BamAlignment::GetTag(const std::string& tag, std::string& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag found + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + const unsigned int dataLength = strlen(pTagData); + destination.clear(); + destination.resize(dataLength); + memcpy( (char*)destination.data(), pTagData, dataLength ); + return true; + } + + // tag not found, return failure + return false; +} + +/*! \fn bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const + \brief Retrieves the unsigned integer value associated with a BAM tag. + + \param tag 2-character tag name + \param destination destination for retrieved value + + \return \c true if found +*/ +bool BamAlignment::GetTag(const std::string& tag, uint32_t& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag found + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + + // determine data byte-length + const char type = *(pTagData - 1); + int destinationLength = 0; + switch (type) { + + // 1 byte data + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + destinationLength = 1; + break; + + // 2 byte data + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + destinationLength = 2; + break; + + // 4 byte data + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + destinationLength = 4; + break; + + // unsupported type for integer destination (float or var-length strings) + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + case (Constants::BAM_TAG_TYPE_ARRAY) : + cerr << "BamAlignment ERROR: cannot store tag of type " << type + << " in integer destination" << endl; + return false; + + // unknown tag type + default: + cerr << "BamAlignment ERROR: unknown tag type encountered: " + << type << endl; + return false; + } + + // store in destination + destination = 0; + memcpy(&destination, pTagData, destinationLength); + return true; + } + + // tag not found, return failure + return false; +} + +/*! \fn bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const + \brief Retrieves the signed integer value associated with a BAM tag. + + \param tag 2-character tag name + \param destination destination for retrieved value + + \return \c true if found +*/ +bool BamAlignment::GetTag(const std::string& tag, int32_t& destination) const { + return GetTag(tag, (uint32_t&)destination); +} + +/*! \fn bool BamAlignment::GetTag(const std::string& tag, float& destination) const + \brief Retrieves the floating-point value associated with a BAM tag. + + \param tag 2-character tag name + \param destination destination for retrieved value + + \return \c true if found +*/ +bool BamAlignment::GetTag(const std::string& tag, float& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if tag found + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + + // determine data byte-length + const char type = *(pTagData - 1); + int destinationLength = 0; + switch (type) { + + // 1 byte data + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + destinationLength = 1; + break; + + // 2 byte data + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + destinationLength = 2; + break; + + // 4 byte data + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + destinationLength = 4; + break; + + // unsupported type (var-length strings) + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + case (Constants::BAM_TAG_TYPE_ARRAY) : + cerr << "BamAlignment ERROR: cannot store tag of type " << type + << " in float destination" << endl; + return false; + + // unknown tag type + default: + cerr << "BamAlignment ERROR: unknown tag type encountered: " + << type << endl; + return false; + } + + // store in destination + destination = 0.0; + memcpy(&destination, pTagData, destinationLength); + return true; + } + + // tag not found, return failure + return false; +} + +/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector<uint32_t>& destination) const + \brief Retrieves the numeric array data associated with a BAM tag + + \param tag 2-character tag name + \param destination destination for retrieved data + + \return \c true if found +*/ +bool BamAlignment::GetTag(const std::string& tag, std::vector<uint32_t>& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // return false if tag not found + if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // check that tag is array type + const char tagType = *(pTagData - 1); + if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) { + cerr << "BamAlignment ERROR: Cannot store non-array data from tag: " + << tag << " in array destination" << endl; + return false; + } + + // calculate length of each element in tag's array + const char elementType = *pTagData; + ++pTagData; + int elementLength = 0; + switch ( elementType ) { + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + elementLength = sizeof(uint8_t); + break; + + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + elementLength = sizeof(uint16_t); + break; + + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + elementLength = sizeof(uint32_t); + break; + + // unsupported type for integer destination (float or var-length data) + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + case (Constants::BAM_TAG_TYPE_ARRAY) : + cerr << "BamAlignment ERROR: array element type: " << elementType + << " cannot be stored in integer value" << endl; + return false; + + // unknown tag type + default: + cerr << "BamAlignment ERROR: unknown element type encountered: " + << elementType << endl; + return false; + } + + // get number of elements + int32_t numElements; + memcpy(&numElements, pTagData, sizeof(int32_t)); + pTagData += 4; + destination.clear(); + destination.reserve(numElements); + + // read in elements + uint32_t value; + for ( int i = 0 ; i < numElements; ++i ) { + memcpy(&value, pTagData, sizeof(uint32_t)); + pTagData += sizeof(uint32_t); + destination.push_back(value); + } + + // return success + return false; +} + +/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector<int32_t>& destination) const + \brief Retrieves the numeric array data associated with a BAM tag + + \param tag 2-character tag name + \param destination destination for retrieved data + + \return \c true if found +*/ +bool BamAlignment::GetTag(const std::string& tag, std::vector<int32_t>& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // return false if tag not found + if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // check that tag is array type + const char tagType = *(pTagData - 1); + if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) { + cerr << "BamAlignment ERROR: Cannot store non-array data from tag: " + << tag << " in array destination" << endl; + return false; + } + + // calculate length of each element in tag's array + const char elementType = *pTagData; + ++pTagData; + int elementLength = 0; + switch ( elementType ) { + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + elementLength = sizeof(uint8_t); + break; + + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + elementLength = sizeof(uint16_t); + break; + + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + elementLength = sizeof(uint32_t); + break; + + // unsupported type for integer destination (float or var-length data) + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + case (Constants::BAM_TAG_TYPE_ARRAY) : + cerr << "BamAlignment ERROR: array element type: " << elementType + << " cannot be stored in integer value" << endl; + return false; + + // unknown tag type + default: + cerr << "BamAlignment ERROR: unknown element type encountered: " + << elementType << endl; + return false; + } + + // get number of elements + int32_t numElements; + memcpy(&numElements, pTagData, sizeof(int32_t)); + pTagData += 4; + destination.clear(); + destination.reserve(numElements); + + // read in elements + int32_t value; + for ( int i = 0 ; i < numElements; ++i ) { + memcpy(&value, pTagData, sizeof(int32_t)); + pTagData += sizeof(int32_t); + destination.push_back(value); + } + + // return success + return false; + +} + +/*! \fn bool BamAlignment::GetTag(const std::string& tag, std::vector<float>& destination) const + \brief Retrieves the numeric array data associated with a BAM tag + + \param tag 2-character tag name + \param destination destination for retrieved data + + \return \c true if found +*/ +bool BamAlignment::GetTag(const std::string& tag, std::vector<float>& destination) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // return false if tag not found + if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) + return false; + + // check that tag is array type + const char tagType = *(pTagData - 1); + if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) { + cerr << "BamAlignment ERROR: Cannot store non-array data from tag: " + << tag << " in array destination" << endl; + return false; + } + + // calculate length of each element in tag's array + const char elementType = *pTagData; + ++pTagData; + int elementLength = 0; + switch ( elementType ) { + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + elementLength = sizeof(uint8_t); + break; + + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + elementLength = sizeof(uint16_t); + break; + + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + case (Constants::BAM_TAG_TYPE_FLOAT) : + elementLength = sizeof(uint32_t); + break; + + // unsupported type for float destination (var-length data) + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + case (Constants::BAM_TAG_TYPE_ARRAY) : + cerr << "BamAlignment ERROR: array element type: " << elementType + << " cannot be stored in float value" << endl; + return false; + + // unknown tag type + default: + cerr << "BamAlignment ERROR: unknown element type encountered: " + << elementType << endl; + return false; + } + + // get number of elements + int32_t numElements; + memcpy(&numElements, pTagData, sizeof(int32_t)); + pTagData += 4; + destination.clear(); + destination.reserve(numElements); + + // read in elements + float value; + for ( int i = 0 ; i < numElements; ++i ) { + memcpy(&value, pTagData, sizeof(float)); + pTagData += sizeof(float); + destination.push_back(value); + } + + // return success + return false; +} + +/*! \fn bool BamAlignment::GetTagType(const std::string& tag, char& type) const + \brief Retrieves the BAM tag type-code associated with requested tag name. + + \param tag 2-character tag name + \param type destination for the retrieved (1-character) tag type + + \return \c true if found + \sa \samSpecURL for more details on reserved tag names, supported tag types, etc. +*/ +bool BamAlignment::GetTagType(const std::string& tag, char& type) const { + + // make sure tag data exists + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // lookup tag + if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) { + + // retrieve tag type code + type = *(pTagData - 1); + + // validate that type is a proper BAM tag type + switch (type) { + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + case (Constants::BAM_TAG_TYPE_ARRAY) : + return true; + + // unknown tag type + default: + cerr << "BamAlignment ERROR: unknown tag type encountered: " + << type << endl; + return false; + } + } + + // tag not found, return failure + return false; +} + +/*! \fn bool BamAlignment::HasTag(const std::string& tag) const + \brief Returns true if alignment has a record for requested tag. + \param tag 2-character tag name + \return \c true if alignment has a record for tag +*/ +bool BamAlignment::HasTag(const std::string& tag) const { + + // return false if no tag data present + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data for lookup + char* pTagData = (char*)TagData.data(); + const unsigned int tagDataLength = TagData.size(); + unsigned int numBytesParsed = 0; + + // if result of tag lookup + return FindTag(tag, pTagData, tagDataLength, numBytesParsed); +} + +/*! \fn bool BamAlignment::IsDuplicate(void) const + \return \c true if this read is a PCR duplicate +*/ +bool BamAlignment::IsDuplicate(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_DUPLICATE) != 0 ); +} + +/*! \fn bool BamAlignment::IsFailedQC(void) const + \return \c true if this read failed quality control +*/ +bool BamAlignment::IsFailedQC(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_QC_FAILED) != 0 ); +} + +/*! \fn bool BamAlignment::IsFirstMate(void) const + \return \c true if alignment is first mate on paired-end read +*/ +bool BamAlignment::IsFirstMate(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_1) != 0 ); +} + +/*! \fn bool BamAlignment::IsMapped(void) const + \return \c true if alignment is mapped +*/ +bool BamAlignment::IsMapped(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_UNMAPPED) == 0 ); +} + +/*! \fn bool BamAlignment::IsMateMapped(void) const + \return \c true if alignment's mate is mapped +*/ +bool BamAlignment::IsMateMapped(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_UNMAPPED) == 0 ); +} + +/*! \fn bool BamAlignment::IsMateReverseStrand(void) const + \return \c true if alignment's mate mapped to reverse strand +*/ +bool BamAlignment::IsMateReverseStrand(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND) != 0 ); +} + +/*! \fn bool BamAlignment::IsPaired(void) const + \return \c true if alignment part of paired-end read +*/ +bool BamAlignment::IsPaired(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PAIRED) != 0 ); +} + +/*! \fn bool BamAlignment::IsPrimaryAlignment(void) const + \return \c true if reported position is primary alignment +*/ +bool BamAlignment::IsPrimaryAlignment(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_SECONDARY) == 0 ); +} + +/*! \fn bool BamAlignment::IsProperPair(void) const + \return \c true if alignment is part of read that satisfied paired-end resolution +*/ +bool BamAlignment::IsProperPair(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_PROPER_PAIR) != 0 ); +} + +/*! \fn bool BamAlignment::IsReverseStrand(void) const + \return \c true if alignment mapped to reverse strand +*/ +bool BamAlignment::IsReverseStrand(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_REVERSE_STRAND) != 0 ); +} + +/*! \fn bool BamAlignment::IsSecondMate(void) const + \return \c true if alignment is second mate on read +*/ +bool BamAlignment::IsSecondMate(void) const { + return ( (AlignmentFlag & Constants::BAM_ALIGNMENT_READ_2) != 0 ); +} + +/*! \fn bool BamAlignment::IsValidSize(const string& tag, const string& type) const + \internal + + Checks that tag name & type strings are expected sizes. + \a tag should have length + \a type should have length 1 + + \param tag BAM tag name + \param type BAM tag type-code + + \return \c true if both \a tag and \a type are correct sizes +*/ +bool BamAlignment::IsValidSize(const string& tag, const string& type) const { + return (tag.size() == Constants::BAM_TAG_TAGSIZE) && + (type.size() == Constants::BAM_TAG_TYPESIZE); +} + +/*! \fn bool BamAlignment::RemoveTag(const std::string& tag) + \brief Removes field from BAM tags. + + \return \c true if tag was removed successfully (or didn't exist before) +*/ +bool BamAlignment::RemoveTag(const std::string& tag) { + + // skip if no tag data available + if ( SupportData.HasCoreOnly || TagData.empty() ) + return false; + + // localize the tag data + char* pOriginalTagData = (char*)TagData.data(); + char* pTagData = pOriginalTagData; + const unsigned int originalTagDataLength = TagData.size(); + unsigned int newTagDataLength = 0; + unsigned int numBytesParsed = 0; + + // if tag found + if ( FindTag(tag, pTagData, originalTagDataLength, numBytesParsed) ) { + + char newTagData[originalTagDataLength]; + + // copy original tag data up til desired tag + pTagData -= 3; + numBytesParsed -= 3; + const unsigned int beginningTagDataLength = numBytesParsed; + newTagDataLength += beginningTagDataLength; + memcpy(newTagData, pOriginalTagData, numBytesParsed); + + // skip to next tag (if tag for removal is last, return true) + const char* pTagStorageType = pTagData + 2; + pTagData += 3; + numBytesParsed += 3; + if ( !SkipToNextTag(*pTagStorageType, pTagData, numBytesParsed) ) + return true; + + // copy everything from current tag (the next one after tag for removal) to end + const unsigned int skippedDataLength = (numBytesParsed - beginningTagDataLength); + const unsigned int endTagDataLength = originalTagDataLength - beginningTagDataLength - skippedDataLength; + memcpy(newTagData + beginningTagDataLength, pTagData, endTagDataLength ); + + // save new tag data + TagData.assign(newTagData, beginningTagDataLength + endTagDataLength); + return true; + } + + // tag not found, no removal - return failure + return false; +} + +/*! \fn void BamAlignment::SetIsDuplicate(bool ok) + \brief Sets value of "PCR duplicate" flag to \a ok. +*/ +void BamAlignment::SetIsDuplicate(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_DUPLICATE; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_DUPLICATE; +} + +/*! \fn void BamAlignment::SetIsFailedQC(bool ok) + \brief Sets "failed quality control" flag to \a ok. +*/ +void BamAlignment::SetIsFailedQC(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_QC_FAILED; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_QC_FAILED; +} + +/*! \fn void BamAlignment::SetIsFirstMate(bool ok) + \brief Sets "alignment is first mate" flag to \a ok. +*/ +void BamAlignment::SetIsFirstMate(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_1; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_1; +} + +/*! \fn void BamAlignment::SetIsMapped(bool ok) + \brief Sets "alignment is mapped" flag to \a ok. +*/ +void BamAlignment::SetIsMapped(bool ok) { + if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_UNMAPPED; + else AlignmentFlag |= Constants::BAM_ALIGNMENT_UNMAPPED; +} + +/*! \fn void BamAlignment::SetIsMateMapped(bool ok) + \brief Sets "alignment's mate is mapped" flag to \a ok. +*/ +void BamAlignment::SetIsMateMapped(bool ok) { + if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_UNMAPPED; + else AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_UNMAPPED; +} + +/*! \fn void BamAlignment::SetIsMateUnmapped(bool ok) + \brief Complement of using SetIsMateMapped(). + \deprecated For sake of symmetry with the query methods + \sa IsMateMapped(), SetIsMateMapped() +*/ +void BamAlignment::SetIsMateUnmapped(bool ok) { + SetIsMateMapped(!ok); +} + +/*! \fn void BamAlignment::SetIsMateReverseStrand(bool ok) + \brief Sets "alignment's mate mapped to reverse strand" flag to \a ok. +*/ +void BamAlignment::SetIsMateReverseStrand(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_MATE_REVERSE_STRAND; +} + +/*! \fn void BamAlignment::SetIsPaired(bool ok) + \brief Sets "alignment part of paired-end read" flag to \a ok. +*/ +void BamAlignment::SetIsPaired(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PAIRED; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PAIRED; +} + +/*! \fn void BamAlignment::SetIsPrimaryAlignment(bool ok) + \brief Sets "position is primary alignment" flag to \a ok. +*/ +void BamAlignment::SetIsPrimaryAlignment(bool ok) { + if (ok) AlignmentFlag &= ~Constants::BAM_ALIGNMENT_SECONDARY; + else AlignmentFlag |= Constants::BAM_ALIGNMENT_SECONDARY; +} + +/*! \fn void BamAlignment::SetIsProperPair(bool ok) + \brief Sets "alignment is part of read that satisfied paired-end resolution" flag to \a ok. +*/ +void BamAlignment::SetIsProperPair(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_PROPER_PAIR; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_PROPER_PAIR; +} + +/*! \fn void BamAlignment::SetIsReverseStrand(bool ok) + \brief Sets "alignment mapped to reverse strand" flag to \a ok. +*/ +void BamAlignment::SetIsReverseStrand(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_REVERSE_STRAND; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_REVERSE_STRAND; +} + +/*! \fn void BamAlignment::SetIsSecondaryAlignment(bool ok) + \brief Complement of using SetIsPrimaryAlignment(). + \deprecated For sake of symmetry with the query methods + \sa IsPrimaryAlignment(), SetIsPrimaryAlignment() +*/ +void BamAlignment::SetIsSecondaryAlignment(bool ok) { + SetIsPrimaryAlignment(!ok); +} + +/*! \fn void BamAlignment::SetIsSecondMate(bool ok) + \brief Sets "alignment is second mate on read" flag to \a ok. +*/ +void BamAlignment::SetIsSecondMate(bool ok) { + if (ok) AlignmentFlag |= Constants::BAM_ALIGNMENT_READ_2; + else AlignmentFlag &= ~Constants::BAM_ALIGNMENT_READ_2; +} + +/*! \fn void BamAlignment::SetIsUnmapped(bool ok) + \brief Complement of using SetIsMapped(). + \deprecated For sake of symmetry with the query methods + \sa IsMapped(), SetIsMapped() +*/ +void BamAlignment::SetIsUnmapped(bool ok) { + SetIsMapped(!ok); +} + +/*! \fn bool BamAlignment::SkipToNextTag(const char storageType, char*& pTagData, unsigned int& numBytesParsed) + \internal + + Moves to next available tag in tag data string + + \param storageType BAM tag type-code that determines how far to move cursor + \param pTagData pointer to current position (cursor) in tag string + \param numBytesParsed report of how many bytes were parsed (cumulatively) + + \return \c if storageType was a recognized BAM tag type + \post \a pTagData will point to the byte where the next tag data begins. + \a numBytesParsed will correspond to the cursor's position in the full TagData string. +*/ +bool BamAlignment::SkipToNextTag(const char storageType, + char*& pTagData, + unsigned int& numBytesParsed) const +{ + switch (storageType) { + + case (Constants::BAM_TAG_TYPE_ASCII) : + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + ++numBytesParsed; + ++pTagData; + break; + + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + numBytesParsed += sizeof(uint16_t); + pTagData += sizeof(uint16_t); + break; + + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + numBytesParsed += sizeof(uint32_t); + pTagData += sizeof(uint32_t); + break; + + case (Constants::BAM_TAG_TYPE_STRING) : + case (Constants::BAM_TAG_TYPE_HEX) : + while( *pTagData ) { + ++numBytesParsed; + ++pTagData; + } + // increment for null-terminator + ++numBytesParsed; + ++pTagData; + break; + + case (Constants::BAM_TAG_TYPE_ARRAY) : + + { + // read array type + const char arrayType = *pTagData; + ++numBytesParsed; + ++pTagData; + + // read number of elements + int32_t numElements; + memcpy(&numElements, pTagData, sizeof(uint32_t)); // already endian-swapped if necessary + numBytesParsed += sizeof(uint32_t); + pTagData += sizeof(uint32_t); + + // calculate number of bytes to skip + int bytesToSkip = 0; + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + bytesToSkip = numElements; + break; + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + bytesToSkip = numElements*sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + bytesToSkip = numElements*sizeof(uint32_t); + break; + default: + cerr << "BamAlignment ERROR: unknown binary array type encountered: " + << arrayType << endl; + return false; + } + + // skip binary array contents + numBytesParsed += bytesToSkip; + pTagData += bytesToSkip; + break; + } + + default: + cerr << "BamAlignment ERROR: unknown tag type encountered" + << storageType << endl; + return false; + } + + // return success + return true; +} diff --git a/src/utils/BamTools/src/api/BamAlignment.h b/src/utils/BamTools/src/api/BamAlignment.h new file mode 100644 index 0000000000000000000000000000000000000000..7535d93f8f7a8fcd1062348b7d406a4082fa0abe --- /dev/null +++ b/src/utils/BamTools/src/api/BamAlignment.h @@ -0,0 +1,207 @@ +// *************************************************************************** +// BamAlignment.h (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 22 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the BamAlignment data structure +// *************************************************************************** + +#ifndef BAMALIGNMENT_H +#define BAMALIGNMENT_H + +#include <api/api_global.h> +#include <api/BamAux.h> +#include <string> +#include <vector> + +namespace BamTools { + +// forward declaration of BamAlignment's friend classes +namespace Internal { + class BamReaderPrivate; + class BamWriterPrivate; +} // namespace Internal + +// BamAlignment data structure +struct API_EXPORT BamAlignment { + + // constructors & destructor + public: + BamAlignment(void); + BamAlignment(const BamAlignment& other); + ~BamAlignment(void); + + // queries against alignment flags + public: + bool IsDuplicate(void) const; // returns true if this read is a PCR duplicate + bool IsFailedQC(void) const; // returns true if this read failed quality control + bool IsFirstMate(void) const; // returns true if alignment is first mate on read + bool IsMapped(void) const; // returns true if alignment is mapped + bool IsMateMapped(void) const; // returns true if alignment's mate is mapped + bool IsMateReverseStrand(void) const; // returns true if alignment's mate mapped to reverse strand + bool IsPaired(void) const; // returns true if alignment part of paired-end read + bool IsPrimaryAlignment(void) const; // returns true if reported position is primary alignment + bool IsProperPair(void) const; // returns true if alignment is part of read that satisfied paired-end resolution + bool IsReverseStrand(void) const; // returns true if alignment mapped to reverse strand + bool IsSecondMate(void) const; // returns true if alignment is second mate on read + + // manipulate alignment flags + public: + void SetIsDuplicate(bool ok); // sets value of "PCR duplicate" flag + void SetIsFailedQC(bool ok); // sets value of "failed quality control" flag + void SetIsFirstMate(bool ok); // sets value of "alignment is first mate" flag + void SetIsMapped(bool ok); // sets value of "alignment is mapped" flag + void SetIsMateMapped(bool ok); // sets value of "alignment's mate is mapped" flag + void SetIsMateReverseStrand(bool ok); // sets value of "alignment's mate mapped to reverse strand" flag + void SetIsPaired(bool ok); // sets value of "alignment part of paired-end read" flag + void SetIsPrimaryAlignment(bool ok); // sets value of "position is primary alignment" flag + void SetIsProperPair(bool ok); // sets value of "alignment is part of read that satisfied paired-end resolution" flag + void SetIsReverseStrand(bool ok); // sets value of "alignment mapped to reverse strand" flag + void SetIsSecondMate(bool ok); // sets value of "alignment is second mate on read" flag + + // legacy methods (consider deprecated, but still available) + void SetIsMateUnmapped(bool ok); // complement of using SetIsMateMapped() + void SetIsSecondaryAlignment(bool ok); // complement of using SetIsPrimaryAlignment() + void SetIsUnmapped(bool ok); // complement of using SetIsMapped() + + // tag data access methods + public: + + // ------------------------------------------------------------------------------------- + // N.B. - The following tag access methods may not be used on BamAlignments fetched + // using BamReader::GetNextAlignmentCore(). Attempting to use them will not result in + // error message (to keep output clean) but will ALWAYS return false. Only user-created + // BamAlignments or those retrieved using BamReader::GetNextAlignment() are valid here. + // + // You can call BuildCharData() on such an alignment retrieved by GetNextAlignmentCore(). + // This populates all the character data, and will enable subsequent queries on tag data. + // ------------------------------------------------------------------------------------- + + // adds a tag + bool AddTag(const std::string& tag, const std::string& type, const std::string& value); + bool AddTag(const std::string& tag, const std::string& type, const uint32_t& value); + bool AddTag(const std::string& tag, const std::string& type, const int32_t& value); + bool AddTag(const std::string& tag, const std::string& type, const float& value); + + // adds a "binary array" tag + bool AddTag(const std::string& tag, const std::vector<uint8_t>& values); + bool AddTag(const std::string& tag, const std::vector<int8_t>& values); + bool AddTag(const std::string& tag, const std::vector<uint16_t>& values); + bool AddTag(const std::string& tag, const std::vector<int16_t>& values); + bool AddTag(const std::string& tag, const std::vector<uint32_t>& values); + bool AddTag(const std::string& tag, const std::vector<int32_t>& values); + bool AddTag(const std::string& tag, const std::vector<float>& values); + + // edits a tag + bool EditTag(const std::string& tag, const std::string& type, const std::string& value); + bool EditTag(const std::string& tag, const std::string& type, const uint32_t& value); + bool EditTag(const std::string& tag, const std::string& type, const int32_t& value); + bool EditTag(const std::string& tag, const std::string& type, const float& value); + + // edits a "binary array" tag + bool EditTag(const std::string& tag, const std::vector<uint8_t>& values); + bool EditTag(const std::string& tag, const std::vector<int8_t>& values); + bool EditTag(const std::string& tag, const std::vector<uint16_t>& values); + bool EditTag(const std::string& tag, const std::vector<int16_t>& values); + bool EditTag(const std::string& tag, const std::vector<uint32_t>& values); + bool EditTag(const std::string& tag, const std::vector<int32_t>& values); + bool EditTag(const std::string& tag, const std::vector<float>& values); + + // retrieves data for a tag + bool GetTag(const std::string& tag, std::string& destination) const; + bool GetTag(const std::string& tag, uint32_t& destination) const; + bool GetTag(const std::string& tag, int32_t& destination) const; + bool GetTag(const std::string& tag, float& destination) const; + + // retrieves data for a "binary array" tag + bool GetTag(const std::string& tag, std::vector<uint32_t>& destination) const; + bool GetTag(const std::string& tag, std::vector<int32_t>& destination) const; + bool GetTag(const std::string& tag, std::vector<float>& destination) const; + + // retrieves the BAM tag-type character for a tag + bool GetTagType(const std::string& tag, char& type) const; + + // legacy methods (consider deprecated, but still available) + bool GetEditDistance(uint32_t& editDistance) const; // retrieves value of "NM" tag + bool GetReadGroup(std::string& readGroup) const; // retrieves value of "RG" tag + + // returns true if alignment has a record for this tag name + bool HasTag(const std::string& tag) const; + + // removes a tag + bool RemoveTag(const std::string& tag); + + // additional methods + public: + // populates alignment string fields + bool BuildCharData(void); + // calculates alignment end position + int GetEndPosition(bool usePadded = false, bool zeroBased = true) const; + + // public data fields + public: + std::string Name; // read name + int32_t Length; // length of query sequence + std::string QueryBases; // 'original' sequence (as reported from sequencing machine) + std::string AlignedBases; // 'aligned' sequence (includes any indels, padding, clipping) + std::string Qualities; // FASTQ qualities (ASCII characters, not numeric values) + std::string TagData; // tag data (use provided methods to query/modify) + int32_t RefID; // ID number for reference sequence + int32_t Position; // position (0-based) where alignment starts + uint16_t Bin; // BAM (standard) index bin number for this alignment + uint16_t MapQuality; // mapping quality score + uint32_t AlignmentFlag; // alignment bit-flag (use provided methods to query/modify) + std::vector<CigarOp> CigarData; // CIGAR operations for this alignment + int32_t MateRefID; // ID number for reference sequence where alignment's mate was aligned + int32_t MatePosition; // position (0-based) where alignment's mate starts + int32_t InsertSize; // mate-pair insert size + std::string Filename; // name of BAM file which this alignment comes from + + //! \cond + // internal utility methods + private: + bool FindTag(const std::string& tag, + char*& pTagData, + const unsigned int& tagDataLength, + unsigned int& numBytesParsed) const; + bool IsValidSize(const std::string& tag, + const std::string& type) const; + bool SkipToNextTag(const char storageType, + char*& pTagData, + unsigned int& numBytesParsed) const; + + // internal data + private: + + struct BamAlignmentSupportData { + + // data members + std::string AllCharData; + uint32_t BlockLength; + uint32_t NumCigarOperations; + uint32_t QueryNameLength; + uint32_t QuerySequenceLength; + bool HasCoreOnly; + + // constructor + BamAlignmentSupportData(void) + : BlockLength(0) + , NumCigarOperations(0) + , QueryNameLength(0) + , QuerySequenceLength(0) + , HasCoreOnly(false) + { } + }; + BamAlignmentSupportData SupportData; + friend class Internal::BamReaderPrivate; + friend class Internal::BamWriterPrivate; + //! \endcond +}; + +typedef std::vector<BamAlignment> BamAlignmentVector; + +} // namespace BamTools + +#endif // BAMALIGNMENT_H diff --git a/src/utils/BamTools/src/api/BamAlignment.o b/src/utils/BamTools/src/api/BamAlignment.o new file mode 100644 index 0000000000000000000000000000000000000000..127fd8ca969897a70652833964672c938b36640a Binary files /dev/null and b/src/utils/BamTools/src/api/BamAlignment.o differ diff --git a/src/utils/BamTools/src/api/BamAux.h b/src/utils/BamTools/src/api/BamAux.h new file mode 100644 index 0000000000000000000000000000000000000000..d171e7069db998da82ac346689cea68fee068611 --- /dev/null +++ b/src/utils/BamTools/src/api/BamAux.h @@ -0,0 +1,457 @@ +// *************************************************************************** +// BamAux.h (c) 2009 Derek Barnett, Michael Str�mberg +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 4 March 2011 (DB) +// --------------------------------------------------------------------------- +// Provides data structures & utility methods that are used throughout the API. +// *************************************************************************** + +#ifndef BAMAUX_H +#define BAMAUX_H + +#include <api/api_global.h> +#include <fstream> +#include <iostream> +#include <string> +#include <vector> + +/*! \file BamAux.h + + Provides data structures & utility methods that are used throughout the API. +*/ +/*! \namespace BamTools + \brief Contains all BamTools classes & methods. + + The BamTools API contained in this namespace contains classes and methods + for reading, writing, and manipulating BAM alignment files. +*/ +namespace BamTools { + +// ---------------------------------------------------------------- +// CigarOp + +/*! \struct BamTools::CigarOp + \brief Represents a CIGAR alignment operation. + + \sa http://samtools.sourceforge.net/SAM-1.3.pdf for more details on using CIGAR operations. +*/ +struct API_EXPORT CigarOp { + + char Type; //!< CIGAR operation type (MIDNSHP) + uint32_t Length; //!< CIGAR operation length (number of bases) + + //! constructor + CigarOp(const char type = '\0', + const uint32_t& length = 0) + : Type(type) + , Length(length) + { } +}; + +// ---------------------------------------------------------------- +// RefData + +/*! \struct BamTools::RefData + \brief Represents a reference sequence entry +*/ +struct API_EXPORT RefData { + + std::string RefName; //!< name of reference sequence + int32_t RefLength; //!< length of reference sequence + + //! constructor + RefData(const std::string& name = "", + const int32_t& length = 0) + : RefName(name) + , RefLength(length) + { } +}; + +//! convenience typedef for vector of RefData entries +typedef std::vector<RefData> RefVector; + +// ---------------------------------------------------------------- +// BamRegion + +/*! \struct BamTools::BamRegion + \brief Represents a sequential genomic region + + Allowed to span multiple (sequential) references. +*/ +struct API_EXPORT BamRegion { + + int LeftRefID; //!< reference ID for region's left boundary + int LeftPosition; //!< position for region's left boundary + int RightRefID; //!< reference ID for region's right boundary + int RightPosition; //!< position for region's right boundary + + //! constructor + BamRegion(const int& leftID = -1, + const int& leftPos = -1, + const int& rightID = -1, + const int& rightPos = -1) + : LeftRefID(leftID) + , LeftPosition(leftPos) + , RightRefID(rightID) + , RightPosition(rightPos) + { } + + //! copy constructor + BamRegion(const BamRegion& other) + : LeftRefID(other.LeftRefID) + , LeftPosition(other.LeftPosition) + , RightRefID(other.RightRefID) + , RightPosition(other.RightPosition) + { } + + //! Clears region boundaries + void clear(void) { + LeftRefID = -1; LeftPosition = -1; + RightRefID = -1; RightPosition = -1; + } + + //! Returns true if region has a left boundary + bool isLeftBoundSpecified(void) const { + return ( LeftRefID >= 0 && LeftPosition >= 0 ); + } + + //! Returns true if region boundaries are not defined + bool isNull(void) const { + return ( !isLeftBoundSpecified() && !isRightBoundSpecified() ); + } + + //! Returns true if region has a right boundary + bool isRightBoundSpecified(void) const { + return ( RightRefID >= 0 && RightPosition >= 0 ); + } +}; + +// ---------------------------------------------------------------- +// General utility methods + +/*! \fn bool FileExists(const std::string& filename) + \brief checks if file exists + + Attempts to open file in a read-only mode. + + \return \c true if file can be opened successfully +*/ +API_EXPORT inline bool FileExists(const std::string& filename) { + std::ifstream f(filename.c_str(), std::ifstream::in); + return !f.fail(); +} + +/*! \fn void SwapEndian_16(int16_t& x) + \brief swaps endianness of signed 16-bit integer, in place + + Swaps endian representation of value in \a x. +*/ +API_EXPORT inline void SwapEndian_16(int16_t& x) { + x = ((x >> 8) | (x << 8)); +} + +/*! \fn void SwapEndian_16(uint16_t& x) + \brief swaps endianness of unsigned 16-bit integer, in place + + Swaps endian representation of value in \a x. +*/ +API_EXPORT inline void SwapEndian_16(uint16_t& x) { + x = ((x >> 8) | (x << 8)); +} + +/*! \fn void SwapEndian_32(int32_t& x) + \brief swaps endianness of signed 32-bit integer, in place + + Swaps endian representation of value in \a x. +*/ +API_EXPORT inline void SwapEndian_32(int32_t& x) { + x = ( (x >> 24) | + ((x << 8) & 0x00FF0000) | + ((x >> 8) & 0x0000FF00) | + (x << 24) + ); +} + +/*! \fn void SwapEndian_32(uint32_t& x) + \brief swaps endianness of unsigned 32-bit integer, in place + + Swaps endian representation of value in \a x. +*/ +API_EXPORT inline void SwapEndian_32(uint32_t& x) { + x = ( (x >> 24) | + ((x << 8) & 0x00FF0000) | + ((x >> 8) & 0x0000FF00) | + (x << 24) + ); +} + +/*! \fn void SwapEndian_64(int64_t& x) + \brief swaps endianness of signed 64-bit integer, in place + + Swaps endian representation of value in \a x. +*/ +API_EXPORT inline void SwapEndian_64(int64_t& x) { + x = ( (x >> 56) | + ((x << 40) & 0x00FF000000000000ll) | + ((x << 24) & 0x0000FF0000000000ll) | + ((x << 8) & 0x000000FF00000000ll) | + ((x >> 8) & 0x00000000FF000000ll) | + ((x >> 24) & 0x0000000000FF0000ll) | + ((x >> 40) & 0x000000000000FF00ll) | + (x << 56) + ); +} + +/*! \fn void SwapEndian_64(uint64_t& x) + \brief swaps endianness of unsigned 64-bit integer, in place + + Swaps endian representation of value in \a x. +*/ +API_EXPORT inline void SwapEndian_64(uint64_t& x) { + x = ( (x >> 56) | + ((x << 40) & 0x00FF000000000000ll) | + ((x << 24) & 0x0000FF0000000000ll) | + ((x << 8) & 0x000000FF00000000ll) | + ((x >> 8) & 0x00000000FF000000ll) | + ((x >> 24) & 0x0000000000FF0000ll) | + ((x >> 40) & 0x000000000000FF00ll) | + (x << 56) + ); +} + +/*! \fn void SwapEndian_16p(char* data) + \brief swaps endianness of the next 2 bytes in a buffer, in place + + Swaps endian representation the next 2 bytes in \a data. +*/ +API_EXPORT inline void SwapEndian_16p(char* data) { + uint16_t& value = (uint16_t&)*data; + SwapEndian_16(value); +} + +/*! \fn void SwapEndian_32p(char* data) + \brief swaps endianness of the next 4 bytes in a buffer, in place + + Swaps endian representation the next 4 bytes in \a data. +*/ +API_EXPORT inline void SwapEndian_32p(char* data) { + uint32_t& value = (uint32_t&)*data; + SwapEndian_32(value); +} + +/*! \fn void SwapEndian_64p(char* data) + \brief swaps endianness of the next 8 bytes in a buffer, in place + + Swaps endian representation the next 8 bytes in \a data. +*/ +API_EXPORT inline void SwapEndian_64p(char* data) { + uint64_t& value = (uint64_t&)*data; + SwapEndian_64(value); +} + +/*! \fn bool SystemIsBigEndian(void) + \brief checks host architecture's byte order + \return \c true if system uses big-endian ordering +*/ +API_EXPORT inline bool SystemIsBigEndian(void) { + const uint16_t one = 0x0001; + return ((*(char*) &one) == 0 ); +} + +/*! \fn void PackUnsignedInt(char* buffer, unsigned int value) + \brief stores unsigned integer value in a byte buffer + + \param buffer destination buffer + \param value unsigned integer to 'pack' in buffer +*/ +API_EXPORT inline void PackUnsignedInt(char* buffer, unsigned int value) { + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); + buffer[2] = (char)(value >> 16); + buffer[3] = (char)(value >> 24); +} + +/*! \fn void PackUnsignedShort(char* buffer, unsigned short value) + \brief stores unsigned short integer value in a byte buffer + + \param buffer destination buffer + \param value unsigned short integer to 'pack' in buffer +*/ +API_EXPORT inline void PackUnsignedShort(char* buffer, unsigned short value) { + buffer[0] = (char)value; + buffer[1] = (char)(value >> 8); +} + +/*! \fn double UnpackDouble(const char* buffer) + \brief reads a double value from byte buffer + + \param buffer source byte buffer + \return the (double) value read from the buffer +*/ +API_EXPORT inline double UnpackDouble(const char* buffer) { + union { double value; unsigned char valueBuffer[sizeof(double)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + un.valueBuffer[4] = buffer[4]; + un.valueBuffer[5] = buffer[5]; + un.valueBuffer[6] = buffer[6]; + un.valueBuffer[7] = buffer[7]; + return un.value; +} + +/*! \fn double UnpackDouble(char* buffer) + \brief reads a double value from byte buffer + + This is an overloaded function. + + \param buffer source byte buffer + \return the (double) value read from the buffer +*/ +API_EXPORT inline double UnpackDouble(char* buffer) { + return UnpackDouble( (const char*)buffer ); +} + +/*! \fn double UnpackFloat(const char* buffer) + \brief reads a float value from byte buffer + + \param buffer source byte buffer + \return the (float) value read from the buffer +*/ +API_EXPORT inline float UnpackFloat(const char* buffer) { + union { float value; unsigned char valueBuffer[sizeof(float)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +/*! \fn double UnpackFloat(char* buffer) + \brief reads a float value from byte buffer + + This is an overloaded function. + + \param buffer source byte buffer + \return the (float) value read from the buffer +*/ +API_EXPORT inline float UnpackFloat(char* buffer) { + return UnpackFloat( (const char*)buffer ); +} + +/*! \fn signed int UnpackSignedInt(const char* buffer) + \brief reads a signed integer value from byte buffer + + \param buffer source byte buffer + \return the (signed int) value read from the buffer +*/ +API_EXPORT inline signed int UnpackSignedInt(const char* buffer) { + union { signed int value; unsigned char valueBuffer[sizeof(signed int)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +/*! \fn signed int UnpackSignedInt(char* buffer) + \brief reads a signed integer value from byte buffer + + This is an overloaded function. + + \param buffer source byte buffer + \return the (signed int) value read from the buffer +*/ +API_EXPORT inline signed int UnpackSignedInt(char* buffer) { + return UnpackSignedInt( (const char*) buffer ); +} + +/*! \fn signed short UnpackSignedShort(const char* buffer) + \brief reads a signed short integer value from byte buffer + + \param buffer source byte buffer + \return the (signed short) value read from the buffer +*/ +API_EXPORT inline signed short UnpackSignedShort(const char* buffer) { + union { signed short value; unsigned char valueBuffer[sizeof(signed short)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; +} + +/*! \fn signed short UnpackSignedShort(char* buffer) + \brief reads a signed short integer value from byte buffer + + This is an overloaded function. + + \param buffer source byte buffer + \return the (signed short) value read from the buffer +*/ +API_EXPORT inline signed short UnpackSignedShort(char* buffer) { + return UnpackSignedShort( (const char*)buffer ); +} + +/*! \fn unsigned int UnpackUnsignedInt(const char* buffer) + \brief reads an unsigned integer value from byte buffer + + \param buffer source byte buffer + \return the (unsigned int) value read from the buffer +*/ +API_EXPORT inline unsigned int UnpackUnsignedInt(const char* buffer) { + union { unsigned int value; unsigned char valueBuffer[sizeof(unsigned int)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + un.valueBuffer[2] = buffer[2]; + un.valueBuffer[3] = buffer[3]; + return un.value; +} + +/*! \fn unsigned int UnpackUnsignedInt(char* buffer) + \brief reads an unsigned integer value from byte buffer + + This is an overloaded function. + + \param buffer source byte buffer + \return the (unsigned int) value read from the buffer +*/ +API_EXPORT inline unsigned int UnpackUnsignedInt(char* buffer) { + return UnpackUnsignedInt( (const char*)buffer ); +} + +/*! \fn unsigned short UnpackUnsignedShort(const char* buffer) + \brief reads an unsigned short integer value from byte buffer + + \param buffer source byte buffer + \return the (unsigned short) value read from the buffer +*/ +API_EXPORT inline unsigned short UnpackUnsignedShort(const char* buffer) { + union { unsigned short value; unsigned char valueBuffer[sizeof(unsigned short)]; } un; + un.value = 0; + un.valueBuffer[0] = buffer[0]; + un.valueBuffer[1] = buffer[1]; + return un.value; +} + +/*! \fn unsigned short UnpackUnsignedShort(char* buffer) + \brief reads an unsigned short integer value from byte buffer + + This is an overloaded function. + + \param buffer source byte buffer + \return the (unsigned short) value read from the buffer +*/ +API_EXPORT inline unsigned short UnpackUnsignedShort(char* buffer) { + return UnpackUnsignedShort( (const char*)buffer ); +} + +} // namespace BamTools + +#endif // BAMAUX_H diff --git a/src/utils/BamTools/src/api/BamConstants.h b/src/utils/BamTools/src/api/BamConstants.h new file mode 100644 index 0000000000000000000000000000000000000000..e433c8e79df37d318a6f4a9c7b94b060494b9a92 --- /dev/null +++ b/src/utils/BamTools/src/api/BamConstants.h @@ -0,0 +1,128 @@ +// *************************************************************************** +// BamConstants.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides basic constants for handling BAM files. +// *************************************************************************** + +#ifndef BAM_CONSTANTS_H +#define BAM_CONSTANTS_H + +#include <string> + +/*! \namespace BamTools::Constants + \brief Provides basic constants for handling BAM files. +*/ + +namespace BamTools { +namespace Constants { + +const int BAM_SIZEOF_INT = 4; + +// header magic number +const char* const BAM_HEADER_MAGIC = "BAM\1"; +const unsigned int BAM_HEADER_MAGIC_LENGTH = 4; + +// BAM alignment core size +const int BAM_CORE_SIZE = 32; +const int BAM_CORE_BUFFER_SIZE = 8; + +// BAM alignment flags +const int BAM_ALIGNMENT_PAIRED = 0x0001; +const int BAM_ALIGNMENT_PROPER_PAIR = 0x0002; +const int BAM_ALIGNMENT_UNMAPPED = 0x0004; +const int BAM_ALIGNMENT_MATE_UNMAPPED = 0x0008; +const int BAM_ALIGNMENT_REVERSE_STRAND = 0x0010; +const int BAM_ALIGNMENT_MATE_REVERSE_STRAND = 0x0020; +const int BAM_ALIGNMENT_READ_1 = 0x0040; +const int BAM_ALIGNMENT_READ_2 = 0x0080; +const int BAM_ALIGNMENT_SECONDARY = 0x0100; +const int BAM_ALIGNMENT_QC_FAILED = 0x0200; +const int BAM_ALIGNMENT_DUPLICATE = 0x0400; + +// CIGAR constants +const char* const BAM_CIGAR_LOOKUP = "MIDNSHP=X"; +const int BAM_CIGAR_MATCH = 0; +const int BAM_CIGAR_INS = 1; +const int BAM_CIGAR_DEL = 2; +const int BAM_CIGAR_REFSKIP = 3; +const int BAM_CIGAR_SOFTCLIP = 4; +const int BAM_CIGAR_HARDCLIP = 5; +const int BAM_CIGAR_PAD = 6; +const int BAM_CIGAR_SEQMATCH = 7; +const int BAM_CIGAR_MISMATCH = 8; + +const char BAM_CIGAR_MATCH_CHAR = 'M'; +const char BAM_CIGAR_INS_CHAR = 'I'; +const char BAM_CIGAR_DEL_CHAR = 'D'; +const char BAM_CIGAR_REFSKIP_CHAR = 'N'; +const char BAM_CIGAR_SOFTCLIP_CHAR = 'S'; +const char BAM_CIGAR_HARDCLIP_CHAR = 'H'; +const char BAM_CIGAR_PAD_CHAR = 'P'; +const char BAM_CIGAR_SEQMATCH_CHAR = '='; +const char BAM_CIGAR_MISMATCH_CHAR = 'X'; + +const int BAM_CIGAR_SHIFT = 4; +const int BAM_CIGAR_MASK = ((1 << BAM_CIGAR_SHIFT) - 1); + +// BAM tag types +const char BAM_TAG_TYPE_ASCII = 'A'; +const char BAM_TAG_TYPE_UINT8 = 'c'; +const char BAM_TAG_TYPE_INT8 = 'C'; +const char BAM_TAG_TYPE_UINT16 = 's'; +const char BAM_TAG_TYPE_INT16 = 'S'; +const char BAM_TAG_TYPE_UINT32 = 'i'; +const char BAM_TAG_TYPE_INT32 = 'I'; +const char BAM_TAG_TYPE_FLOAT = 'f'; +const char BAM_TAG_TYPE_STRING = 'Z'; +const char BAM_TAG_TYPE_HEX = 'H'; +const char BAM_TAG_TYPE_ARRAY = 'B'; + +const size_t BAM_TAG_TAGSIZE = 2; +const size_t BAM_TAG_TYPESIZE = 1; +const int BAM_TAG_ARRAYBASE_SIZE = 8; + +// DNA bases +const char* const BAM_DNA_LOOKUP = "=ACMGRSVTWYHKDBN"; +const unsigned char BAM_BASECODE_EQUAL = 0; +const unsigned char BAM_BASECODE_A = 1; +const unsigned char BAM_BASECODE_C = 2; +const unsigned char BAM_BASECODE_G = 4; +const unsigned char BAM_BASECODE_T = 8; +const unsigned char BAM_BASECODE_N = 15; + +const char BAM_DNA_EQUAL = '='; +const char BAM_DNA_A = 'A'; +const char BAM_DNA_C = 'C'; +const char BAM_DNA_G = 'G'; +const char BAM_DNA_T = 'T'; +const char BAM_DNA_N = 'N'; +const char BAM_DNA_DEL = '-'; +const char BAM_DNA_PAD = '*'; + +// zlib constants +const int GZIP_ID1 = 31; +const int GZIP_ID2 = 139; +const int CM_DEFLATE = 8; +const int FLG_FEXTRA = 4; +const int OS_UNKNOWN = 255; +const int BGZF_XLEN = 6; +const int BGZF_ID1 = 66; +const int BGZF_ID2 = 67; +const int BGZF_LEN = 2; +const int GZIP_WINDOW_BITS = -15; +const int Z_DEFAULT_MEM_LEVEL = 8; + +// BZGF constants +const int BGZF_BLOCK_HEADER_LENGTH = 18; +const int BGZF_BLOCK_FOOTER_LENGTH = 8; +const int BGZF_MAX_BLOCK_SIZE = 65536; +const int BGZF_DEFAULT_BLOCK_SIZE = 65536; + +} // namespace Constants +} // namespace BamTools + +#endif // BAM_CONSTANTS_H diff --git a/src/utils/BamTools/src/api/BamIndex.h b/src/utils/BamTools/src/api/BamIndex.h new file mode 100644 index 0000000000000000000000000000000000000000..00a8f0174458f542268944d30aa39997400d0338 --- /dev/null +++ b/src/utils/BamTools/src/api/BamIndex.h @@ -0,0 +1,80 @@ +// *************************************************************************** +// BamIndex.h (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 5 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides basic BAM index interface +// *************************************************************************** + +#ifndef BAM_INDEX_H +#define BAM_INDEX_H + +#include <api/api_global.h> +#include <api/BamAux.h> +#include <string> + +namespace BamTools { + +namespace Internal { + class BamReaderPrivate; +} // namespace Internal + +/*! \class BamTools::BamIndex + \brief Provides methods for generating & loading BAM index files. + + This class straddles the line between public API and internal + implementation detail. Most client code should never have to use this + class directly. + + It is exposed to the public API to allow advanced users to implement + their own custom indexing schemes. + + More documentation on methods & enums coming soon. +*/ + +class API_EXPORT BamIndex { + + // enums + public: + // specify index-caching behavior + enum IndexCacheMode { FullIndexCaching = 0 // store entire index file contents in memory + , LimitedIndexCaching // store only index data for current reference + , NoIndexCaching // do not store any index data between jumps + }; + + // list of supported BamIndex types + enum IndexType { BAMTOOLS = 0 + , STANDARD + }; + + // ctor & dtor + public: + BamIndex(Internal::BamReaderPrivate* reader) : m_reader(reader) { } + virtual ~BamIndex(void) { } + + // index interface + public: + // builds index from associated BAM file & writes out to index file + virtual bool Create(void) =0; // creates index file from BAM file + // returns whether reference has alignments or no + virtual bool HasAlignments(const int& referenceID) const =0; + // attempts to use index data to jump to @region, returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + virtual bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) =0; + // loads existing data from file into memory + virtual bool Load(const std::string& filename) =0; + // change the index caching behavior + virtual void SetCacheMode(const BamIndex::IndexCacheMode& mode) =0; + + // data members + protected: + Internal::BamReaderPrivate* m_reader; // copy, not ownedprivate: +}; + +} // namespace BamTools + +#endif // BAM_INDEX_H diff --git a/src/utils/BamTools/src/api/BamMultiReader.cpp b/src/utils/BamTools/src/api/BamMultiReader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..06055df393daae3cb96233eebb5787b21c0b71d2 --- /dev/null +++ b/src/utils/BamTools/src/api/BamMultiReader.cpp @@ -0,0 +1,396 @@ +// *************************************************************************** +// BamMultiReader.cpp (c) 2010 Erik Garrison, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 15 March 2011 (DB) +// --------------------------------------------------------------------------- +// Convenience class for reading multiple BAM files. +// +// This functionality allows applications to work on very large sets of files +// without requiring intermediate merge, sort, and index steps for each file +// subset. It also improves the performance of our merge system as it +// precludes the need to sort merged files. +// *************************************************************************** + +#include <api/BamMultiReader.h> +#include <api/internal/BamMultiReader_p.h> +using namespace BamTools; + +#include <string> +#include <vector> +using namespace std; + +/*! \class BamTools::BamReader + \brief Convenience class for reading multiple BAM files. +*/ + +/*! \fn BamMultiReader::BamMultiReader(void) + \brief constructor +*/ +BamMultiReader::BamMultiReader(void) + : d(new Internal::BamMultiReaderPrivate) +{ } + +/*! \fn BamMultiReader::~BamMultiReader(void) + \brief destructor +*/ +BamMultiReader::~BamMultiReader(void) { + delete d; + d = 0; +} + +/*! \fn void BamMultiReader::Close(void) + \brief Closes all open BAM files. + + Also clears out all header and reference data. + + \sa CloseFile(), IsOpen(), Open(), BamReader::Close() +*/ +void BamMultiReader::Close(void) { + d->Close(); +} + +/*! \fn void BamMultiReader::CloseFile(const std::string& filename) + \brief Closes requested BAM file. + + Leaves any other file(s) open, along with header and reference data. + + \sa Close(), IsOpen(), Open(), BamReader::Close() +*/ +void BamMultiReader::CloseFile(const std::string& filename) { + d->CloseFile(filename); +} + +/*! \fn bool BamMultiReader::CreateIndexes(const BamIndex::IndexType& type) + \brief Creates index files for the current BAM files. + + \param type file format to create, see BamIndex::IndexType for available formats + \return \c true if index files created OK + \sa LocateIndexes(), OpenIndexes(), BamReader::CreateIndex() +*/ +bool BamMultiReader::CreateIndexes(const BamIndex::IndexType& type) { + return d->CreateIndexes(type); +} + +/*! \fn const std::vector<std::string> BamMultiReader::Filenames(void) const + \brief Returns list of filenames for all open BAM files. + + Retrieved filenames will contain whatever was passed via Open(). + If you need full directory paths here, be sure to include them + when you open the BAM files. + + \returns names of open BAM files. If no files are open, returns an empty vector. + \sa IsOpen(), BamReader::GetFilename() +*/ +const std::vector<std::string> BamMultiReader::Filenames(void) const { + return d->Filenames(); +} + +/*! \fn SamHeader BamMultiReader::GetHeader(void) const + \brief Returns unified SAM-format header for all files + + N.B. - Modifying the retrieved text does NOT affect the current + BAM files. Thesse file have been opened in a read-only mode. However, + your modified header text can be used in conjunction with BamWriter + to generate a new BAM file with the appropriate header information. + + \returns header data wrapped in SamHeader object + \sa GetHeaderText(), BamReader::GetHeader() +*/ +SamHeader BamMultiReader::GetHeader(void) const { + return d->GetHeader(); +} + +/*! \fn std::string BamMultiReader::GetHeaderText(void) const + \brief Returns unified SAM-format header text for all files + + N.B. - Modifying the retrieved text does NOT affect the current + BAM files. Thesse file have been opened in a read-only mode. However, + your modified header text can be used in conjunction with BamWriter + to generate a new BAM file with the appropriate header information. + + \returns SAM-formatted header text + \sa GetHeader(), BamReader::GetHeaderText() +*/ +std::string BamMultiReader::GetHeaderText(void) const { + return d->GetHeaderText(); +} + +/*! \fn bool BamMultiReader::GetNextAlignment(BamAlignment& alignment) + \brief Retrieves next available alignment. + + Equivalent to BamReader::GetNextAlignment() with respect to what is a valid + overlapping alignment and what data gets populated. + + This method takes care of determining which alignment actually is 'next' + across multiple files, depending on current SortOrder. + + \param alignment destination for alignment record data + \returns \c true if a valid alignment was found + \sa GetNextAlignmentCore(), SetRegion(), SetSortOrder(), BamReader::GetNextAlignment() +*/ +bool BamMultiReader::GetNextAlignment(BamAlignment& nextAlignment) { + return d->GetNextAlignment(nextAlignment); +} + +/*! \fn bool BamMultiReader::GetNextAlignmentCore(BamAlignment& alignment) + \brief Retrieves next available alignment. + + Equivalent to BamReader::GetNextAlignmentCore() with respect to what is a valid + overlapping alignment and what data gets populated. + + This method takes care of determining which alignment actually is 'next' + across multiple files, depending on current SortOrder. + + \param alignment destination for alignment record data + \returns \c true if a valid alignment was found + \sa GetNextAlignment(), SetRegion(), SetSortOrder(), BamReader::GetNextAlignmentCore() +*/ +bool BamMultiReader::GetNextAlignmentCore(BamAlignment& nextAlignment) { + return d->GetNextAlignmentCore(nextAlignment); +} + +/*! \fn int BamMultiReader::GetReferenceCount(void) const + \brief Returns number of reference sequences. + \sa BamReader::GetReferenceCount() +*/ +int BamMultiReader::GetReferenceCount(void) const { + return d->GetReferenceCount(); +} + +/*! \fn const RefVector& BamMultiReader::GetReferenceData(void) const + \brief Returns all reference sequence entries. + \sa RefData, BamReader::GetReferenceData() +*/ +const BamTools::RefVector BamMultiReader::GetReferenceData(void) const { + return d->GetReferenceData(); +} + +/*! \fn int BamMultiReader::GetReferenceID(const std::string& refName) const + \brief Returns the ID of the reference with this name. + + If \a refName is not found, returns -1. + + \sa BamReader::GetReferenceID() +*/ +int BamMultiReader::GetReferenceID(const std::string& refName) const { + return d->GetReferenceID(refName); +} + +/*! \fn bool BamMultiReader::HasIndexes(void) const + \brief Returns \c true if all BAM files have index data available. + \sa BamReader::HasIndex() +*/ +bool BamMultiReader::HasIndexes(void) const { + return d->HasIndexes(); +} + +/*! \fn bool BamMultiReader::HasOpenReaders(void) const + \brief Returns \c true if there are any open BAM files. +*/ +bool BamMultiReader::HasOpenReaders(void) const { + return d->HasOpenReaders(); +} + +/*! \fn bool BamMultiReader::IsIndexLoaded(void) const + \brief Returns \c true if all BAM files have index data available. + + \deprecated Instead use HasIndexes() + \cond + See explanation in BamReader.cpp for more details on the deprecation decision. + \endcond +*/ + +bool BamMultiReader::IsIndexLoaded(void) const { + return d->HasIndexes(); +} + +/*! \fn bool BamMultiReader::Jump(int refID, int position) + \brief Performs a random-access jump within current BAM files. + + This is a convenience method, equivalent to calling SetRegion() + with only a left boundary specified. + + \returns \c true if jump was successful + \sa HasIndex(), BamReader::Jump() +*/ + +bool BamMultiReader::Jump(int refID, int position) { + return d->Jump(refID, position); +} + +/*! \fn bool BamMultiReader::LocateIndexes(const BamIndex::IndexType& preferredType) + \brief Looks for index files that match current BAM files. + + Use this function when you need index files, and perhaps have a + preferred index format, but do not depend heavily on which indexes + actually get loaded at runtime. + + For each BAM file, this function will defer to your \a preferredType + whenever possible. However, if an index file of \a preferredType can + not be found, then it will look for any other index file that matches + that BAM file. + + An example case would look this: + \code + + BamMultiReader reader; + // do setup + + // ensure that all files have an index + if ( !reader.LocateIndexes() ) // opens any existing index files that match our BAM files + reader.CreateIndexes(); // creates index files for BAM files that still lack one + + // do interesting stuff + // ... + + \endcode + + If you want precise control over which index files are loaded, use OpenIndexes() + with the desired index filenames. If that function returns false, you can use + CreateIndexes() to then build index files of the exact requested format. + + \param preferredType desired index file format, see BamIndex::IndexType for available formats + \returns \c true if index files could be found for \b ALL open BAM files + \sa BamReader::LocateIndex() +*/ +bool BamMultiReader::LocateIndexes(const BamIndex::IndexType& preferredType) { + return d->LocateIndexes(preferredType); +} + +/*! \fn bool BamMultiReader::Open(const std::vector<std::string>& filenames) + \brief Opens BAM files. + + N.B. - Opening BAM files will invalidate any current region set on the multireader. + All file pointers will be returned to the beginning of the alignment data. + Follow this with Jump() or SetRegion() to establish a region of interest. + + \param filenames list of BAM filenames to open + \returns \c true if BAM files were opened successfully + \sa Close(), HasOpenReaders(), OpenFile(), OpenIndexes(), BamReader::Open() +*/ +bool BamMultiReader::Open(const std::vector<std::string>& filenames) { + return d->Open(filenames); +} + +/*! \fn bool BamMultiReader::OpenFile(const std::string& filename) + \brief Opens a single BAM file. + + Adds another BAM file to multireader "on-the-fly". + + N.B. - Opening a BAM file invalidates any current region set on the multireader. + All file pointers will be returned to the beginning of the alignment data. + Follow this with Jump() or SetRegion() to establish a region of interest. + + \param filename BAM filename to open + \returns \c true if BAM file was opened successfully + \sa Close(), HasOpenReaders(), Open(), OpenIndexes(), BamReader::Open() +*/ +bool BamMultiReader::OpenFile(const std::string& filename) { + return d->OpenFile(filename); +} + +/*! \fn bool BamMultiReader::OpenIndexes(const std::vector<std::string>& indexFilenames) + \brief Opens index files for current BAM files. + + N.B. - Currently assumes that index filenames match the order (and number) of + BAM files passed to Open(). + + \param indexFilenames list of BAM index file names + \returns \c true if BAM index file was opened & data loaded successfully + \sa LocateIndex(), Open(), SetIndex(), BamReader::OpenIndex() +*/ +bool BamMultiReader::OpenIndexes(const std::vector<std::string>& indexFilenames) { + return d->OpenIndexes(indexFilenames); +} + +/*! \fn void BamMultiReader::PrintFilenames(void) const + \brief Convenience method for printing filenames to stdout. + \deprecated Doesn't really belong as an API function. Clients should + determine how the data is reported. + \sa Filenames(), BamReader::GetFilename() +*/ +void BamMultiReader::PrintFilenames(void) const { + d->PrintFilenames(); +} + +/*! \fn bool BamMultiReader::Rewind(void) + \brief Returns the internal file pointers to the beginning of alignment records. + + Useful for performing multiple sequential passes through BAM files. + Calling this function clears any prior region that may have been set. + + \returns \c true if rewind operation was successful + \sa Jump(), SetRegion(), BamReader::Rewind() +*/ +bool BamMultiReader::Rewind(void) { + return d->Rewind(); +} + +/*! \fn void BamMultiReader::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) + \brief Changes the caching behavior of the index data. + + Default mode is BamIndex::LimitedIndexCaching. + + \param mode desired cache mode for index, see BamIndex::IndexCacheMode for + description of the available cache modes + \sa HasIndex(), BamReader::SetIndexCacheMode() +*/ +void BamMultiReader::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) { + d->SetIndexCacheMode(mode); +} + +/*! \fn bool BamMultiReader::SetRegion(const BamRegion& region) + \brief Sets a target region of interest + + Equivalent to calling BamReader::SetRegion() on all open BAM files. + + \param region desired region-of-interest to activate + \returns \c true if ALL readers set the region successfully + \sa HasIndexes(), Jump(), BamReader::SetRegion() +*/ +bool BamMultiReader::SetRegion(const BamRegion& region) { + return d->SetRegion(region); +} + +/*! \fn bool BamMultiReader::SetRegion(const int& leftRefID, + const int& leftPosition, + const int& rightRefID, + const int& rightPosition) + \brief Sets a target region of interest + + This is an overloaded function. + + Equivalent to calling BamReader::SetRegion() on all open BAM files. + + \param leftRefID referenceID of region's left boundary + \param leftPosition position of region's left boundary + \param rightRefID reference ID of region's right boundary + \param rightPosition position of region's right boundary + + \returns \c true if ALL readers set the region successfully + \sa HasIndexes(), Jump(), BamReader::SetRegion() +*/ +bool BamMultiReader::SetRegion(const int& leftRefID, + const int& leftPosition, + const int& rightRefID, + const int& rightPosition) +{ + BamRegion region(leftRefID, leftPosition, rightRefID, rightPosition); + return d->SetRegion(region); +} + +/*! \fn void BamMultiReader::SetSortOrder(const SortOrder& order) + \brief Sets the expected sorting order for reading across multiple BAM files. + + Default is BamMultiReader::SortedByPosition. + + The SortOrder determines how the reader determines which alignment is "next" + from among its open readers. + + \param order expected sort order +*/ +void BamMultiReader::SetSortOrder(const SortOrder& order) { + d->SetSortOrder(order); +} diff --git a/src/utils/BamTools/src/api/BamMultiReader.h b/src/utils/BamTools/src/api/BamMultiReader.h new file mode 100644 index 0000000000000000000000000000000000000000..cc49ec8eb2076469be2585bdf978bd603acec8a3 --- /dev/null +++ b/src/utils/BamTools/src/api/BamMultiReader.h @@ -0,0 +1,127 @@ +// *************************************************************************** +// BamMultiReader.h (c) 2010 Erik Garrison, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 15 March 2011 (DB) +// --------------------------------------------------------------------------- +// Convenience class for reading multiple BAM files. +// *************************************************************************** + +#ifndef BAMMULTIREADER_H +#define BAMMULTIREADER_H + +#include <api/api_global.h> +#include <api/BamReader.h> +#include <map> +#include <sstream> +#include <string> +#include <utility> + +namespace BamTools { + +namespace Internal { + class BamMultiReaderPrivate; +} // namespace Internal + +class API_EXPORT BamMultiReader { + + public: + enum SortOrder { SortedByPosition = 0 + , SortedByReadName + , Unsorted + }; + + // constructor / destructor + public: + BamMultiReader(void); + ~BamMultiReader(void); + + // public interface + public: + + // ---------------------- + // BAM file operations + // ---------------------- + + // closes all open BAM files + void Close(void); + // close only the requested BAM file + void CloseFile(const std::string& filename); + // returns list of filenames for all open BAM files + const std::vector<std::string> Filenames(void) const; + // returns true if multireader has any open BAM files + bool HasOpenReaders(void) const; + // performs random-access jump within current BAM files + bool Jump(int refID, int position = 0); + // opens BAM files + bool Open(const std::vector<std::string>& filenames); + // opens a single BAM file, adding to any other current BAM files + bool OpenFile(const std::string& filename); + // returns file pointers to beginning of alignments + bool Rewind(void); + // sets the target region of interest + bool SetRegion(const BamRegion& region); + // sets the target region of interest + bool SetRegion(const int& leftRefID, + const int& leftPosition, + const int& rightRefID, + const int& rightPosition); + + // ---------------------- + // access alignment data + // ---------------------- + + // retrieves next available alignment + bool GetNextAlignment(BamAlignment& alignment); + // retrieves next available alignmnet (without populating the alignment's string data fields) + bool GetNextAlignmentCore(BamAlignment& alignment); + + // sets the expected sorting order for reading across multiple BAM files + void SetSortOrder(const SortOrder& order); + + // ---------------------- + // access auxiliary data + // ---------------------- + + // returns unified SAM header for all files + SamHeader GetHeader(void) const; + // returns unified SAM header text for all files + std::string GetHeaderText(void) const; + // returns number of reference sequences + int GetReferenceCount(void) const; + // returns all reference sequence entries. + const BamTools::RefVector GetReferenceData(void) const; + // returns the ID of the reference with this name. + int GetReferenceID(const std::string& refName) const; + + // ---------------------- + // BAM index operations + // ---------------------- + + // creates index files for current BAM files + bool CreateIndexes(const BamIndex::IndexType& type = BamIndex::STANDARD); + // returns true if all BAM files have index data available + bool HasIndexes(void) const; + // looks for index files that match current BAM files + bool LocateIndexes(const BamIndex::IndexType& preferredType = BamIndex::STANDARD); + // opens index files for current BAM files. + bool OpenIndexes(const std::vector<std::string>& indexFilenames); + // changes the caching behavior of the index data + void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode); + + // deprecated methods + public: + // returns \c true if all BAM files have index data available. + bool IsIndexLoaded(void) const; + // convenience method for printing filenames to stdout + void PrintFilenames(void) const; + + // private implementation + private: + Internal::BamMultiReaderPrivate* d; +}; + +} // namespace BamTools + +#endif // BAMMULTIREADER_H diff --git a/src/utils/BamTools/src/api/BamMultiReader.o b/src/utils/BamTools/src/api/BamMultiReader.o new file mode 100644 index 0000000000000000000000000000000000000000..f053ec2ab82d48b489244296b413cd1e6091c4f4 Binary files /dev/null and b/src/utils/BamTools/src/api/BamMultiReader.o differ diff --git a/src/utils/BamTools/src/api/BamReader.cpp b/src/utils/BamTools/src/api/BamReader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..eaa68827f95b06d09dea64f21a3d16277b9b9c69 --- /dev/null +++ b/src/utils/BamTools/src/api/BamReader.cpp @@ -0,0 +1,370 @@ +// *************************************************************************** +// BamReader.cpp (c) 2009 Derek Barnett, Michael Str�mberg +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 4 March 2011 (DB) +// --------------------------------------------------------------------------- +// Provides read access to BAM files. +// *************************************************************************** + +#include <api/BamReader.h> +#include <api/internal/BamReader_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <algorithm> +#include <iostream> +#include <iterator> +#include <string> +#include <vector> +using namespace std; + +/*! \class BamTools::BamReader + \brief Provides read access to BAM files. +*/ + +/*! \fn BamReader::BamReader(void) + \brief constructor +*/ +BamReader::BamReader(void) + : d(new BamReaderPrivate(this)) +{ } + +/*! \fn BamReader::~BamReader(void) + \brief destructor +*/ +BamReader::~BamReader(void) { + delete d; + d = 0; +} + +/*! \fn void BamReader::Close(void) + \brief Closes the current BAM file. + + Also clears out all header and reference data. + + \sa IsOpen(), Open() +*/ +void BamReader::Close(void) { + d->Close(); +} + +/*! \fn bool BamReader::CreateIndex(const BamIndex::IndexType& type) + \brief Creates an index file for current BAM file. + + \param type file format to create, see BamIndex::IndexType for available formats + \return \c true if index created OK + \sa LocateIndex(), OpenIndex() +*/ +bool BamReader::CreateIndex(const BamIndex::IndexType& type) { + return d->CreateIndex(type); +} + +/*! \fn const std::string BamReader::GetFilename(void) const + \brief Returns name of current BAM file. + + Retrieved filename will contain whatever was passed via Open(). + If you need full directory paths here, be sure to include them + when you open the BAM file. + + \returns name of open BAM file. If no file is open, returns an empty string. + \sa IsOpen() +*/ +const std::string BamReader::GetFilename(void) const { + return d->Filename(); +} + +/*! \fn SamHeader BamReader::GetHeader(void) const + \brief Returns SAM header data. + + Header data is wrapped in a SamHeader object that can be conveniently queried & modified. + + N.B. - Modifying the retrieved SamHeader object does NOT affect the + current BAM file. This file has been opened in a read-only mode. + However, your modified SamHeader object can be used in conjunction with + BamWriter to generate a new BAM file with the appropriate header information. + + \returns header data object + \sa GetHeaderText() +*/ +SamHeader BamReader::GetHeader(void) const { + return d->GetSamHeader(); +} + +/*! \fn std::string BamReader::GetHeaderText(void) const + \brief Returns SAM header data, as SAM-formatted text. + + N.B. - Modifying the retrieved text does NOT affect the current + BAM file. This file has been opened in a read-only mode. However, + your modified header text can be used in conjunction with BamWriter + to generate a new BAM file with the appropriate header information. + + \returns SAM-formatted header text + \sa GetHeader() +*/ +std::string BamReader::GetHeaderText(void) const { + return d->GetHeaderText(); +} + +/*! \fn bool BamReader::GetNextAlignment(BamAlignment& alignment) + \brief Retrieves next available alignment. + + Attempts to read the next alignment record from BAM file, and checks to see + if it overlaps the current region. If no region is currently set, then the + next alignment available is always considered valid. + + If a region has been set, via Jump() or SetRegion(), an alignment is only + considered valid if it overlaps the region. If the actual 'next' alignment record + in the BAM file does not overlap this region, then this function will read sequentially + through the file until the next alignment that overlaps this region is found. + Once the region has been exhausted (i.e. the next alignment loaded is beyond the region), + the function aborts and returns \c false. In this case, there is no point to continue + reading, assuming properly sorted alignments. + + This function fully populates all of the alignment's available data fields, + including the string data fields (read name, bases, qualities, tags, filename). + If only positional data (refID, position, CIGAR ops, alignment flags, etc.) + are required, consider using GetNextAlignmentCore() for a significant + performance boost. + + \param alignment destination for alignment record data + \returns \c true if a valid alignment was found +*/ +bool BamReader::GetNextAlignment(BamAlignment& alignment) { + return d->GetNextAlignment(alignment); +} + +/*! \fn bool BamReader::GetNextAlignmentCore(BamAlignment& alignment) + \brief Retrieves next available alignment, without populating the alignment's string data fields. + + Equivalent to GetNextAlignment() with respect to what is a valid overlapping alignment. + + However, this method does NOT populate the alignment's string data fields + (read name, bases, qualities, tags, filename). This provides a boost in speed + when these fields are not required for every alignment. These fields can be + populated 'lazily' (as needed) by calling BamAlignment::BuildCharData() later. + + \param alignment destination for alignment record data + \returns \c true if a valid alignment was found + \sa SetRegion() +*/ +bool BamReader::GetNextAlignmentCore(BamAlignment& alignment) { + return d->GetNextAlignmentCore(alignment); +} + +/*! \fn int BamReader::GetReferenceCount(void) const + \brief Returns number of reference sequences. +*/ +int BamReader::GetReferenceCount(void) const { + return d->GetReferenceCount(); +} + +/*! \fn const RefVector& BamReader::GetReferenceData(void) const + \brief Returns all reference sequence entries. + \sa RefData +*/ +const RefVector& BamReader::GetReferenceData(void) const { + return d->GetReferenceData(); +} + +/*! \fn int BamReader::GetReferenceID(const std::string& refName) const + \brief Returns the ID of the reference with this name. + + If \a refName is not found, returns -1. +*/ +int BamReader::GetReferenceID(const std::string& refName) const { + return d->GetReferenceID(refName); +} + +/*! \fn bool BamReader::HasIndex(void) const + \brief Returns \c true if index data is available. +*/ +bool BamReader::HasIndex(void) const { + return d->HasIndex(); +} + +/*! \fn bool BamReader::IsIndexLoaded(void) const + \brief Returns \c true if index data is available. + + \deprecated Instead use HasIndex() + \cond + Deprecated purely for API semantic clarity - HasIndex() should be clearer + than IsIndexLoaded() in light of the new caching modes that may clear the + index data from memory, but leave the index file open for later random access + seeks. + + For example, what would (IsIndexLoaded() == true) mean when cacheMode has been + explicitly set to NoIndexCaching? This is confusing at best, misleading about + current memory behavior at worst. + \endcond +*/ +bool BamReader::IsIndexLoaded(void) const { + return d->HasIndex(); +} + +/*! \fn bool BamReader::IsOpen(void) const + \brief Returns \c true if a BAM file is open for reading. +*/ +bool BamReader::IsOpen(void) const { + return d->IsOpen(); +} + +/*! \fn bool BamReader::Jump(int refID, int position) + \brief Performs a random-access jump within BAM file. + + This is a convenience method, equivalent to calling SetRegion() + with only a left boundary specified. + + \returns \c true if jump was successful + \sa HasIndex() +*/ +bool BamReader::Jump(int refID, int position) { + return d->SetRegion( BamRegion(refID, position) ); +} + +/*! \fn bool BamReader::LocateIndex(const BamIndex::IndexType& preferredType) + \brief Looks in BAM file's directory for a matching index file. + + Use this function when you need an index file, and perhaps have a + preferred index format, but do not depend heavily on which format + actually gets loaded at runtime. + + This function will defer to your \a preferredType whenever possible. + However, if an index file of \a preferredType can not be found, then + it will look for any other index file that corresponds to this BAM file. + + If you want precise control over which index file is loaded, use OpenIndex() + with the desired index filename. If that function returns false, you can use + CreateIndex() to then build an index of the exact requested format. + + \param preferredType desired index file format, see BamIndex::IndexType for available formats + \returns \c true if (any) index file could be found +*/ +bool BamReader::LocateIndex(const BamIndex::IndexType& preferredType) { + return d->LocateIndex(preferredType); +} + +/*! \fn bool BamReader::Open(const std::string& filename) + \brief Opens a BAM file. + + If BamReader is already opened on another file, this function closes + that file, then attempts to open requested \a filename. + + \param filename name of BAM file to open + \returns \c true if BAM file was opened successfully + \sa Close(), IsOpen(), OpenIndex() +*/ +bool BamReader::Open(const std::string& filename) { + return d->Open(filename); +} + +/*! \fn bool BamReader::OpenIndex(const std::string& indexFilename) + \brief Opens a BAM index file. + + \param indexFilename name of BAM index file + + \returns \c true if BAM index file was opened & data loaded successfully + \sa LocateIndex(), Open(), SetIndex() +*/ +bool BamReader::OpenIndex(const std::string& indexFilename) { + return d->OpenIndex(indexFilename); +} + +/*! \fn bool BamReader::Rewind(void) + \brief Returns the internal file pointer to the first alignment record. + + Useful for performing multiple sequential passes through a BAM file. + Calling this function clears any prior region that may have been set. + + N.B. - Note that this function sets the file pointer to first alignment record + in the BAM file, NOT the beginning of the file. + + \returns \c true if rewind operation was successful + \sa Jump(), SetRegion() +*/ +bool BamReader::Rewind(void) { + return d->Rewind(); +} + +/*! \fn void BamReader::SetIndex(BamIndex* index) + \brief Sets a custom BamIndex on this reader. + + Only necessary for custom BamIndex subclasses. Most clients should + never have to use this function. + + Example: + \code + BamReader reader; + reader.SetIndex(new MyCustomBamIndex); + \endcode + + N.B. - BamReader takes ownership of \a index - i.e. BamReader will + take care of deleting the pointer when the reader is destructed, + when the current BAM file is closed, or when a new index is requested. + + \param index custom BamIndex subclass created by client + \sa CreateIndex(), LocateIndex(), OpenIndex() +*/ +void BamReader::SetIndex(BamIndex* index) { + d->SetIndex(index); +} + +/*! \fn void BamReader::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) + \brief Changes the caching behavior of the index data. + + Default mode is BamIndex::LimitedIndexCaching. + + \param mode desired cache mode for index, see BamIndex::IndexCacheMode for + description of the available cache modes + \sa HasIndex() +*/ +void BamReader::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) { + d->SetIndexCacheMode(mode); +} + +/*! \fn bool BamReader::SetRegion(const BamRegion& region) + \brief Sets a target region of interest + + Requires that index data be available. Attempts a random-access + jump in the BAM file, near \a region left boundary position. + + Subsequent calls to GetNextAlignment() or GetNextAlignmentCore() + will only return \c true when alignments can be found that overlap + this \a region. + + A \a region with no right boundary is considered open-ended, meaning + that all alignments that lie downstream of the left boundary are + considered valid, continuing to the end of the BAM file. + + \param region desired region-of-interest to activate + \returns \c true if reader was able to jump successfully to the region's left boundary + \sa HasIndex(), Jump() +*/ +bool BamReader::SetRegion(const BamRegion& region) { + return d->SetRegion(region); +} + +/*! \fn bool BamReader::SetRegion(const int& leftRefID, + const int& leftPosition, + const int& rightRefID, + const int& rightPosition) + \brief Sets a target region of interest. + + This is an overloaded function. + + \param leftRefID referenceID of region's left boundary + \param leftPosition position of region's left boundary + \param rightRefID reference ID of region's right boundary + \param rightPosition position of region's right boundary + + \returns \c true if reader was able to jump successfully to the region's left boundary + \sa HasIndex(), Jump() +*/ +bool BamReader::SetRegion(const int& leftRefID, + const int& leftBound, + const int& rightRefID, + const int& rightBound) +{ + return d->SetRegion( BamRegion(leftRefID, leftBound, rightRefID, rightBound) ); +} diff --git a/src/utils/BamTools/src/api/BamReader.h b/src/utils/BamTools/src/api/BamReader.h new file mode 100644 index 0000000000000000000000000000000000000000..85b0c0d5713ed15f4a616ec4a6f7e1c2ab9d3c1b --- /dev/null +++ b/src/utils/BamTools/src/api/BamReader.h @@ -0,0 +1,118 @@ +// *************************************************************************** +// BamReader.h (c) 2009 Derek Barnett, Michael Str�mberg +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 4 March 2011 (DB) +// --------------------------------------------------------------------------- +// Provides read access to BAM files. +// *************************************************************************** + +#ifndef BAMREADER_H +#define BAMREADER_H + +#include <api/api_global.h> +#include <api/BamAlignment.h> +#include <api/BamIndex.h> +#include <api/SamHeader.h> +#include <string> + +namespace BamTools { + +namespace Internal { + class BamReaderPrivate; +} // namespace Internal + +class API_EXPORT BamReader { + + // constructor / destructor + public: + BamReader(void); + ~BamReader(void); + + // public interface + public: + + // ---------------------- + // BAM file operations + // ---------------------- + + // closes the current BAM file + void Close(void); + // returns filename of current BAM file + const std::string GetFilename(void) const; + // returns true if a BAM file is open for reading + bool IsOpen(void) const; + // performs random-access jump within BAM file + bool Jump(int refID, int position = 0); + // opens a BAM file + bool Open(const std::string& filename); + // returns internal file pointer to beginning of alignment data + bool Rewind(void); + // sets the target region of interest + bool SetRegion(const BamRegion& region); + // sets the target region of interest + bool SetRegion(const int& leftRefID, + const int& leftPosition, + const int& rightRefID, + const int& rightPosition); + + // ---------------------- + // access alignment data + // ---------------------- + + // retrieves next available alignment + bool GetNextAlignment(BamAlignment& alignment); + // retrieves next available alignmnet (without populating the alignment's string data fields) + bool GetNextAlignmentCore(BamAlignment& alignment); + + // ---------------------- + // access header data + // ---------------------- + + // returns SAM header data + SamHeader GetHeader(void) const; + // returns SAM header data, as SAM-formatted text + std::string GetHeaderText(void) const; + + // ---------------------- + // access reference data + // ---------------------- + + // returns the number of reference sequences + int GetReferenceCount(void) const; + // returns all reference sequence entries + const RefVector& GetReferenceData(void) const; + // returns the ID of the reference with this name + int GetReferenceID(const std::string& refName) const; + + // ---------------------- + // BAM index operations + // ---------------------- + + // creates an index file for current BAM file, using the requested index type + bool CreateIndex(const BamIndex::IndexType& type = BamIndex::STANDARD); + // returns true if index data is available + bool HasIndex(void) const; + // looks in BAM file's directory for a matching index file + bool LocateIndex(const BamIndex::IndexType& preferredType = BamIndex::STANDARD); + // opens a BAM index file + bool OpenIndex(const std::string& indexFilename); + // sets a custom BamIndex on this reader + void SetIndex(BamIndex* index); + // changes the caching behavior of the index data + void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode); + + // deprecated methods + public: + // returns true if index data is available + bool IsIndexLoaded(void) const; + + // private implementation + private: + Internal::BamReaderPrivate* d; +}; + +} // namespace BamTools + +#endif // BAMREADER_H diff --git a/src/utils/BamTools/src/api/BamReader.o b/src/utils/BamTools/src/api/BamReader.o new file mode 100644 index 0000000000000000000000000000000000000000..1fd86c23022b6a281f4a1135690e3477e21e989e Binary files /dev/null and b/src/utils/BamTools/src/api/BamReader.o differ diff --git a/src/utils/BamTools/src/api/BamWriter.cpp b/src/utils/BamTools/src/api/BamWriter.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8582f34be92b30e5e0d09a0465d729cf880b52c0 --- /dev/null +++ b/src/utils/BamTools/src/api/BamWriter.cpp @@ -0,0 +1,143 @@ +// *************************************************************************** +// BamWriter.cpp (c) 2009 Michael Str�mberg, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 4 March 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#include <api/BamAlignment.h> +#include <api/BamWriter.h> +#include <api/SamHeader.h> +#include <api/internal/BamWriter_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <iostream> +using namespace std; + +/*! \class BamTools::BamWriter + \brief Provides write access for generating BAM files. +*/ +/*! \enum BamTools::BamWriter::CompressionMode + \brief This enum describes the compression behaviors for output BAM files. +*/ +/*! \var BamWriter::CompressionMode BamWriter::Compressed + \brief Use normal BAM compression +*/ +/*! \var BamWriter::CompressionMode BamWriter::Uncompressed + \brief Disable BAM compression + + Useful in situations where the BAM data is streamed (e.g. piping). + It would be wasteful to compress, and then immediately decompress + the data. +*/ + +/*! \fn BamWriter::BamWriter(void) + \brief constructor +*/ +BamWriter::BamWriter(void) + : d(new BamWriterPrivate) +{ } + +/*! \fn BamWriter::~BamWriter(void) + \brief destructor +*/ +BamWriter::~BamWriter(void) { + delete d; + d = 0; +} + +/*! \fn BamWriter::Close(void) + \brief Closes the current BAM file. + \sa Open() +*/ +void BamWriter::Close(void) { + d->Close(); +} + +/*! \fn bool BamWriter::IsOpen(void) const + \brief Returns \c true if BAM file is open for writing. + \sa Open() +*/ +bool BamWriter::IsOpen(void) const { + return d->IsOpen(); +} + +/*! \fn bool BamWriter::Open(const std::string& filename, + const std::string& samHeaderText, + const RefVector& referenceSequences) + \brief Opens a BAM file for writing. + + Will overwrite the BAM file if it already exists. + + \param filename name of output BAM file + \param samHeaderText header data, as SAM-formatted string + \param referenceSequences list of reference entries + + \return \c true if opened successfully + \sa Close(), IsOpen(), BamReader::GetHeaderText(), BamReader::GetReferenceData() +*/ +bool BamWriter::Open(const std::string& filename, + const std::string& samHeaderText, + const RefVector& referenceSequences) +{ + return d->Open(filename, samHeaderText, referenceSequences); +} + +/*! \fn bool BamWriter::Open(const std::string& filename, + const SamHeader& samHeader, + const RefVector& referenceSequences) + \brief Opens a BAM file for writing. + + This is an overloaded function. + + Will overwrite the BAM file if it already exists. + + \param filename name of output BAM file + \param samHeader header data, wrapped in SamHeader object + \param referenceSequences list of reference entries + + \return \c true if opened successfully + \sa Close(), IsOpen(), BamReader::GetHeader(), BamReader::GetReferenceData() +*/ +bool BamWriter::Open(const std::string& filename, + const SamHeader& samHeader, + const RefVector& referenceSequences) +{ + return d->Open(filename, samHeader.ToString(), referenceSequences); +} + +/*! \fn void BamWriter::SaveAlignment(const BamAlignment& alignment) + \brief Saves an alignment to the BAM file. + + \param alignment BamAlignment record to save + \sa BamReader::GetNextAlignment(), BamReader::GetNextAlignmentCore() +*/ +void BamWriter::SaveAlignment(const BamAlignment& alignment) { + d->SaveAlignment(alignment); +} + +/*! \fn void BamWriter::SetCompressionMode(const CompressionMode& compressionMode) + \brief Sets the output compression mode. + + Default mode is BamWriter::Compressed. + + N.B. - Changing the compression mode is disabled on open files (i.e. the request will be ignored). + Be sure to call this function before opening the BAM file. + + \code + BamWriter writer; + writer.SetCompressionMode(BamWriter::Uncompressed); + writer.Open( ... ); + // ... + \endcode + + \param compressionMode desired output compression behavior + \sa IsOpen(), Open() +*/ +void BamWriter::SetCompressionMode(const CompressionMode& compressionMode) { + d->SetWriteCompressed( compressionMode == BamWriter::Compressed ); +} diff --git a/src/utils/BamTools/src/api/BamWriter.h b/src/utils/BamTools/src/api/BamWriter.h new file mode 100644 index 0000000000000000000000000000000000000000..476dbecf271e35ea8db22883055faa69c62babf0 --- /dev/null +++ b/src/utils/BamTools/src/api/BamWriter.h @@ -0,0 +1,64 @@ +// *************************************************************************** +// BamWriter.h (c) 2009 Michael Str�mberg, Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 4 March 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#ifndef BAMWRITER_H +#define BAMWRITER_H + +#include <api/api_global.h> +#include <api/BamAux.h> +#include <string> + +namespace BamTools { + +class BamAlignment; +class SamHeader; + +namespace Internal { + class BamWriterPrivate; +} // namespace Internal + +class API_EXPORT BamWriter { + + public: enum CompressionMode { Compressed = 0 + , Uncompressed + }; + + // ctor & dtor + public: + BamWriter(void); + ~BamWriter(void); + + // public interface + public: + // closes the current BAM file + void Close(void); + // returns true if BAM file is open for writing + bool IsOpen(void) const; + // opens a BAM file for writing + bool Open(const std::string& filename, + const std::string& samHeaderText, + const RefVector& referenceSequences); + // opens a BAM file for writing + bool Open(const std::string& filename, + const SamHeader& samHeader, + const RefVector& referenceSequences); + // saves the alignment to the alignment archive + void SaveAlignment(const BamAlignment& alignment); + // sets the output compression mode + void SetCompressionMode(const CompressionMode& compressionMode); + + // private implementation + private: + Internal::BamWriterPrivate* d; +}; + +} // namespace BamTools + +#endif // BAMWRITER_H diff --git a/src/utils/BamTools/src/api/BamWriter.o b/src/utils/BamTools/src/api/BamWriter.o new file mode 100644 index 0000000000000000000000000000000000000000..9f7e8a4a859cf95c05edeb768a381936c4c96eba Binary files /dev/null and b/src/utils/BamTools/src/api/BamWriter.o differ diff --git a/src/utils/BamTools/src/api/CMakeLists.txt b/src/utils/BamTools/src/api/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ba99632bd4594d189e6d024232969eda8019045 --- /dev/null +++ b/src/utils/BamTools/src/api/CMakeLists.txt @@ -0,0 +1,78 @@ +# ========================== +# BamTools CMakeLists.txt +# (c) 2010 Derek Barnett +# +# src/api/ +# ========================== + +# list include paths +include_directories( ${BamTools_SOURCE_DIR}/src ) + +# add compiler definitions +add_definitions( -DBAMTOOLS_API_LIBRARY ) # (for proper exporting of library symbols) +add_definitions( -fPIC ) # (attempt to force PIC compiling on some archs) + +# list of all BamTools API source (.cpp) files +set( BamToolsAPISources + BamAlignment.cpp + BamMultiReader.cpp + BamReader.cpp + BamWriter.cpp + SamHeader.cpp + SamProgram.cpp + SamProgramChain.cpp + SamReadGroup.cpp + SamReadGroupDictionary.cpp + SamSequence.cpp + SamSequenceDictionary.cpp + internal/BamHeader_p.cpp + internal/BamIndexFactory_p.cpp + internal/BamMultiReader_p.cpp + internal/BamRandomAccessController_p.cpp + internal/BamReader_p.cpp + internal/BamStandardIndex_p.cpp + internal/BamToolsIndex_p.cpp + internal/BamWriter_p.cpp + internal/BgzfStream_p.cpp + internal/SamFormatParser_p.cpp + internal/SamFormatPrinter_p.cpp + internal/SamHeaderValidator_p.cpp +) + +# create main BamTools API shared library +add_library( BamTools SHARED ${BamToolsAPISources} ) +set_target_properties( BamTools PROPERTIES SOVERSION "1.0.2" ) +set_target_properties( BamTools PROPERTIES OUTPUT_NAME "bamtools" ) + +# create main BamTools API static library +add_library( BamTools-static STATIC ${BamToolsAPISources} ) +set_target_properties( BamTools-static PROPERTIES OUTPUT_NAME "bamtools" ) +set_target_properties( BamTools-static PROPERTIES PREFIX "lib" ) + +# link libraries with zlib automatically +target_link_libraries( BamTools z ) +target_link_libraries( BamTools-static z ) + +# set library install destinations +install( TARGETS BamTools LIBRARY DESTINATION "lib/bamtools") +install( TARGETS BamTools-static ARCHIVE DESTINATION "lib/bamtools") + +# export API headers +include(../ExportHeader.cmake) +set(ApiIncludeDir "api") +ExportHeader(APIHeaders api_global.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamAlignment.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamAux.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamConstants.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamIndex.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamMultiReader.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamReader.h ${ApiIncludeDir}) +ExportHeader(APIHeaders BamWriter.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamConstants.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamHeader.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamProgram.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamProgramChain.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamReadGroup.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamReadGroupDictionary.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamSequence.h ${ApiIncludeDir}) +ExportHeader(APIHeaders SamSequenceDictionary.h ${ApiIncludeDir}) diff --git a/src/utils/BamTools/src/api/SamConstants.h b/src/utils/BamTools/src/api/SamConstants.h new file mode 100644 index 0000000000000000000000000000000000000000..d34592027891a8233e6e27b6b4f2cb8a84a8f9a6 --- /dev/null +++ b/src/utils/BamTools/src/api/SamConstants.h @@ -0,0 +1,96 @@ +// *************************************************************************** +// SamConstants.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides constants for SAM header +// *************************************************************************** + +#ifndef SAM_CONSTANTS_H +#define SAM_CONSTANTS_H + +#include <api/api_global.h> +#include <string> + +namespace BamTools { +namespace Constants { + +// basic char constants used in SAM format +const char SAM_COLON = ':'; +const char SAM_EQUAL = '='; +const char SAM_PERIOD = '.'; +const char SAM_STAR = '*'; +const char SAM_TAB = '\t'; +const std::string SAM_DIGITS = "0123456789"; + +// HD entries +const std::string SAM_HD_BEGIN_TOKEN = "@HD"; +const std::string SAM_HD_VERSION_TAG = "VN"; +const std::string SAM_HD_SORTORDER_TAG = "SO"; +const std::string SAM_HD_GROUPORDER_TAG = "GO"; + +// SQ entries +const std::string SAM_SQ_BEGIN_TOKEN = "@SQ"; +const std::string SAM_SQ_ASSEMBLYID_TAG = "AS"; +const std::string SAM_SQ_CHECKSUM_TAG = "M5"; +const std::string SAM_SQ_LENGTH_TAG = "LN"; +const std::string SAM_SQ_NAME_TAG = "SN"; +const std::string SAM_SQ_SPECIES_TAG = "SP"; +const std::string SAM_SQ_URI_TAG = "UR"; + +// RG entries +const std::string SAM_RG_BEGIN_TOKEN = "@RG"; +const std::string SAM_RG_DESCRIPTION_TAG = "DS"; +const std::string SAM_RG_FLOWORDER_TAG = "FO"; +const std::string SAM_RG_ID_TAG = "ID"; +const std::string SAM_RG_KEYSEQUENCE_TAG = "KS"; +const std::string SAM_RG_LIBRARY_TAG = "LB"; +const std::string SAM_RG_PLATFORMUNIT_TAG = "PU"; +const std::string SAM_RG_PREDICTEDINSERTSIZE_TAG = "PI"; +const std::string SAM_RG_PRODUCTIONDATE_TAG = "DT"; +const std::string SAM_RG_PROGRAM_TAG = "PG"; +const std::string SAM_RG_SAMPLE_TAG = "SM"; +const std::string SAM_RG_SEQCENTER_TAG = "CN"; +const std::string SAM_RG_SEQTECHNOLOGY_TAG = "PL"; + +// PG entries +const std::string SAM_PG_BEGIN_TOKEN = "@PG"; +const std::string SAM_PG_COMMANDLINE_TAG = "CL"; +const std::string SAM_PG_ID_TAG = "ID"; +const std::string SAM_PG_NAME_TAG = "PN"; +const std::string SAM_PG_PREVIOUSPROGRAM_TAG = "PP"; +const std::string SAM_PG_VERSION_TAG = "VN"; + +// CO entries +const std::string SAM_CO_BEGIN_TOKEN = "@CO"; + +// HD:SO values +const std::string SAM_HD_SORTORDER_COORDINATE = "coordinate"; +const std::string SAM_HD_SORTORDER_QUERYNAME = "queryname"; +const std::string SAM_HD_SORTORDER_UNKNOWN = "unknown"; +const std::string SAM_HD_SORTORDER_UNSORTED = "unsorted"; + +// HD:GO values +const std::string SAM_HD_GROUPORDER_NONE = "none"; +const std::string SAM_HD_GROUPORDER_QUERY = "query"; +const std::string SAM_HD_GROUPORDER_REFERENCE = "reference"; + +// SQ:LN values +const unsigned int SAM_SQ_LENGTH_MIN = 1; +const unsigned int SAM_SQ_LENGTH_MAX = 536870911; // 2^29 - 1 + +// RG:PL values +const std::string SAM_RG_SEQTECHNOLOGY_CAPILLARY = "CAPILLARY"; +const std::string SAM_RG_SEQTECHNOLOGY_HELICOS = "HELICOS"; +const std::string SAM_RG_SEQTECHNOLOGY_ILLUMINA = "ILLUMINA"; +const std::string SAM_RG_SEQTECHNOLOGY_IONTORRENT = "IONTORRENT"; +const std::string SAM_RG_SEQTECHNOLOGY_LS454 = "LS454"; +const std::string SAM_RG_SEQTECHNOLOGY_PACBIO = "PACBIO"; +const std::string SAM_RG_SEQTECHNOLOGY_SOLID = "SOLID"; + +} // namespace Constants +} // namespace BamTools + +#endif // SAM_CONSTANTS_H diff --git a/src/utils/BamTools/src/api/SamHeader.cpp b/src/utils/BamTools/src/api/SamHeader.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9104978ba7492c77db439b3c037984c5874051d1 --- /dev/null +++ b/src/utils/BamTools/src/api/SamHeader.cpp @@ -0,0 +1,185 @@ +// *************************************************************************** +// SamHeader.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM header data fields. +// *************************************************************************** + +#include <api/SamConstants.h> +#include <api/SamHeader.h> +#include <api/internal/SamFormatParser_p.h> +#include <api/internal/SamFormatPrinter_p.h> +#include <api/internal/SamHeaderValidator_p.h> +using namespace BamTools; +using namespace BamTools::Internal; +using namespace std; + +/*! \struct BamTools::SamHeader + \brief Represents the SAM-formatted text header that is part of the BAM file header. + + Provides direct read/write access to the SAM header data fields. + + \sa \samSpecURL +*/ +/*! \var SamHeader::Version + \brief corresponds to \@HD VN:\<Version\> + + Required for valid SAM header, if @HD record is present. +*/ +/*! \var SamHeader::SortOrder + \brief corresponds to \@HD SO:\<SortOrder\> +*/ +/*! \var SamHeader::GroupOrder + \brief corresponds to \@HD GO:\<GroupOrder\> +*/ +/*! \var SamHeader::Sequences + \brief corresponds to \@SQ entries + \sa SamSequence, SamSequenceDictionary +*/ +/*! \var SamHeader::ReadGroups + \brief corresponds to \@RG entries + \sa SamReadGroup, SamReadGroupDictionary +*/ +/*! \var SamHeader::ProgramName + \brief corresponds to \@PG ID:\<ProgramName\> +*/ +/*! \var SamHeader::ProgramVersion + \brief corresponds to \@PG VN:\<ProgramVersion\> +*/ +/*! \var SamHeader::ProgramCommandLine + \brief corresponds to \@PG CL:\<ProgramCommandLine\> +*/ +/*! \var SamHeader::Comments + \brief corresponds to \@CO entries +*/ + +/*! \fn SamHeader::SamHeader(const std::string& headerText = "") + \brief constructor +*/ +SamHeader::SamHeader(const std::string& headerText) + : Version("") + , SortOrder(Constants::SAM_HD_SORTORDER_UNKNOWN) + , GroupOrder("") +{ + SamFormatParser parser(*this); + parser.Parse(headerText); +} + +/*! \fn SamHeader::SamHeader(const SamHeader& other) + \brief copy constructor +*/ +SamHeader::SamHeader(const SamHeader& other) + : Version(other.Version) + , SortOrder(other.SortOrder) + , GroupOrder(other.GroupOrder) + , Sequences(other.Sequences) + , ReadGroups(other.ReadGroups) + , Programs(other.Programs) +{ } + +/*! \fn SamHeader::~SamHeader(void) + \brief destructor +*/ +SamHeader::~SamHeader(void) { } + +/*! \fn void SamHeader::Clear(void) + \brief Clears all header contents. +*/ +void SamHeader::Clear(void) { + Version.clear(); + SortOrder.clear(); + GroupOrder.clear(); + Sequences.Clear(); + ReadGroups.Clear(); + Programs.Clear(); + Comments.clear(); +} + +/*! \fn bool SamHeader::HasVersion(void) const + \brief Returns \c true if header contains \@HD ID:\<Version\> +*/ +bool SamHeader::HasVersion(void) const { + return (!Version.empty()); +} + +/*! \fn bool SamHeader::HasSortOrder(void) const + \brief Returns \c true if header contains \@HD SO:\<SortOrder\> +*/ +bool SamHeader::HasSortOrder(void) const { + return (!SortOrder.empty()); +} + +/*! \fn bool SamHeader::HasGroupOrder(void) const + \brief Returns \c true if header contains \@HD GO:\<GroupOrder\> +*/ +bool SamHeader::HasGroupOrder(void) const { + return (!GroupOrder.empty()); +} + +/*! \fn bool SamHeader::HasSequences(void) const + \brief Returns \c true if header contains any \@SQ entries +*/ +bool SamHeader::HasSequences(void) const { + return (!Sequences.IsEmpty()); +} + +/*! \fn bool SamHeader::HasReadGroups(void) const + \brief Returns \c true if header contains any \@RG entries +*/ +bool SamHeader::HasReadGroups(void) const { + return (!ReadGroups.IsEmpty()); +} + +/*! \fn bool SamHeader::HasPrograms(void) const + \brief Returns \c true if header contains any \@PG entries +*/ +bool SamHeader::HasPrograms(void) const { + return (!Programs.IsEmpty()); +} + +/*! \fn bool SamHeader::HasComments(void) const + \brief Returns \c true if header contains any \@CO entries +*/ +bool SamHeader::HasComments(void) const { + return (!Comments.empty()); +} + +/*! \fn bool SamHeader::IsValid(bool verbose = false) const + \brief Checks header contents for required data and proper formatting. + \param verbose If set to true, validation errors & warnings will be printed to stderr. + Otherwise, output is suppressed and only validation check occurs. + \return \c true if SAM header is well-formed +*/ +bool SamHeader::IsValid(bool verbose) const { + SamHeaderValidator validator(*this); + return validator.Validate(verbose); +} + +/*! \fn void SamHeader::SetHeaderText(const std::string& headerText) + \brief Replaces header contents with \a headerText. + \param headerText SAM formatted-text that will be parsed into data fields +*/ +void SamHeader::SetHeaderText(const std::string& headerText) { + + // clear prior data + Clear(); + + // parse header text into data + SamFormatParser parser(*this); + parser.Parse(headerText); +} + +/*! \fn std::string SamHeader::ToString(void) const + \brief Converts data fields to SAM-formatted text. + + Applies any local modifications made since creating this object or calling SetHeaderText(). + + \return SAM-formatted header text +*/ +string SamHeader::ToString(void) const { + SamFormatPrinter printer(*this); + return printer.ToString(); +} diff --git a/src/utils/BamTools/src/api/SamHeader.h b/src/utils/BamTools/src/api/SamHeader.h new file mode 100644 index 0000000000000000000000000000000000000000..5c7a1019120f74981ad8ee3e73559d8247b569a9 --- /dev/null +++ b/src/utils/BamTools/src/api/SamHeader.h @@ -0,0 +1,69 @@ +// *************************************************************************** +// SamHeader.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM header data fields. +// *************************************************************************** + +#ifndef SAM_HEADER_H +#define SAM_HEADER_H + +#include <api/api_global.h> +#include <api/SamProgramChain.h> +#include <api/SamReadGroupDictionary.h> +#include <api/SamSequenceDictionary.h> +#include <string> +#include <vector> + +namespace BamTools { + +struct API_EXPORT SamHeader { + + // ctor & dtor + SamHeader(const std::string& headerText = ""); + SamHeader(const SamHeader& other); + ~SamHeader(void); + + // query/modify entire SamHeader + void Clear(void); // clears all header contents + bool IsValid(bool verbose = false) const; // returns true if SAM header is well-formed + void SetHeaderText(const std::string& headerText); // replaces data fields with contents of SAM-formatted text + std::string ToString(void) const; // returns the printable, SAM-formatted header text + + // convenience query methods + bool HasVersion(void) const; // returns true if header contains format version entry + bool HasSortOrder(void) const; // returns true if header contains sort order entry + bool HasGroupOrder(void) const; // returns true if header contains group order entry + bool HasSequences(void) const; // returns true if header contains any sequence entries + bool HasReadGroups(void) const; // returns true if header contains any read group entries + bool HasPrograms(void) const; // returns true if header contains any program record entries + bool HasComments(void) const; // returns true if header contains comments + + // -------------- + // data members + // -------------- + + // header metadata (@HD line) + std::string Version; // VN:<Version> *Required for valid SAM header, if @HD record is present* + std::string SortOrder; // SO:<SortOrder> + std::string GroupOrder; // GO:<GroupOrder> + + // header sequences (@SQ entries) + SamSequenceDictionary Sequences; + + // header read groups (@RG entries) + SamReadGroupDictionary ReadGroups; + + // header program data (@PG entries) + SamProgramChain Programs; + + // header comments (@CO entries) + std::vector<std::string> Comments; +}; + +} // namespace BamTools + +#endif // SAM_HEADER_H diff --git a/src/utils/BamTools/src/api/SamHeader.o b/src/utils/BamTools/src/api/SamHeader.o new file mode 100644 index 0000000000000000000000000000000000000000..10e49204ad057a11f159aec86122566f75b9c651 Binary files /dev/null and b/src/utils/BamTools/src/api/SamHeader.o differ diff --git a/src/utils/BamTools/src/api/SamProgram.cpp b/src/utils/BamTools/src/api/SamProgram.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b89a10bf0e1a1cf31ae2d88c0424e79446523262 --- /dev/null +++ b/src/utils/BamTools/src/api/SamProgram.cpp @@ -0,0 +1,140 @@ +// *************************************************************************** +// SamProgram.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM header program records. +// *************************************************************************** + +#include <api/SamProgram.h> +using namespace BamTools; +using namespace std; + +/*! \struct BamTools::SamProgram + \brief Represents a SAM program record. + + Provides direct read/write access to the SAM header program records. + + \sa \samSpecURL +*/ +/*! \var SamProgram::CommandLine + \brief corresponds to \@PG CL:\<CommandLine\> +*/ +/*! \var SamProgram::ID + \brief corresponds to \@PG ID:\<ID\> + + Required for valid SAM header. +*/ +/*! \var SamProgram::Name + \brief corresponds to \@PG PN:\<Name\> +*/ +/*! \var SamProgram::PreviousProgramID + \brief corresponds to \@PG PP:\<PreviousProgramID\> +*/ +/*! \var SamProgram::Version + \brief corresponds to \@PG VN:\<Version\> +*/ +/*! \var SamProgram::NextProgramID + \internal + Holds ID of the "next" program record in a SamProgramChain +*/ + +/*! \fn SamProgram::SamProgram(void) + \brief default constructor +*/ +SamProgram::SamProgram(void) + : CommandLine("") + , ID("") + , Name("") + , PreviousProgramID("") + , Version("") + , NextProgramID("") +{ } + +/*! \fn SamProgram::SamProgram(const std::string& id) + \brief constructs program record with \a id + + \param id desired program record ID +*/ +SamProgram::SamProgram(const std::string& id) + : CommandLine("") + , ID(id) + , Name("") + , PreviousProgramID("") + , Version("") + , NextProgramID("") +{ } + +/*! \fn SamProgram::SamProgram(const SamProgram& other) + \brief copy constructor +*/ +SamProgram::SamProgram(const SamProgram& other) + : CommandLine(other.CommandLine) + , ID(other.ID) + , Name(other.Name) + , PreviousProgramID(other.PreviousProgramID) + , Version(other.Version) + , NextProgramID(other.NextProgramID) +{ } + +/*! \fn SamProgram::~SamProgram(void) + \brief destructor +*/ +SamProgram::~SamProgram(void) { } + +/*! \fn void SamProgram::Clear(void) + \brief Clears all data fields. +*/ +void SamProgram::Clear(void) { + CommandLine.clear(); + ID.clear(); + Name.clear(); + PreviousProgramID.clear(); + Version.clear(); + NextProgramID.clear(); +} + +/*! \fn bool SamProgram::HasCommandLine(void) const + \brief Returns \c true if program record contains \@PG: CL:\<CommandLine\> +*/ +bool SamProgram::HasCommandLine(void) const { + return (!CommandLine.empty()); +} + +/*! \fn bool SamProgram::HasID(void) const + \brief Returns \c true if program record contains \@PG: ID:\<ID\> +*/ +bool SamProgram::HasID(void) const { + return (!ID.empty()); +} + +/*! \fn bool SamProgram::HasName(void) const + \brief Returns \c true if program record contains \@PG: PN:\<Name\> +*/ +bool SamProgram::HasName(void) const { + return (!Name.empty()); +} + +/*! \fn bool SamProgram::HasNextProgramID(void) const + \internal + \return true if program has a "next" record in a SamProgramChain +*/ +bool SamProgram::HasNextProgramID(void) const { + return (!NextProgramID.empty()); +} + +/*! \fn bool SamProgram::HasPreviousProgramID(void) const + \brief Returns \c true if program record contains \@PG: PP:\<PreviousProgramID\> +*/ +bool SamProgram::HasPreviousProgramID(void) const { + return (!PreviousProgramID.empty()); +} + +/*! \fn bool SamProgram::HasVersion(void) const + \brief Returns \c true if program record contains \@PG: VN:\<Version\> +*/ +bool SamProgram::HasVersion(void) const { + return (!Version.empty()); +} diff --git a/src/utils/BamTools/src/api/SamProgram.h b/src/utils/BamTools/src/api/SamProgram.h new file mode 100644 index 0000000000000000000000000000000000000000..3c89059bdaedeeaf0b287b9c4fd3a2aae794002e --- /dev/null +++ b/src/utils/BamTools/src/api/SamProgram.h @@ -0,0 +1,62 @@ +// *************************************************************************** +// SamProgram.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM header program records. +// *************************************************************************** + +#ifndef SAM_PROGRAM_H +#define SAM_PROGRAM_H + +#include "api/api_global.h" +#include <string> + +namespace BamTools { + +class SamProgramChain; + +struct API_EXPORT SamProgram { + + // ctor & dtor + SamProgram(void); + SamProgram(const std::string& id); + SamProgram(const SamProgram& other); + ~SamProgram(void); + + // query/modify entire program record + void Clear(void); // clears all data fields + + // convenience query methods + bool HasCommandLine(void) const; // returns true if program record has a command line entry + bool HasID(void) const; // returns true if program record has an ID + bool HasName(void) const; // returns true if program record has a name + bool HasPreviousProgramID(void) const; // returns true if program record has a 'previous program ID' + bool HasVersion(void) const; // returns true if program record has a version + + // data members + std::string CommandLine; // CL:<CommandLine> + std::string ID; // ID:<ID> *Required for valid SAM header* + std::string Name; // PN:<Name> + std::string PreviousProgramID; // PP:<PreviousProgramID> + std::string Version; // VN:<Version> + + // internal (non-standard) methods & fields + private: + bool HasNextProgramID(void) const; + std::string NextProgramID; + friend class BamTools::SamProgramChain; +}; + +/*! \fn bool operator==(const SamProgram& lhs, const SamProgram& rhs) + \brief tests equality by comparing program IDs +*/ +API_EXPORT inline bool operator==(const SamProgram& lhs, const SamProgram& rhs) { + return lhs.ID == rhs.ID; +} + +} // namespace BamTools + +#endif // SAM_PROGRAM_H diff --git a/src/utils/BamTools/src/api/SamProgram.o b/src/utils/BamTools/src/api/SamProgram.o new file mode 100644 index 0000000000000000000000000000000000000000..b7d6c41bd895efdfc342560e2d97daca9cded631 Binary files /dev/null and b/src/utils/BamTools/src/api/SamProgram.o differ diff --git a/src/utils/BamTools/src/api/SamProgramChain.cpp b/src/utils/BamTools/src/api/SamProgramChain.cpp new file mode 100644 index 0000000000000000000000000000000000000000..66b7f071abdacd8a99d4e8863666d8e346c9c2d2 --- /dev/null +++ b/src/utils/BamTools/src/api/SamProgramChain.cpp @@ -0,0 +1,352 @@ +// *************************************************************************** +// SamProgramChain.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides methods for operating on a SamProgram record "chain" +// *************************************************************************** + +#include <api/SamProgramChain.h> +using namespace BamTools; + +#include <algorithm> +#include <iostream> +#include <cstdlib> +using namespace std; + +/*! \class BamTools::SamProgramChain + \brief Sorted container "chain" of SamProgram records. + + Provides methods for operating on a collection of SamProgram records. + + N.B. - Underlying container is *NOT* ordered by linkage, but by order of + appearance in SamHeader and subsequent Add() calls. Using the current + iterators will not allow you to step through the header's program history. + Instead use First()/Last() to access oldest/newest records, respectively. +*/ + +/*! \fn SamProgramChain::SamProgramChain(void) + \brief constructor +*/ +SamProgramChain::SamProgramChain(void) { } + +/*! \fn SamProgramChain::SamProgramChain(const SamProgramChain& other) + \brief copy constructor +*/ +SamProgramChain::SamProgramChain(const SamProgramChain& other) + : m_data(other.m_data) +{ } + +/*! \fn SamProgramChain::~SamProgramChain(void) + \brief destructor +*/ +SamProgramChain::~SamProgramChain(void) { } + +/*! \fn void SamProgramChain::Add(SamProgram& program) + \brief Appends a program to program chain. + + Duplicate entries are silently discarded. + + N.B. - Underlying container is *NOT* ordered by linkage, but by order of + appearance in SamHeader and subsequent Add() calls. Using the current + iterators will not allow you to step through the header's program history. + Instead use First()/Last() to access oldest/newest records, respectively. + + \param program entry to be appended +*/ +void SamProgramChain::Add(SamProgram& program) { + + // ignore duplicated records + if ( Contains(program) ) + return; + + // if other programs already in chain, try to find the "next" record + // tries to match another record's PPID with @program's ID + if ( !IsEmpty() ) + program.NextProgramID = NextIdFor(program.ID); + + // store program record + m_data.push_back(program); +} + +/*! \fn void SamProgramChain::Add(const std::vector<SamProgram>& programs) + \brief Appends a batch of programs to the end of the chain. + + This is an overloaded function. + + \param programs batch of program records to append + \sa Add() +*/ +void SamProgramChain::Add(std::vector<SamProgram>& programs) { + vector<SamProgram>::iterator pgIter = programs.begin(); + vector<SamProgram>::iterator pgEnd = programs.end(); + for ( ; pgIter != pgEnd; ++pgIter ) + Add(*pgIter); +} + +/*! \fn SamProgramIterator SamProgramChain::Begin(void) + \return an STL iterator pointing to the first (oldest) program record + \sa ConstBegin(), End(), First() +*/ +SamProgramIterator SamProgramChain::Begin(void) { + return m_data.begin(); +} + +/*! \fn SamProgramConstIterator SamProgramChain::Begin(void) const + \return an STL const_iterator pointing to the first (oldest) program record + + This is an overloaded function. + + \sa ConstBegin(), End(), First() +*/ +SamProgramConstIterator SamProgramChain::Begin(void) const { + return m_data.begin(); +} + +/*! \fn void SamProgramChain::Clear(void) + \brief Clears all program records. +*/ +void SamProgramChain::Clear(void) { + m_data.clear(); +} + +/*! \fn SamProgramConstIterator SamProgramChain::ConstBegin(void) const + \return an STL const_iterator pointing to the first (oldest) program record + \sa Begin(), ConstEnd(), First() +*/ +SamProgramConstIterator SamProgramChain::ConstBegin(void) const { + return m_data.begin(); +} + +/*! \fn SamProgramConstIterator SamProgramChain::ConstEnd(void) const + \return an STL const_iterator pointing to the imaginary entry after the last (newest) program record + \sa ConstBegin(), End(), Last() +*/ +SamProgramConstIterator SamProgramChain::ConstEnd(void) const { + return m_data.end(); +} + +/*! \fn bool SamProgramChain::Contains(const SamProgram& program) const + \brief Returns true if chains has this program record (matching on ID). + + This is an overloaded function. + + \param program SamProgram to search for + \return \c true if chain contains program (matching on ID) +*/ +bool SamProgramChain::Contains(const SamProgram& program) const { + return Contains(program.ID); +} + +/*! \fn bool SamProgramChain::Contains(const std::string& programId) const + \brief Returns true if chains has a program record with this ID + \param programId search for program matching this ID + \return \c true if chain contains a program record with this ID +*/ +bool SamProgramChain::Contains(const std::string& programId) const { + return ( IndexOf(programId) != (int)m_data.size() ); +} + +/*! \fn SamProgramIterator SamProgramChain::End(void) + \return an STL iterator pointing to the imaginary entry after the last (newest) program record + \sa Begin(), ConstEnd(), Last() +*/ +SamProgramIterator SamProgramChain::End(void) { + return m_data.end(); +} + +/*! \fn SamProgramConstIterator SamProgramChain::End(void) const + \return an STL const_iterator pointing to the imaginary entry after the last (newest) program record + + This is an overloaded function. + + \sa Begin(), ConstEnd(), Last() +*/ +SamProgramConstIterator SamProgramChain::End(void) const { + return m_data.end(); +} + +/*! \fn SamProgram& SamProgramChain::First(void) + \brief Fetches first (oldest) record in the chain. + + N.B. - This function will fail if the chain is empty. If this is possible, + check the result of IsEmpty() before calling this function. + + \return a modifiable reference to the first (oldest) program entry + \sa Begin(), Last() +*/ +SamProgram& SamProgramChain::First(void) { + + // find first record in container that has no PreviousProgramID entry + SamProgramIterator iter = Begin(); + SamProgramIterator end = End(); + for ( ; iter != end; ++iter ) { + SamProgram& current = (*iter); + if ( !current.HasPreviousProgramID() ) + return current; + } + + // otherwise error + cerr << "SamProgramChain ERROR - could not find any record without a PP tag" << endl; + exit(1); +} + +/*! \fn const SamProgram& SamProgramChain::First(void) const + \brief Fetches first (oldest) record in the chain. + + This is an overloaded function. + + N.B. - This function will fail if the chain is empty. If this is possible, + check the result of IsEmpty() before calling this function. + + \return a read-only reference to the first (oldest) program entry + \sa Begin(), ConstBegin(), Last() +*/ +const SamProgram& SamProgramChain::First(void) const { + + // find first record in container that has no PreviousProgramID entry + SamProgramConstIterator iter = ConstBegin(); + SamProgramConstIterator end = ConstEnd(); + for ( ; iter != end; ++iter ) { + const SamProgram& current = (*iter); + if ( !current.HasPreviousProgramID() ) + return current; + } + + // otherwise error + cerr << "SamProgramChain ERROR - could not find any record without a PP tag" << endl; + exit(1); +} + +/*! \fn int SamProgramChain::IndexOf(const std::string& programId) const + \internal + \return index of program record if found. + Otherwise, returns vector::size() (invalid index). +*/ +int SamProgramChain::IndexOf(const std::string& programId) const { + SamProgramConstIterator begin = ConstBegin(); + SamProgramConstIterator iter = begin; + SamProgramConstIterator end = ConstEnd(); + for ( ; iter != end; ++iter ) { + const SamProgram& current = (*iter); + if ( current.ID == programId ) + break; + } + return distance( begin, iter ); +} + +/*! \fn bool SamProgramChain::IsEmpty(void) const + \brief Returns \c true if chain contains no records + \sa Size() +*/ +bool SamProgramChain::IsEmpty(void) const { + return m_data.empty(); +} + +/*! \fn SamProgram& SamProgramChain::Last(void) + \brief Fetches last (newest) record in the chain. + + N.B. - This function will fail if the chain is empty. If this is possible, + check the result of IsEmpty() before calling this function. + + \return a modifiable reference to the last (newest) program entry + \sa End(), First() +*/ +SamProgram& SamProgramChain::Last(void) { + // find first record in container that has no NextProgramID entry + SamProgramIterator iter = Begin(); + SamProgramIterator end = End(); + for ( ; iter != end; ++iter ) { + SamProgram& current = (*iter); + if ( !current.HasNextProgramID() ) + return current; + } + + // otherwise error + cerr << "SamProgramChain ERROR - could not determine last record" << endl; + exit(1); +} + +/*! \fn const SamProgram& SamProgramChain::Last(void) const + \brief Fetches last (newest) record in the chain. + + This is an overloaded function. + + N.B. - This function will fail if the chain is empty. If this is possible, + check the result of IsEmpty() before calling this function. + + \return a read-only reference to the last (newest) program entry + \sa End(), ConstEnd(), First() +*/ +const SamProgram& SamProgramChain::Last(void) const { + // find first record in container that has no NextProgramID entry + SamProgramConstIterator iter = ConstBegin(); + SamProgramConstIterator end = ConstEnd(); + for ( ; iter != end; ++iter ) { + const SamProgram& current = (*iter); + if ( !current.HasNextProgramID() ) + return current; + } + + // otherwise error + cerr << "SamProgramChain ERROR - could not determine last record" << endl; + exit(1); +} + +/*! \fn const std::string SamProgramChain::NextIdFor(const std::string& programId) const + \internal + \return ID of program record, whose PreviousProgramID matches \a programId. + Otherwise, returns empty string if none found. +*/ +const std::string SamProgramChain::NextIdFor(const std::string& programId) const { + + // find first record in container whose PreviousProgramID matches @programId + SamProgramConstIterator iter = ConstBegin(); + SamProgramConstIterator end = ConstEnd(); + for ( ; iter != end; ++iter ) { + const SamProgram& current = (*iter); + if ( !current.HasPreviousProgramID() && + current.PreviousProgramID == programId + ) + { + return current.ID; + } + } + + // none found + return string(); +} + +/*! \fn int SamProgramChain::Size(void) const + \brief Returns number of program records in the chain. + \sa IsEmpty() +*/ +int SamProgramChain::Size(void) const { + return m_data.size(); +} + +/*! \fn SamProgram& SamProgramChain::operator[](const std::string& programId) + \brief Retrieves the modifiable SamProgram record that matches \a programId. + + NOTE - If the chain contains no read group matching this ID, this function will + print an error and terminate. + + \param programId ID of program record to retrieve + \return a modifiable reference to the SamProgram associated with the ID +*/ +SamProgram& SamProgramChain::operator[](const std::string& programId) { + + // look up program record matching this ID + int index = IndexOf(programId); + + // if record not found + if ( index == (int)m_data.size() ) { + cerr << "SamProgramChain ERROR - unknown programId: " << programId << endl; + exit(1); + } + + // otherwise return program record at index + return m_data.at(index); +} diff --git a/src/utils/BamTools/src/api/SamProgramChain.h b/src/utils/BamTools/src/api/SamProgramChain.h new file mode 100644 index 0000000000000000000000000000000000000000..4cb16fc337fc2b834dded709804f8879f15968f2 --- /dev/null +++ b/src/utils/BamTools/src/api/SamProgramChain.h @@ -0,0 +1,86 @@ +// *************************************************************************** +// SamProgramChain.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides methods for operating on a SamProgram record "chain" +// *************************************************************************** + +#ifndef SAM_PROGRAMCHAIN_H +#define SAM_PROGRAMCHAIN_H + +#include <api/api_global.h> +#include <api/SamProgram.h> +#include <string> +#include <vector> + +namespace BamTools { + +// chain is *NOT* sorted in any order +// use First()/Last() to retrieve oldest/newest programs, respectively +typedef std::vector<SamProgram> SamProgramContainer; +typedef SamProgramContainer::iterator SamProgramIterator; +typedef SamProgramContainer::const_iterator SamProgramConstIterator; + +class API_EXPORT SamProgramChain { + + // ctor & dtor + public: + SamProgramChain(void); + SamProgramChain(const SamProgramChain& other); + ~SamProgramChain(void); + + // query/modify program data + public: + // appends a program record to the chain + void Add(SamProgram& program); + void Add(std::vector<SamProgram>& programs); + + // clears all read group entries + void Clear(void); + + // returns true if chain contains this program record (matches on ID) + bool Contains(const SamProgram& program) const; + bool Contains(const std::string& programId) const; + + // returns the first (oldest) program in the chain + SamProgram& First(void); + const SamProgram& First(void) const; + + // returns true if chain is empty + bool IsEmpty(void) const; + + // returns last (most recent) program in the chain + SamProgram& Last(void); + const SamProgram& Last(void) const; + + // returns number of program records in the chain + int Size(void) const; + + // retrieves a modifiable reference to the SamProgram object associated with this ID + SamProgram& operator[](const std::string& programId); + + // retrieve STL-compatible iterators + public: + SamProgramIterator Begin(void); // returns iterator to begin() + SamProgramConstIterator Begin(void) const; // returns const_iterator to begin() + SamProgramConstIterator ConstBegin(void) const; // returns const_iterator to begin() + SamProgramIterator End(void); // returns iterator to end() + SamProgramConstIterator End(void) const; // returns const_iterator to end() + SamProgramConstIterator ConstEnd(void) const; // returns const_iterator to end() + + // internal methods + private: + int IndexOf(const std::string& programId) const; + const std::string NextIdFor(const std::string& programId) const; + + // data members + private: + SamProgramContainer m_data; +}; + +} // namespace BamTools + +#endif // SAM_PROGRAMCHAIN_H diff --git a/src/utils/BamTools/src/api/SamProgramChain.o b/src/utils/BamTools/src/api/SamProgramChain.o new file mode 100644 index 0000000000000000000000000000000000000000..ddda8067930d6db4f4bf7e095d82278b12ab1146 Binary files /dev/null and b/src/utils/BamTools/src/api/SamProgramChain.o differ diff --git a/src/utils/BamTools/src/api/SamReadGroup.cpp b/src/utils/BamTools/src/api/SamReadGroup.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2ba75f16b46b7e7a548581a3aa04144895675097 --- /dev/null +++ b/src/utils/BamTools/src/api/SamReadGroup.cpp @@ -0,0 +1,222 @@ +// *************************************************************************** +// SamReadGroup.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM read group data fields. +// *************************************************************************** + +#include <api/SamReadGroup.h> +using namespace BamTools; +using namespace std; + +/*! \struct BamTools::SamReadGroup + \brief Represents a SAM read group entry. + + Provides direct read/write access to the SAM read group data fields. + + \sa \samSpecURL +*/ +/*! \var SamReadGroup::Description + \brief corresponds to \@RG DS:\<Description\> +*/ +/*! \var SamReadGroup::FlowOrder + \brief corresponds to \@RG FO:\<FlowOrder\> +*/ +/*! \var SamReadGroup::ID + \brief corresponds to \@RG ID:\<ID\> + + Required for valid SAM header. +*/ +/*! \var SamReadGroup::KeySequence + \brief corresponds to \@RG KS:\<KeySequence\> +*/ +/*! \var SamReadGroup::Library + \brief corresponds to \@RG LB:\<Library\> +*/ +/*! \var SamReadGroup::PlatformUnit + \brief corresponds to \@RG PU:\<PlatformUnit\> +*/ +/*! \var SamReadGroup::PredictedInsertSize + \brief corresponds to \@RG PI:\<PredictedInsertSize\> +*/ +/*! \var SamReadGroup::ProductionDate + \brief corresponds to \@RG DT:\<ProductionDate\> +*/ +/*! \var SamReadGroup::Program + \brief corresponds to \@RG PG:\<Program\> +*/ +/*! \var SamReadGroup::Sample + \brief corresponds to \@RG SM:\<Sample\> +*/ +/*! \var SamReadGroup::SequencingCenter + \brief corresponds to \@RG CN:\<SequencingCenter\> +*/ +/*! \var SamReadGroup::SequencingTechnology + \brief corresponds to \@RG PL:\<SequencingTechnology\> +*/ + +/*! \fn SamReadGroup::SamReadGroup(void) + \brief default constructor +*/ +SamReadGroup::SamReadGroup(void) + : Description("") + , FlowOrder("") + , ID("") + , KeySequence("") + , Library("") + , PlatformUnit("") + , PredictedInsertSize("") + , ProductionDate("") + , Program("") + , Sample("") + , SequencingCenter("") + , SequencingTechnology("") +{ } + +/*! \fn SamReadGroup::SamReadGroup(const std::string& id) + \brief constructs read group with \a id + + \param id desired read group ID +*/ +SamReadGroup::SamReadGroup(const std::string& id) + : Description("") + , FlowOrder("") + , ID(id) + , KeySequence("") + , Library("") + , PlatformUnit("") + , PredictedInsertSize("") + , ProductionDate("") + , Program("") + , Sample("") + , SequencingCenter("") + , SequencingTechnology("") +{ } + +/*! \fn SamReadGroup::SamReadGroup(const SamReadGroup& other) + \brief copy constructor +*/ +SamReadGroup::SamReadGroup(const SamReadGroup& other) + : Description(other.Description) + , FlowOrder(other.FlowOrder) + , ID(other.ID) + , KeySequence(other.KeySequence) + , Library(other.Library) + , PlatformUnit(other.PlatformUnit) + , PredictedInsertSize(other.PredictedInsertSize) + , ProductionDate(other.ProductionDate) + , Program(other.Program) + , Sample(other.Sample) + , SequencingCenter(other.SequencingCenter) + , SequencingTechnology(other.SequencingTechnology) +{ } + +/*! \fn SamReadGroup::~SamReadGroup(void) + \brief destructor +*/ +SamReadGroup::~SamReadGroup(void) { } + +/*! \fn void SamReadGroup::Clear(void) + \brief Clears all data fields. +*/ +void SamReadGroup::Clear(void) { + Description.clear(); + FlowOrder.clear(); + ID.clear(); + KeySequence.clear(); + Library.clear(); + PlatformUnit.clear(); + PredictedInsertSize.clear(); + ProductionDate.clear(); + Program.clear(); + Sample.clear(); + SequencingCenter.clear(); + SequencingTechnology.clear(); +} + +/*! \fn bool SamReadGroup::HasDescription(void) const + \brief Returns \c true if read group contains \@RG DS:\<Description\> +*/ +bool SamReadGroup::HasDescription(void) const { + return (!Description.empty()); +} + +/*! \fn bool SamReadGroup::HasFlowOrder(void) const + \brief Returns \c true if read group contains \@RG FO:\<FlowOrder\> +*/ +bool SamReadGroup::HasFlowOrder(void) const { + return (!FlowOrder.empty()); +} + +/*! \fn bool SamReadGroup::HasID(void) const + \brief Returns \c true if read group contains \@RG: ID:\<ID\> +*/ +bool SamReadGroup::HasID(void) const { + return (!ID.empty()); +} + +/*! \fn bool SamReadGroup::HasKeySequence(void) const + \brief Returns \c true if read group contains \@RG KS:\<KeySequence\> +*/ +bool SamReadGroup::HasKeySequence(void) const { + return (!KeySequence.empty()); +} + +/*! \fn bool SamReadGroup::HasLibrary(void) const + \brief Returns \c true if read group contains \@RG LB:\<Library\> +*/ +bool SamReadGroup::HasLibrary(void) const { + return (!Library.empty()); +} + +/*! \fn bool SamReadGroup::HasPlatformUnit(void) const + \brief Returns \c true if read group contains \@RG PU:\<PlatformUnit\> +*/ +bool SamReadGroup::HasPlatformUnit(void) const { + return (!PlatformUnit.empty()); +} + +/*! \fn bool SamReadGroup::HasPredictedInsertSize(void) const + \brief Returns \c true if read group contains \@RG PI:\<PredictedInsertSize\> +*/ +bool SamReadGroup::HasPredictedInsertSize(void) const { + return (!PredictedInsertSize.empty()); +} + +/*! \fn bool SamReadGroup::HasProductionDate(void) const + \brief Returns \c true if read group contains \@RG DT:\<ProductionDate\> +*/ +bool SamReadGroup::HasProductionDate(void) const { + return (!ProductionDate.empty()); +} + +/*! \fn bool SamReadGroup::HasProgram(void) const + \brief Returns \c true if read group contains \@RG PG:\<Program\> +*/ +bool SamReadGroup::HasProgram(void) const { + return (!Program.empty()); +} + +/*! \fn bool SamReadGroup::HasSample(void) const + \brief Returns \c true if read group contains \@RG SM:\<Sample\> +*/ +bool SamReadGroup::HasSample(void) const { + return (!Sample.empty()); +} + +/*! \fn bool SamReadGroup::HasSequencingCenter(void) const + \brief Returns \c true if read group contains \@RG CN:\<SequencingCenter\> +*/ +bool SamReadGroup::HasSequencingCenter(void) const { + return (!SequencingCenter.empty()); +} + +/*! \fn bool SamReadGroup::HasSequencingTechnology(void) const + \brief Returns \c true if read group contains \@RG PL:\<SequencingTechnology\> +*/ +bool SamReadGroup::HasSequencingTechnology(void) const { + return (!SequencingTechnology.empty()); +} diff --git a/src/utils/BamTools/src/api/SamReadGroup.h b/src/utils/BamTools/src/api/SamReadGroup.h new file mode 100644 index 0000000000000000000000000000000000000000..b203d3cdb16927c5363272c9810b74b84bd2ecba --- /dev/null +++ b/src/utils/BamTools/src/api/SamReadGroup.h @@ -0,0 +1,69 @@ +// *************************************************************************** +// SamReadGroup.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM read group data fields. +// *************************************************************************** + +#ifndef SAM_READGROUP_H +#define SAM_READGROUP_H + +#include "api/api_global.h" +#include <string> + +namespace BamTools { + +struct API_EXPORT SamReadGroup { + + // ctor & dtor + SamReadGroup(void); + SamReadGroup(const std::string& id); + SamReadGroup(const SamReadGroup& other); + ~SamReadGroup(void); + + // query/modify entire read group + void Clear(void); // clears all data fields + + // convenience query methods + bool HasDescription(void) const; // returns true if read group has a description + bool HasFlowOrder(void) const; // returns true if read group has a flow order entry + bool HasID(void) const; // returns true if read group has a group ID + bool HasKeySequence(void) const; // returns true if read group has a key sequence + bool HasLibrary(void) const; // returns true if read group has a library name + bool HasPlatformUnit(void) const; // returns true if read group has a platform unit ID + bool HasPredictedInsertSize(void) const; // returns true if read group has a predicted insert size + bool HasProductionDate(void) const; // returns true if read group has a production date + bool HasProgram(void) const; // returns true if read group has a program entry + bool HasSample(void) const; // returns true if read group has a sample name + bool HasSequencingCenter(void) const; // returns true if read group has a sequencing center ID + bool HasSequencingTechnology(void) const; // returns true if read group has a sequencing technology ID + + + // data fields + std::string Description; // DS:<Description> + std::string FlowOrder; // FO:<FlowOrder> + std::string ID; // ID:<ID> *Required for valid SAM header* + std::string KeySequence; // KS:<KeySequence> + std::string Library; // LB:<Library> + std::string PlatformUnit; // PU:<PlatformUnit> + std::string PredictedInsertSize; // PI:<PredictedInsertSize> + std::string ProductionDate; // DT:<ProductionDate> + std::string Program; // PG:<Program> + std::string Sample; // SM:<Sample> + std::string SequencingCenter; // CN:<SequencingCenter> + std::string SequencingTechnology; // PL:<SequencingTechnology> +}; + +/*! \fn bool operator==(const SamReadGroup& lhs, const SamReadGroup& rhs) + \brief tests equality by comparing read group IDs +*/ +API_EXPORT inline bool operator==(const SamReadGroup& lhs, const SamReadGroup& rhs) { + return lhs.ID == rhs.ID; +} + +} // namespace BamTools + +#endif // SAM_READGROUP_H diff --git a/src/utils/BamTools/src/api/SamReadGroup.o b/src/utils/BamTools/src/api/SamReadGroup.o new file mode 100644 index 0000000000000000000000000000000000000000..644aff69b9462791a36b3276be9da5f3eb5ad553 Binary files /dev/null and b/src/utils/BamTools/src/api/SamReadGroup.o differ diff --git a/src/utils/BamTools/src/api/SamReadGroupDictionary.cpp b/src/utils/BamTools/src/api/SamReadGroupDictionary.cpp new file mode 100644 index 0000000000000000000000000000000000000000..69903ff8051f2f83da5c3a23d4e8725f2694939b --- /dev/null +++ b/src/utils/BamTools/src/api/SamReadGroupDictionary.cpp @@ -0,0 +1,290 @@ +// *************************************************************************** +// SamReadGroupDictionary.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides methods for operating on a collection of SamReadGroup entries. +// *************************************************************************** + +#include <api/SamReadGroupDictionary.h> +using namespace BamTools; + +#include <algorithm> +#include <iostream> +using namespace std; + +/*! \class BamTools::SamReadGroupDictionary + \brief Container of SamReadGroup entries. + + Provides methods for operating on a collection of SamReadGroup entries. +*/ + +/*! \fn SamReadGroupDictionary::SamReadGroupDictionary(void) + \brief constructor +*/ +SamReadGroupDictionary::SamReadGroupDictionary(void) { } + +/*! \fn SamReadGroupDictionary::SamReadGroupDictionary(const SamReadGroupDictionary& other) + \brief copy constructor +*/ +SamReadGroupDictionary::SamReadGroupDictionary(const SamReadGroupDictionary& other) + : m_data(other.m_data) +{ } + +/*! \fn SamReadGroupDictionary::~SamReadGroupDictionary(void) + \brief destructor +*/ +SamReadGroupDictionary::~SamReadGroupDictionary(void) { } + +/*! \fn void SamReadGroupDictionary::Add(const SamReadGroup& readGroup) + \brief Adds a read group to the dictionary. + + Duplicate entries are silently discarded. + + \param readGroup entry to be added +*/ +void SamReadGroupDictionary::Add(const SamReadGroup& readGroup) { + + // TODO: report error on attempted duplicate? + + if ( IsEmpty() || !Contains(readGroup) ) + m_data.push_back(readGroup); +} + +/*! \fn void SamReadGroupDictionary::Add(const std::string& readGroupId) + \brief Adds a read group to the dictionary. + + This is an overloaded function. + + \param readGroupId ID of read group to be added + \sa Add() +*/ +void SamReadGroupDictionary::Add(const std::string& readGroupId) { + Add( SamReadGroup(readGroupId) ); +} + +/*! \fn void SamReadGroupDictionary::Add(const std::vector<SamReadGroup>& readGroups) + \brief Adds multiple read groups to the dictionary. + + This is an overloaded function. + + \param readGroups entries to be added + \sa Add() +*/ +void SamReadGroupDictionary::Add(const std::vector<SamReadGroup>& readGroups) { + vector<SamReadGroup>::const_iterator rgIter = readGroups.begin(); + vector<SamReadGroup>::const_iterator rgEnd = readGroups.end(); + for ( ; rgIter!= rgEnd; ++rgIter ) + Add(*rgIter); +} + +/*! \fn void SamReadGroupDictionary::Add(const std::vector<std::string>& readGroupIds) + \brief Adds multiple read groups to the dictionary. + + This is an overloaded function. + + \param readGroupIds IDs of read groups to be added + \sa Add() +*/ +void SamReadGroupDictionary::Add(const std::vector<std::string>& readGroupIds) { + vector<string>::const_iterator rgIter = readGroupIds.begin(); + vector<string>::const_iterator rgEnd = readGroupIds.end(); + for ( ; rgIter!= rgEnd; ++rgIter ) + Add(*rgIter); +} + +/*! \fn SamReadGroupIterator SamReadGroupDictionary::Begin(void) + \return an STL iterator pointing to the first read group + \sa ConstBegin(), End() +*/ +SamReadGroupIterator SamReadGroupDictionary::Begin(void) { + return m_data.begin(); +} + +/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::Begin(void) const + \return an STL const_iterator pointing to the first read group + + This is an overloaded function. + + \sa ConstBegin(), End() +*/ +SamReadGroupConstIterator SamReadGroupDictionary::Begin(void) const { + return m_data.begin(); +} + +/*! \fn void SamReadGroupDictionary::Clear(void) + \brief Clears all read group entries. +*/ +void SamReadGroupDictionary::Clear(void) { + m_data.clear(); +} + +/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::ConstBegin(void) const + \return an STL const_iterator pointing to the first read group + \sa Begin(), ConstEnd() +*/ +SamReadGroupConstIterator SamReadGroupDictionary::ConstBegin(void) const { + return m_data.begin(); +} + +/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd(void) const + \return an STL const_iterator pointing to the imaginary entry after the last read group + \sa ConstBegin(), End() +*/ +SamReadGroupConstIterator SamReadGroupDictionary::ConstEnd(void) const { + return m_data.end(); +} + +/*! \fn bool SamReadGroupDictionary::Contains(const std::string& readGroupId) const + \brief Returns true if dictionary contains read group. + \param readGroupId search for read group matching this ID + \return \c true if dictionary contains a read group with this ID +*/ +bool SamReadGroupDictionary::Contains(const std::string& readGroupId) const { + return ( IndexOf(readGroupId) != (int)m_data.size() ); +} + +/*! \fn bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const + \brief Returns true if dictionary contains read group (matching on ID). + + This is an overloaded function. + + \param readGroup search for this read group + \return \c true if dictionary contains read group (matching on ID). +*/ +bool SamReadGroupDictionary::Contains(const SamReadGroup& readGroup) const { + return Contains( readGroup.ID ); +} + +/*! \fn SamReadGroupIterator SamReadGroupDictionary::End(void) + \return an STL iterator pointing to the imaginary entry after the last read group + \sa Begin(), ConstEnd() +*/ +SamReadGroupIterator SamReadGroupDictionary::End(void) { + return m_data.end(); +} + +/*! \fn SamReadGroupConstIterator SamReadGroupDictionary::End(void) const + \return an STL const_iterator pointing to the imaginary entry after the last read group + + This is an overloaded function. + + \sa Begin(), ConstEnd() +*/ +SamReadGroupConstIterator SamReadGroupDictionary::End(void) const { + return m_data.end(); +} + +/*! \fn int SamReadGroupDictionary::IndexOf(const std::string& readGroupId) const + \internal + \return index of read group if found. Otherwise, returns vector::size() (invalid index). +*/ +int SamReadGroupDictionary::IndexOf(const std::string& readGroupId) const { + SamReadGroupConstIterator begin = ConstBegin(); + SamReadGroupConstIterator iter = begin; + SamReadGroupConstIterator end = ConstEnd(); + for ( ; iter != end; ++iter ) { + const SamReadGroup& current = (*iter); + if ( current.ID == readGroupId ) + break; + } + return distance( begin, iter ); +} + +/*! \fn bool SamReadGroupDictionary::IsEmpty(void) const + \brief Returns \c true if dictionary contains no read groups + \sa Size() +*/ +bool SamReadGroupDictionary::IsEmpty(void) const { + return m_data.empty(); +} + +/*! \fn void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup) + \brief Removes read group from dictionary, if found (matching on ID). + + This is an overloaded function. + + \param readGroup read group to remove (matches on ID) +*/ +void SamReadGroupDictionary::Remove(const SamReadGroup& readGroup) { + Remove( readGroup.ID ); +} + +/*! \fn void SamReadGroupDictionary::Remove(const std::string& readGroupId) + \brief Removes read group from dictionary, if found. + \param readGroupId ID of read group to remove + \sa Remove() +*/ +void SamReadGroupDictionary::Remove(const std::string& readGroupId) { + if ( Contains(readGroupId) ) + m_data.erase( m_data.begin() + IndexOf(readGroupId) ); +} + +/*! \fn void SamReadGroupDictionary::Remove(const std::vector<SamReadGroup>& readGroups) + \brief Removes multiple read groups from dictionary (matching on ID). + + This is an overloaded function. + + \param readGroups read groups to remove + \sa Remove() +*/ +void SamReadGroupDictionary::Remove(const std::vector<SamReadGroup>& readGroups) { + vector<SamReadGroup>::const_iterator rgIter = readGroups.begin(); + vector<SamReadGroup>::const_iterator rgEnd = readGroups.end(); + for ( ; rgIter!= rgEnd; ++rgIter ) + Remove(*rgIter); +} + +/*! \fn void SamReadGroupDictionary::Remove(const std::vector<std::string>& readGroupIds) + \brief Removes multiple read groups from dictionary. + + This is an overloaded function. + + \param readGroupIds IDs of the read groups to remove + \sa Remove() +*/ +void SamReadGroupDictionary::Remove(const std::vector<std::string>& readGroupIds) { + vector<string>::const_iterator rgIter = readGroupIds.begin(); + vector<string>::const_iterator rgEnd = readGroupIds.end(); + for ( ; rgIter!= rgEnd; ++rgIter ) + Remove(*rgIter); +} + +/*! \fn int SamReadGroupDictionary::Size(void) const + \brief Returns number of read groups in dictionary. + \sa IsEmpty() +*/ +int SamReadGroupDictionary::Size(void) const { + return m_data.size(); +} + +/*! \fn SamReadGroup& SamReadGroupDictionary::operator[](const std::string& readGroupId) + \brief Retrieves the modifiable SamReadGroup that matches \a readGroupId. + + NOTE - If the dictionary contains no read group matching this ID, this function inserts + a new one with this ID, and returns a reference to it. + + If you want to avoid this insertion behavior, check the result of Contains() before + using this operator. + + \param readGroupId ID of read group to retrieve + \return a modifiable reference to the SamReadGroup associated with the ID +*/ +SamReadGroup& SamReadGroupDictionary::operator[](const std::string& readGroupId) { + + // look up read group ID + int index = IndexOf(readGroupId); + + // if found, return read group at index + if ( index != (int)m_data.size() ) + return m_data[index]; + + // otherwise, append new read group and return reference + else { + SamReadGroup rg(readGroupId); + m_data.push_back(rg); + return m_data.back(); + } +} diff --git a/src/utils/BamTools/src/api/SamReadGroupDictionary.h b/src/utils/BamTools/src/api/SamReadGroupDictionary.h new file mode 100644 index 0000000000000000000000000000000000000000..8ec40e227ba5f020699cd1a0021a2785587e447c --- /dev/null +++ b/src/utils/BamTools/src/api/SamReadGroupDictionary.h @@ -0,0 +1,87 @@ +// *************************************************************************** +// SamReadGroupDictionary.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides methods for operating on a collection of SamReadGroup entries. +// *************************************************************************** + +#ifndef SAM_READGROUP_DICTIONARY_H +#define SAM_READGROUP_DICTIONARY_H + +#include <api/api_global.h> +#include <api/SamReadGroup.h> +#include <string> +#include <vector> + +namespace BamTools { + +typedef std::vector<SamReadGroup> SamReadGroupContainer; +typedef SamReadGroupContainer::iterator SamReadGroupIterator; +typedef SamReadGroupContainer::const_iterator SamReadGroupConstIterator; + +class API_EXPORT SamReadGroupDictionary { + + // ctor & dtor + public: + SamReadGroupDictionary(void); + SamReadGroupDictionary(const SamReadGroupDictionary& other); + ~SamReadGroupDictionary(void); + + // query/modify read group data + public: + // adds a read group + void Add(const SamReadGroup& readGroup); + void Add(const std::string& readGroupId); + + // adds multiple read groups + void Add(const std::vector<SamReadGroup>& readGroups); + void Add(const std::vector<std::string>& readGroupIds); + + // clears all read group entries + void Clear(void); + + // returns true if dictionary contains this read group + bool Contains(const SamReadGroup& readGroup) const; + bool Contains(const std::string& readGroupId) const; + + // returns true if dictionary is empty + bool IsEmpty(void) const; + + // removes read group, if found + void Remove(const SamReadGroup& readGroup); + void Remove(const std::string& readGroupId); + + // removes multiple read groups + void Remove(const std::vector<SamReadGroup>& readGroups); + void Remove(const std::vector<std::string>& readGroupIds); + + // returns number of read groups in dictionary + int Size(void) const; + + // retrieves a modifiable reference to the SamReadGroup object associated with this ID + SamReadGroup& operator[](const std::string& readGroupId); + + // retrieve STL-compatible iterators + public: + SamReadGroupIterator Begin(void); // returns iterator to begin() + SamReadGroupConstIterator Begin(void) const; // returns const_iterator to begin() + SamReadGroupConstIterator ConstBegin(void) const; // returns const_iterator to begin() + SamReadGroupIterator End(void); // returns iterator to end() + SamReadGroupConstIterator End(void) const; // returns const_iterator to end() + SamReadGroupConstIterator ConstEnd(void) const; // returns const_iterator to end() + + // internal methods + private: + int IndexOf(const std::string& readGroupId) const; + + // data members + private: + SamReadGroupContainer m_data; +}; + +} // namespace BamTools + +#endif // SAM_READGROUP_DICTIONARY_H diff --git a/src/utils/BamTools/src/api/SamReadGroupDictionary.o b/src/utils/BamTools/src/api/SamReadGroupDictionary.o new file mode 100644 index 0000000000000000000000000000000000000000..d30ca9b094b895cb8bbf27e8565b3ab58d1d2e48 Binary files /dev/null and b/src/utils/BamTools/src/api/SamReadGroupDictionary.o differ diff --git a/src/utils/BamTools/src/api/SamSequence.cpp b/src/utils/BamTools/src/api/SamSequence.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0231988dd901300c1937f7abb20775f19607b599 --- /dev/null +++ b/src/utils/BamTools/src/api/SamSequence.cpp @@ -0,0 +1,162 @@ +// *************************************************************************** +// SamSequence.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM sequence data fields. +// *************************************************************************** + +#include <api/SamSequence.h> +#include <sstream> +using namespace BamTools; +using namespace std; + +/*! \struct BamTools::SamSequence + \brief Represents a SAM sequence entry. + + Provides direct read/write access to the SAM sequence data fields. + + \sa \samSpecURL +*/ +/*! \var SamSequence::AssemblyID + \brief corresponds to \@SQ AS:\<AssemblyID\> +*/ +/*! \var SamSequence::Checksum + \brief corresponds to \@SQ M5:\<Checksum\> +*/ +/*! \var SamSequence::Length + \brief corresponds to \@SQ LN:\<Length\> + + Required for valid SAM header. +*/ +/*! \var SamSequence::Name + \brief corresponds to \@SQ SN:\<Name\> + + Required for valid SAM header. +*/ +/*! \var SamSequence::Species + \brief corresponds to \@SQ SP:\<Species\> +*/ +/*! \var SamSequence::URI + \brief corresponds to \@SQ UR:\<URI\> +*/ + +/*! \fn SamSequence::SamSequence(void) + \brief default constructor +*/ +SamSequence::SamSequence(void) + : AssemblyID("") + , Checksum("") + , Length("") + , Name("") + , Species("") + , URI("") +{ } + +/*! \fn SamSequence::SamSequence(const std::string& name, const int& length) + \brief constructs sequence with \a name and \a length + + \param name desired sequence name + \param length desired sequence length (numeric value) +*/ +SamSequence::SamSequence(const std::string& name, + const int& length) + : AssemblyID("") + , Checksum("") + , Name(name) + , Species("") + , URI("") +{ + stringstream s(""); + s << length; + Length = s.str(); +} + +/*! \fn SamSequence::SamSequence(const std::string& name, const std::string& length) + \brief constructs sequence with \a name and \a length + + \param name desired sequence name + \param length desired sequence length (string value) +*/ +SamSequence::SamSequence(const std::string& name, + const std::string& length) + : AssemblyID("") + , Checksum("") + , Length(length) + , Name(name) + , Species("") + , URI("") +{ } + +/*! \fn SamSequence::SamSequence(const SamSequence& other) + \brief copy constructor +*/ +SamSequence::SamSequence(const SamSequence& other) + : AssemblyID(other.AssemblyID) + , Checksum(other.Checksum) + , Length(other.Length) + , Name(other.Name) + , Species(other.Species) + , URI(other.URI) +{ } + +/*! \fn SamSequence::~SamSequence(void) + \brief destructor +*/ +SamSequence::~SamSequence(void) { } + +/*! \fn void SamSequence::Clear(void) + \brief Clears all data fields. +*/ +void SamSequence::Clear(void) { + AssemblyID.clear(); + Checksum.clear(); + Length.clear(); + Name.clear(); + Species.clear(); + URI.clear(); +} + +/*! \fn bool SamSequence::HasAssemblyID(void) const + \brief Returns \c true if sequence contains \@SQ AS:\<AssemblyID\> +*/ +bool SamSequence::HasAssemblyID(void) const { + return (!AssemblyID.empty()); +} + +/*! \fn bool SamSequence::HasChecksum(void) const + \brief Returns \c true if sequence contains \@SQ M5:\<Checksum\> +*/ +bool SamSequence::HasChecksum(void) const { + return (!Checksum.empty()); +} + +/*! \fn bool SamSequence::HasLength(void) const + \brief Returns \c true if sequence contains \@SQ LN:\<Length\> +*/ +bool SamSequence::HasLength(void) const { + return (!Length.empty()); +} + +/*! \fn bool SamSequence::HasName(void) const + \brief Returns \c true if sequence contains \@SQ SN:\<Name\> +*/ +bool SamSequence::HasName(void) const { + return (!Name.empty()); +} + +/*! \fn bool SamSequence::HasSpecies(void) const + \brief Returns \c true if sequence contains \@SQ SP:\<Species\> +*/ +bool SamSequence::HasSpecies(void) const { + return (!Species.empty()); +} + +/*! \fn bool SamSequence::HasURI(void) const + \brief Returns \c true if sequence contains \@SQ UR:\<URI\> +*/ +bool SamSequence::HasURI(void) const { + return (!URI.empty()); +} diff --git a/src/utils/BamTools/src/api/SamSequence.h b/src/utils/BamTools/src/api/SamSequence.h new file mode 100644 index 0000000000000000000000000000000000000000..054e58f985ec40c74820f37a8318157202dfd2ed --- /dev/null +++ b/src/utils/BamTools/src/api/SamSequence.h @@ -0,0 +1,61 @@ +// *************************************************************************** +// SamSequence.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides direct read/write access to the SAM sequence data fields. +// *************************************************************************** + +#ifndef SAM_SEQUENCE_H +#define SAM_SEQUENCE_H + +#include <api/api_global.h> +#include <string> + +namespace BamTools { + +struct API_EXPORT SamSequence { + + // ctor & dtor + SamSequence(void); + SamSequence(const std::string& name, const int& length); + SamSequence(const std::string& name, const std::string& length); + SamSequence(const SamSequence& other); + ~SamSequence(void); + + // query/modify entire sequence + void Clear(void); // clears all contents + + // convenience query methods + bool HasAssemblyID(void) const; // returns true if sequence has an assembly ID + bool HasChecksum(void) const; // returns true if sequence has an MD5 checksum + bool HasLength(void) const; // returns true if sequence has a length + bool HasName(void) const; // returns true if sequence has a name + bool HasSpecies(void) const; // returns true if sequence has a species ID + bool HasURI(void) const; // returns true if sequence has a URI + + // data members + std::string AssemblyID; // AS:<AssemblyID> + std::string Checksum; // M5:<Checksum> + std::string Length; // LN:<Length> *Required for valid SAM header* + std::string Name; // SN:<Name> *Required for valid SAM header* + std::string Species; // SP:<Species> + std::string URI; // UR:<URI> +}; + +/*! \fn bool operator==(const SamSequence& lhs, const SamSequence& rhs) + \brief tests equality by comparing sequence names, lengths, & checksums (if available) +*/ +API_EXPORT inline bool operator==(const SamSequence& lhs, const SamSequence& rhs) { + if ( lhs.Name != rhs.Name ) return false; + if ( lhs.Length != rhs.Length ) return false; + if ( lhs.HasChecksum() && rhs.HasChecksum() ) + return (lhs.Checksum == rhs.Checksum); + else return true; +} + +} // namespace BamTools + +#endif // SAM_SEQUENCE_H diff --git a/src/utils/BamTools/src/api/SamSequence.o b/src/utils/BamTools/src/api/SamSequence.o new file mode 100644 index 0000000000000000000000000000000000000000..2ecce255d2c6d056bd1a8ff79ecb5b29b3f64106 Binary files /dev/null and b/src/utils/BamTools/src/api/SamSequence.o differ diff --git a/src/utils/BamTools/src/api/SamSequenceDictionary.cpp b/src/utils/BamTools/src/api/SamSequenceDictionary.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3e5240df386099cc5199fe0d3d173d5374b22303 --- /dev/null +++ b/src/utils/BamTools/src/api/SamSequenceDictionary.cpp @@ -0,0 +1,293 @@ +// *************************************************************************** +// SamSequenceDictionary.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides methods for operating on a collection of SamSequence entries. +// ************************************************************************* + +#include <api/SamSequenceDictionary.h> +using namespace BamTools; + +#include <iostream> +using namespace std; + +/*! \class BamTools::SamSequenceDictionary + \brief Container of SamSequence entries. + + Provides methods for operating on a collection of SamSequence entries. +*/ + +/*! \fn SamSequenceDictionary::SamSequenceDictionary(void) + \brief constructor +*/ +SamSequenceDictionary::SamSequenceDictionary(void) { } + +/*! \fn SamSequenceDictionary::SamSequenceDictionary(const SamSequenceDictionary& other) + \brief copy constructor +*/ +SamSequenceDictionary::SamSequenceDictionary(const SamSequenceDictionary& other) + : m_data(other.m_data) +{ } + +/*! \fn SamSequenceDictionary::~SamSequenceDictionary(void) + \brief destructor +*/ +SamSequenceDictionary::~SamSequenceDictionary(void) { } + +/*! \fn void SamSequenceDictionary::Add(const SamSequence& sequence) + \brief Adds a sequence to the dictionary. + + Duplicate entries are silently discarded. + + \param sequence entry to be added +*/ +void SamSequenceDictionary::Add(const SamSequence& sequence) { + + // TODO: report error on attempted duplicate? + + if ( IsEmpty() || !Contains(sequence) ) + m_data.push_back(sequence); +} + +/*! \fn void SamSequenceDictionary::Add(const std::string& name, const int& length) + \brief Adds a sequence to the dictionary. + + This is an overloaded function. + + \param name name of sequence entry to be added + \param length length of sequence entry to be added + \sa Add() +*/ +void SamSequenceDictionary::Add(const std::string& name, const int& length) { + Add( SamSequence(name, length) ); +} + +/*! \fn void SamSequenceDictionary::Add(const std::vector<SamSequence>& sequences) + \brief Adds multiple sequences to the dictionary. + + This is an overloaded function. + + \param sequences entries to be added + \sa Add() +*/ +void SamSequenceDictionary::Add(const std::vector<SamSequence>& sequences) { + vector<SamSequence>::const_iterator seqIter = sequences.begin(); + vector<SamSequence>::const_iterator seqEnd = sequences.end(); + for ( ; seqIter!= seqEnd; ++seqIter ) + Add(*seqIter); +} + +/*! \fn void SamSequenceDictionary::Add(const std::map<std::string, int>& sequenceMap) + \brief Adds multiple sequences to the dictionary. + + This is an overloaded function. + + \param sequenceMap map of sequence entries (name => length) to be added + \sa Add() +*/ +void SamSequenceDictionary::Add(const std::map<std::string, int>& sequenceMap) { + map<string, int>::const_iterator seqIter = sequenceMap.begin(); + map<string, int>::const_iterator seqEnd = sequenceMap.end(); + for ( ; seqIter != seqEnd; ++seqIter ) { + const string& name = (*seqIter).first; + const int& length = (*seqIter).second; + Add( SamSequence(name, length) ); + } +} + +/*! \fn SamSequenceIterator SamSequenceDictionary::Begin(void) + \return an STL iterator pointing to the first sequence + \sa ConstBegin(), End() +*/ +SamSequenceIterator SamSequenceDictionary::Begin(void) { + return m_data.begin(); +} + +/*! \fn SamSequenceConstIterator SamSequenceDictionary::Begin(void) const + \return an STL const_iterator pointing to the first sequence + + This is an overloaded function. + + \sa ConstBegin(), End() +*/ +SamSequenceConstIterator SamSequenceDictionary::Begin(void) const { + return m_data.begin(); +} + +/*! \fn void SamSequenceDictionary::Clear(void) + \brief Clears all sequence entries. +*/ +void SamSequenceDictionary::Clear(void) { + m_data.clear(); +} + +/*! \fn SamSequenceConstIterator SamSequenceDictionary::ConstBegin(void) const + \return an STL const_iterator pointing to the first sequence + \sa Begin(), ConstEnd() +*/ +SamSequenceConstIterator SamSequenceDictionary::ConstBegin(void) const { + return m_data.begin(); +} + +/*! \fn SamSequenceConstIterator SamSequenceDictionary::ConstEnd(void) const + \return an STL const_iterator pointing to the imaginary entry after the last sequence + \sa End(), ConstBegin() +*/ +SamSequenceConstIterator SamSequenceDictionary::ConstEnd(void) const { + return m_data.end(); +} + +/*! \fn bool SamSequenceDictionary::Contains(const std::string& sequenceName) const + \brief Returns true if dictionary contains sequence. + \param sequenceName search for sequence matching this name + \return \c true if dictionary contains a sequence with this name +*/ +bool SamSequenceDictionary::Contains(const std::string& sequenceName) const { + return ( IndexOf(sequenceName) != (int)m_data.size() ); +} + +/*! \fn bool SamSequenceDictionary::Contains(const SamSequence& sequence) const + \brief Returns true if dictionary contains sequence (matches on name). + + This is an overloaded function. + + \param sequence search for this sequence + \return \c true if dictionary contains sequence (matching on name) +*/ +bool SamSequenceDictionary::Contains(const SamSequence& sequence) const { + return ( IndexOf(sequence.Name) != (int)m_data.size() ); +} + +/*! \fn SamSequenceIterator SamSequenceDictionary::End(void) + \return an STL iterator pointing to the imaginary entry after the last sequence + \sa Begin(), ConstEnd() +*/ +SamSequenceIterator SamSequenceDictionary::End(void) { + return m_data.end(); +} + +/*! \fn SamSequenceConstIterator SamSequenceDictionary::End(void) const + \return an STL const_iterator pointing to the imaginary entry after the last sequence + + This is an overloaded function. + + \sa Begin(), ConstEnd() +*/ +SamSequenceConstIterator SamSequenceDictionary::End(void) const { + return m_data.end(); +} + +/*! \fn int SamSequenceDictionary::IndexOf(const std::string& name) const + \internal + \return index of sequence if found (matching on name). Otherwise, returns vector::size() (invalid index). +*/ +int SamSequenceDictionary::IndexOf(const std::string& name) const { + SamSequenceConstIterator begin = ConstBegin(); + SamSequenceConstIterator iter = begin; + SamSequenceConstIterator end = ConstEnd(); + for ( ; iter != end; ++iter ) { + const SamSequence& currentSeq = (*iter); + if ( currentSeq.Name == name ) + break; + } + return distance( begin, iter ); +} + +/*! \fn bool SamSequenceDictionary::IsEmpty(void) const + \brief Returns \c true if dictionary contains no sequences + \sa Size() +*/ +bool SamSequenceDictionary::IsEmpty(void) const { + return m_data.empty(); +} + +/*! \fn void SamSequenceDictionary::Remove(const SamSequence& sequence) + \brief Removes sequence from dictionary, if found (matches on name). + + This is an overloaded function. + + \param sequence SamSequence to remove (matching on name) +*/ +void SamSequenceDictionary::Remove(const SamSequence& sequence) { + Remove( sequence.Name ); +} + +/*! \fn void SamSequenceDictionary::Remove(const std::string& sequenceName) + \brief Removes sequence from dictionary, if found. + + \param sequenceName name of sequence to remove + \sa Remove() +*/ +void SamSequenceDictionary::Remove(const std::string& sequenceName) { + if ( Contains(sequenceName) ) + m_data.erase( m_data.begin() + IndexOf(sequenceName) ); +} + +/*! \fn void SamSequenceDictionary::Remove(const std::vector<SamSequence>& sequences) + \brief Removes multiple sequences from dictionary. + + This is an overloaded function. + + \param sequences sequences to remove + \sa Remove() +*/ +void SamSequenceDictionary::Remove(const std::vector<SamSequence>& sequences) { + vector<SamSequence>::const_iterator rgIter = sequences.begin(); + vector<SamSequence>::const_iterator rgEnd = sequences.end(); + for ( ; rgIter!= rgEnd; ++rgIter ) + Remove(*rgIter); +} + +/*! \fn void SamSequenceDictionary::Remove(const std::vector<std::string>& sequenceNames) + \brief Removes multiple sequences from dictionary. + + This is an overloaded function. + + \param sequenceNames names of the sequences to remove + \sa Remove() +*/ +void SamSequenceDictionary::Remove(const std::vector<std::string>& sequenceNames) { + vector<string>::const_iterator rgIter = sequenceNames.begin(); + vector<string>::const_iterator rgEnd = sequenceNames.end(); + for ( ; rgIter!= rgEnd; ++rgIter ) + Remove(*rgIter); +} + +/*! \fn int SamSequenceDictionary::Size(void) const + \brief Returns number of sequences in dictionary. + \sa IsEmpty() +*/ +int SamSequenceDictionary::Size(void) const { + return m_data.size(); +} + +/*! \fn SamSequence& SamSequenceDictionary::operator[](const std::string& sequenceName) + \brief Retrieves the modifiable SamSequence that matches \a sequenceName. + + NOTE - If the dictionary contains no sequence matching this name, this function inserts + a new one with this name (length:0), and returns a reference to it. + + If you want to avoid this insertion behavior, check the result of Contains() before + using this operator. + + \param sequenceName name of sequence to retrieve + \return a modifiable reference to the SamSequence associated with the name +*/ +SamSequence& SamSequenceDictionary::operator[](const std::string& sequenceName) { + + // look up sequence ID + int index = IndexOf(sequenceName); + + // if found, return sequence at index + if ( index != (int)m_data.size() ) + return m_data[index]; + + // otherwise, append new sequence and return reference + else { + m_data.push_back( SamSequence(sequenceName, 0) ); + return m_data.back(); + } +} diff --git a/src/utils/BamTools/src/api/SamSequenceDictionary.h b/src/utils/BamTools/src/api/SamSequenceDictionary.h new file mode 100644 index 0000000000000000000000000000000000000000..1ac73261fef989f3f38adf9d6a9d1db1c16701b7 --- /dev/null +++ b/src/utils/BamTools/src/api/SamSequenceDictionary.h @@ -0,0 +1,89 @@ +// *************************************************************************** +// SamSequenceDictionary.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 April 2011 +// --------------------------------------------------------------------------- +// Provides methods for operating on a collection of SamSequence entries. +// *************************************************************************** + +#ifndef SAM_SEQUENCE_DICTIONARY_H +#define SAM_SEQUENCE_DICTIONARY_H + +#include <api/api_global.h> +#include <api/SamSequence.h> +#include <string> +#include <map> +#include <vector> + +namespace BamTools { + +typedef std::vector<SamSequence> SamSequenceContainer; +typedef SamSequenceContainer::iterator SamSequenceIterator; +typedef SamSequenceContainer::const_iterator SamSequenceConstIterator; + +class API_EXPORT SamSequenceDictionary { + + // ctor & dtor + public: + SamSequenceDictionary(void); + SamSequenceDictionary(const SamSequenceDictionary& other); + ~SamSequenceDictionary(void); + + // query/modify sequence data + public: + // adds a sequence + void Add(const SamSequence& sequence); + void Add(const std::string& name, const int& length); + + // adds multiple sequences + void Add(const std::vector<SamSequence>& sequences); + void Add(const std::map<std::string, int>& sequenceMap); + + // clears all sequence entries + void Clear(void); + + // returns true if dictionary contains this sequence + bool Contains(const SamSequence& sequence) const; + bool Contains(const std::string& sequenceName) const; + + // returns true if dictionary is empty + bool IsEmpty(void) const; + + // removes sequence, if found + void Remove(const SamSequence& sequence); + void Remove(const std::string& sequenceName); + + // removes multiple sequences + void Remove(const std::vector<SamSequence>& sequences); + void Remove(const std::vector<std::string>& sequenceNames); + + // returns number of sequences in dictionary + int Size(void) const; + + // retrieves a modifiable reference to the SamSequence object associated with this name + SamSequence& operator[](const std::string& sequenceName); + + // retrieve STL-compatible iterators + public: + SamSequenceIterator Begin(void); // returns iterator to begin() + SamSequenceConstIterator Begin(void) const; // returns const_iterator to begin() + SamSequenceConstIterator ConstBegin(void) const; // returns const_iterator to begin() + SamSequenceIterator End(void); // returns iterator to end() + SamSequenceConstIterator End(void) const; // returns const_iterator to end() + SamSequenceConstIterator ConstEnd(void) const; // returns const_iterator to end() + + // internal methods + private: + int IndexOf(const std::string& name) const; + + // data members + private: + SamSequenceContainer m_data; +}; + +} // namespace BamTools + +#endif // SAM_SEQUENCE_DICTIONARY_H + diff --git a/src/utils/BamTools/src/api/SamSequenceDictionary.o b/src/utils/BamTools/src/api/SamSequenceDictionary.o new file mode 100644 index 0000000000000000000000000000000000000000..972c52becc0169c84f19efa0ba3b09057c1f1e72 Binary files /dev/null and b/src/utils/BamTools/src/api/SamSequenceDictionary.o differ diff --git a/src/utils/BamTools/src/api/api_global.h b/src/utils/BamTools/src/api/api_global.h new file mode 100644 index 0000000000000000000000000000000000000000..84fcad214c1e141d8f227dba3961e6570f03aec8 --- /dev/null +++ b/src/utils/BamTools/src/api/api_global.h @@ -0,0 +1,22 @@ +// *************************************************************************** +// api_global.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 November 2010 (DB) +// --------------------------------------------------------------------------- +// Provides macros for exporting & importing BamTools API library symbols +// *************************************************************************** + +#ifndef API_GLOBAL_H +#define API_GLOBAL_H + +#include "shared/bamtools_global.h" + +#ifdef BAMTOOLS_API_LIBRARY +# define API_EXPORT BAMTOOLS_LIBRARY_EXPORT +#else +# define API_EXPORT BAMTOOLS_LIBRARY_IMPORT +#endif + +#endif // API_GLOBAL_H diff --git a/src/utils/BamTools/src/api/internal/BamHeader_p.cpp b/src/utils/BamTools/src/api/internal/BamHeader_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0eaf7bc0f6ed2a7d7b4a1c0431021734962c024f --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamHeader_p.cpp @@ -0,0 +1,133 @@ +// *************************************************************************** +// BamHeader_p.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 21 March 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for handling BAM headers. +// *************************************************************************** + +#include <api/BamAux.h> +#include <api/BamConstants.h> +#include <api/internal/BamHeader_p.h> +#include <api/internal/BgzfStream_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <iostream> +using namespace std; + +// ctor +BamHeader::BamHeader(void) { } + +// dtor +BamHeader::~BamHeader(void) { } + +// reads magic number from BGZF stream, returns true if valid +bool BamHeader::CheckMagicNumber(BgzfStream* stream) { + + // try to read magic number + char buffer[Constants::BAM_HEADER_MAGIC_LENGTH]; + if ( stream->Read(buffer, Constants::BAM_HEADER_MAGIC_LENGTH) != (int)Constants::BAM_HEADER_MAGIC_LENGTH ) { + fprintf(stderr, "BamHeader ERROR: could not read magic number\n"); + return false; + } + + // validate magic number + if ( strncmp(buffer, Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH) != 0 ) { + fprintf(stderr, "BamHeader ERROR: invalid magic number\n"); + return false; + } + + // all checks out + return true; +} + +// clear SamHeader data +void BamHeader::Clear(void) { + m_header.Clear(); +} + +// return true if SamHeader data is valid +bool BamHeader::IsValid(void) const { + return m_header.IsValid(); +} + +// load BAM header ('magic number' and SAM header text) from BGZF stream +// returns true if all OK +bool BamHeader::Load(BgzfStream* stream) { + + // cannot load if invalid stream + if ( stream == 0 ) + return false; + + // cannot load if magic number is invalid + if ( !CheckMagicNumber(stream) ) + return false; + + // cannot load header if cannot read header length + uint32_t length(0); + if ( !ReadHeaderLength(stream, length) ) + return false; + + // cannot load header if cannot read header text + if ( !ReadHeaderText(stream, length) ) + return false; + + // otherwise, everything OK + return true; +} + +// reads SAM header text length from BGZF stream, stores it in @length +// returns read success/fail status +bool BamHeader::ReadHeaderLength(BgzfStream* stream, uint32_t& length) { + + // attempt to read BAM header text length + char buffer[sizeof(uint32_t)]; + if ( stream->Read(buffer, sizeof(uint32_t)) != sizeof(uint32_t) ) { + fprintf(stderr, "BamHeader ERROR: could not read header length\n"); + return false; + } + + // convert char buffer to length, return success + length = BamTools::UnpackUnsignedInt(buffer); + if ( BamTools::SystemIsBigEndian() ) + BamTools::SwapEndian_32(length); + return true; +} + +// reads SAM header text from BGZF stream, stores in SamHeader object +// returns read success/fail status +bool BamHeader::ReadHeaderText(BgzfStream* stream, const uint32_t& length) { + + // set up destination buffer + char* headerText = (char*)calloc(length + 1, 1); + + // attempt to read header text + const unsigned bytesRead = stream->Read(headerText, length); + const bool readOk = ( bytesRead == length ); + if ( readOk ) + m_header.SetHeaderText( (string)((const char*)headerText) ); + else + fprintf(stderr, "BamHeader ERROR: could not read header text\n"); + + // clean up calloc-ed temp variable (on success or fail) + free(headerText); + + // return read success + return readOk; +} + +// returns *copy* of SamHeader data object +SamHeader BamHeader::ToSamHeader(void) const { + return m_header; +} + +// returns SAM-formatted string of header data +string BamHeader::ToString(void) const { + return m_header.ToString(); +} diff --git a/src/utils/BamTools/src/api/internal/BamHeader_p.h b/src/utils/BamTools/src/api/internal/BamHeader_p.h new file mode 100644 index 0000000000000000000000000000000000000000..1f1a31c355dc41ac2f400e51f1421aeb0f907f0c --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamHeader_p.h @@ -0,0 +1,72 @@ +// *************************************************************************** +// BamHeader_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 26 January 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for handling BAM headers. +// *************************************************************************** + +#ifndef BAMHEADER_P_H +#define BAMHEADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <api/SamHeader.h> +#include <string> + +namespace BamTools { +namespace Internal { + +class BgzfStream; + +class BamHeader { + + // ctor & dtor + public: + BamHeader(void); + ~BamHeader(void); + + // BamHeader interface + public: + // clear SamHeader data + void Clear(void); + // return true if SamHeader data is valid + bool IsValid(void) const; + // load BAM header ('magic number' and SAM header text) from BGZF stream + // returns true if all OK + bool Load(BgzfStream* stream); + // returns (editable) copy of SamHeader data object + SamHeader ToSamHeader(void) const; + // returns SAM-formatted string of header data + std::string ToString(void) const; + + // internal methods + private: + // reads magic number from BGZF stream, returns true if valid + bool CheckMagicNumber(BgzfStream* stream); + // reads SAM header length from BGZF stream, stores it in @length + // returns read success/fail status + bool ReadHeaderLength(BgzfStream* stream, uint32_t& length); + // reads SAM header text from BGZF stream, stores in SamHeader object + // returns read success/fail status + bool ReadHeaderText(BgzfStream* stream, const uint32_t& length); + + // data members + private: + SamHeader m_header; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMHEADER_P_H diff --git a/src/utils/BamTools/src/api/internal/BamIndexFactory_p.cpp b/src/utils/BamTools/src/api/internal/BamIndexFactory_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..69b372bb02a770cbaa3d6c1449c1a73d23ffc1e8 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamIndexFactory_p.cpp @@ -0,0 +1,113 @@ +// *************************************************************************** +// BamIndexFactory_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 5 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides interface for generating BamIndex implementations +// *************************************************************************** + +#include <api/BamAux.h> +#include <api/internal/BamIndexFactory_p.h> +#include <api/internal/BamStandardIndex_p.h> +#include <api/internal/BamToolsIndex_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstdio> +using namespace std; + +// generates index filename from BAM filename (depending on requested type) +// if type is unknown, returns empty string +const string BamIndexFactory::CreateIndexFilename(const string& bamFilename, + const BamIndex::IndexType& type) +{ + switch ( type ) { + case ( BamIndex::STANDARD ) : return ( bamFilename + BamStandardIndex::Extension() ); + case ( BamIndex::BAMTOOLS ) : return ( bamFilename + BamToolsIndex::Extension() ); + default : + cerr << "BamIndexFactory ERROR: unknown index type" << type << endl; + return string(); + } +} + +// creates a new BamIndex object, depending on extension of @indexFilename +BamIndex* BamIndexFactory::CreateIndexFromFilename(const string& indexFilename, BamReaderPrivate* reader) { + + // if file doesn't exist, return null index + if ( !BamTools::FileExists(indexFilename) ) + return 0; + + // get file extension from index filename, including dot (".EXT") + // if can't get file extension, return null index + const string extension = FileExtension(indexFilename); + if ( extension.empty() ) + return 0; + + // create index based on extension + if ( extension == BamStandardIndex::Extension() ) return new BamStandardIndex(reader); + else if ( extension == BamToolsIndex::Extension() ) return new BamToolsIndex(reader); + else + return 0; +} + +// creates a new BamIndex, object of requested @type +BamIndex* BamIndexFactory::CreateIndexOfType(const BamIndex::IndexType& type, + BamReaderPrivate* reader) +{ + switch ( type ) { + case ( BamIndex::STANDARD ) : return new BamStandardIndex(reader); + case ( BamIndex::BAMTOOLS ) : return new BamToolsIndex(reader); + default : + cerr << "BamIndexFactory ERROR: unknown index type " << type << endl; + return 0; + } +} + +// retrieves file extension (including '.') +const string BamIndexFactory::FileExtension(const string& filename) { + + // if filename cannot contain valid path + extension, return empty string + if ( filename.empty() || filename.length() <= 4 ) + return string(); + + // look for last dot in filename + size_t lastDotPosition = filename.find_last_of('.'); + + // if none found, return empty string + if ( lastDotPosition == string::npos ) + return string(); + + // return substring from last dot position + return filename.substr(lastDotPosition); +} + +// returns name of existing index file that corresponds to @bamFilename +// will defer to @preferredType if possible, if not will attempt to load any supported type +// returns empty string if not found +const string BamIndexFactory::FindIndexFilename(const string& bamFilename, + const BamIndex::IndexType& preferredType) +{ + // try to find index of preferred type first + // return index filename if found + string indexFilename = CreateIndexFilename(bamFilename, preferredType); + if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) ) + return indexFilename; + + // couldn't find preferred type, try the other supported types + // return index filename if found + if ( preferredType != BamIndex::STANDARD ) { + indexFilename = CreateIndexFilename(bamFilename, BamIndex::STANDARD); + if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) ) + return indexFilename; + } + if ( preferredType != BamIndex::BAMTOOLS ) { + indexFilename = CreateIndexFilename(bamFilename, BamIndex::BAMTOOLS); + if ( !indexFilename.empty() && BamTools::FileExists(indexFilename) ) + return indexFilename; + } + + // otherwise couldn't find any index matching this filename + return string(); +} diff --git a/src/utils/BamTools/src/api/internal/BamIndexFactory_p.h b/src/utils/BamTools/src/api/internal/BamIndexFactory_p.h new file mode 100644 index 0000000000000000000000000000000000000000..f060d2cd4e766179cf81162897727ec06a0236a5 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamIndexFactory_p.h @@ -0,0 +1,50 @@ +// *************************************************************************** +// BamIndexFactory_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 5 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides interface for generating BamIndex implementations +// *************************************************************************** + +#ifndef BAMINDEX_FACTORY_P_H +#define BAMINDEX_FACTORY_P_H + +#include <api/BamIndex.h> +#include <string> + +namespace BamTools { +namespace Internal { + +class BamIndexFactory { + + // static interface methods + public: + // creates a new BamIndex object, depending on extension of @indexFilename + static BamIndex* CreateIndexFromFilename(const std::string& indexFilename, + BamReaderPrivate* reader); + // creates a new BamIndex object, of requested @type + static BamIndex* CreateIndexOfType(const BamIndex::IndexType& type, + BamReaderPrivate* reader); + // returns name of existing index file that corresponds to @bamFilename + // will defer to @preferredType if possible + // if @preferredType not found, will attempt to load any supported index type + // returns empty string if no index file (of any type) is found + static const std::string FindIndexFilename(const std::string& bamFilename, + const BamIndex::IndexType& preferredType); + + // internal methods + public: + // generates index filename from BAM filename (depending on requested type) + // if type is unknown, returns empty string + static const std::string CreateIndexFilename(const std::string& bamFilename, + const BamIndex::IndexType& type); + // retrieves file extension (including '.') + static const std::string FileExtension(const std::string& filename); +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMINDEX_FACTORY_P_H diff --git a/src/utils/BamTools/src/api/internal/BamMultiMerger_p.h b/src/utils/BamTools/src/api/internal/BamMultiMerger_p.h new file mode 100644 index 0000000000000000000000000000000000000000..ae67eea238df983b4e8968b41414ef1c5abd5fcc --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamMultiMerger_p.h @@ -0,0 +1,295 @@ +// *************************************************************************** +// BamMultiMerger_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 March 2011 (DB) +// --------------------------------------------------------------------------- +// Provides merging functionality for BamMultiReader. At this point, supports +// sorting results by (refId, position) or by read name. +// *************************************************************************** + +#ifndef BAMMULTIMERGER_P_H +#define BAMMULTIMERGER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <api/BamAlignment.h> +#include <api/BamReader.h> +#include <map> +#include <queue> +#include <string> +#include <utility> + +namespace BamTools { +namespace Internal { + +typedef std::pair<BamReader*, BamAlignment*> ReaderAlignment; + +// generic MultiMerger interface +class IBamMultiMerger { + + public: + IBamMultiMerger(void) { } + virtual ~IBamMultiMerger(void) { } + + public: + virtual void Add(const ReaderAlignment& value) =0; + virtual void Clear(void) =0; + virtual const ReaderAlignment& First(void) const =0; + virtual bool IsEmpty(void) const =0; + virtual void Remove(BamReader* reader) =0; + virtual int Size(void) const =0; + virtual ReaderAlignment TakeFirst(void) =0; +}; + +// IBamMultiMerger implementation - sorted on BamAlignment: (RefId, Position) +class PositionMultiMerger : public IBamMultiMerger { + + public: + PositionMultiMerger(void) : IBamMultiMerger() { } + ~PositionMultiMerger(void) { } + + public: + void Add(const ReaderAlignment& value); + void Clear(void); + const ReaderAlignment& First(void) const; + bool IsEmpty(void) const; + void Remove(BamReader* reader); + int Size(void) const; + ReaderAlignment TakeFirst(void); + + private: + typedef std::pair<int, int> KeyType; + typedef ReaderAlignment ValueType; + typedef std::pair<KeyType, ValueType> ElementType; + + typedef std::multimap<KeyType, ValueType> ContainerType; + typedef ContainerType::iterator DataIterator; + typedef ContainerType::const_iterator DataConstIterator; + + ContainerType m_data; +}; + +// IBamMultiMerger implementation - sorted on BamAlignment: Name +class ReadNameMultiMerger : public IBamMultiMerger { + + public: + ReadNameMultiMerger(void) : IBamMultiMerger() { } + ~ReadNameMultiMerger(void) { } + + public: + void Add(const ReaderAlignment& value); + void Clear(void); + const ReaderAlignment& First(void) const; + bool IsEmpty(void) const; + void Remove(BamReader* reader); + int Size(void) const; + ReaderAlignment TakeFirst(void); + + private: + typedef std::string KeyType; + typedef ReaderAlignment ValueType; + typedef std::pair<KeyType, ValueType> ElementType; + + typedef std::multimap<KeyType, ValueType> ContainerType; + typedef ContainerType::iterator DataIterator; + typedef ContainerType::const_iterator DataConstIterator; + + ContainerType m_data; +}; + +// IBamMultiMerger implementation - unsorted BAM file(s) +class UnsortedMultiMerger : public IBamMultiMerger { + + public: + UnsortedMultiMerger(void) : IBamMultiMerger() { } + ~UnsortedMultiMerger(void) { } + + public: + void Add(const ReaderAlignment& value); + void Clear(void); + const ReaderAlignment& First(void) const; + bool IsEmpty(void) const; + void Remove(BamReader* reader); + int Size(void) const; + ReaderAlignment TakeFirst(void); + + private: + typedef ReaderAlignment ElementType; + typedef std::vector<ReaderAlignment> ContainerType; + typedef ContainerType::iterator DataIterator; + typedef ContainerType::const_iterator DataConstIterator; + + ContainerType m_data; +}; + +// ------------------------------------------ +// PositionMultiMerger implementation + +inline void PositionMultiMerger::Add(const ReaderAlignment& value) { + const KeyType key( value.second->RefID, value.second->Position ); + m_data.insert( ElementType(key, value) ); +} + +inline void PositionMultiMerger::Clear(void) { + m_data.clear(); +} + +inline const ReaderAlignment& PositionMultiMerger::First(void) const { + const ElementType& entry = (*m_data.begin()); + return entry.second; +} + +inline bool PositionMultiMerger::IsEmpty(void) const { + return m_data.empty(); +} + +inline void PositionMultiMerger::Remove(BamReader* reader) { + + if ( reader == 0 ) return; + const std::string filenameToRemove = reader->GetFilename(); + + // iterate over readers in cache + DataIterator dataIter = m_data.begin(); + DataIterator dataEnd = m_data.end(); + for ( ; dataIter != dataEnd; ++dataIter ) { + const ValueType& entry = (*dataIter).second; + const BamReader* entryReader = entry.first; + if ( entryReader == 0 ) continue; + + // remove iterator on match + if ( entryReader->GetFilename() == filenameToRemove ) { + m_data.erase(dataIter); + return; + } + } +} + +inline int PositionMultiMerger::Size(void) const { + return m_data.size(); +} + +inline ReaderAlignment PositionMultiMerger::TakeFirst(void) { + DataIterator first = m_data.begin(); + ReaderAlignment next = (*first).second; + m_data.erase(first); + return next; +} + +// ------------------------------------------ +// ReadNameMultiMerger implementation + +inline void ReadNameMultiMerger::Add(const ReaderAlignment& value) { + const KeyType key(value.second->Name); + m_data.insert( ElementType(key, value) ); +} + +inline void ReadNameMultiMerger::Clear(void) { + m_data.clear(); +} + +inline const ReaderAlignment& ReadNameMultiMerger::First(void) const { + const ElementType& entry = (*m_data.begin()); + return entry.second; +} + +inline bool ReadNameMultiMerger::IsEmpty(void) const { + return m_data.empty(); +} + +inline void ReadNameMultiMerger::Remove(BamReader* reader) { + + if ( reader == 0 ) return; + const std::string filenameToRemove = reader->GetFilename(); + + // iterate over readers in cache + DataIterator dataIter = m_data.begin(); + DataIterator dataEnd = m_data.end(); + for ( ; dataIter != dataEnd; ++dataIter ) { + const ValueType& entry = (*dataIter).second; + const BamReader* entryReader = entry.first; + if ( entryReader == 0 ) continue; + + // remove iterator on match + if ( entryReader->GetFilename() == filenameToRemove ) { + m_data.erase(dataIter); + return; + } + } + +} + +inline int ReadNameMultiMerger::Size(void) const { + return m_data.size(); +} + +inline ReaderAlignment ReadNameMultiMerger::TakeFirst(void) { + DataIterator first = m_data.begin(); + ReaderAlignment next = (*first).second; + m_data.erase(first); + return next; +} + +// ------------------------------------------ +// UnsortedMultiMerger implementation + +inline void UnsortedMultiMerger::Add(const ReaderAlignment& value) { + m_data.push_back(value); +} + +inline void UnsortedMultiMerger::Clear(void) { + for (size_t i = 0; i < m_data.size(); ++i ) + m_data.pop_back(); +} + +inline const ReaderAlignment& UnsortedMultiMerger::First(void) const { + return m_data.front(); +} + +inline bool UnsortedMultiMerger::IsEmpty(void) const { + return m_data.empty(); +} + +inline void UnsortedMultiMerger::Remove(BamReader* reader) { + + if ( reader == 0 ) return; + const std::string filenameToRemove = reader->GetFilename(); + + // iterate over readers in cache + DataIterator dataIter = m_data.begin(); + DataIterator dataEnd = m_data.end(); + for ( ; dataIter != dataEnd; ++dataIter ) { + const BamReader* entryReader = (*dataIter).first; + if ( entryReader == 0 ) continue; + + // remove iterator on match + if ( entryReader->GetFilename() == filenameToRemove ) { + m_data.erase(dataIter); + return; + } + } +} + +inline int UnsortedMultiMerger::Size(void) const { + return m_data.size(); +} + +inline ReaderAlignment UnsortedMultiMerger::TakeFirst(void) { + ReaderAlignment first = m_data.front(); + m_data.erase( m_data.begin() ); + return first; +} + +} // namespace Internal +} // namespace BamTools + +#endif // BAMMULTIMERGER_P_H diff --git a/src/utils/BamTools/src/api/internal/BamMultiReader_p.cpp b/src/utils/BamTools/src/api/internal/BamMultiReader_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e789d06ee35c93d3945427ec99a4fe544af49930 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamMultiReader_p.cpp @@ -0,0 +1,802 @@ +// *************************************************************************** +// BamMultiReader_p.cpp (c) 2010 Derek Barnett, Erik Garrison +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 5 April 2011 (DB) +// --------------------------------------------------------------------------- +// Functionality for simultaneously reading multiple BAM files +// ************************************************************************* + +#include <api/BamAlignment.h> +#include <api/BamMultiReader.h> +#include <api/internal/BamMultiMerger_p.h> +#include <api/internal/BamMultiReader_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <algorithm> +#include <fstream> +#include <iostream> +#include <iterator> +#include <sstream> +using namespace std; + +// ctor +BamMultiReaderPrivate::BamMultiReaderPrivate(void) + : m_alignments(0) + , m_isCoreMode(false) + , m_sortOrder(BamMultiReader::SortedByPosition) +{ } + +// dtor +BamMultiReaderPrivate::~BamMultiReaderPrivate(void) { + + // close all open BAM readers + Close(); + + // clean up alignment cache + delete m_alignments; + m_alignments = 0; +} + +// close all BAM files +void BamMultiReaderPrivate::Close(void) { + CloseFiles( Filenames() ); +} + +// close requested BAM file +void BamMultiReaderPrivate::CloseFile(const string& filename) { + vector<string> filenames(1, filename); + CloseFiles(filenames); +} + +// close requested BAM files +void BamMultiReaderPrivate::CloseFiles(const vector<string>& filenames) { + + // iterate over filenames + vector<string>::const_iterator filesIter = filenames.begin(); + vector<string>::const_iterator filesEnd = filenames.end(); + for ( ; filesIter != filesEnd; ++filesIter ) { + const string& filename = (*filesIter); + if ( filename.empty() ) continue; + + // iterate over readers + vector<ReaderAlignment>::iterator readerIter = m_readers.begin(); + vector<ReaderAlignment>::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // if reader matches requested filename + if ( reader->GetFilename() == filename ) { + + // remove reader/alignment from alignment cache + m_alignments->Remove(reader); + + // close & delete reader + reader->Close(); + delete reader; + reader = 0; + + // delete reader's alignment entry + BamAlignment* alignment = (*readerIter).second; + delete alignment; + alignment = 0; + + // remove reader from container + m_readers.erase(readerIter); + + // on match, just go on to next filename + // (no need to keep looking and iterator is invalid now anyway) + break; + } + } + } + + // make sure alignment cache is cleared if all readers are now closed + if ( m_readers.empty() && m_alignments != 0 ) + m_alignments->Clear(); +} + +// creates index files for BAM files that don't have them +bool BamMultiReaderPrivate::CreateIndexes(const BamIndex::IndexType& type) { + + bool result = true; + + // iterate over readers + vector<ReaderAlignment>::iterator readerIter = m_readers.begin(); + vector<ReaderAlignment>::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // if reader doesn't have an index, create one + if ( !reader->HasIndex() ) + result &= reader->CreateIndex(type); + } + + return result; +} + +IBamMultiMerger* BamMultiReaderPrivate::CreateMergerForCurrentSortOrder(void) const { + switch ( m_sortOrder ) { + case ( BamMultiReader::SortedByPosition ) : return new PositionMultiMerger; + case ( BamMultiReader::SortedByReadName ) : return new ReadNameMultiMerger; + case ( BamMultiReader::Unsorted ) : return new UnsortedMultiMerger; + default : + cerr << "BamMultiReader ERROR: requested sort order is unknown" << endl; + return 0; + } +} + +const string BamMultiReaderPrivate::ExtractReadGroup(const string& headerLine) const { + + string readGroup(""); + stringstream headerLineSs(headerLine); + string part; + + // parse @RG header line, looking for the ID: tag + while( getline(headerLineSs, part, '\t') ) { + stringstream partSs(part); + string subtag; + getline(partSs, subtag, ':'); + if ( subtag == "ID" ) { + getline(partSs, readGroup, ':'); + break; + } + } + return readGroup; +} + +const vector<string> BamMultiReaderPrivate::Filenames(void) const { + + // init filename container + vector<string> filenames; + filenames.reserve( m_readers.size() ); + + // iterate over readers + vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin(); + vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // store filename if not empty + const string filename = reader->GetFilename(); + if ( !filename.empty() ) + filenames.push_back( reader->GetFilename() ); + } + + // return result + return filenames; +} + +SamHeader BamMultiReaderPrivate::GetHeader(void) const { + string text = GetHeaderText(); + return SamHeader(text); +} + +// makes a virtual, unified header for all the bam files in the multireader +string BamMultiReaderPrivate::GetHeaderText(void) const { + + // TODO: merge SamHeader objects instead of parsing string data (again) + + // if only one reader is open + if ( m_readers.size() == 1 ) { + + // just return reader's header text + const ReaderAlignment& ra = m_readers.front(); + const BamReader* reader = ra.first; + if ( reader ) return reader->GetHeaderText(); + + // invalid reader + return string(); + } + + string mergedHeader(""); + map<string, bool> readGroups; + + // foreach extraction entry (each BAM file) + vector<ReaderAlignment>::const_iterator readerBegin = m_readers.begin(); + vector<ReaderAlignment>::const_iterator readerIter = readerBegin; + vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // get header from reader + string headerText = reader->GetHeaderText(); + if ( headerText.empty() ) continue; + + // store header text in lines + map<string, bool> currentFileReadGroups; + const vector<string> lines = SplitHeaderText(headerText); + + // iterate over header lines + vector<string>::const_iterator linesIter = lines.begin(); + vector<string>::const_iterator linesEnd = lines.end(); + for ( ; linesIter != linesEnd; ++linesIter ) { + + // get next line from header, skip if empty + const string headerLine = (*linesIter); + if ( headerLine.empty() ) continue; + + // if first file, save HD & SQ entries + // TODO: what if first file has empty header, should just check for empty 'mergedHeader' instead ? + if ( readerIter == readerBegin ) { + if ( headerLine.find("@HD") == 0 || headerLine.find("@SQ") == 0) { + mergedHeader.append(headerLine.c_str()); + mergedHeader.append(1, '\n'); + } + } + + // (for all files) append RG entries if they are unique + if ( headerLine.find("@RG") == 0 ) { + + // extract read group name from line + const string readGroup = ExtractReadGroup(headerLine); + + // make sure not to duplicate @RG entries + if ( readGroups.find(readGroup) == readGroups.end() ) { + mergedHeader.append(headerLine.c_str() ); + mergedHeader.append(1, '\n'); + readGroups[readGroup] = true; + currentFileReadGroups[readGroup] = true; + } else { + // warn iff we are reading one file and discover duplicated @RG tags in the header + // otherwise, we emit no warning, as we might be merging multiple BAM files with identical @RG tags + if ( currentFileReadGroups.find(readGroup) != currentFileReadGroups.end() ) { + cerr << "BamMultiReader WARNING: duplicate @RG tag " << readGroup + << " entry in header of " << reader->GetFilename() << endl; + } + } + } + } + } + + // return merged header text + return mergedHeader; +} + +// get next alignment among all files +bool BamMultiReaderPrivate::GetNextAlignment(BamAlignment& al) { + m_isCoreMode = false; + return LoadNextAlignment(al); +} + +// get next alignment among all files without parsing character data from alignments +bool BamMultiReaderPrivate::GetNextAlignmentCore(BamAlignment& al) { + m_isCoreMode = true; + return LoadNextAlignment(al); +} + +// --------------------------------------------------------------------------------------- +// +// NB: The following GetReferenceX() functions assume that we have identical +// references for all BAM files. We enforce this by invoking the +// ValidateReaders() method to verify that our reference data is the same +// across all files on Open - so we will not encounter a situation in which +// there is a mismatch and we are still live. +// +// --------------------------------------------------------------------------------------- + +// returns the number of reference sequences +int BamMultiReaderPrivate::GetReferenceCount(void) const { + + // handle empty multireader + if ( m_readers.empty() ) + return 0; + + // return reference count from first reader + const ReaderAlignment& ra = m_readers.front(); + const BamReader* reader = ra.first; + if ( reader ) return reader->GetReferenceCount(); + + // invalid reader + return 0; +} + +// returns vector of reference objects +const RefVector BamMultiReaderPrivate::GetReferenceData(void) const { + + // handle empty multireader + if ( m_readers.empty() ) + return RefVector(); + + // return reference data from first BamReader + const ReaderAlignment& ra = m_readers.front(); + const BamReader* reader = ra.first; + if ( reader ) return reader->GetReferenceData(); + + // invalid reader + return RefVector(); +} + +// returns refID from reference name +int BamMultiReaderPrivate::GetReferenceID(const string& refName) const { + + // handle empty multireader + if ( m_readers.empty() ) + return -1; + + // return reference ID from first BamReader + const ReaderAlignment& ra = m_readers.front(); + const BamReader* reader = ra.first; + if ( reader ) return reader->GetReferenceID(refName); + + // invalid reader + return -1; +} +// --------------------------------------------------------------------------------------- + +// checks if any readers still have alignments +bool BamMultiReaderPrivate::HasAlignmentData(void) const { + if ( m_alignments == 0 ) + return false; + return !m_alignments->IsEmpty(); +} + +// returns true if all readers have index data available +// this is useful to indicate whether Jump() or SetRegion() are possible +bool BamMultiReaderPrivate::HasIndexes(void) const { + + // handle empty multireader + if ( m_readers.empty() ) + return false; + + bool result = true; + + // iterate over readers + vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin(); + vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // see if current reader has index data + result &= reader->HasIndex(); + } + + return result; +} + +// returns true if multireader has open readers +bool BamMultiReaderPrivate::HasOpenReaders(void) { + + // iterate over readers + vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin(); + vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + const BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // return true whenever an open reader is found + if ( reader->IsOpen() ) return true; + } + + // no readers open + return false; +} + +// performs random-access jump using (refID, position) as a left-bound +bool BamMultiReaderPrivate::Jump(int refID, int position) { + + // NB: While it may make sense to track readers in which we can + // successfully Jump, in practice a failure of Jump means "no + // alignments here." It makes sense to simply accept the failure, + // UpdateAlignments(), and continue. + + // iterate over readers + vector<ReaderAlignment>::iterator readerIter = m_readers.begin(); + vector<ReaderAlignment>::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // attempt jump() on each + if ( !reader->Jump(refID, position) ) { + cerr << "BamMultiReader ERROR: could not jump " << reader->GetFilename() + << " to " << refID << ":" << position << endl; + } + } + + // update alignment cache & return success + UpdateAlignmentCache(); + return true; +} + +bool BamMultiReaderPrivate::LoadNextAlignment(BamAlignment& al) { + + // bail out if no more data to process + if ( !HasAlignmentData() ) + return false; + + // "pop" next alignment and reader + ReaderAlignment nextReaderAlignment = m_alignments->TakeFirst(); + BamReader* reader = nextReaderAlignment.first; + BamAlignment* alignment = nextReaderAlignment.second; + + // store cached alignment into destination parameter (by copy) + al = *alignment; + + // peek to next alignment & store in cache + SaveNextAlignment(reader, alignment); + + // return success + return true; +} + +// locate (& load) index files for BAM readers that don't already have one loaded +bool BamMultiReaderPrivate::LocateIndexes(const BamIndex::IndexType& preferredType) { + + bool result = true; + + // iterate over readers + vector<ReaderAlignment>::iterator readerIter = m_readers.begin(); + vector<ReaderAlignment>::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // if reader has no index, try to locate one + if ( !reader->HasIndex() ) + result &= reader->LocateIndex(preferredType); + } + + return result; +} + +// opens BAM files +bool BamMultiReaderPrivate::Open(const vector<string>& filenames) { + + // create alignment cache if neccessary + if ( m_alignments == 0 ) { + m_alignments = CreateMergerForCurrentSortOrder(); + if ( m_alignments == 0 ) return false; + } + + // iterate over filenames + vector<string>::const_iterator filenameIter = filenames.begin(); + vector<string>::const_iterator filenameEnd = filenames.end(); + for ( ; filenameIter != filenameEnd; ++filenameIter ) { + const string& filename = (*filenameIter); + if ( filename.empty() ) continue; + + // attempt to open BamReader on filename + BamReader* reader = OpenReader(filename); + if ( reader == 0 ) continue; + + // store reader with new alignment + m_readers.push_back( make_pair(reader, new BamAlignment) ); + } + + // validate & rewind any opened readers, also refreshes alignment cache + if ( !m_readers.empty() ) { + ValidateReaders(); + Rewind(); + } + + // return success + return true; +} + +bool BamMultiReaderPrivate::OpenFile(const std::string& filename) { + vector<string> filenames(1, filename); + return Open(filenames); +} + +bool BamMultiReaderPrivate::OpenIndexes(const vector<string>& indexFilenames) { + + // TODO: This needs to be cleaner - should not assume same order. + // And either way, shouldn't start at first reader. Should start at + // first reader without an index? + + // make sure same number of index filenames as readers + if ( m_readers.size() != indexFilenames.size() || !indexFilenames.empty() ) + return false; + + // init result flag + bool result = true; + + // iterate over BamReaders + vector<string>::const_iterator indexFilenameIter = indexFilenames.begin(); + vector<string>::const_iterator indexFilenameEnd = indexFilenames.end(); + vector<ReaderAlignment>::iterator readerIter = m_readers.begin(); + vector<ReaderAlignment>::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + + // open index filename on reader + if ( reader ) { + const string& indexFilename = (*indexFilenameIter); + result &= reader->OpenIndex(indexFilename); + } + + // increment filename iterator, skip if no more index files to open + if ( ++indexFilenameIter == indexFilenameEnd ) + break; + } + + // TODO: validation ?? + + // return success/fail + return result; +} + +BamReader* BamMultiReaderPrivate::OpenReader(const std::string& filename) { + + // create new BamReader + BamReader* reader = new BamReader; + + // if reader opens OK + if ( reader->Open(filename) ) { + + // attempt to read first alignment (sanity check) + // if ok, then return BamReader pointer + BamAlignment al; + if ( reader->GetNextAlignmentCore(al) ) + return reader; + + // could not read alignment + else { + cerr << "BamMultiReader WARNING: Could not read first alignment from " + << filename << ", ignoring file" << endl; + } + } + + // reader could not open + else { + cerr << "BamMultiReader WARNING: Could not open " + << filename << ", ignoring file" << endl; + } + + // if we get here, there was a problem with this BAM file (opening or reading) + // clean up memory allocation & return null pointer + delete reader; + return 0; +} + +// print associated filenames to stdout +void BamMultiReaderPrivate::PrintFilenames(void) const { + const vector<string>& filenames = Filenames(); + vector<string>::const_iterator filenameIter = filenames.begin(); + vector<string>::const_iterator filenameEnd = filenames.end(); + for ( ; filenameIter != filenameEnd; ++filenameIter ) + cout << (*filenameIter) << endl; +} + +// returns BAM file pointers to beginning of alignment data & resets alignment cache +bool BamMultiReaderPrivate::Rewind(void) { + + // clear out alignment cache + m_alignments->Clear(); + + // attempt to rewind files + if ( !RewindReaders() ) { + cerr << "BamMultiReader ERROR: could not rewind file(s) successfully"; + return false; + } + + // reset cache & return success + UpdateAlignmentCache(); + return true; +} + +// returns BAM file pointers to beginning of alignment data +bool BamMultiReaderPrivate::RewindReaders(void) { + + bool result = true; + + // iterate over readers + vector<ReaderAlignment>::iterator readerIter = m_readers.begin(); + vector<ReaderAlignment>::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // attempt rewind on BamReader + result &= reader->Rewind(); + } + + return result; +} + +void BamMultiReaderPrivate::SaveNextAlignment(BamReader* reader, BamAlignment* alignment) { + + // must be in core mode && NOT sorting by read name to call GNACore() + if ( m_isCoreMode && m_sortOrder != BamMultiReader::SortedByReadName ) { + if ( reader->GetNextAlignmentCore(*alignment) ) + m_alignments->Add( make_pair(reader, alignment) ); + } + + // not in core mode and/or sorting by readname, must call GNA() + else { + if ( reader->GetNextAlignment(*alignment) ) + m_alignments->Add( make_pair(reader, alignment) ); + } +} + +// sets the index caching mode on the readers +void BamMultiReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode mode) { + + // iterate over readers + vector<ReaderAlignment>::iterator readerIter = m_readers.begin(); + vector<ReaderAlignment>::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // set reader's index cache mode + reader->SetIndexCacheMode(mode); + } +} + +bool BamMultiReaderPrivate::SetRegion(const BamRegion& region) { + + // NB: While it may make sense to track readers in which we can + // successfully SetRegion, In practice a failure of SetRegion means "no + // alignments here." It makes sense to simply accept the failure, + // UpdateAlignments(), and continue. + + // iterate over alignments + vector<ReaderAlignment>::iterator readerIter = m_readers.begin(); + vector<ReaderAlignment>::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + + // attempt to set BamReader's region of interest + if ( !reader->SetRegion(region) ) { + cerr << "BamMultiReader WARNING: could not jump " << reader->GetFilename() << " to " + << region.LeftRefID << ":" << region.LeftPosition << ".." + << region.RightRefID << ":" << region.RightPosition << endl; + } + } + + // update alignment cache & return success + UpdateAlignmentCache(); + return true; +} + +void BamMultiReaderPrivate::SetSortOrder(const BamMultiReader::SortOrder& order) { + + // skip if no change needed + if ( m_sortOrder == order ) return; + + // set new sort order + m_sortOrder = order; + + // create new alignment cache based on sort order + IBamMultiMerger* newAlignmentCache = CreateMergerForCurrentSortOrder(); + if ( newAlignmentCache == 0 ) return; // print error? + + // copy old cache contents to new cache + while ( m_alignments->Size() > 0 ) { + ReaderAlignment value = m_alignments->TakeFirst(); // retrieves & 'pops' + newAlignmentCache->Add(value); + } + + // remove old cache structure & point to new cache + delete m_alignments; + m_alignments = newAlignmentCache; +} + +// splits the entire header into a list of strings +const vector<string> BamMultiReaderPrivate::SplitHeaderText(const string& headerText) const { + + stringstream header(headerText); + string item; + + vector<string> lines; + while ( getline(header, item) ) + lines.push_back(item); + return lines; +} + +// updates our alignment cache +void BamMultiReaderPrivate::UpdateAlignmentCache(void) { + + // skip if invalid alignment cache + if ( m_alignments == 0 ) return; + + // clear the cache + m_alignments->Clear(); + + // seed cache with fully-populated alignments + // further updates will fill with full/core-only as requested + m_isCoreMode = false; + + // iterate over readers + vector<ReaderAlignment>::iterator readerIter = m_readers.begin(); + vector<ReaderAlignment>::iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + BamReader* reader = (*readerIter).first; + BamAlignment* alignment = (*readerIter).second; + if ( reader == 0 || alignment == 0 ) continue; + + // save next alignment from each reader in cache + SaveNextAlignment(reader, alignment); + } +} + +// ValidateReaders checks that all the readers point to BAM files representing +// alignments against the same set of reference sequences, and that the +// sequences are identically ordered. If these checks fail the operation of +// the multireader is undefined, so we force program exit. +void BamMultiReaderPrivate::ValidateReaders(void) const { + + // retrieve first reader data + const BamReader* firstReader = m_readers.front().first; + if ( firstReader == 0 ) return; + const RefVector firstReaderRefData = firstReader->GetReferenceData(); + const int firstReaderRefCount = firstReader->GetReferenceCount(); + const int firstReaderRefSize = firstReaderRefData.size(); + + // iterate over all readers + vector<ReaderAlignment>::const_iterator readerIter = m_readers.begin(); + vector<ReaderAlignment>::const_iterator readerEnd = m_readers.end(); + for ( ; readerIter != readerEnd; ++readerIter ) { + + // get current reader data + BamReader* reader = (*readerIter).first; + if ( reader == 0 ) continue; + const RefVector currentReaderRefData = reader->GetReferenceData(); + const int currentReaderRefCount = reader->GetReferenceCount(); + const int currentReaderRefSize = currentReaderRefData.size(); + + // init container iterators + RefVector::const_iterator firstRefIter = firstReaderRefData.begin(); + RefVector::const_iterator firstRefEnd = firstReaderRefData.end(); + RefVector::const_iterator currentRefIter = currentReaderRefData.begin(); + + // compare reference counts from BamReader ( & container size, in case of BR error) + if ( (currentReaderRefCount != firstReaderRefCount) || + (firstReaderRefSize != currentReaderRefSize) ) + { + cerr << "BamMultiReader ERROR: mismatched number of references in " << reader->GetFilename() + << " expected " << firstReaderRefCount + << " reference sequences but only found " << currentReaderRefCount << endl; + exit(1); + } + + // this will be ok; we just checked above that we have identically-sized sets of references + // here we simply check if they are all, in fact, equal in content + while ( firstRefIter != firstRefEnd ) { + const RefData& firstRef = (*firstRefIter); + const RefData& currentRef = (*currentRefIter); + + // compare reference name & length + if ( (firstRef.RefName != currentRef.RefName) || + (firstRef.RefLength != currentRef.RefLength) ) + { + cerr << "BamMultiReader ERROR: mismatched references found in " << reader->GetFilename() + << " expected: " << endl; + + // print first reader's reference data + RefVector::const_iterator refIter = firstReaderRefData.begin(); + RefVector::const_iterator refEnd = firstReaderRefData.end(); + for ( ; refIter != refEnd; ++refIter ) { + const RefData& entry = (*refIter); + cerr << entry.RefName << " " << entry.RefLength << endl; + } + + cerr << "but found: " << endl; + + // print current reader's reference data + refIter = currentReaderRefData.begin(); + refEnd = currentReaderRefData.end(); + for ( ; refIter != refEnd; ++refIter ) { + const RefData& entry = (*refIter); + cerr << entry.RefName << " " << entry.RefLength << endl; + } + + exit(1); + } + + // update iterators + ++firstRefIter; + ++currentRefIter; + } + } +} diff --git a/src/utils/BamTools/src/api/internal/BamMultiReader_p.h b/src/utils/BamTools/src/api/internal/BamMultiReader_p.h new file mode 100644 index 0000000000000000000000000000000000000000..b34fb0c583eb016167846d2cf937e14cab669c01 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamMultiReader_p.h @@ -0,0 +1,102 @@ +// *************************************************************************** +// BamMultiReader_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 13 March 2011 (DB) +// --------------------------------------------------------------------------- +// Functionality for simultaneously reading multiple BAM files +// ************************************************************************* + +#ifndef BAMMULTIREADER_P_H +#define BAMMULTIREADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <api/SamHeader.h> +#include <api/BamMultiReader.h> +#include <string> +#include <vector> + +namespace BamTools { +namespace Internal { + +class IBamMultiMerger; + +class BamMultiReaderPrivate { + + // constructor / destructor + public: + BamMultiReaderPrivate(void); + ~BamMultiReaderPrivate(void); + + // public interface + public: + + // file operations + void Close(void); + void CloseFile(const std::string& filename); + void CloseFiles(const std::vector<std::string>& filenames); + const std::vector<std::string> Filenames(void) const; + bool Jump(int refID, int position = 0); + bool Open(const std::vector<std::string>& filenames); + bool OpenFile(const std::string& filename); + void PrintFilenames(void) const; + bool Rewind(void); + bool SetRegion(const BamRegion& region); + + // access alignment data + bool GetNextAlignment(BamAlignment& al); + bool GetNextAlignmentCore(BamAlignment& al); + bool HasOpenReaders(void); + void SetSortOrder(const BamMultiReader::SortOrder& order); + + // access auxiliary data + SamHeader GetHeader(void) const; + std::string GetHeaderText(void) const; + int GetReferenceCount(void) const; + const BamTools::RefVector GetReferenceData(void) const; + int GetReferenceID(const std::string& refName) const; + + // BAM index operations + bool CreateIndexes(const BamIndex::IndexType& type = BamIndex::STANDARD); + bool HasIndexes(void) const; + bool LocateIndexes(const BamIndex::IndexType& preferredType = BamIndex::STANDARD); + bool OpenIndexes(const std::vector<std::string>& indexFilenames); + void SetIndexCacheMode(const BamIndex::IndexCacheMode mode); + + // 'internal' methods + public: + IBamMultiMerger* CreateMergerForCurrentSortOrder(void) const; + const std::string ExtractReadGroup(const std::string& headerLine) const; + bool HasAlignmentData(void) const; + bool LoadNextAlignment(BamAlignment& al); + BamTools::BamReader* OpenReader(const std::string& filename); + bool RewindReaders(void); + void SaveNextAlignment(BamTools::BamReader* reader, BamTools::BamAlignment* alignment); + const std::vector<std::string> SplitHeaderText(const std::string& headerText) const; + void UpdateAlignmentCache(void); + void ValidateReaders(void) const; + + // data members + public: + typedef std::pair<BamReader*, BamAlignment*> ReaderAlignment; + std::vector<ReaderAlignment> m_readers; + + IBamMultiMerger* m_alignments; + bool m_isCoreMode; + BamMultiReader::SortOrder m_sortOrder; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMMULTIREADER_P_H diff --git a/src/utils/BamTools/src/api/internal/BamRandomAccessController_p.cpp b/src/utils/BamTools/src/api/internal/BamRandomAccessController_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..89636bbf0ecf46a76b4ec5a63cff085e6b899a04 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamRandomAccessController_p.cpp @@ -0,0 +1,273 @@ +// *************************************************************************** +// BamRandomAccessController_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 5 April 2011(DB) +// --------------------------------------------------------------------------- +// Manages random access operations in a BAM file +// ************************************************************************** + +#include <api/BamIndex.h> +#include <api/internal/BamRandomAccessController_p.h> +#include <api/internal/BamReader_p.h> +#include <api/internal/BamIndexFactory_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <iostream> +using namespace std; + +BamRandomAccessController::BamRandomAccessController(void) + : m_index(0) + , m_indexCacheMode(BamIndex::LimitedIndexCaching) + , m_hasAlignmentsInRegion(true) +{ } + +BamRandomAccessController::~BamRandomAccessController(void) { + Close(); +} + +void BamRandomAccessController::AdjustRegion(const int& referenceCount) { + + // skip if no index available + if ( m_index == 0 ) + return; + + // see if any references in region have alignments + m_hasAlignmentsInRegion = false; + int currentId = m_region.LeftRefID; + const int rightBoundRefId = ( m_region.isRightBoundSpecified() ? m_region.RightRefID : referenceCount - 1 ); + while ( currentId <= rightBoundRefId ) { + m_hasAlignmentsInRegion = m_index->HasAlignments(currentId); + if ( m_hasAlignmentsInRegion ) break; + ++currentId; + } + + // if no data found on any reference in region + if ( !m_hasAlignmentsInRegion ) + return; + + // if left bound of desired region had no data, use first reference that had data + // otherwise, leave requested region as-is + if ( currentId != m_region.LeftRefID ) { + m_region.LeftRefID = currentId; + m_region.LeftPosition = 0; + } +} + +// returns alignments' "RegionState": { Before|Overlaps|After } current region +BamRandomAccessController::RegionState +BamRandomAccessController::AlignmentState(const BamAlignment& alignment) const { + + // if region has no left bound at all + if ( !m_region.isLeftBoundSpecified() ) + return OverlapsRegion; + + // handle unmapped reads - return AFTER region to halt processing + if ( alignment.RefID == -1 ) + return AfterRegion; + + // if alignment is on any reference before left bound reference + if ( alignment.RefID < m_region.LeftRefID ) + return BeforeRegion; + + // if alignment is on left bound reference + else if ( alignment.RefID == m_region.LeftRefID ) { + + // if alignment starts at or after left bound position + if ( alignment.Position >= m_region.LeftPosition) { + + if ( m_region.isRightBoundSpecified() && // right bound is specified AND + m_region.LeftRefID == m_region.RightRefID && // left & right bounds on same reference AND + alignment.Position > m_region.RightPosition ) // alignment starts after right bound position + return AfterRegion; + + // otherwise, alignment overlaps region + else return OverlapsRegion; + } + + // alignment starts before left bound position + else { + + // if alignment overlaps left bound position + if ( alignment.GetEndPosition() >= m_region.LeftPosition ) + return OverlapsRegion; + else + return BeforeRegion; + } + } + + // otherwise alignment is on a reference after left bound reference + else { + + // if region has a right bound + if ( m_region.isRightBoundSpecified() ) { + + // alignment is on any reference between boundaries + if ( alignment.RefID < m_region.RightRefID ) + return OverlapsRegion; + + // alignment is on any reference after right boundary + else if ( alignment.RefID > m_region.RightRefID ) + return AfterRegion; + + // alignment is on right bound reference + else { + + // if alignment starts on or before right bound position + if ( alignment.Position <= m_region.RightPosition ) + return OverlapsRegion; + else + return AfterRegion; + } + } + + // otherwise, alignment starts after left bound and there is no right bound + else return OverlapsRegion; + } +} + +void BamRandomAccessController::Close(void) { + ClearIndex(); + ClearRegion(); +} + +void BamRandomAccessController::ClearIndex(void) { + delete m_index; + m_index = 0; +} + +void BamRandomAccessController::ClearRegion(void) { + m_region.clear(); + m_hasAlignmentsInRegion = true; +} + +bool BamRandomAccessController::CreateIndex(BamReaderPrivate* reader, + const BamIndex::IndexType& type) { + + // skip if reader is invalid + if ( reader == 0 ) + return false; + + // create new index of requested type + BamIndex* newIndex = BamIndexFactory::CreateIndexOfType(type, reader); + if ( newIndex == 0 ) { + cerr << "BamRandomAccessController ERROR: could not create index of type " << type << endl; + return false; + } + + // attempt to build index from current BamReader file + if ( !newIndex->Create() ) { + cerr << "BamRandomAccessController ERROR: could not create index for BAM file: " + << reader->Filename() << endl; + return false; + } + + // save new index + SetIndex(newIndex); + + // set new index's cache mode & return success + newIndex->SetCacheMode(m_indexCacheMode); + return true; +} + +bool BamRandomAccessController::HasIndex(void) const { + return ( m_index != 0 ); +} + +bool BamRandomAccessController::HasRegion(void) const { + return ( !m_region.isNull() ); +} + +bool BamRandomAccessController::IndexHasAlignmentsForReference(const int& refId) { + return m_index->HasAlignments(refId); +} + +bool BamRandomAccessController::LocateIndex(BamReaderPrivate* reader, + const BamIndex::IndexType& preferredType) +{ + // look up index filename, deferring to preferredType if possible + const string& indexFilename = BamIndexFactory::FindIndexFilename(reader->Filename(), preferredType); + + // if no index file found (of any type) + if ( indexFilename.empty() ) { + cerr << "BamRandomAccessController WARNING: " + << "could not find index file for BAM: " + << reader->Filename() << endl; + return false; + } + + // otherwise open & use index file that was found + return OpenIndex(indexFilename, reader); +} + +bool BamRandomAccessController::OpenIndex(const string& indexFilename, BamReaderPrivate* reader) { + + // attempt create new index of type based on filename + BamIndex* index = BamIndexFactory::CreateIndexFromFilename(indexFilename, reader); + if ( index == 0 ) { + cerr << "BamRandomAccessController ERROR: could not create index for file: " << indexFilename << endl; + return false; + } + + // set cache mode + index->SetCacheMode(m_indexCacheMode); + + // attempt to load data from index file + if ( !index->Load(indexFilename) ) { + cerr << "BamRandomAccessController ERROR: could not load index data from file: " << indexFilename << endl; + return false; + } + + // save new index & return success + SetIndex(index); + return true; +} + +bool BamRandomAccessController::RegionHasAlignments(void) const { + return m_hasAlignmentsInRegion; +} + +void BamRandomAccessController::SetIndex(BamIndex* index) { + if ( m_index ) + ClearIndex(); + m_index = index; +} + +void BamRandomAccessController::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) { + m_indexCacheMode = mode; + if ( m_index ) + m_index->SetCacheMode(mode); +} + +bool BamRandomAccessController::SetRegion(BamReaderPrivate* reader, + const BamRegion& region, + const int& referenceCount) +{ + // store region + m_region = region; + + // cannot jump when no index is available + if ( !HasIndex() ) + return false; + + // adjust region as necessary to reflect where data actually begins + AdjustRegion(referenceCount); + + // if no data present, return true + // * Not an error, but future attempts to access alignments in this region will not return data + // Returning true is useful in a BamMultiReader setting where some BAM files may + // lack alignments in regions where other BAMs do have data. + if ( !m_hasAlignmentsInRegion ) + return true; + + // return success/failure of jump to specified region, + // + // * Index::Jump() is allowed to modify the m_hasAlignmentsInRegion flag + // This covers 'corner case' where a region is requested that lies beyond the last + // alignment on a reference. If this occurs, any subsequent calls to GetNextAlignment[Core] + // will not return data. BamMultiReader will still be able to successfully pull alignments + // from a region from multiple files even if one or more have no data. + return m_index->Jump(m_region, &m_hasAlignmentsInRegion); +} diff --git a/src/utils/BamTools/src/api/internal/BamRandomAccessController_p.h b/src/utils/BamTools/src/api/internal/BamRandomAccessController_p.h new file mode 100644 index 0000000000000000000000000000000000000000..372ea4a5e9a300dfc24ff2e2f4ea7ccdac92f21a --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamRandomAccessController_p.h @@ -0,0 +1,94 @@ +// *************************************************************************** +// BamRandomAccessController_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 24 February 2011(DB) +// --------------------------------------------------------------------------- +// Manages random access operations in a BAM file +// *************************************************************************** + +#ifndef BAMRACONTROLLER_P_H +#define BAMRACONTROLLER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <api/BamAux.h> +#include <api/BamIndex.h> + +namespace BamTools { + +class BamAlignment; + +namespace Internal { + +class BamReaderPrivate; + +class BamRandomAccessController { + + // enums + public: enum RegionState { BeforeRegion = 0 + , OverlapsRegion + , AfterRegion + }; + + // ctor & dtor + public: + BamRandomAccessController(void); + ~BamRandomAccessController(void); + + // general interface + public: + void Close(void); + + // index operations + public: + // + void ClearIndex(void); + bool CreateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& type); + bool HasIndex(void) const; + bool IndexHasAlignmentsForReference(const int& refId); + bool LocateIndex(BamReaderPrivate* reader, const BamIndex::IndexType& preferredType); + bool OpenIndex(const std::string& indexFilename, BamReaderPrivate* reader); + void SetIndex(BamIndex* index); + void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode); + + // region operations + public: + void ClearRegion(void); + bool HasRegion(void) const; + RegionState AlignmentState(const BamAlignment& alignment) const; + bool RegionHasAlignments(void) const; + bool SetRegion(BamReaderPrivate* reader, + const BamRegion& region, + const int& referenceCount); + + // 'internal' methods + public: + // adjusts requested region if necessary (depending on where data actually begins) + void AdjustRegion(const int& referenceCount); + + // data members + private: + + // index data + BamIndex* m_index; // owns index, not a copy - responsible for deleting + BamIndex::IndexCacheMode m_indexCacheMode; + + // region data + BamRegion m_region; + bool m_hasAlignmentsInRegion; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMRACONTROLLER_P_H diff --git a/src/utils/BamTools/src/api/internal/BamReader_p.cpp b/src/utils/BamTools/src/api/internal/BamReader_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5daa1bf2950c6c7da9af4cc6c5f9bd4b40ca41c3 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamReader_p.cpp @@ -0,0 +1,381 @@ +// *************************************************************************** +// BamReader_p.cpp (c) 2009 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 10 May 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +#include <api/BamConstants.h> +#include <api/BamReader.h> +#include <api/internal/BamHeader_p.h> +#include <api/internal/BamRandomAccessController_p.h> +#include <api/internal/BamReader_p.h> +#include <api/internal/BamStandardIndex_p.h> +#include <api/internal/BamToolsIndex_p.h> +#include <api/internal/BgzfStream_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <algorithm> +#include <iostream> +#include <iterator> +#include <vector> +using namespace std; + +// constructor +BamReaderPrivate::BamReaderPrivate(BamReader* parent) + : m_alignmentsBeginOffset(0) + , m_parent(parent) +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +// destructor +BamReaderPrivate::~BamReaderPrivate(void) { + Close(); +} + +// closes the BAM file +void BamReaderPrivate::Close(void) { + + // clear header & reference data + m_references.clear(); + m_header.Clear(); + + // close internal + m_randomAccessController.Close(); + m_stream.Close(); + + // clear filename + m_filename.clear(); +} + +// creates an index file of requested type on current BAM file +bool BamReaderPrivate::CreateIndex(const BamIndex::IndexType& type) { + if ( !IsOpen() ) return false; + return m_randomAccessController.CreateIndex(this, type); +} + +// return path & filename of current BAM file +const string BamReaderPrivate::Filename(void) const { + return m_filename; +} + +// return header data as std::string +string BamReaderPrivate::GetHeaderText(void) const { + return m_header.ToString(); +} + +// return header data as SamHeader object +SamHeader BamReaderPrivate::GetSamHeader(void) const { + return m_header.ToSamHeader(); +} + +// get next alignment (with character data fully parsed) +bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) { + + // if valid alignment found + if ( GetNextAlignmentCore(alignment) ) { + + // store alignment's "source" filename + alignment.Filename = m_filename; + + // return success/failure of parsing char data + return alignment.BuildCharData(); + } + + // no valid alignment found + return false; +} + +// retrieves next available alignment core data (returns success/fail) +// ** DOES NOT populate any character data fields (read name, bases, qualities, tag data, filename) +// these can be accessed, if necessary, from the supportData +// useful for operations requiring ONLY positional or other alignment-related information +bool BamReaderPrivate::GetNextAlignmentCore(BamAlignment& alignment) { + + // skip if region is set but has no alignments + if ( m_randomAccessController.HasRegion() && + !m_randomAccessController.RegionHasAlignments() ) + { + return false; + } + + // if can't read next alignment + if ( !LoadNextAlignment(alignment) ) + return false; + + // check alignment's region-overlap state + BamRandomAccessController::RegionState state = m_randomAccessController.AlignmentState(alignment); + + // if alignment starts after region, no need to keep reading + if ( state == BamRandomAccessController::AfterRegion ) + return false; + + // read until overlap is found + while ( state != BamRandomAccessController::OverlapsRegion ) { + + // if can't read next alignment + if ( !LoadNextAlignment(alignment) ) + return false; + + // check alignment's region-overlap state + state = m_randomAccessController.AlignmentState(alignment); + + // if alignment starts after region, no need to keep reading + if ( state == BamRandomAccessController::AfterRegion ) + return false; + } + + // if we get here, we found the next 'valid' alignment + // (e.g. overlaps current region if one was set, simply the next alignment if not) + alignment.SupportData.HasCoreOnly = true; + return true; +} + +int BamReaderPrivate::GetReferenceCount(void) const { + return m_references.size(); +} + +const RefVector& BamReaderPrivate::GetReferenceData(void) const { + return m_references; +} + +// returns RefID for given RefName (returns References.size() if not found) +int BamReaderPrivate::GetReferenceID(const string& refName) const { + + // retrieve names from reference data + vector<string> refNames; + RefVector::const_iterator refIter = m_references.begin(); + RefVector::const_iterator refEnd = m_references.end(); + for ( ; refIter != refEnd; ++refIter) + refNames.push_back( (*refIter).RefName ); + + // return 'index-of' refName (or -1 if not found) + int index = distance(refNames.begin(), find(refNames.begin(), refNames.end(), refName)); + if ( index == (int)m_references.size() ) return -1; + else return index; +} + +bool BamReaderPrivate::HasIndex(void) const { + return m_randomAccessController.HasIndex(); +} + +bool BamReaderPrivate::IsOpen(void) const { + return m_stream.IsOpen; +} + +// load BAM header data +bool BamReaderPrivate::LoadHeaderData(void) { + return m_header.Load(&m_stream); +} + +// populates BamAlignment with alignment data under file pointer, returns success/fail +bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment) { + + // read in the 'block length' value, make sure it's not zero + char buffer[sizeof(uint32_t)]; + m_stream.Read(buffer, sizeof(uint32_t)); + alignment.SupportData.BlockLength = BamTools::UnpackUnsignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(alignment.SupportData.BlockLength); + if ( alignment.SupportData.BlockLength == 0 ) return false; + + // read in core alignment data, make sure the right size of data was read + char x[Constants::BAM_CORE_SIZE]; + if ( m_stream.Read(x, Constants::BAM_CORE_SIZE) != Constants::BAM_CORE_SIZE ) + return false; + + // swap core endian-ness if necessary + if ( m_isBigEndian ) { + for ( int i = 0; i < Constants::BAM_CORE_SIZE; i+=sizeof(uint32_t) ) + BamTools::SwapEndian_32p(&x[i]); + } + + // set BamAlignment 'core' and 'support' data + alignment.RefID = BamTools::UnpackSignedInt(&x[0]); + alignment.Position = BamTools::UnpackSignedInt(&x[4]); + + unsigned int tempValue = BamTools::UnpackUnsignedInt(&x[8]); + alignment.Bin = tempValue >> 16; + alignment.MapQuality = tempValue >> 8 & 0xff; + alignment.SupportData.QueryNameLength = tempValue & 0xff; + + tempValue = BamTools::UnpackUnsignedInt(&x[12]); + alignment.AlignmentFlag = tempValue >> 16; + alignment.SupportData.NumCigarOperations = tempValue & 0xffff; + + alignment.SupportData.QuerySequenceLength = BamTools::UnpackUnsignedInt(&x[16]); + alignment.MateRefID = BamTools::UnpackSignedInt(&x[20]); + alignment.MatePosition = BamTools::UnpackSignedInt(&x[24]); + alignment.InsertSize = BamTools::UnpackSignedInt(&x[28]); + + // set BamAlignment length + alignment.Length = alignment.SupportData.QuerySequenceLength; + + // read in character data - make sure proper data size was read + bool readCharDataOK = false; + const unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE; + char* allCharData = (char*)calloc(sizeof(char), dataLength); + + if ( m_stream.Read(allCharData, dataLength) == (signed int)dataLength ) { + + // store 'allCharData' in supportData structure + alignment.SupportData.AllCharData.assign((const char*)allCharData, dataLength); + + // set success flag + readCharDataOK = true; + + // save CIGAR ops + // need to calculate this here so that BamAlignment::GetEndPosition() performs correctly, + // even when GetNextAlignmentCore() is called + const unsigned int cigarDataOffset = alignment.SupportData.QueryNameLength; + uint32_t* cigarData = (uint32_t*)(allCharData + cigarDataOffset); + CigarOp op; + alignment.CigarData.clear(); + alignment.CigarData.reserve(alignment.SupportData.NumCigarOperations); + for ( unsigned int i = 0; i < alignment.SupportData.NumCigarOperations; ++i ) { + + // swap endian-ness if necessary + if ( m_isBigEndian ) BamTools::SwapEndian_32(cigarData[i]); + + // build CigarOp structure + op.Length = (cigarData[i] >> Constants::BAM_CIGAR_SHIFT); + op.Type = Constants::BAM_CIGAR_LOOKUP[ (cigarData[i] & Constants::BAM_CIGAR_MASK) ]; + + // save CigarOp + alignment.CigarData.push_back(op); + } + } + + // clean up & return parsing success/failure + free(allCharData); + return readCharDataOK; +} + +// loads reference data from BAM file +bool BamReaderPrivate::LoadReferenceData(void) { + + // get number of reference sequences + char buffer[sizeof(uint32_t)]; + m_stream.Read(buffer, sizeof(uint32_t)); + uint32_t numberRefSeqs = BamTools::UnpackUnsignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(numberRefSeqs); + m_references.reserve((int)numberRefSeqs); + + // iterate over all references in header + for ( unsigned int i = 0; i != numberRefSeqs; ++i ) { + + // get length of reference name + m_stream.Read(buffer, sizeof(uint32_t)); + uint32_t refNameLength = BamTools::UnpackUnsignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(refNameLength); + char* refName = (char*)calloc(refNameLength, 1); + + // get reference name and reference sequence length + m_stream.Read(refName, refNameLength); + m_stream.Read(buffer, sizeof(int32_t)); + int32_t refLength = BamTools::UnpackSignedInt(buffer); + if ( m_isBigEndian ) BamTools::SwapEndian_32(refLength); + + // store data for reference + RefData aReference; + aReference.RefName = (string)((const char*)refName); + aReference.RefLength = refLength; + m_references.push_back(aReference); + + // clean up calloc-ed temp variable + free(refName); + } + + // return success + return true; +} + +bool BamReaderPrivate::LocateIndex(const BamIndex::IndexType& preferredType) { + return m_randomAccessController.LocateIndex(this, preferredType); +} + +// opens BAM file (and index) +bool BamReaderPrivate::Open(const string& filename) { + + // close current BAM file if open + if ( m_stream.IsOpen ) + Close(); + + // attempt to open BgzfStream for reading + if ( !m_stream.Open(filename, "rb") ) { + cerr << "BamReader ERROR: Could not open BGZF stream for " << filename << endl; + return false; + } + + // attempt to load header data + if ( !LoadHeaderData() ) { + cerr << "BamReader ERROR: Could not load header data for " << filename << endl; + Close(); + return false; + } + + // attempt to load reference data + if ( !LoadReferenceData() ) { + cerr << "BamReader ERROR: Could not load reference data for " << filename << endl; + Close(); + return false; + } + + // if all OK, store filename & offset of first alignment + m_filename = filename; + m_alignmentsBeginOffset = m_stream.Tell(); + + // return success + return true; +} + +bool BamReaderPrivate::OpenIndex(const std::string& indexFilename) { + return m_randomAccessController.OpenIndex(indexFilename, this); +} + +// returns BAM file pointer to beginning of alignment data +bool BamReaderPrivate::Rewind(void) { + + // attempt rewind to first alignment + if ( !m_stream.Seek(m_alignmentsBeginOffset) ) + return false; + + // verify that we can read first alignment + BamAlignment al; + if ( !LoadNextAlignment(al) ) + return false; + + // reset region + m_randomAccessController.ClearRegion(); + + // rewind back to beginning of first alignment + // return success/fail of seek + return m_stream.Seek(m_alignmentsBeginOffset); +} + +bool BamReaderPrivate::Seek(const int64_t& position) { + return m_stream.Seek(position); +} + +void BamReaderPrivate::SetIndex(BamIndex* index) { + m_randomAccessController.SetIndex(index); +} + +// change the index caching behavior +void BamReaderPrivate::SetIndexCacheMode(const BamIndex::IndexCacheMode& mode) { + m_randomAccessController.SetIndexCacheMode(mode); +} + +// sets current region & attempts to jump to it +// returns success/failure +bool BamReaderPrivate::SetRegion(const BamRegion& region) { + return m_randomAccessController.SetRegion(this, region, m_references.size()); +} + +int64_t BamReaderPrivate::Tell(void) const { + return m_stream.Tell(); +} diff --git a/src/utils/BamTools/src/api/internal/BamReader_p.h b/src/utils/BamTools/src/api/internal/BamReader_p.h new file mode 100644 index 0000000000000000000000000000000000000000..c0d07d88a863d84efa81c96328e70778de892cf9 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamReader_p.h @@ -0,0 +1,113 @@ +// *************************************************************************** +// BamReader_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 5 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for reading BAM files +// *************************************************************************** + +#ifndef BAMREADER_P_H +#define BAMREADER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <api/BamAlignment.h> +#include <api/BamIndex.h> +#include <api/BamReader.h> +#include <api/SamHeader.h> +#include <api/internal/BamHeader_p.h> +#include <api/internal/BamRandomAccessController_p.h> +#include <api/internal/BgzfStream_p.h> +#include <string> + +namespace BamTools { +namespace Internal { + +class BamReaderPrivate { + + // ctor & dtor + public: + BamReaderPrivate(BamReader* parent); + ~BamReaderPrivate(void); + + // BamReader interface + public: + + // file operations + void Close(void); + const std::string Filename(void) const; + bool IsOpen(void) const; + bool Open(const std::string& filename); + bool Rewind(void); + bool SetRegion(const BamRegion& region); + + // access alignment data + bool GetNextAlignment(BamAlignment& alignment); + bool GetNextAlignmentCore(BamAlignment& alignment); + + // access auxiliary data + std::string GetHeaderText(void) const; + SamHeader GetSamHeader(void) const; + int GetReferenceCount(void) const; + const RefVector& GetReferenceData(void) const; + int GetReferenceID(const std::string& refName) const; + + // index operations + bool CreateIndex(const BamIndex::IndexType& type); + bool HasIndex(void) const; + bool LocateIndex(const BamIndex::IndexType& preferredType); + bool OpenIndex(const std::string& indexFilename); + void SetIndex(BamIndex* index); + void SetIndexCacheMode(const BamIndex::IndexCacheMode& mode); + + // internal methods, but available as a BamReaderPrivate 'interface' + // + // these methods should only be used by BamTools::Internal classes + // (currently only used by the BamIndex subclasses) + public: + // retrieves header text from BAM file + bool LoadHeaderData(void); + // retrieves BAM alignment under file pointer + // (does no overlap checking or character data parsing) + bool LoadNextAlignment(BamAlignment& alignment); + // builds reference data structure from BAM file + bool LoadReferenceData(void); + // seek reader to file position + bool Seek(const int64_t& position); + // return reader's file position + int64_t Tell(void) const; + + // data members + public: + + // general BAM file data + int64_t m_alignmentsBeginOffset; + std::string m_filename; + RefVector m_references; + + // system data + bool m_isBigEndian; + + // parent BamReader + BamReader* m_parent; + + // BamReaderPrivate components + BamHeader m_header; + BamRandomAccessController m_randomAccessController; + BgzfStream m_stream; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMREADER_P_H diff --git a/src/utils/BamTools/src/api/internal/BamStandardIndex_p.cpp b/src/utils/BamTools/src/api/internal/BamStandardIndex_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6cb7b0a7ea6a7db1534addeeca3f309955add454 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamStandardIndex_p.cpp @@ -0,0 +1,974 @@ +// *************************************************************************** +// BamStandardIndex.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 16 June 2011 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the standardized BAM index format (".bai") +// *************************************************************************** + +#include <api/BamAlignment.h> +#include <api/internal/BamReader_p.h> +#include <api/internal/BamStandardIndex_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <algorithm> +#include <iostream> +using namespace std; + +// static BamStandardIndex constants +const int BamStandardIndex::MAX_BIN = 37450; // =(8^6-1)/7+1 +const int BamStandardIndex::BAM_LIDX_SHIFT = 14; +const string BamStandardIndex::BAI_EXTENSION = ".bai"; +const char* const BamStandardIndex::BAI_MAGIC = "BAI\1"; +const int BamStandardIndex::SIZEOF_ALIGNMENTCHUNK = sizeof(uint64_t)*2; +const int BamStandardIndex::SIZEOF_BINCORE = sizeof(uint32_t) + sizeof(int32_t); +const int BamStandardIndex::SIZEOF_LINEAROFFSET = sizeof(uint64_t); + +// ctor +BamStandardIndex::BamStandardIndex(Internal::BamReaderPrivate* reader) + : BamIndex(reader) + , m_indexStream(0) + , m_cacheMode(BamIndex::LimitedIndexCaching) + , m_buffer(0) + , m_bufferLength(0) +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +// dtor +BamStandardIndex::~BamStandardIndex(void) { + CloseFile(); +} + +bool BamStandardIndex::AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end) { + + // retrieve references from reader + const RefVector& references = m_reader->GetReferenceData(); + + // make sure left-bound position is valid + if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) + return false; + + // set region 'begin' + begin = (unsigned int)region.LeftPosition; + + // if right bound specified AND left&right bounds are on same reference + // OK to use right bound position as region 'end' + if ( region.isRightBoundSpecified() && ( region.LeftRefID == region.RightRefID ) ) + end = (unsigned int)region.RightPosition; + + // otherwise, set region 'end' to last reference base + else end = (unsigned int)references.at(region.LeftRefID).RefLength - 1; + + // return success + return true; +} + +void BamStandardIndex::CalculateCandidateBins(const uint32_t& begin, + const uint32_t& end, + set<uint16_t>& candidateBins) +{ + // initialize list, bin '0' is always a valid bin + candidateBins.insert(0); + + // get rest of bins that contain this region + unsigned int k; + for (k = 1 + (begin>>26); k <= 1 + (end>>26); ++k) { candidateBins.insert(k); } + for (k = 9 + (begin>>23); k <= 9 + (end>>23); ++k) { candidateBins.insert(k); } + for (k = 73 + (begin>>20); k <= 73 + (end>>20); ++k) { candidateBins.insert(k); } + for (k = 585 + (begin>>17); k <= 585 + (end>>17); ++k) { candidateBins.insert(k); } + for (k = 4681 + (begin>>14); k <= 4681 + (end>>14); ++k) { candidateBins.insert(k); } +} + +bool BamStandardIndex::CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, + const uint64_t& minOffset, + set<uint16_t>& candidateBins, + vector<int64_t>& offsets) +{ + // attempt seek to first bin + if ( !Seek(refSummary.FirstBinFilePosition, SEEK_SET) ) + return false; + + // iterate over reference bins + uint32_t binId; + int32_t numAlignmentChunks; + set<uint16_t>::iterator candidateBinIter; + for ( int i = 0; i < refSummary.NumBins; ++i ) { + + // read bin contents (if successful, alignment chunks are now in m_buffer) + if ( !ReadBinIntoBuffer(binId, numAlignmentChunks) ) + return false; + + // see if bin is a 'candidate bin' + candidateBinIter = candidateBins.find(binId); + + // if not, move on to next bin + if ( candidateBinIter == candidateBins.end() ) + continue; + + // otherwise, check bin's contents against for overlap + else { + + unsigned int offset = 0; + uint64_t chunkStart; + uint64_t chunkStop; + + // iterate over alignment chunks + for (int j = 0; j < numAlignmentChunks; ++j ) { + + // read chunk start & stop from buffer + memcpy((char*)&chunkStart, m_buffer+offset, sizeof(uint64_t)); + offset += sizeof(uint64_t); + memcpy((char*)&chunkStop, m_buffer, sizeof(uint64_t)); + offset += sizeof(uint64_t); + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(chunkStart); + SwapEndian_64(chunkStop); + } + + // store alignment chunk's start offset + // if its stop offset is larger than our 'minOffset' + if ( chunkStop >= minOffset ) + offsets.push_back(chunkStart); + } + + // 'pop' bin ID from candidate bins set + candidateBins.erase(candidateBinIter); + + // quit if no more candidates + if ( candidateBins.empty() ) + break; + } + } + + // return success + return true; +} + +uint64_t BamStandardIndex::CalculateMinOffset(const BaiReferenceSummary& refSummary, + const uint32_t& begin) +{ + // if no linear offsets exist, return 0 + if ( refSummary.NumLinearOffsets == 0 ) + return 0; + + // if 'begin' starts beyond last linear offset, use the last linear offset as minimum + // else use the offset corresponding to the requested start position + const int shiftedBegin = begin>>BamStandardIndex::BAM_LIDX_SHIFT; + if ( shiftedBegin >= refSummary.NumLinearOffsets ) + return LookupLinearOffset( refSummary, refSummary.NumLinearOffsets-1 ); + else + return LookupLinearOffset( refSummary, shiftedBegin ); +} + +void BamStandardIndex::CheckBufferSize(char*& buffer, + unsigned int& bufferLength, + const unsigned int& requestedBytes) +{ + try { + if ( requestedBytes > bufferLength ) { + bufferLength = requestedBytes + 10; + delete[] buffer; + buffer = new char[bufferLength]; + } + } catch ( std::bad_alloc ) { + cerr << "BamStandardIndex ERROR: out of memory when allocating " + << requestedBytes << " byes" << endl; + exit(1); + } +} + +void BamStandardIndex::CheckBufferSize(unsigned char*& buffer, + unsigned int& bufferLength, + const unsigned int& requestedBytes) +{ + try { + if ( requestedBytes > bufferLength ) { + bufferLength = requestedBytes + 10; + delete[] buffer; + buffer = new unsigned char[bufferLength]; + } + } catch ( std::bad_alloc ) { + cerr << "BamStandardIndex ERROR: out of memory when allocating " + << requestedBytes << " byes" << endl; + exit(1); + } +} + +bool BamStandardIndex::CheckMagicNumber(void) { + + // check 'magic number' to see if file is BAI index + char magic[4]; + size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream); + if ( elementsRead != 4 ) { + cerr << "BamStandardIndex ERROR: could not read format 'magic number'" << endl; + return false; + } + + // compare to expected value + if ( strncmp(magic, BamStandardIndex::BAI_MAGIC, 4) != 0 ) { + cerr << "BamStandardIndex ERROR: invalid format" << endl; + return false; + } + + // otherwise OK + return true; +} + +void BamStandardIndex::ClearReferenceEntry(BaiReferenceEntry& refEntry) { + refEntry.ID = -1; + refEntry.Bins.clear(); + refEntry.LinearOffsets.clear(); +} + +void BamStandardIndex::CloseFile(void) { + + // close file stream + if ( IsFileOpen() ) + fclose(m_indexStream); + + // clear index file summary data + m_indexFileSummary.clear(); + + // clean up I/O buffer + delete[] m_buffer; + m_buffer = 0; + m_bufferLength = 0; +} + +// builds index from associated BAM file & writes out to index file +bool BamStandardIndex::Create(void) { + + // return false if BamReader is invalid or not open + if ( m_reader == 0 || !m_reader->IsOpen() ) { + cerr << "BamStandardIndex ERROR: BamReader is not open" + << ", aborting index creation" << endl; + return false; + } + + // rewind BamReader + if ( !m_reader->Rewind() ) { + cerr << "BamStandardIndex ERROR: could not rewind BamReader to create index" + << ", aborting index creation" << endl; + return false; + } + + // open new index file (read & write) + string indexFilename = m_reader->Filename() + Extension(); + if ( !OpenFile(indexFilename, "w+b") ) { + cerr << "BamStandardIndex ERROR: could not open ouput index file: " << indexFilename + << ", aborting index creation" << endl; + return false; + } + + // initialize BaiFileSummary with number of references + const int& numReferences = m_reader->GetReferenceCount(); + ReserveForSummary(numReferences); + + // initialize output file + bool createdOk = true; + createdOk &= WriteHeader(); + + // set up bin, ID, offset, & coordinate markers + const uint32_t defaultValue = 0xffffffffu; + uint32_t currentBin = defaultValue; + uint32_t lastBin = defaultValue; + int32_t currentRefID = defaultValue; + int32_t lastRefID = defaultValue; + uint64_t currentOffset = (uint64_t)m_reader->Tell(); + uint64_t lastOffset = currentOffset; + int32_t lastPosition = defaultValue; + + // iterate through alignments in BAM file + BamAlignment al; + BaiReferenceEntry refEntry; + while ( m_reader->LoadNextAlignment(al) ) { + + // changed to new reference + if ( lastRefID != al.RefID ) { + + // if not first reference, save previous reference data + if ( lastRefID != (int32_t)defaultValue ) { + + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + createdOk &= WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); + + // write any empty references between (but *NOT* including) lastRefID & al.RefID + for ( int i = lastRefID+1; i < al.RefID; ++i ) { + BaiReferenceEntry emptyEntry(i); + createdOk &= WriteReferenceEntry(emptyEntry); + } + + // update bin markers + currentOffset = lastOffset; + currentBin = al.Bin; + lastBin = al.Bin; + currentRefID = al.RefID; + } + + // first pass + // write any empty references up to (but *NOT* including) al.RefID + else { + for ( int i = 0; i < al.RefID; ++i ) { + BaiReferenceEntry emptyEntry(i); + createdOk &= WriteReferenceEntry(emptyEntry); + } + } + + // update reference markers + refEntry.ID = al.RefID; + lastRefID = al.RefID; + lastBin = defaultValue; + } + + // if lastPosition greater than current alignment position - file not sorted properly + else if ( lastPosition > al.Position ) { + cerr << "BamStandardIndex ERROR: BAM file is not properly sorted by coordinate" + << ", aborting index creation" + << endl + << "At alignment: " << al.Name + << " : previous position " << lastPosition + << " > this alignment position " << al.Position + << " on reference id: " << al.RefID << endl; + return false; + } + + // if alignment's ref ID is valid & its bin is not a 'leaf' + if ( (al.RefID >= 0) && (al.Bin < 4681) ) + SaveLinearOffsetEntry(refEntry.LinearOffsets, al.Position, al.GetEndPosition(), lastOffset); + + // changed to new BAI bin + if ( al.Bin != lastBin ) { + + // if not first bin on reference, save previous bin data + if ( currentBin != defaultValue ) + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + + // update markers + currentOffset = lastOffset; + currentBin = al.Bin; + lastBin = al.Bin; + currentRefID = al.RefID; + + // if invalid RefID, break out + if ( currentRefID < 0 ) + break; + } + + // make sure that current file pointer is beyond lastOffset + if ( m_reader->Tell() <= (int64_t)lastOffset ) { + cerr << "BamStandardIndex ERROR: calculating offsets failed" + << ", aborting index creation" << endl; + return false; + } + + // update lastOffset & lastPosition + lastOffset = m_reader->Tell(); + lastPosition = al.Position; + } + + // after finishing alignments, if any data was read, check: + if ( currentRefID >= 0 ) { + + // store last alignment chunk to its bin, then write last reference entry with data + SaveAlignmentChunkToBin(refEntry.Bins, currentBin, currentOffset, lastOffset); + createdOk &= WriteReferenceEntry(refEntry); + + // then write any empty references remaining at end of file + for ( int i = currentRefID+1; i < numReferences; ++i ) { + BaiReferenceEntry emptyEntry(i); + createdOk &= WriteReferenceEntry(emptyEntry); + } + } + + // rewind reader now that we're done building + createdOk &= m_reader->Rewind(); + + // return result + return createdOk; +} + +// returns format's file extension +const string BamStandardIndex::Extension(void) { + return BamStandardIndex::BAI_EXTENSION; +} + +bool BamStandardIndex::GetOffsets(const BamRegion& region, vector<int64_t>& offsets) { + + // cannot calculate offsets if unknown/invalid reference ID requested + if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() ) + return false; + + // retrieve index summary for left bound reference + const BaiReferenceSummary& refSummary = m_indexFileSummary.at(region.LeftRefID); + + // set up region boundaries based on actual BamReader data + uint32_t begin; + uint32_t end; + if ( !AdjustRegion(region, begin, end) ) { + cerr << "BamStandardIndex ERROR: cannot calculate offsets on invalid region" << endl; + return false; + } + + // retrieve all candidate bin IDs for region + set<uint16_t> candidateBins; + CalculateCandidateBins(begin, end, candidateBins); + + // use reference's linear offsets to calculate the minimum offset + // that must be considered to find overlap + const uint64_t& minOffset = CalculateMinOffset(refSummary, begin); + + // attempt to use reference summary, minOffset, & candidateBins to calculate offsets + // no data should not be error + if ( !CalculateCandidateOffsets(refSummary, minOffset, candidateBins, offsets) ) { + cerr << "BamStandardIndex ERROR: could not calculate candidate offsets for requested region" << endl; + return false; + } + + // ensure that offsets are sorted before returning + sort( offsets.begin(), offsets.end() ); + + // return succes + return true; +} + +// returns whether reference has alignments or no +bool BamStandardIndex::HasAlignments(const int& referenceID) const { + if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() ) + return false; + const BaiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID); + return ( refSummary.NumBins > 0 ); +} + +bool BamStandardIndex::IsFileOpen(void) const { + return ( m_indexStream != 0 ); +} + +// attempts to use index data to jump to @region, returns success/fail +// a "successful" jump indicates no error, but not whether this region has data +// * thus, the method sets a flag to indicate whether there are alignments +// available after the jump position +bool BamStandardIndex::Jump(const BamRegion& region, bool* hasAlignmentsInRegion) { + + // clear out flag + *hasAlignmentsInRegion = false; + + // skip if reader is not valid or is not open + if ( m_reader == 0 || !m_reader->IsOpen() ) + return false; + + // calculate offsets for this region + vector<int64_t> offsets; + if ( !GetOffsets(region, offsets) ) { + cerr << "BamStandardIndex ERROR: could not jump" + << ", unable to retrieve offsets for region" << endl; + return false; + } + + // iterate through candidate offsets + BamAlignment al; + vector<int64_t>::const_iterator offsetIter = offsets.begin(); + vector<int64_t>::const_iterator offsetEnd = offsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter) { + + // attempt seek + if ( !m_reader->Seek(*offsetIter) ) { + cerr << "BamStandardIndex ERROR: could not jump" + << ", there was a problem seeking in BAM file" << endl; + return false; + } + + // load first available alignment, setting flag to true if data exists + *hasAlignmentsInRegion = m_reader->LoadNextAlignment(al); + + // if this alignment corresponds to desired position + // return success of seeking back to the offset before the 'current offset' (to cover overlaps) + if ( ((al.RefID == region.LeftRefID) && + ((al.Position + al.Length) > region.LeftPosition)) || + (al.RefID > region.LeftRefID) ) + { + if ( offsetIter != offsets.begin() ) + --offsetIter; + return m_reader->Seek(*offsetIter); + } + } + + // return success (no offset data is not an error, + // but hasAlignments flag will be marked accordingly) + return true; +} + +// loads existing data from file into memory +bool BamStandardIndex::Load(const std::string& filename) { + + // attempt open index file (read-only) + if ( !OpenFile(filename, "rb") ) { + cerr << "BamStandardIndex ERROR: could not open input index file: " << filename + << ", aborting index load" << endl; + return false; + } + + // if invalid format 'magic number', close & return failure + if ( !CheckMagicNumber() ) { + cerr << "BamStandardIndex ERROR: unexpected format for index file: " << filename + << ", aborting index load" << endl; + CloseFile(); + return false; + } + + // attempt to load index file summary, return success/failure + if ( !SummarizeIndexFile() ) { + cerr << "BamStandardIndex ERROR: could not generate a summary of index file " << filename + << ", aborting index load" << endl; + CloseFile(); + return false; + } + + // if we get here, index summary is loaded OK + return true; +} + +uint64_t BamStandardIndex::LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index) { + + // attempt seek to proper index file position + const int64_t linearOffsetFilePosition = (int64_t)refSummary.FirstLinearOffsetFilePosition + + index*BamStandardIndex::SIZEOF_LINEAROFFSET; + if ( !Seek(linearOffsetFilePosition, SEEK_SET) ) + return 0; + + // read linear offset from BAI file + uint64_t linearOffset(0); + if ( !ReadLinearOffset(linearOffset) ) + return 0; + return linearOffset; +} + +void BamStandardIndex::MergeAlignmentChunks(BaiAlignmentChunkVector& chunks) { + + // skip if chunks are empty, nothing to merge + if ( chunks.empty() ) + return; + + // set up merged alignment chunk container + BaiAlignmentChunkVector mergedChunks; + mergedChunks.push_back( chunks[0] ); + + // iterate over chunks + int i = 0; + BaiAlignmentChunkVector::iterator chunkIter = chunks.begin(); + BaiAlignmentChunkVector::iterator chunkEnd = chunks.end(); + for ( ++chunkIter; chunkIter != chunkEnd; ++chunkIter) { + + // get 'currentMergeChunk' based on numeric index + BaiAlignmentChunk& currentMergeChunk = mergedChunks[i]; + + // get sourceChunk based on source vector iterator + BaiAlignmentChunk& sourceChunk = (*chunkIter); + + // if currentMergeChunk ends where sourceChunk starts, then merge the two + if ( currentMergeChunk.Stop>>16 == sourceChunk.Start>>16 ) + currentMergeChunk.Stop = sourceChunk.Stop; + + // otherwise + else { + // append sourceChunk after currentMergeChunk + mergedChunks.push_back(sourceChunk); + + // update i, so the next iteration will consider the + // recently-appended sourceChunk as new mergeChunk candidate + ++i; + } + } + + // saved newly-merged chunks into (parameter) chunks + chunks = mergedChunks; +} + +bool BamStandardIndex::OpenFile(const std::string& filename, const char* mode) { + + // make sure any previous index file is closed + CloseFile(); + + // attempt to open file + m_indexStream = fopen(filename.c_str(), mode); + return IsFileOpen(); +} + +bool BamStandardIndex::ReadBinID(uint32_t& binId) { + size_t elementsRead = 0; + elementsRead += fread(&binId, sizeof(binId), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(binId); + return ( elementsRead == 1 ); +} + +bool BamStandardIndex::ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks) { + + bool readOk = true; + + // read bin header + readOk &= ReadBinID(binId); + readOk &= ReadNumAlignmentChunks(numAlignmentChunks); + + // read bin contents + const unsigned int bytesRequested = numAlignmentChunks*BamStandardIndex::SIZEOF_ALIGNMENTCHUNK; + readOk &= ReadIntoBuffer(bytesRequested); + + // return success/failure + return readOk; +} + +bool BamStandardIndex::ReadIntoBuffer(const unsigned int& bytesRequested) { + + // ensure that our buffer is big enough for request + BamStandardIndex::CheckBufferSize(m_buffer, m_bufferLength, bytesRequested); + + // read from BAI file stream + size_t bytesRead = fread( m_buffer, sizeof(char), bytesRequested, m_indexStream ); + return ( bytesRead == (size_t)bytesRequested ); +} + +bool BamStandardIndex::ReadLinearOffset(uint64_t& linearOffset) { + size_t elementsRead = 0; + elementsRead += fread(&linearOffset, sizeof(linearOffset), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_64(linearOffset); + return ( elementsRead == 1 ); +} + +bool BamStandardIndex::ReadNumAlignmentChunks(int& numAlignmentChunks) { + size_t elementsRead = 0; + elementsRead += fread(&numAlignmentChunks, sizeof(numAlignmentChunks), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numAlignmentChunks); + return ( elementsRead == 1 ); +} + +bool BamStandardIndex::ReadNumBins(int& numBins) { + size_t elementsRead = 0; + elementsRead += fread(&numBins, sizeof(numBins), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numBins); + return ( elementsRead == 1 ); +} + +bool BamStandardIndex::ReadNumLinearOffsets(int& numLinearOffsets) { + size_t elementsRead = 0; + elementsRead += fread(&numLinearOffsets, sizeof(numLinearOffsets), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numLinearOffsets); + return ( elementsRead == 1 ); +} + +bool BamStandardIndex::ReadNumReferences(int& numReferences) { + size_t elementsRead = 0; + elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + return ( elementsRead == 1 ); +} + +void BamStandardIndex::ReserveForSummary(const int& numReferences) { + m_indexFileSummary.clear(); + m_indexFileSummary.assign( numReferences, BaiReferenceSummary() ); +} + +void BamStandardIndex::SaveAlignmentChunkToBin(BaiBinMap& binMap, + const uint32_t& currentBin, + const uint64_t& currentOffset, + const uint64_t& lastOffset) +{ + // create new alignment chunk + BaiAlignmentChunk newChunk(currentOffset, lastOffset); + + + + // if no entry exists yet for this bin, create one and store alignment chunk + BaiBinMap::iterator binIter = binMap.find(currentBin); + if ( binIter == binMap.end() ) { + BaiAlignmentChunkVector newChunks; + newChunks.push_back(newChunk); + binMap.insert( pair<uint32_t, BaiAlignmentChunkVector>(currentBin, newChunks)); + } + + // otherwise, just append alignment chunk + else { + BaiAlignmentChunkVector& binChunks = (*binIter).second; + binChunks.push_back( newChunk ); + } +} + +void BamStandardIndex::SaveBinsSummary(const int& refId, const int& numBins) { + BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId); + refSummary.NumBins = numBins; + refSummary.FirstBinFilePosition = Tell(); +} + +void BamStandardIndex::SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets, + const int& alignmentStartPosition, + const int& alignmentStopPosition, + const uint64_t& lastOffset) +{ + // get converted offsets + const int beginOffset = alignmentStartPosition >> BamStandardIndex::BAM_LIDX_SHIFT; + const int endOffset = (alignmentStopPosition - 1) >> BamStandardIndex::BAM_LIDX_SHIFT; + + // resize vector if necessary + int oldSize = offsets.size(); + int newSize = endOffset + 1; + if ( oldSize < newSize ) + offsets.resize(newSize, 0); + + // store offset + for( int i = beginOffset + 1; i <= endOffset; ++i ) { + if ( offsets[i] == 0 ) + offsets[i] = lastOffset; + } +} + +void BamStandardIndex::SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets) { + BaiReferenceSummary& refSummary = m_indexFileSummary.at(refId); + refSummary.NumLinearOffsets = numLinearOffsets; + refSummary.FirstLinearOffsetFilePosition = Tell(); +} + +// seek to position in index file stream +bool BamStandardIndex::Seek(const int64_t& position, const int& origin) { + return ( fseek64(m_indexStream, position, origin) == 0 ); +} + +// change the index caching behavior +void BamStandardIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) { + m_cacheMode = mode; + // do nothing else here ? cache mode will be ignored from now on, most likely +} + +bool BamStandardIndex::SkipBins(const int& numBins) { + uint32_t binId; + int32_t numAlignmentChunks; + bool skippedOk = true; + for (int i = 0; i < numBins; ++i) + skippedOk &= ReadBinIntoBuffer(binId, numAlignmentChunks); // results & buffer ignored + return skippedOk; +} + +bool BamStandardIndex::SkipLinearOffsets(const int& numLinearOffsets) { + const unsigned int bytesRequested = numLinearOffsets*BamStandardIndex::SIZEOF_LINEAROFFSET; + return ReadIntoBuffer(bytesRequested); +} + +void BamStandardIndex::SortLinearOffsets(BaiLinearOffsetVector& linearOffsets) { + sort( linearOffsets.begin(), linearOffsets.end() ); +} + +bool BamStandardIndex::SummarizeBins(BaiReferenceSummary& refSummary) { + + // load number of bins + int numBins; + if ( !ReadNumBins(numBins) ) + return false; + + // store bins summary for this reference + refSummary.NumBins = numBins; + refSummary.FirstBinFilePosition = Tell(); + + // attempt skip reference bins, return success/failure + if ( !SkipBins(numBins) ) + return false; + + // if we get here, bin summarized OK + return true; +} + +bool BamStandardIndex::SummarizeIndexFile(void) { + + // load number of reference sequences + int numReferences; + if ( !ReadNumReferences(numReferences) ) + return false; + + // initialize file summary data + ReserveForSummary(numReferences); + + // iterate over reference entries + bool loadedOk = true; + BaiFileSummary::iterator summaryIter = m_indexFileSummary.begin(); + BaiFileSummary::iterator summaryEnd = m_indexFileSummary.end(); + for ( int i = 0; summaryIter != summaryEnd; ++summaryIter, ++i ) + loadedOk &= SummarizeReference(*summaryIter); + + // return result + return loadedOk; +} + +bool BamStandardIndex::SummarizeLinearOffsets(BaiReferenceSummary& refSummary) { + + // load number of linear offsets + int numLinearOffsets; + if ( !ReadNumLinearOffsets(numLinearOffsets) ) + return false; + + // store bin summary data for this reference + refSummary.NumLinearOffsets = numLinearOffsets; + refSummary.FirstLinearOffsetFilePosition = Tell(); + + // skip linear offsets in index file + if ( !SkipLinearOffsets(numLinearOffsets) ) + return false; + + // if get here, linear offsets summarized OK + return true; +} + +bool BamStandardIndex::SummarizeReference(BaiReferenceSummary& refSummary) { + + bool loadedOk = true; + loadedOk &= SummarizeBins(refSummary); + loadedOk &= SummarizeLinearOffsets(refSummary); + return loadedOk; +} + +// return position of file pointer in index file stream +int64_t BamStandardIndex::Tell(void) const { + return ftell64(m_indexStream); +} + +bool BamStandardIndex::WriteAlignmentChunk(const BaiAlignmentChunk& chunk) { + + size_t elementsWritten = 0; + + // localize alignment chunk offsets + uint64_t start = chunk.Start; + uint64_t stop = chunk.Stop; + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_64(start); + SwapEndian_64(stop); + } + + // write to index file + elementsWritten += fwrite(&start, sizeof(start), 1, m_indexStream); + elementsWritten += fwrite(&stop, sizeof(stop), 1, m_indexStream); + + // return success/failure of write + return ( elementsWritten == 2 ); +} + +bool BamStandardIndex::WriteAlignmentChunks(BaiAlignmentChunkVector& chunks) { + + // make sure chunks are merged (simplified) before writing & saving summary + MergeAlignmentChunks(chunks); + + size_t elementsWritten = 0; + + // write chunks + int32_t chunkCount = chunks.size(); + if ( m_isBigEndian ) SwapEndian_32(chunkCount); + elementsWritten += fwrite(&chunkCount, sizeof(chunkCount), 1, m_indexStream); + + // iterate over chunks + bool chunksOk = true; + BaiAlignmentChunkVector::const_iterator chunkIter = chunks.begin(); + BaiAlignmentChunkVector::const_iterator chunkEnd = chunks.end(); + for ( ; chunkIter != chunkEnd; ++chunkIter ) + chunksOk &= WriteAlignmentChunk( (*chunkIter) ); + + // return success/failure of write + return ( (elementsWritten == 1) && chunksOk ); +} + +bool BamStandardIndex::WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks) { + + size_t elementsWritten = 0; + + // write BAM bin ID + uint32_t binKey = binId; + if ( m_isBigEndian ) SwapEndian_32(binKey); + elementsWritten += fwrite(&binKey, sizeof(binKey), 1, m_indexStream); + + // write bin's alignment chunks + bool chunksOk = WriteAlignmentChunks(chunks); + + // return success/failure of write + return ( (elementsWritten == 1) && chunksOk ); +} + +bool BamStandardIndex::WriteBins(const int& refId, BaiBinMap& bins) { + + size_t elementsWritten = 0; + + // write number of bins + int32_t binCount = bins.size(); + if ( m_isBigEndian ) SwapEndian_32(binCount); + elementsWritten += fwrite(&binCount, sizeof(binCount), 1, m_indexStream); + + // save summary for reference's bins + SaveBinsSummary(refId, bins.size()); + + // iterate over bins + bool binsOk = true; + BaiBinMap::iterator binIter = bins.begin(); + BaiBinMap::iterator binEnd = bins.end(); + for ( ; binIter != binEnd; ++binIter ) + binsOk &= WriteBin( (*binIter).first, (*binIter).second ); + + // return success/failure of write + return ( (elementsWritten == 1) && binsOk ); +} + +bool BamStandardIndex::WriteHeader(void) { + + size_t elementsWritten = 0; + + // write magic number + elementsWritten += fwrite(BamStandardIndex::BAI_MAGIC, sizeof(char), 4, m_indexStream); + + // write number of reference sequences + int32_t numReferences = m_indexFileSummary.size(); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream); + + // return success/failure of write + return (elementsWritten == 5); +} + +bool BamStandardIndex::WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets) { + + // make sure linear offsets are sorted before writing & saving summary + SortLinearOffsets(linearOffsets); + + size_t elementsWritten = 0; + + // write number of linear offsets + int32_t offsetCount = linearOffsets.size(); + if ( m_isBigEndian ) SwapEndian_32(offsetCount); + elementsWritten += fwrite(&offsetCount, sizeof(offsetCount), 1, m_indexStream); + + // save summary for reference's linear offsets + SaveLinearOffsetsSummary(refId, linearOffsets.size()); + + // iterate over linear offsets + BaiLinearOffsetVector::const_iterator offsetIter = linearOffsets.begin(); + BaiLinearOffsetVector::const_iterator offsetEnd = linearOffsets.end(); + for ( ; offsetIter != offsetEnd; ++offsetIter ) { + + // write linear offset + uint64_t linearOffset = (*offsetIter); + if ( m_isBigEndian ) SwapEndian_64(linearOffset); + elementsWritten += fwrite(&linearOffset, sizeof(linearOffset), 1, m_indexStream); + } + + // return success/failure of write + return ( elementsWritten == (size_t)(linearOffsets.size() + 1) ); +} + +bool BamStandardIndex::WriteReferenceEntry(BaiReferenceEntry& refEntry) { + bool refOk = true; + refOk &= WriteBins(refEntry.ID, refEntry.Bins); + refOk &= WriteLinearOffsets(refEntry.ID, refEntry.LinearOffsets); + return refOk; +} diff --git a/src/utils/BamTools/src/api/internal/BamStandardIndex_p.h b/src/utils/BamTools/src/api/internal/BamStandardIndex_p.h new file mode 100644 index 0000000000000000000000000000000000000000..7c61a6296b1dd95b6da0b29ae27a19d943df5cd8 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamStandardIndex_p.h @@ -0,0 +1,237 @@ +// *************************************************************************** +// BamStandardIndex.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 5 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the standardized BAM index format (".bai") +// *************************************************************************** + +#ifndef BAM_STANDARD_INDEX_FORMAT_H +#define BAM_STANDARD_INDEX_FORMAT_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include <api/BamAux.h> +#include <api/BamIndex.h> +#include <map> +#include <set> +#include <string> +#include <vector> + +namespace BamTools { +namespace Internal { + +// ----------------------------------------------------------------------------- +// BamStandardIndex data structures + +// defines start and end of a contiguous run of alignments +struct BaiAlignmentChunk { + + // data members + uint64_t Start; + uint64_t Stop; + + // constructor + BaiAlignmentChunk(const uint64_t& start = 0, + const uint64_t& stop = 0) + : Start(start) + , Stop(stop) + { } +}; + +// comparison operator (for sorting) +inline +bool operator<(const BaiAlignmentChunk& lhs, const BaiAlignmentChunk& rhs) { + return lhs.Start < rhs.Start; +} + +// convenience typedef for a list of all alignment 'chunks' in a BAI bin +typedef std::vector<BaiAlignmentChunk> BaiAlignmentChunkVector; + +// convenience typedef for a map of all BAI bins in a reference (ID => chunks) +typedef std::map<uint32_t, BaiAlignmentChunkVector> BaiBinMap; + +// convenience typedef for a list of all 'linear offsets' in a reference +typedef std::vector<uint64_t> BaiLinearOffsetVector; + +// contains all fields necessary for building, loading, & writing +// full BAI index data for a single reference +struct BaiReferenceEntry { + + // data members + int32_t ID; + BaiBinMap Bins; + BaiLinearOffsetVector LinearOffsets; + + // ctor + BaiReferenceEntry(const int32_t& id = -1) + : ID(id) + { } +}; + +// provides (persistent) summary of BaiReferenceEntry's index data +struct BaiReferenceSummary { + + // data members + int NumBins; + int NumLinearOffsets; + uint64_t FirstBinFilePosition; + uint64_t FirstLinearOffsetFilePosition; + + // ctor + BaiReferenceSummary(void) + : NumBins(0) + , NumLinearOffsets(0) + , FirstBinFilePosition(0) + , FirstLinearOffsetFilePosition(0) + { } +}; + +// convenience typedef for describing a full BAI index file summary +typedef std::vector<BaiReferenceSummary> BaiFileSummary; + +// end BamStandardIndex data structures +// ----------------------------------------------------------------------------- + +class BamStandardIndex : public BamIndex { + + // ctor & dtor + public: + BamStandardIndex(Internal::BamReaderPrivate* reader); + ~BamStandardIndex(void); + + // BamIndex implementation + public: + // builds index from associated BAM file & writes out to index file + bool Create(void); + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // attempts to use index data to jump to @region, returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + // loads existing data from file into memory + bool Load(const std::string& filename); + // change the index caching behavior + void SetCacheMode(const BamIndex::IndexCacheMode& mode); + public: + // returns format's file extension + static const std::string Extension(void); + + // internal file ops + private: + bool CheckMagicNumber(void); + void CloseFile(void); + bool IsFileOpen(void) const; + bool OpenFile(const std::string& filename, const char* mode); + bool Seek(const int64_t& position, const int& origin); + int64_t Tell(void) const; + + // internal BAI index building methods + private: + void ClearReferenceEntry(BaiReferenceEntry& refEntry); + void SaveAlignmentChunkToBin(BaiBinMap& binMap, + const uint32_t& currentBin, + const uint64_t& currentOffset, + const uint64_t& lastOffset); + void SaveLinearOffsetEntry(BaiLinearOffsetVector& offsets, + const int& alignmentStartPosition, + const int& alignmentStopPosition, + const uint64_t& lastOffset); + + // internal random-access methods + private: + bool AdjustRegion(const BamRegion& region, uint32_t& begin, uint32_t& end); + void CalculateCandidateBins(const uint32_t& begin, + const uint32_t& end, + std::set<uint16_t>& candidateBins); + bool CalculateCandidateOffsets(const BaiReferenceSummary& refSummary, + const uint64_t& minOffset, + std::set<uint16_t>& candidateBins, + std::vector<int64_t>& offsets); + uint64_t CalculateMinOffset(const BaiReferenceSummary& refSummary, const uint32_t& begin); + bool GetOffsets(const BamRegion& region, std::vector<int64_t>& offsets); + uint64_t LookupLinearOffset(const BaiReferenceSummary& refSummary, const int& index); + + // internal BAI summary (create/load) methods + private: + void ReserveForSummary(const int& numReferences); + void SaveBinsSummary(const int& refId, const int& numBins); + void SaveLinearOffsetsSummary(const int& refId, const int& numLinearOffsets); + bool SkipBins(const int& numBins); + bool SkipLinearOffsets(const int& numLinearOffsets); + bool SummarizeBins(BaiReferenceSummary& refSummary); + bool SummarizeIndexFile(void); + bool SummarizeLinearOffsets(BaiReferenceSummary& refSummary); + bool SummarizeReference(BaiReferenceSummary& refSummary); + + // internal BAI full index input methods + private: + bool ReadBinID(uint32_t& binId); + bool ReadBinIntoBuffer(uint32_t& binId, int32_t& numAlignmentChunks); + bool ReadIntoBuffer(const unsigned int& bytesRequested); + bool ReadLinearOffset(uint64_t& linearOffset); + bool ReadNumAlignmentChunks(int& numAlignmentChunks); + bool ReadNumBins(int& numBins); + bool ReadNumLinearOffsets(int& numLinearOffsets); + bool ReadNumReferences(int& numReferences); + + // internal BAI full index output methods + private: + void MergeAlignmentChunks(BaiAlignmentChunkVector& chunks); + void SortLinearOffsets(BaiLinearOffsetVector& linearOffsets); + bool WriteAlignmentChunk(const BaiAlignmentChunk& chunk); + bool WriteAlignmentChunks(BaiAlignmentChunkVector& chunks); + bool WriteBin(const uint32_t& binId, BaiAlignmentChunkVector& chunks); + bool WriteBins(const int& refId, BaiBinMap& bins); + bool WriteHeader(void); + bool WriteLinearOffsets(const int& refId, BaiLinearOffsetVector& linearOffsets); + bool WriteReferenceEntry(BaiReferenceEntry& refEntry); + + // data members + private: + FILE* m_indexStream; + bool m_isBigEndian; + BamIndex::IndexCacheMode m_cacheMode; + BaiFileSummary m_indexFileSummary; + + // our input buffer + char* m_buffer; + unsigned int m_bufferLength; + + // static methods + private: + // checks if the buffer is large enough to accomodate the requested size + static void CheckBufferSize(char*& buffer, + unsigned int& bufferLength, + const unsigned int& requestedBytes); + // checks if the buffer is large enough to accomodate the requested size + static void CheckBufferSize(unsigned char*& buffer, + unsigned int& bufferLength, + const unsigned int& requestedBytes); + // static constants + private: + static const int MAX_BIN; + static const int BAM_LIDX_SHIFT; + static const std::string BAI_EXTENSION; + static const char* const BAI_MAGIC; + static const int SIZEOF_ALIGNMENTCHUNK; + static const int SIZEOF_BINCORE; + static const int SIZEOF_LINEAROFFSET; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAM_STANDARD_INDEX_FORMAT_H diff --git a/src/utils/BamTools/src/api/internal/BamToolsIndex_p.cpp b/src/utils/BamTools/src/api/internal/BamToolsIndex_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..05d43275e1c307372481eb559004a741ff81fa88 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamToolsIndex_p.cpp @@ -0,0 +1,642 @@ +// *************************************************************************** +// BamToolsIndex.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 27 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the BamTools index format (".bti") +// *************************************************************************** + +#include <api/BamAlignment.h> +#include <api/internal/BamReader_p.h> +#include <api/internal/BamToolsIndex_p.h> +#include <api/internal/BgzfStream_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstdio> +#include <cstdlib> +#include <cstring> +#include <algorithm> +#include <iostream> +#include <iterator> +#include <map> +using namespace std; + +// static BamToolsIndex constants +const int BamToolsIndex::DEFAULT_BLOCK_LENGTH = 1000; +const string BamToolsIndex::BTI_EXTENSION = ".bti"; +const char* const BamToolsIndex::BTI_MAGIC = "BTI\1"; +const int BamToolsIndex::SIZEOF_BLOCK = sizeof(int32_t)*2 + sizeof(int64_t); + +// ctor +BamToolsIndex::BamToolsIndex(Internal::BamReaderPrivate* reader) + : BamIndex(reader) + , m_indexStream(0) + , m_cacheMode(BamIndex::LimitedIndexCaching) + , m_blockSize(BamToolsIndex::DEFAULT_BLOCK_LENGTH) + , m_inputVersion(0) + , m_outputVersion(BTI_1_2) // latest version - used for writing new index files +{ + m_isBigEndian = BamTools::SystemIsBigEndian(); +} + +// dtor +BamToolsIndex::~BamToolsIndex(void) { + CloseFile(); +} + +bool BamToolsIndex::CheckMagicNumber(void) { + + // check 'magic number' to see if file is BTI index + char magic[4]; + size_t elementsRead = fread(magic, sizeof(char), 4, m_indexStream); + if ( elementsRead != 4 ) { + cerr << "BamToolsIndex ERROR: could not read format 'magic' number" << endl; + return false; + } + + if ( strncmp(magic, BamToolsIndex::BTI_MAGIC, 4) != 0 ) { + cerr << "BamToolsIndex ERROR: invalid format" << endl; + return false; + } + + // otherwise ok + return true; +} + +// check index file version, return true if OK +bool BamToolsIndex::CheckVersion(void) { + + // read version from file + size_t elementsRead = fread(&m_inputVersion, sizeof(m_inputVersion), 1, m_indexStream); + if ( elementsRead != 1 ) return false; + if ( m_isBigEndian ) SwapEndian_32(m_inputVersion); + + // if version is negative, or zero + if ( m_inputVersion <= 0 ) { + cerr << "BamToolsIndex ERROR: could not load index file: invalid version." + << endl; + return false; + } + + // if version is newer than can be supported by this version of bamtools + else if ( m_inputVersion > m_outputVersion ) { + cerr << "BamToolsIndex ERROR: could not load index file. This version of BamTools does not recognize new index file version" + << endl + << "Please update BamTools to a more recent version to support this index file." + << endl; + return false; + } + + // ------------------------------------------------------------------ + // check for deprecated, unsupported versions + // (typically whose format did not accomodate a particular bug fix) + + else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_0 ) { + cerr << "BamToolsIndex ERROR: could not load index file. This version of the index contains a bug related to accessing data near reference ends." + << endl << endl + << "Please run 'bamtools index -bti -in yourData.bam' to generate an up-to-date, fixed BTI file." + << endl << endl; + return false; + } + + else if ( (Version)m_inputVersion == BamToolsIndex::BTI_1_1 ) { + cerr << "BamToolsIndex ERROR: could not load index file. This version of the index contains a bug related to handling empty references." + << endl << endl + << "Please run 'bamtools index -bti -in yourData.bam' to generate an up-to-date, fixed BTI file." + << endl << endl; + return false; + } + + // otherwise ok + else return true; +} + +void BamToolsIndex::ClearReferenceEntry(BtiReferenceEntry& refEntry) { + refEntry.ID = -1; + refEntry.Blocks.clear(); +} + +void BamToolsIndex::CloseFile(void) { + if ( IsFileOpen() ) + fclose(m_indexStream); + m_indexFileSummary.clear(); +} + +// builds index from associated BAM file & writes out to index file +bool BamToolsIndex::Create(void) { + + // return false if BamReader is invalid or not open + if ( m_reader == 0 || !m_reader->IsOpen() ) { + cerr << "BamToolsIndex ERROR: BamReader is not open" + << ", aborting index creation" << endl; + return false; + } + + // rewind BamReader + if ( !m_reader->Rewind() ) { + cerr << "BamToolsIndex ERROR: could not rewind BamReader to create index" + << ", aborting index creation" << endl; + return false; + } + + // open new index file (read & write) + string indexFilename = m_reader->Filename() + Extension(); + if ( !OpenFile(indexFilename, "w+b") ) { + cerr << "BamToolsIndex ERROR: could not open ouput index file " << indexFilename + << ", aborting index creation" << endl; + return false; + } + + // initialize BtiFileSummary with number of references + const int& numReferences = m_reader->GetReferenceCount(); + InitializeFileSummary(numReferences); + + // initialize output file + bool createdOk = true; + createdOk &= WriteHeader(); + + // index building markers + int32_t currentBlockCount = 0; + int64_t currentAlignmentOffset = m_reader->Tell(); + int32_t blockRefId = -1; + int32_t blockMaxEndPosition = -1; + int64_t blockStartOffset = currentAlignmentOffset; + int32_t blockStartPosition = -1; + + // plow through alignments, storing index entries + BamAlignment al; + BtiReferenceEntry refEntry; + while ( m_reader->LoadNextAlignment(al) ) { + + // if moved to new reference + if ( al.RefID != blockRefId ) { + + // if first pass, check: + if ( currentBlockCount == 0 ) { + + // write any empty references up to (but not including) al.RefID + for ( int i = 0; i < al.RefID; ++i ) { + BtiReferenceEntry emptyEntry(i); + createdOk &= WriteReferenceEntry(emptyEntry); + } + } + + // not first pass: + else { + + // store previous BTI block data in reference entry + BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); + + // write reference entry, then clear + createdOk &= WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); + + // write any empty references between (but not including) the last blockRefID and current al.RefID + for ( int i = blockRefId+1; i < al.RefID; ++i ) { + BtiReferenceEntry emptyEntry(i); + createdOk &= WriteReferenceEntry(emptyEntry); + } + + // reset block count + currentBlockCount = 0; + } + + // set ID for new reference entry + refEntry.ID = al.RefID; + } + + // if beginning of block, update counters + if ( currentBlockCount == 0 ) { + blockRefId = al.RefID; + blockStartOffset = currentAlignmentOffset; + blockStartPosition = al.Position; + blockMaxEndPosition = al.GetEndPosition(); + } + + // increment block counter + ++currentBlockCount; + + // check end position + int32_t alignmentEndPosition = al.GetEndPosition(); + if ( alignmentEndPosition > blockMaxEndPosition ) + blockMaxEndPosition = alignmentEndPosition; + + // if block is full, get offset for next block, reset currentBlockCount + if ( currentBlockCount == m_blockSize ) { + + // store previous block data in reference entry + BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); + + // update markers + blockStartOffset = m_reader->Tell(); + currentBlockCount = 0; + } + + // not the best name, but for the next iteration, this value will be the offset of the *current* alignment + // necessary because we won't know if this next alignment is on a new reference until we actually read it + currentAlignmentOffset = m_reader->Tell(); + } + + // after finishing alignments, if any data was read, check: + if ( blockRefId >= 0 ) { + + // store last BTI block data in reference entry + BtiBlock block(blockMaxEndPosition, blockStartOffset, blockStartPosition); + refEntry.Blocks.push_back(block); + + // write last reference entry, then clear + createdOk &= WriteReferenceEntry(refEntry); + ClearReferenceEntry(refEntry); + + // then write any empty references remaining at end of file + for ( int i = blockRefId+1; i < numReferences; ++i ) { + BtiReferenceEntry emptyEntry(i); + createdOk &= WriteReferenceEntry(emptyEntry); + } + } + + // rewind reader & return result + createdOk &= m_reader->Rewind(); + + // return result + return createdOk; +} + +// returns format's file extension +const std::string BamToolsIndex::Extension(void) { + return BamToolsIndex::BTI_EXTENSION; +} + +bool BamToolsIndex::GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion) { + + // return false ref ID is not a valid index in file summary data + if ( region.LeftRefID < 0 || region.LeftRefID >= (int)m_indexFileSummary.size() ) + return false; + + // retrieve reference index data for left bound reference + BtiReferenceEntry refEntry(region.LeftRefID); + if ( !ReadReferenceEntry(refEntry) ) { + cerr << "BamToolsIndex ERROR: could not retrieve index data from BTI file" << endl; + return false; + } + + // binary search for an overlapping block (may not be first one though) + bool found = false; + typedef BtiBlockVector::const_iterator BtiBlockConstIterator; + BtiBlockConstIterator blockFirst = refEntry.Blocks.begin(); + BtiBlockConstIterator blockIter = blockFirst; + BtiBlockConstIterator blockLast = refEntry.Blocks.end(); + iterator_traits<BtiBlockConstIterator>::difference_type count = distance(blockFirst, blockLast); + iterator_traits<BtiBlockConstIterator>::difference_type step; + while ( count > 0 ) { + blockIter = blockFirst; + step = count/2; + advance(blockIter, step); + + const BtiBlock& block = (*blockIter); + if ( block.StartPosition <= region.RightPosition ) { + if ( block.MaxEndPosition >= region.LeftPosition ) { + offset = block.StartOffset; + break; + } + blockFirst = ++blockIter; + count -= step+1; + } + else count = step; + } + + // if we didn't search "off the end" of the blocks + if ( blockIter != blockLast ) { + + // "walk back" until we've gone too far + while ( blockIter != blockFirst ) { + const BtiBlock& currentBlock = (*blockIter); + + --blockIter; + const BtiBlock& previousBlock = (*blockIter); + if ( previousBlock.MaxEndPosition < region.LeftPosition ) { + offset = currentBlock.StartOffset; + found = true; + break; + } + } + + // if we walked all the way to first block, just return that and let the reader's + // region overlap parsing do the rest + if ( blockIter == blockFirst ) { + const BtiBlock& block = (*blockIter); + offset = block.StartOffset; + found = true; + } + } + + + // sets to false if blocks container is empty, or if no matching block could be found + *hasAlignmentsInRegion = found; + + // return success + return true; +} + +// returns whether reference has alignments or no +bool BamToolsIndex::HasAlignments(const int& referenceID) const { + if ( referenceID < 0 || referenceID >= (int)m_indexFileSummary.size() ) + return false; + const BtiReferenceSummary& refSummary = m_indexFileSummary.at(referenceID); + return ( refSummary.NumBlocks > 0 ); +} + +void BamToolsIndex::InitializeFileSummary(const int& numReferences) { + m_indexFileSummary.clear(); + for ( int i = 0; i < numReferences; ++i ) + m_indexFileSummary.push_back( BtiReferenceSummary() ); +} + +bool BamToolsIndex::IsFileOpen(void) const { + return ( m_indexStream != 0 ); +} + +// attempts to use index data to jump to @region, returns success/fail +// a "successful" jump indicates no error, but not whether this region has data +// * thus, the method sets a flag to indicate whether there are alignments +// available after the jump position +bool BamToolsIndex::Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion) { + + // clear flag + *hasAlignmentsInRegion = false; + + // skip if invalid reader or not open + if ( m_reader == 0 || !m_reader->IsOpen() ) + return false; + + // make sure left-bound position is valid + const RefVector& references = m_reader->GetReferenceData(); + if ( region.LeftPosition > references.at(region.LeftRefID).RefLength ) + return false; + + // calculate nearest offset to jump to + int64_t offset; + if ( !GetOffset(region, offset, hasAlignmentsInRegion) ) { + cerr << "BamToolsIndex ERROR: could not jump" + << ", unable to calculate offset for specified region" << endl; + return false; + } + + // return success/failure of seek + return m_reader->Seek(offset); +} + +// loads existing data from file into memory +bool BamToolsIndex::Load(const std::string& filename) { + + // attempt open index file (read-only) + if ( !OpenFile(filename, "rb") ) { + cerr << "BamToolsIndex ERROR: could not open input index file " << filename + << ", aborting index load" << endl; + return false; + } + + // attempt to load & validate BTI header data + if ( !LoadHeader() ) { + cerr << "BamToolsIndex ERROR: could load header from index file " << filename + << ", aborting index load" << endl; + CloseFile(); + return false; + } + + // attempt to load index file summary + if ( !LoadFileSummary() ) { + cerr << "BamToolsIndex ERROR: could not generate a summary of index file " << filename + << ", aborting index load" << endl; + CloseFile(); + return false; + } + + // if we get here, index summary is loaded OK + return true; +} + +bool BamToolsIndex::LoadFileSummary(void) { + + // load number of reference sequences + int numReferences; + if ( !LoadNumReferences(numReferences) ) + return false; + + // initialize file summary data + InitializeFileSummary(numReferences); + + // iterate over reference entries + bool loadedOk = true; + BtiFileSummary::iterator summaryIter = m_indexFileSummary.begin(); + BtiFileSummary::iterator summaryEnd = m_indexFileSummary.end(); + for ( ; summaryIter != summaryEnd; ++summaryIter ) + loadedOk &= LoadReferenceSummary(*summaryIter); + + // return result + return loadedOk; +} + +bool BamToolsIndex::LoadHeader(void) { + + // if invalid format 'magic number' + if ( !CheckMagicNumber() ) + return false; + + // if invalid BTI version + if ( !CheckVersion() ) + return false; + + // use file's BTI block size to set member variable + size_t elementsRead = fread(&m_blockSize, sizeof(m_blockSize), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(m_blockSize); + return ( elementsRead == 1 ); +} + +bool BamToolsIndex::LoadNumBlocks(int& numBlocks) { + size_t elementsRead = 0; + elementsRead += fread(&numBlocks, sizeof(numBlocks), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numBlocks); + return ( elementsRead == 1 ); +} + +bool BamToolsIndex::LoadNumReferences(int& numReferences) { + size_t elementsRead = 0; + elementsRead += fread(&numReferences, sizeof(numReferences), 1, m_indexStream); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + return ( elementsRead == 1 ); +} + +bool BamToolsIndex::LoadReferenceSummary(BtiReferenceSummary& refSummary) { + + // load number of blocks + int numBlocks; + if ( !LoadNumBlocks(numBlocks) ) + return false; + + // store block summary data for this reference + refSummary.NumBlocks = numBlocks; + refSummary.FirstBlockFilePosition = Tell(); + + // skip blocks in index file (and return status) + return SkipBlocks(numBlocks); +} + +bool BamToolsIndex::OpenFile(const std::string& filename, const char* mode) { + + // make sure any previous index file is closed + CloseFile(); + + // attempt to open file + m_indexStream = fopen(filename.c_str(), mode); + return IsFileOpen(); +} + +bool BamToolsIndex::ReadBlock(BtiBlock& block) { + + // read in block data members + size_t elementsRead = 0; + elementsRead += fread(&block.MaxEndPosition, sizeof(block.MaxEndPosition), 1, m_indexStream); + elementsRead += fread(&block.StartOffset, sizeof(block.StartOffset), 1, m_indexStream); + elementsRead += fread(&block.StartPosition, sizeof(block.StartPosition), 1, m_indexStream); + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_32(block.MaxEndPosition); + SwapEndian_64(block.StartOffset); + SwapEndian_32(block.StartPosition); + } + + // return success/failure + return ( elementsRead == 3 ); +} + +bool BamToolsIndex::ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks) { + + // prep blocks container + blocks.clear(); + blocks.reserve(refSummary.NumBlocks); + + // skip to first block entry + if ( !Seek( refSummary.FirstBlockFilePosition, SEEK_SET ) ) { + cerr << "BamToolsIndex ERROR: could not seek to position " + << refSummary.FirstBlockFilePosition << endl; + return false; + } + + // read & store block entries + bool readOk = true; + BtiBlock block; + for ( int i = 0; i < refSummary.NumBlocks; ++i ) { + readOk &= ReadBlock(block); + blocks.push_back(block); + } + return readOk; +} + +bool BamToolsIndex::ReadReferenceEntry(BtiReferenceEntry& refEntry) { + + // return false if refId not valid index in file summary structure + if ( refEntry.ID < 0 || refEntry.ID >= (int)m_indexFileSummary.size() ) + return false; + + // use index summary to assist reading the reference's BTI blocks + const BtiReferenceSummary& refSummary = m_indexFileSummary.at(refEntry.ID); + return ReadBlocks(refSummary, refEntry.Blocks); +} + +bool BamToolsIndex::Seek(const int64_t& position, const int& origin) { + return ( fseek64(m_indexStream, position, origin) == 0 ); +} + +// change the index caching behavior +void BamToolsIndex::SetCacheMode(const BamIndex::IndexCacheMode& mode) { + m_cacheMode = mode; + // do nothing else here ? cache mode will be ignored from now on, most likely +} + +bool BamToolsIndex::SkipBlocks(const int& numBlocks) { + return Seek( numBlocks*BamToolsIndex::SIZEOF_BLOCK, SEEK_CUR ); +} + +int64_t BamToolsIndex::Tell(void) const { + return ftell64(m_indexStream); +} + +bool BamToolsIndex::WriteBlock(const BtiBlock& block) { + + // copy entry data + int32_t maxEndPosition = block.MaxEndPosition; + int64_t startOffset = block.StartOffset; + int32_t startPosition = block.StartPosition; + + // swap endian-ness if necessary + if ( m_isBigEndian ) { + SwapEndian_32(maxEndPosition); + SwapEndian_64(startOffset); + SwapEndian_32(startPosition); + } + + // write the reference index entry + size_t elementsWritten = 0; + elementsWritten += fwrite(&maxEndPosition, sizeof(maxEndPosition), 1, m_indexStream); + elementsWritten += fwrite(&startOffset, sizeof(startOffset), 1, m_indexStream); + elementsWritten += fwrite(&startPosition, sizeof(startPosition), 1, m_indexStream); + return ( elementsWritten == 3 ); +} + +bool BamToolsIndex::WriteBlocks(const BtiBlockVector& blocks) { + bool writtenOk = true; + BtiBlockVector::const_iterator blockIter = blocks.begin(); + BtiBlockVector::const_iterator blockEnd = blocks.end(); + for ( ; blockIter != blockEnd; ++blockIter ) + writtenOk &= WriteBlock(*blockIter); + return writtenOk; +} + +bool BamToolsIndex::WriteHeader(void) { + + size_t elementsWritten = 0; + + // write BTI index format 'magic number' + elementsWritten += fwrite(BamToolsIndex::BTI_MAGIC, 1, 4, m_indexStream); + + // write BTI index format version + int32_t currentVersion = (int32_t)m_outputVersion; + if ( m_isBigEndian ) SwapEndian_32(currentVersion); + elementsWritten += fwrite(¤tVersion, sizeof(currentVersion), 1, m_indexStream); + + // write block size + int32_t blockSize = m_blockSize; + if ( m_isBigEndian ) SwapEndian_32(blockSize); + elementsWritten += fwrite(&blockSize, sizeof(blockSize), 1, m_indexStream); + + // write number of references + int32_t numReferences = m_indexFileSummary.size(); + if ( m_isBigEndian ) SwapEndian_32(numReferences); + elementsWritten += fwrite(&numReferences, sizeof(numReferences), 1, m_indexStream); + + // return success/failure of write + return ( elementsWritten == 7 ); +} + +bool BamToolsIndex::WriteReferenceEntry(const BtiReferenceEntry& refEntry) { + + size_t elementsWritten = 0; + + // write number of blocks this reference + uint32_t numBlocks = refEntry.Blocks.size(); + if ( m_isBigEndian ) SwapEndian_32(numBlocks); + elementsWritten += fwrite(&numBlocks, sizeof(numBlocks), 1, m_indexStream); + + // write actual block entries + const bool blocksOk = WriteBlocks(refEntry.Blocks); + + // return success/fail + return ( elementsWritten == 1) && blocksOk; +} diff --git a/src/utils/BamTools/src/api/internal/BamToolsIndex_p.h b/src/utils/BamTools/src/api/internal/BamToolsIndex_p.h new file mode 100644 index 0000000000000000000000000000000000000000..16aef8c9afd85e23ea1b946d0009b9274034e5e0 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamToolsIndex_p.h @@ -0,0 +1,188 @@ +// *************************************************************************** +// BamToolsIndex.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 5 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides index operations for the BamTools index format (".bti") +// *************************************************************************** + +#ifndef BAMTOOLS_INDEX_FORMAT_H +#define BAMTOOLS_INDEX_FORMAT_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include <api/BamAux.h> +#include <api/BamIndex.h> +#include <map> +#include <string> +#include <vector> + +namespace BamTools { +namespace Internal { + +// contains data for each 'block' in a BTI index +struct BtiBlock { + + // data members + int32_t MaxEndPosition; + int64_t StartOffset; + int32_t StartPosition; + + // ctor + BtiBlock(const int32_t& maxEndPosition = 0, + const int64_t& startOffset = 0, + const int32_t& startPosition = 0) + : MaxEndPosition(maxEndPosition) + , StartOffset(startOffset) + , StartPosition(startPosition) + { } +}; + +// convenience typedef for describing a a list of BTI blocks on a reference +typedef std::vector<BtiBlock> BtiBlockVector; + +// contains all fields necessary for building, loading, & writing +// full BTI index data for a single reference +struct BtiReferenceEntry { + + // data members + int32_t ID; + BtiBlockVector Blocks; + + // ctor + BtiReferenceEntry(const int& id = -1) + : ID(id) + { } +}; + +// provides (persistent) summary of BtiReferenceEntry's index data +struct BtiReferenceSummary { + + // data members + int NumBlocks; + uint64_t FirstBlockFilePosition; + + // ctor + BtiReferenceSummary(void) + : NumBlocks(0) + , FirstBlockFilePosition(0) + { } +}; + +// convenience typedef for describing a full BTI index file summary +typedef std::vector<BtiReferenceSummary> BtiFileSummary; + +class BamToolsIndex : public BamIndex { + + // keep a list of any supported versions here + // (might be useful later to handle any 'legacy' versions if the format changes) + // listed for example like: BTI_1_0 = 1, BTI_1_1 = 2, BTI_1_2 = 3, BTI_2_0 = 4, and so on + // + // so a change introduced in (hypothetical) BTI_1_2 would be handled from then on by: + // + // if ( indexVersion >= BTI_1_2 ) + // do something new + // else + // do the old thing + enum Version { BTI_1_0 = 1 + , BTI_1_1 + , BTI_1_2 + }; + + // ctor & dtor + public: + BamToolsIndex(Internal::BamReaderPrivate* reader); + ~BamToolsIndex(void); + + // BamIndex implementation + public: + // builds index from associated BAM file & writes out to index file + bool Create(void); + // returns whether reference has alignments or no + bool HasAlignments(const int& referenceID) const; + // attempts to use index data to jump to @region, returns success/fail + // a "successful" jump indicates no error, but not whether this region has data + // * thus, the method sets a flag to indicate whether there are alignments + // available after the jump position + bool Jump(const BamTools::BamRegion& region, bool* hasAlignmentsInRegion); + // loads existing data from file into memory + bool Load(const std::string& filename); + // change the index caching behavior + void SetCacheMode(const BamIndex::IndexCacheMode& mode); + public: + // returns format's file extension + static const std::string Extension(void); + + // internal file ops + private: + bool CheckMagicNumber(void); + bool CheckVersion(void); + void CloseFile(void); + bool IsFileOpen(void) const; + bool OpenFile(const std::string& filename, const char* mode); + bool Seek(const int64_t& position, const int& origin); + int64_t Tell(void) const; + + // internal BTI index building methods + private: + void ClearReferenceEntry(BtiReferenceEntry& refEntry); + + // internal random-access methods + private: + bool GetOffset(const BamRegion& region, int64_t& offset, bool* hasAlignmentsInRegion); + + // internal BTI summary data methods + private: + void InitializeFileSummary(const int& numReferences); + bool LoadFileSummary(void); + bool LoadHeader(void); + bool LoadNumBlocks(int& numBlocks); + bool LoadNumReferences(int& numReferences); + bool LoadReferenceSummary(BtiReferenceSummary& refSummary); + bool SkipBlocks(const int& numBlocks); + + // internal BTI full index input methods + private: + bool ReadBlock(BtiBlock& block); + bool ReadBlocks(const BtiReferenceSummary& refSummary, BtiBlockVector& blocks); + bool ReadReferenceEntry(BtiReferenceEntry& refEntry); + + // internal BTI full index output methods + private: + bool WriteBlock(const BtiBlock& block); + bool WriteBlocks(const BtiBlockVector& blocks); + bool WriteHeader(void); + bool WriteReferenceEntry(const BtiReferenceEntry& refEntry); + + // data members + private: + FILE* m_indexStream; + bool m_isBigEndian; + BamIndex::IndexCacheMode m_cacheMode; + BtiFileSummary m_indexFileSummary; + int m_blockSize; + int32_t m_inputVersion; // Version is serialized as int + Version m_outputVersion; + + // static constants + private: + static const int DEFAULT_BLOCK_LENGTH; + static const std::string BTI_EXTENSION; + static const char* const BTI_MAGIC; + static const int SIZEOF_BLOCK; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMTOOLS_INDEX_FORMAT_H diff --git a/src/utils/BamTools/src/api/internal/BamWriter_p.cpp b/src/utils/BamTools/src/api/internal/BamWriter_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..490820cbc7583014ee75f4d5bf0adc51bd5f286e --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamWriter_p.cpp @@ -0,0 +1,425 @@ +// *************************************************************************** +// BamWriter_p.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 16 June 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#include <api/BamAlignment.h> +#include <api/BamConstants.h> +#include <api/internal/BamWriter_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstdio> +#include <cstdlib> +#include <cstring> +using namespace std; + +// ctor +BamWriterPrivate::BamWriterPrivate(void) + : m_isBigEndian( BamTools::SystemIsBigEndian() ) +{ } + +// dtor +BamWriterPrivate::~BamWriterPrivate(void) { + m_stream.Close(); +} + +// calculates minimum bin for a BAM alignment interval +unsigned int BamWriterPrivate::CalculateMinimumBin(const int begin, int end) const { + --end; + if ( (begin >> 14) == (end >> 14) ) return 4681 + (begin >> 14); + if ( (begin >> 17) == (end >> 17) ) return 585 + (begin >> 17); + if ( (begin >> 20) == (end >> 20) ) return 73 + (begin >> 20); + if ( (begin >> 23) == (end >> 23) ) return 9 + (begin >> 23); + if ( (begin >> 26) == (end >> 26) ) return 1 + (begin >> 26); + return 0; +} + +// closes the alignment archive +void BamWriterPrivate::Close(void) { + m_stream.Close(); +} + +// creates a cigar string from the supplied alignment +void BamWriterPrivate::CreatePackedCigar(const vector<CigarOp>& cigarOperations, string& packedCigar) { + + // initialize + const unsigned int numCigarOperations = cigarOperations.size(); + packedCigar.resize(numCigarOperations * Constants::BAM_SIZEOF_INT); + + // pack the cigar data into the string + unsigned int* pPackedCigar = (unsigned int*)packedCigar.data(); + + // iterate over cigar operations + vector<CigarOp>::const_iterator coIter = cigarOperations.begin(); + vector<CigarOp>::const_iterator coEnd = cigarOperations.end(); + for ( ; coIter != coEnd; ++coIter ) { + + // store op in packedCigar + unsigned int cigarOp; + switch ( coIter->Type ) { + case (Constants::BAM_CIGAR_MATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MATCH; break; + case (Constants::BAM_CIGAR_INS_CHAR) : cigarOp = Constants::BAM_CIGAR_INS; break; + case (Constants::BAM_CIGAR_DEL_CHAR) : cigarOp = Constants::BAM_CIGAR_DEL; break; + case (Constants::BAM_CIGAR_REFSKIP_CHAR) : cigarOp = Constants::BAM_CIGAR_REFSKIP; break; + case (Constants::BAM_CIGAR_SOFTCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_SOFTCLIP; break; + case (Constants::BAM_CIGAR_HARDCLIP_CHAR) : cigarOp = Constants::BAM_CIGAR_HARDCLIP; break; + case (Constants::BAM_CIGAR_PAD_CHAR) : cigarOp = Constants::BAM_CIGAR_PAD; break; + case (Constants::BAM_CIGAR_SEQMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_SEQMATCH; break; + case (Constants::BAM_CIGAR_MISMATCH_CHAR) : cigarOp = Constants::BAM_CIGAR_MISMATCH; break; + default: + fprintf(stderr, "BamWriter ERROR: unknown cigar operation found: %c\n", coIter->Type); + exit(1); + } + + *pPackedCigar = coIter->Length << Constants::BAM_CIGAR_SHIFT | cigarOp; + pPackedCigar++; + } +} + +// encodes the supplied query sequence into 4-bit notation +void BamWriterPrivate::EncodeQuerySequence(const string& query, string& encodedQuery) { + + // prepare the encoded query string + const unsigned int queryLen = query.size(); + const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); + encodedQuery.resize(encodedQueryLen); + char* pEncodedQuery = (char*)encodedQuery.data(); + const char* pQuery = (const char*)query.data(); + + unsigned char nucleotideCode; + bool useHighWord = true; + + while ( *pQuery ) { + switch ( *pQuery ) { + case (Constants::BAM_DNA_EQUAL) : nucleotideCode = Constants::BAM_BASECODE_EQUAL; break; + case (Constants::BAM_DNA_A) : nucleotideCode = Constants::BAM_BASECODE_A; break; + case (Constants::BAM_DNA_C) : nucleotideCode = Constants::BAM_BASECODE_C; break; + case (Constants::BAM_DNA_G) : nucleotideCode = Constants::BAM_BASECODE_G; break; + case (Constants::BAM_DNA_T) : nucleotideCode = Constants::BAM_BASECODE_T; break; + case (Constants::BAM_DNA_N) : nucleotideCode = Constants::BAM_BASECODE_N; break; + default: + fprintf(stderr, "BamWriter ERROR: only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); + exit(1); + } + + // pack the nucleotide code + if ( useHighWord ) { + *pEncodedQuery = nucleotideCode << 4; + useHighWord = false; + } else { + *pEncodedQuery |= nucleotideCode; + ++pEncodedQuery; + useHighWord = true; + } + + // increment the query position + ++pQuery; + } +} + +// returns whether BAM file is open for writing or not +bool BamWriterPrivate::IsOpen(void) const { + return m_stream.IsOpen; +} + +// opens the alignment archive +bool BamWriterPrivate::Open(const string& filename, + const string& samHeaderText, + const RefVector& referenceSequences) +{ + // open the BGZF file for writing, return failure if error + if ( !m_stream.Open(filename, "wb") ) + return false; + + // write BAM file 'metadata' components + WriteMagicNumber(); + WriteSamHeaderText(samHeaderText); + WriteReferences(referenceSequences); + return true; +} + +// saves the alignment to the alignment archive +void BamWriterPrivate::SaveAlignment(const BamAlignment& al) { + + // if BamAlignment contains only the core data and a raw char data buffer + // (as a result of BamReader::GetNextAlignmentCore()) + if ( al.SupportData.HasCoreOnly ) { + + // write the block size + unsigned int blockSize = al.SupportData.BlockLength; + if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); + m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); + + // re-calculate bin (in case BamAlignment's position has been previously modified) + const uint32_t alignmentBin = CalculateMinimumBin(al.Position, al.GetEndPosition()); + + // assign the BAM core data + uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength; + buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations; + buffer[4] = al.SupportData.QuerySequenceLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( m_isBigEndian ) { + for ( int i = 0; i < 8; ++i ) + BamTools::SwapEndian_32(buffer[i]); + } + + // write the BAM core + m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); + + // write the raw char data + m_stream.Write((char*)al.SupportData.AllCharData.data(), + al.SupportData.BlockLength-Constants::BAM_CORE_SIZE); + } + + // otherwise, BamAlignment should contain character in the standard fields: Name, QueryBases, etc + // ( resulting from BamReader::GetNextAlignment() *OR* being generated directly by client code ) + else { + + // calculate char lengths + const unsigned int nameLength = al.Name.size() + 1; + const unsigned int numCigarOperations = al.CigarData.size(); + const unsigned int queryLength = al.QueryBases.size(); + const unsigned int tagDataLength = al.TagData.size(); + + // no way to tell if BamAlignment.Bin is already defined (no default, invalid value) + // force calculation of Bin before storing + const int endPosition = al.GetEndPosition(); + const unsigned int alignmentBin = CalculateMinimumBin(al.Position, endPosition); + + // create our packed cigar string + string packedCigar; + CreatePackedCigar(al.CigarData, packedCigar); + const unsigned int packedCigarLength = packedCigar.size(); + + // encode the query + string encodedQuery; + EncodeQuerySequence(al.QueryBases, encodedQuery); + const unsigned int encodedQueryLength = encodedQuery.size(); + + // write the block size + const unsigned int dataBlockSize = nameLength + + packedCigarLength + + encodedQueryLength + + queryLength + + tagDataLength; + unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize; + if ( m_isBigEndian ) BamTools::SwapEndian_32(blockSize); + m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT); + + // assign the BAM core data + uint32_t buffer[Constants::BAM_CORE_BUFFER_SIZE]; + buffer[0] = al.RefID; + buffer[1] = al.Position; + buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength; + buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations; + buffer[4] = queryLength; + buffer[5] = al.MateRefID; + buffer[6] = al.MatePosition; + buffer[7] = al.InsertSize; + + // swap BAM core endian-ness, if necessary + if ( m_isBigEndian ) { + for ( int i = 0; i < 8; ++i ) + BamTools::SwapEndian_32(buffer[i]); + } + + // write the BAM core + m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE); + + // write the query name + m_stream.Write(al.Name.c_str(), nameLength); + + // write the packed cigar + if ( m_isBigEndian ) { + char* cigarData = (char*)calloc(sizeof(char), packedCigarLength); + memcpy(cigarData, packedCigar.data(), packedCigarLength); + if ( m_isBigEndian ) { + for ( unsigned int i = 0; i < packedCigarLength; ++i ) + BamTools::SwapEndian_32p(&cigarData[i]); + } + m_stream.Write(cigarData, packedCigarLength); + free(cigarData); + } + else + m_stream.Write(packedCigar.data(), packedCigarLength); + + // write the encoded query sequence + m_stream.Write(encodedQuery.data(), encodedQueryLength); + + // write the base qualities + char* pBaseQualities = (char*)al.Qualities.data(); + for ( unsigned int i = 0; i < queryLength; ++i ) + pBaseQualities[i] -= 33; // FASTQ conversion + m_stream.Write(pBaseQualities, queryLength); + + // write the read group tag + if ( m_isBigEndian ) { + + char* tagData = (char*)calloc(sizeof(char), tagDataLength); + memcpy(tagData, al.TagData.data(), tagDataLength); + + int i = 0; + while ( (unsigned int)i < tagDataLength ) { + + i += Constants::BAM_TAG_TAGSIZE; // skip tag chars (e.g. "RG", "NM", etc.) + const char type = tagData[i]; // get tag type at position i + ++i; + + switch ( type ) { + + case(Constants::BAM_TAG_TYPE_ASCII) : + case(Constants::BAM_TAG_TYPE_INT8) : + case(Constants::BAM_TAG_TYPE_UINT8) : + ++i; + break; + + case(Constants::BAM_TAG_TYPE_INT16) : + case(Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + + case(Constants::BAM_TAG_TYPE_FLOAT) : + case(Constants::BAM_TAG_TYPE_INT32) : + case(Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + + case(Constants::BAM_TAG_TYPE_HEX) : + case(Constants::BAM_TAG_TYPE_STRING) : + // no endian swapping necessary for hex-string/string data + while ( tagData[i] ) + ++i; + // increment one more for null terminator + ++i; + break; + + case(Constants::BAM_TAG_TYPE_ARRAY) : + + { + // read array type + const char arrayType = tagData[i]; + ++i; + + // swap endian-ness of number of elements in place, then retrieve for loop + BamTools::SwapEndian_32p(&tagData[i]); + int32_t numElements; + memcpy(&numElements, &tagData[i], sizeof(uint32_t)); + i += sizeof(uint32_t); + + // swap endian-ness of array elements + for ( int j = 0; j < numElements; ++j ) { + switch (arrayType) { + case (Constants::BAM_TAG_TYPE_INT8) : + case (Constants::BAM_TAG_TYPE_UINT8) : + // no endian-swapping necessary + ++i; + break; + case (Constants::BAM_TAG_TYPE_INT16) : + case (Constants::BAM_TAG_TYPE_UINT16) : + BamTools::SwapEndian_16p(&tagData[i]); + i += sizeof(uint16_t); + break; + case (Constants::BAM_TAG_TYPE_FLOAT) : + case (Constants::BAM_TAG_TYPE_INT32) : + case (Constants::BAM_TAG_TYPE_UINT32) : + BamTools::SwapEndian_32p(&tagData[i]); + i += sizeof(uint32_t); + break; + default: + // error case + fprintf(stderr, + "BamWriter ERROR: unknown binary array type encountered: [%c]\n", + arrayType); + exit(1); + } + } + + break; + } + + default : + fprintf(stderr, "BamWriter ERROR: invalid tag value type\n"); // shouldn't get here + free(tagData); + exit(1); + } + } + m_stream.Write(tagData, tagDataLength); + free(tagData); + } + else + m_stream.Write(al.TagData.data(), tagDataLength); + } +} + +void BamWriterPrivate::SetWriteCompressed(bool ok) { + + // warn if BAM file is already open + // modifying compression is not allowed in this case + if ( IsOpen() ) { + cerr << "BamWriter WARNING: attempting to change compression mode on an open BAM file is not allowed. " + << "Ignoring request." << endl; + return; + } + + // set BgzfStream compression mode + m_stream.SetWriteCompressed(ok); +} + +void BamWriterPrivate::WriteMagicNumber(void) { + // write BAM file 'magic number' + m_stream.Write(Constants::BAM_HEADER_MAGIC, Constants::BAM_HEADER_MAGIC_LENGTH); +} + +void BamWriterPrivate::WriteReferences(const BamTools::RefVector& referenceSequences) { + + // write the number of reference sequences + uint32_t numReferenceSequences = referenceSequences.size(); + if ( m_isBigEndian ) BamTools::SwapEndian_32(numReferenceSequences); + m_stream.Write((char*)&numReferenceSequences, Constants::BAM_SIZEOF_INT); + + // foreach reference sequence + RefVector::const_iterator rsIter = referenceSequences.begin(); + RefVector::const_iterator rsEnd = referenceSequences.end(); + for ( ; rsIter != rsEnd; ++rsIter ) { + + // write the reference sequence name length + uint32_t referenceSequenceNameLen = rsIter->RefName.size() + 1; + if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceSequenceNameLen); + m_stream.Write((char*)&referenceSequenceNameLen, Constants::BAM_SIZEOF_INT); + + // write the reference sequence name + m_stream.Write(rsIter->RefName.c_str(), referenceSequenceNameLen); + + // write the reference sequence length + int32_t referenceLength = rsIter->RefLength; + if ( m_isBigEndian ) BamTools::SwapEndian_32(referenceLength); + m_stream.Write((char*)&referenceLength, Constants::BAM_SIZEOF_INT); + } +} + +void BamWriterPrivate::WriteSamHeaderText(const std::string& samHeaderText) { + + // write the SAM header text length + uint32_t samHeaderLen = samHeaderText.size(); + if ( m_isBigEndian ) BamTools::SwapEndian_32(samHeaderLen); + m_stream.Write((char*)&samHeaderLen, Constants::BAM_SIZEOF_INT); + + // write the SAM header text + if ( samHeaderLen > 0 ) + m_stream.Write(samHeaderText.data(), samHeaderLen); +} diff --git a/src/utils/BamTools/src/api/internal/BamWriter_p.h b/src/utils/BamTools/src/api/internal/BamWriter_p.h new file mode 100644 index 0000000000000000000000000000000000000000..dd2b0fe1178ce8c2c52a9b2b4b0406ba9a81c122 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BamWriter_p.h @@ -0,0 +1,67 @@ +// *************************************************************************** +// BamWriter_p.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 24 February 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic functionality for producing BAM files +// *************************************************************************** + +#ifndef BAMWRITER_P_H +#define BAMWRITER_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. + +#include <api/BamAux.h> +#include <api/internal/BgzfStream_p.h> +#include <string> +#include <vector> + +namespace BamTools { +namespace Internal { + +class BamWriterPrivate { + + // ctor & dtor + public: + BamWriterPrivate(void); + ~BamWriterPrivate(void); + + // interface methods + public: + void Close(void); + bool IsOpen(void) const; + bool Open(const std::string& filename, + const std::string& samHeaderText, + const BamTools::RefVector& referenceSequences); + void SaveAlignment(const BamAlignment& al); + void SetWriteCompressed(bool ok); + + // 'internal' methods + public: + unsigned int CalculateMinimumBin(const int begin, int end) const; + void CreatePackedCigar(const std::vector<BamTools::CigarOp>& cigarOperations, std::string& packedCigar); + void EncodeQuerySequence(const std::string& query, std::string& encodedQuery); + void WriteMagicNumber(void); + void WriteReferences(const BamTools::RefVector& referenceSequences); + void WriteSamHeaderText(const std::string& samHeaderText); + + // data members + private: + BgzfStream m_stream; + bool m_isBigEndian; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // BAMWRITER_P_H diff --git a/src/utils/BamTools/src/api/internal/BgzfStream_p.cpp b/src/utils/BamTools/src/api/internal/BgzfStream_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..aba2a0786a095db16c6adf6c8830a3f14bd37f4a --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BgzfStream_p.cpp @@ -0,0 +1,439 @@ +// *************************************************************************** +// BgzfStream_p.cpp (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 5 April 2011(DB) +// --------------------------------------------------------------------------- +// Based on BGZF routines developed at the Broad Institute. +// Provides the basic functionality for reading & writing BGZF files +// Replaces the old BGZF.* files to avoid clashing with other toolkits +// *************************************************************************** + +#include <api/internal/BgzfStream_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cstring> +#include <algorithm> +using namespace std; + +// constructor +BgzfStream::BgzfStream(void) + : UncompressedBlockSize(Constants::BGZF_DEFAULT_BLOCK_SIZE) + , CompressedBlockSize(Constants::BGZF_MAX_BLOCK_SIZE) + , BlockLength(0) + , BlockOffset(0) + , BlockAddress(0) + , IsOpen(false) + , IsWriteOnly(false) + , IsWriteCompressed(true) + , Stream(NULL) + , UncompressedBlock(NULL) + , CompressedBlock(NULL) +{ + try { + CompressedBlock = new char[CompressedBlockSize]; + UncompressedBlock = new char[UncompressedBlockSize]; + } catch( std::bad_alloc& ba ) { + fprintf(stderr, "BgzfStream ERROR: unable to allocate memory\n"); + exit(1); + } +} + +// destructor +BgzfStream::~BgzfStream(void) { + if( CompressedBlock ) delete[] CompressedBlock; + if( UncompressedBlock ) delete[] UncompressedBlock; +} + +// closes BGZF file +void BgzfStream::Close(void) { + + // skip if file not open + if ( !IsOpen ) return; + + // if writing to file, flush the current BGZF block, + // then write an empty block (as EOF marker) + if ( IsWriteOnly ) { + FlushBlock(); + int blockLength = DeflateBlock(); + fwrite(CompressedBlock, 1, blockLength, Stream); + } + + // flush and close stream + fflush(Stream); + fclose(Stream); + + // reset flags + IsWriteCompressed = true; + IsOpen = false; +} + +// compresses the current block +int BgzfStream::DeflateBlock(void) { + + // initialize the gzip header + char* buffer = CompressedBlock; + memset(buffer, 0, 18); + buffer[0] = Constants::GZIP_ID1; + buffer[1] = (char)Constants::GZIP_ID2; + buffer[2] = Constants::CM_DEFLATE; + buffer[3] = Constants::FLG_FEXTRA; + buffer[9] = (char)Constants::OS_UNKNOWN; + buffer[10] = Constants::BGZF_XLEN; + buffer[12] = Constants::BGZF_ID1; + buffer[13] = Constants::BGZF_ID2; + buffer[14] = Constants::BGZF_LEN; + + // set compression level + const int compressionLevel = ( IsWriteCompressed ? Z_DEFAULT_COMPRESSION : 0 ); + + // loop to retry for blocks that do not compress enough + int inputLength = BlockOffset; + int compressedLength = 0; + unsigned int bufferSize = CompressedBlockSize; + + while ( true ) { + + // initialize zstream values + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)UncompressedBlock; + zs.avail_in = inputLength; + zs.next_out = (Bytef*)&buffer[Constants::BGZF_BLOCK_HEADER_LENGTH]; + zs.avail_out = bufferSize - Constants::BGZF_BLOCK_HEADER_LENGTH - Constants::BGZF_BLOCK_FOOTER_LENGTH; + + // initialize the zlib compression algorithm + if ( deflateInit2(&zs, + compressionLevel, + Z_DEFLATED, + Constants::GZIP_WINDOW_BITS, + Constants::Z_DEFAULT_MEM_LEVEL, + Z_DEFAULT_STRATEGY) != Z_OK ) + { + fprintf(stderr, "BgzfStream ERROR: zlib deflate initialization failed\n"); + exit(1); + } + + // compress the data + int status = deflate(&zs, Z_FINISH); + if ( status != Z_STREAM_END ) { + + deflateEnd(&zs); + + // reduce the input length and try again + if ( status == Z_OK ) { + inputLength -= 1024; + if ( inputLength < 0 ) { + fprintf(stderr, "BgzfStream ERROR: input reduction failed\n"); + exit(1); + } + continue; + } + + fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n"); + exit(1); + } + + // finalize the compression routine + if ( deflateEnd(&zs) != Z_OK ) { + fprintf(stderr, "BgzfStream ERROR: zlib::deflateEnd() failed\n"); + exit(1); + } + + compressedLength = zs.total_out; + compressedLength += Constants::BGZF_BLOCK_HEADER_LENGTH + Constants::BGZF_BLOCK_FOOTER_LENGTH; + if ( compressedLength > Constants::BGZF_MAX_BLOCK_SIZE ) { + fprintf(stderr, "BgzfStream ERROR: deflate overflow\n"); + exit(1); + } + + break; + } + + // store the compressed length + BamTools::PackUnsignedShort(&buffer[16], (unsigned short)(compressedLength - 1)); + + // store the CRC32 checksum + unsigned int crc = crc32(0, NULL, 0); + crc = crc32(crc, (Bytef*)UncompressedBlock, inputLength); + BamTools::PackUnsignedInt(&buffer[compressedLength - 8], crc); + BamTools::PackUnsignedInt(&buffer[compressedLength - 4], inputLength); + + // ensure that we have less than a block of data left + int remaining = BlockOffset - inputLength; + if ( remaining > 0 ) { + if ( remaining > inputLength ) { + fprintf(stderr, "BgzfStream ERROR: after deflate, remainder too large\n"); + exit(1); + } + memcpy(UncompressedBlock, UncompressedBlock + inputLength, remaining); + } + + // update block data + BlockOffset = remaining; + + // return result + return compressedLength; +} + +// flushes the data in the BGZF block +void BgzfStream::FlushBlock(void) { + + // flush all of the remaining blocks + while ( BlockOffset > 0 ) { + + // compress the data block + int blockLength = DeflateBlock(); + + // flush the data to our output stream + int numBytesWritten = fwrite(CompressedBlock, 1, blockLength, Stream); + if ( numBytesWritten != blockLength ) { + fprintf(stderr, "BgzfStream ERROR: expected to write %u bytes during flushing, but wrote %u bytes\n", + blockLength, numBytesWritten); + exit(1); + } + + // update block data + BlockAddress += blockLength; + } +} + +// decompresses the current block +int BgzfStream::InflateBlock(const int& blockLength) { + + // inflate the data from compressed buffer into uncompressed buffer + z_stream zs; + zs.zalloc = NULL; + zs.zfree = NULL; + zs.next_in = (Bytef*)CompressedBlock + 18; + zs.avail_in = blockLength - 16; + zs.next_out = (Bytef*)UncompressedBlock; + zs.avail_out = UncompressedBlockSize; + + int status = inflateInit2(&zs, Constants::GZIP_WINDOW_BITS); + if ( status != Z_OK ) { + fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateInit() failed\n"); + return -1; + } + + status = inflate(&zs, Z_FINISH); + if ( status != Z_STREAM_END ) { + inflateEnd(&zs); + fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflate() failed\n"); + return -1; + } + + status = inflateEnd(&zs); + if ( status != Z_OK ) { + fprintf(stderr, "BgzfStream ERROR: could not decompress block - zlib::inflateEnd() failed\n"); + return -1; + } + + // return result + return zs.total_out; +} + +// opens the BGZF file for reading (mode is either "rb" for reading, or "wb" for writing) +bool BgzfStream::Open(const string& filename, const char* mode) { + + // close current stream, if necessary, before opening next + if ( IsOpen ) Close(); + + // determine open mode + if ( strcmp(mode, "rb") == 0 ) + IsWriteOnly = false; + else if ( strcmp(mode, "wb") == 0) + IsWriteOnly = true; + else { + fprintf(stderr, "BgzfStream ERROR: unknown file mode: %s\n", mode); + return false; + } + + // open BGZF stream on a file + if ( (filename != "stdin") && (filename != "stdout") ) + Stream = fopen(filename.c_str(), mode); + + // open BGZF stream on stdin + else if ( (filename == "stdin") && (strcmp(mode, "rb") == 0 ) ) + Stream = freopen(NULL, mode, stdin); + + // open BGZF stream on stdout + else if ( (filename == "stdout") && (strcmp(mode, "wb") == 0) ) + Stream = freopen(NULL, mode, stdout); + + if ( !Stream ) { + fprintf(stderr, "BgzfStream ERROR: unable to open file %s\n", filename.c_str() ); + return false; + } + + // set flag & return success + IsOpen = true; + return true; +} + +// reads BGZF data into a byte buffer +int BgzfStream::Read(char* data, const unsigned int dataLength) { + + // if stream not open for reading (or empty request) + if ( !IsOpen || IsWriteOnly || dataLength == 0 ) + return 0; + + // read blocks as needed until desired data length is retrieved + char* output = data; + unsigned int numBytesRead = 0; + while ( numBytesRead < dataLength ) { + + // determine bytes available in current block + int bytesAvailable = BlockLength - BlockOffset; + + // read (and decompress) next block if needed + if ( bytesAvailable <= 0 ) { + if ( !ReadBlock() ) return -1; + bytesAvailable = BlockLength - BlockOffset; + if ( bytesAvailable <= 0 ) break; + } + + // copy data from uncompressed source buffer into data destination buffer + char* buffer = UncompressedBlock; + int copyLength = min( (int)(dataLength-numBytesRead), bytesAvailable ); + memcpy(output, buffer + BlockOffset, copyLength); + + // update counters + BlockOffset += copyLength; + output += copyLength; + numBytesRead += copyLength; + } + + // update block data + if ( BlockOffset == BlockLength ) { + BlockAddress = ftell64(Stream); + BlockOffset = 0; + BlockLength = 0; + } + + return numBytesRead; +} + +// reads a BGZF block +bool BgzfStream::ReadBlock(void) { + + char header[Constants::BGZF_BLOCK_HEADER_LENGTH]; + int64_t blockAddress = ftell64(Stream); + + // read block header from file + int count = fread(header, 1, sizeof(header), Stream); + + // if block header empty + if ( count == 0 ) { + BlockLength = 0; + return true; + } + + // if block header invalid size + if ( count != sizeof(header) ) { + fprintf(stderr, "BgzfStream ERROR: read block failed - could not read block header\n"); + return false; + } + + // validate block header contents + if ( !BgzfStream::CheckBlockHeader(header) ) { + fprintf(stderr, "BgzfStream ERROR: read block failed - invalid block header\n"); + return false; + } + + // copy header contents to compressed buffer + int blockLength = BamTools::UnpackUnsignedShort(&header[16]) + 1; + char* compressedBlock = CompressedBlock; + memcpy(compressedBlock, header, Constants::BGZF_BLOCK_HEADER_LENGTH); + int remaining = blockLength - Constants::BGZF_BLOCK_HEADER_LENGTH; + + // read remainder of block + count = fread(&compressedBlock[Constants::BGZF_BLOCK_HEADER_LENGTH], 1, remaining, Stream); + if ( count != remaining ) { + fprintf(stderr, "BgzfStream ERROR: read block failed - could not read data from block\n"); + return false; + } + + // decompress block data + count = InflateBlock(blockLength); + if ( count < 0 ) { + fprintf(stderr, "BgzfStream ERROR: read block failed - could not decompress block data\n"); + return false; + } + + // update block data + if ( BlockLength != 0 ) + BlockOffset = 0; + BlockAddress = blockAddress; + BlockLength = count; + + // return success + return true; +} + +// seek to position in BGZF file +bool BgzfStream::Seek(const int64_t& position) { + + // skip if not open + if ( !IsOpen ) return false; + + // determine adjusted offset & address + int blockOffset = (position & 0xFFFF); + int64_t blockAddress = (position >> 16) & 0xFFFFFFFFFFFFLL; + + // attempt seek in file + if ( fseek64(Stream, blockAddress, SEEK_SET) != 0 ) { + fprintf(stderr, "BgzfStream ERROR: unable to seek in file\n"); + return false; + } + + // update block data & return success + BlockLength = 0; + BlockAddress = blockAddress; + BlockOffset = blockOffset; + return true; +} + +void BgzfStream::SetWriteCompressed(bool ok) { + IsWriteCompressed = ok; +} + +// get file position in BGZF file +int64_t BgzfStream::Tell(void) const { + if ( !IsOpen ) + return 0; + return ( (BlockAddress << 16) | (BlockOffset & 0xFFFF) ); +} + +// writes the supplied data into the BGZF buffer +unsigned int BgzfStream::Write(const char* data, const unsigned int dataLen) { + + // skip if file not open for writing + if ( !IsOpen || !IsWriteOnly ) return false; + + // write blocks as needed til all data is written + unsigned int numBytesWritten = 0; + const char* input = data; + unsigned int blockLength = UncompressedBlockSize; + while ( numBytesWritten < dataLen ) { + + // copy data contents to uncompressed output buffer + unsigned int copyLength = min(blockLength - BlockOffset, dataLen - numBytesWritten); + char* buffer = UncompressedBlock; + memcpy(buffer + BlockOffset, input, copyLength); + + // update counter + BlockOffset += copyLength; + input += copyLength; + numBytesWritten += copyLength; + + // flush (& compress) output buffer when full + if ( BlockOffset == blockLength ) FlushBlock(); + } + + // return result + return numBytesWritten; +} diff --git a/src/utils/BamTools/src/api/internal/BgzfStream_p.h b/src/utils/BamTools/src/api/internal/BgzfStream_p.h new file mode 100644 index 0000000000000000000000000000000000000000..c5e5d4872c0886509be74ca4e3d299f76a678025 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/BgzfStream_p.h @@ -0,0 +1,109 @@ +// *************************************************************************** +// BgzfStream_p.h (c) 2011 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 5 April 2011(DB) +// --------------------------------------------------------------------------- +// Based on BGZF routines developed at the Broad Institute. +// Provides the basic functionality for reading & writing BGZF files +// Replaces the old BGZF.* files to avoid clashing with other toolkits +// *************************************************************************** + +#ifndef BGZFSTREAM_P_H +#define BGZFSTREAM_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <api/BamAux.h> +#include <api/BamConstants.h> +#include "zlib.h" +#include <cstdio> +#include <string> + +namespace BamTools { +namespace Internal { + +class BgzfStream { + + // constructor & destructor + public: + BgzfStream(void); + ~BgzfStream(void); + + // main interface methods + public: + // closes BGZF file + void Close(void); + // opens the BGZF file (mode is either "rb" for reading, or "wb" for writing) + bool Open(const std::string& filename, const char* mode); + // reads BGZF data into a byte buffer + int Read(char* data, const unsigned int dataLength); + // seek to position in BGZF file + bool Seek(const int64_t& position); + // enable/disable compressed output + void SetWriteCompressed(bool ok); + // get file position in BGZF file + int64_t Tell(void) const; + // writes the supplied data into the BGZF buffer + unsigned int Write(const char* data, const unsigned int dataLen); + + // internal methods + private: + // compresses the current block + int DeflateBlock(void); + // flushes the data in the BGZF block + void FlushBlock(void); + // de-compresses the current block + int InflateBlock(const int& blockLength); + // reads a BGZF block + bool ReadBlock(void); + + // static 'utility' methods + public: + // checks BGZF block header + static inline bool CheckBlockHeader(char* header); + + // data members + public: + unsigned int UncompressedBlockSize; + unsigned int CompressedBlockSize; + unsigned int BlockLength; + unsigned int BlockOffset; + uint64_t BlockAddress; + bool IsOpen; + bool IsWriteOnly; + bool IsWriteCompressed; + FILE* Stream; + char* UncompressedBlock; + char* CompressedBlock; +}; + +// ------------------------------------------------------------- +// static 'utility' method implementations + +// checks BGZF block header +inline +bool BgzfStream::CheckBlockHeader(char* header) { + return (header[0] == Constants::GZIP_ID1 && + header[1] == (char)Constants::GZIP_ID2 && + header[2] == Z_DEFLATED && + (header[3] & Constants::FLG_FEXTRA) != 0 && + BamTools::UnpackUnsignedShort(&header[10]) == Constants::BGZF_XLEN && + header[12] == Constants::BGZF_ID1 && + header[13] == Constants::BGZF_ID2 && + BamTools::UnpackUnsignedShort(&header[14]) == Constants::BGZF_LEN ); +} + +} // namespace Internal +} // namespace BamTools + +#endif // BGZFSTREAM_P_H diff --git a/src/utils/BamTools/src/api/internal/SamFormatParser_p.cpp b/src/utils/BamTools/src/api/internal/SamFormatParser_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..316f75f73b5d49522297937588264ea5b4c8156e --- /dev/null +++ b/src/utils/BamTools/src/api/internal/SamFormatParser_p.cpp @@ -0,0 +1,231 @@ +// *************************************************************************** +// SamFormatParser.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for parsing SAM header text into SamHeader object +// *************************************************************************** + +#include <api/SamConstants.h> +#include <api/SamHeader.h> +#include <api/internal/SamFormatParser_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <iostream> +#include <sstream> +#include <vector> +using namespace std; + +SamFormatParser::SamFormatParser(SamHeader& header) + : m_header(header) +{ } + +SamFormatParser::~SamFormatParser(void) { } + +void SamFormatParser::Parse(const string& headerText) { + + // clear header's prior contents + m_header.Clear(); + + // empty header is OK, but skip processing + if ( headerText.empty() ) + return; + + // other wise parse SAM lines + istringstream headerStream(headerText); + string headerLine(""); + while ( getline(headerStream, headerLine) ) + ParseSamLine(headerLine); +} + +void SamFormatParser::ParseSamLine(const string& line) { + + // skip if line is not long enough to contain true values + if (line.length() < 5 ) return; + + // determine token at beginning of line + const string firstToken = line.substr(0,3); + string restOfLine = line.substr(4); + if ( firstToken == Constants::SAM_HD_BEGIN_TOKEN) ParseHDLine(restOfLine); + else if ( firstToken == Constants::SAM_SQ_BEGIN_TOKEN) ParseSQLine(restOfLine); + else if ( firstToken == Constants::SAM_RG_BEGIN_TOKEN) ParseRGLine(restOfLine); + else if ( firstToken == Constants::SAM_PG_BEGIN_TOKEN) ParsePGLine(restOfLine); + else if ( firstToken == Constants::SAM_CO_BEGIN_TOKEN) ParseCOLine(restOfLine); + else + cerr << "SamFormatParser ERROR: unknown token: " << firstToken << endl; +} + +void SamFormatParser::ParseHDLine(const string& line) { + + // split HD lines into tokens + vector<string> tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + vector<string>::const_iterator tokenIter = tokens.begin(); + vector<string>::const_iterator tokenEnd = tokens.end(); + for ( ; tokenIter != tokenEnd; ++tokenIter ) { + + // get tag/value + const string tokenTag = (*tokenIter).substr(0,2); + const string tokenValue = (*tokenIter).substr(3); + + // set header contents + if ( tokenTag == Constants::SAM_HD_VERSION_TAG ) m_header.Version = tokenValue; + else if ( tokenTag == Constants::SAM_HD_SORTORDER_TAG ) m_header.SortOrder = tokenValue; + else if ( tokenTag == Constants::SAM_HD_GROUPORDER_TAG ) m_header.GroupOrder = tokenValue; + else + cerr << "SamFormatParser ERROR: unknown HD tag: " << tokenTag << endl; + } + + // if @HD line exists, VN must be provided + if ( !m_header.HasVersion() ) + cerr << "SamFormatParser ERROR: @HD line is missing VN tag" << endl; +} + +void SamFormatParser::ParseSQLine(const string& line) { + + SamSequence seq; + + // split SQ line into tokens + vector<string> tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + vector<string>::const_iterator tokenIter = tokens.begin(); + vector<string>::const_iterator tokenEnd = tokens.end(); + for ( ; tokenIter != tokenEnd; ++tokenIter ) { + + // get tag/value + const string tokenTag = (*tokenIter).substr(0,2); + const string tokenValue = (*tokenIter).substr(3); + + // set sequence contents + if ( tokenTag == Constants::SAM_SQ_NAME_TAG ) seq.Name = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_LENGTH_TAG ) seq.Length = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_ASSEMBLYID_TAG ) seq.AssemblyID = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_CHECKSUM_TAG ) seq.Checksum = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_SPECIES_TAG ) seq.Species = tokenValue; + else if ( tokenTag == Constants::SAM_SQ_URI_TAG ) seq.URI = tokenValue; + else + cerr << "SamFormatParser ERROR: unknown SQ tag: " << tokenTag << endl; + } + + bool isMissingRequiredFields = false; + + // if @SQ line exists, SN must be provided + if ( !seq.HasName() ) { + isMissingRequiredFields = true; + cerr << "SamFormatParser ERROR: @SQ line is missing SN tag" << endl; + } + + // if @SQ line exists, LN must be provided + if ( !seq.HasLength() ) { + isMissingRequiredFields = true; + cerr << "SamFormatParser ERROR: @SQ line is missing LN tag" << endl; + } + + // store SAM sequence entry + if ( !isMissingRequiredFields ) + m_header.Sequences.Add(seq); +} + +void SamFormatParser::ParseRGLine(const string& line) { + + SamReadGroup rg; + + // split string into tokens + vector<string> tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + vector<string>::const_iterator tokenIter = tokens.begin(); + vector<string>::const_iterator tokenEnd = tokens.end(); + for ( ; tokenIter != tokenEnd; ++tokenIter ) { + + // get token tag/value + const string tokenTag = (*tokenIter).substr(0,2); + const string tokenValue = (*tokenIter).substr(3); + + // set read group contents + if ( tokenTag == Constants::SAM_RG_ID_TAG ) rg.ID = tokenValue; + else if ( tokenTag == Constants::SAM_RG_DESCRIPTION_TAG ) rg.Description = tokenValue; + else if ( tokenTag == Constants::SAM_RG_FLOWORDER_TAG ) rg.FlowOrder = tokenValue; + else if ( tokenTag == Constants::SAM_RG_KEYSEQUENCE_TAG ) rg.KeySequence = tokenValue; + else if ( tokenTag == Constants::SAM_RG_LIBRARY_TAG ) rg.Library = tokenValue; + else if ( tokenTag == Constants::SAM_RG_PLATFORMUNIT_TAG ) rg.PlatformUnit = tokenValue; + else if ( tokenTag == Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG ) rg.PredictedInsertSize = tokenValue; + else if ( tokenTag == Constants::SAM_RG_PRODUCTIONDATE_TAG ) rg.ProductionDate = tokenValue; + else if ( tokenTag == Constants::SAM_RG_PROGRAM_TAG ) rg.Program = tokenValue; + else if ( tokenTag == Constants::SAM_RG_SAMPLE_TAG ) rg.Sample = tokenValue; + else if ( tokenTag == Constants::SAM_RG_SEQCENTER_TAG ) rg.SequencingCenter = tokenValue; + else if ( tokenTag == Constants::SAM_RG_SEQTECHNOLOGY_TAG ) rg.SequencingTechnology = tokenValue; + else + cerr << "SamFormatParser ERROR: unknown RG tag: " << tokenTag << endl; + } + + bool isMissingRequiredFields = false; + + // if @RG line exists, ID must be provided + if ( !rg.HasID() ) { + isMissingRequiredFields = true; + cerr << "SamFormatParser ERROR: @RG line is missing ID tag" << endl; + } + + // store SAM read group entry + if ( !isMissingRequiredFields ) + m_header.ReadGroups.Add(rg); +} + +void SamFormatParser::ParsePGLine(const string& line) { + + SamProgram pg; + + // split string into tokens + vector<string> tokens = Split(line, Constants::SAM_TAB); + + // iterate over tokens + vector<string>::const_iterator tokenIter = tokens.begin(); + vector<string>::const_iterator tokenEnd = tokens.end(); + for ( ; tokenIter != tokenEnd; ++tokenIter ) { + + // get token tag/value + const string tokenTag = (*tokenIter).substr(0,2); + const string tokenValue = (*tokenIter).substr(3); + + // set program record contents + if ( tokenTag == Constants::SAM_PG_ID_TAG ) pg.ID = tokenValue; + else if ( tokenTag == Constants::SAM_PG_NAME_TAG ) pg.Name = tokenValue; + else if ( tokenTag == Constants::SAM_PG_COMMANDLINE_TAG ) pg.CommandLine = tokenValue; + else if ( tokenTag == Constants::SAM_PG_PREVIOUSPROGRAM_TAG ) pg.PreviousProgramID = tokenValue; + else if ( tokenTag == Constants::SAM_PG_VERSION_TAG ) pg.Version = tokenValue; + else + cerr << "SamFormatParser ERROR: unknown PG tag: " << tokenTag << endl; + } + + bool isMissingRequiredFields = false; + + // if @PG line exists, ID must be provided + if ( !pg.HasID() ) { + isMissingRequiredFields = true; + cerr << "SamFormatParser ERROR: @PG line is missing ID tag" << endl; + } + + // store SAM program record + if ( !isMissingRequiredFields ) + m_header.Programs.Add(pg); +} + +void SamFormatParser::ParseCOLine(const string& line) { + // simply add line to comments list + m_header.Comments.push_back(line); +} + +const vector<string> SamFormatParser::Split(const string& line, const char delim) { + vector<string> tokens; + stringstream lineStream(line); + string token; + while ( getline(lineStream, token, delim) ) + tokens.push_back(token); + return tokens; +} diff --git a/src/utils/BamTools/src/api/internal/SamFormatParser_p.h b/src/utils/BamTools/src/api/internal/SamFormatParser_p.h new file mode 100644 index 0000000000000000000000000000000000000000..daabe394f3b440b43cc98ffa2b5ba41427f6a2c1 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/SamFormatParser_p.h @@ -0,0 +1,62 @@ +// *************************************************************************** +// SamFormatParser.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for parsing SAM header text into SamHeader object +// *************************************************************************** + +#ifndef SAM_FORMAT_PARSER_H +#define SAM_FORMAT_PARSER_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <string> +#include <vector> + +namespace BamTools { + +class SamHeader; + +namespace Internal { + +class SamFormatParser { + + // ctor & dtor + public: + SamFormatParser(BamTools::SamHeader& header); + ~SamFormatParser(void); + + // parse text & populate header data + public: + void Parse(const std::string& headerText); + + // internal methods + private: + void ParseSamLine(const std::string& line); + void ParseHDLine(const std::string& line); + void ParseSQLine(const std::string& line); + void ParseRGLine(const std::string& line); + void ParsePGLine(const std::string& line); + void ParseCOLine(const std::string& line); + const std::vector<std::string> Split(const std::string& line, const char delim); + + // data members + private: + SamHeader& m_header; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // SAM_FORMAT_PARSER_H diff --git a/src/utils/BamTools/src/api/internal/SamFormatPrinter_p.cpp b/src/utils/BamTools/src/api/internal/SamFormatPrinter_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1e670b0155be479d88d9ed6d21b1c406600313af --- /dev/null +++ b/src/utils/BamTools/src/api/internal/SamFormatPrinter_p.cpp @@ -0,0 +1,211 @@ +// *************************************************************************** +// SamFormatPrinter.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 19 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for printing formatted SAM header to string +// *************************************************************************** + +#include <api/SamConstants.h> +#include <api/SamHeader.h> +#include <api/internal/SamFormatPrinter_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <iostream> +#include <sstream> +#include <vector> +using namespace std; + +SamFormatPrinter::SamFormatPrinter(const SamHeader& header) + : m_header(header) +{ } + +SamFormatPrinter::~SamFormatPrinter(void) { } + +const string SamFormatPrinter::FormatTag(const string &tag, const string &value) const { + return string(Constants::SAM_TAB + tag + Constants::SAM_COLON + value); +} + +const string SamFormatPrinter::ToString(void) const { + + // clear out stream + stringstream out(""); + + // generate formatted header text + PrintHD(out); + PrintSQ(out); + PrintRG(out); + PrintPG(out); + PrintCO(out); + + // return result + return out.str(); +} + +void SamFormatPrinter::PrintHD(std::stringstream& out) const { + + // if header has @HD data + if ( m_header.HasVersion() ) { + + // @HD VN:<Version> + out << Constants::SAM_HD_BEGIN_TOKEN + << FormatTag(Constants::SAM_HD_VERSION_TAG, m_header.Version); + + // SO:<SortOrder> + if ( m_header.HasSortOrder() ) + out << FormatTag(Constants::SAM_HD_SORTORDER_TAG, m_header.SortOrder); + + // GO:<GroupOrder> + if ( m_header.HasGroupOrder() ) + out << FormatTag(Constants::SAM_HD_GROUPORDER_TAG, m_header.GroupOrder); + + // newline + out << endl; + } +} + +void SamFormatPrinter::PrintSQ(std::stringstream& out) const { + + // iterate over sequence entries + SamSequenceConstIterator seqIter = m_header.Sequences.ConstBegin(); + SamSequenceConstIterator seqEnd = m_header.Sequences.ConstEnd(); + for ( ; seqIter != seqEnd; ++seqIter ) { + const SamSequence& seq = (*seqIter); + + // @SQ SN:<Name> LN:<Length> + out << Constants::SAM_SQ_BEGIN_TOKEN + << FormatTag(Constants::SAM_SQ_NAME_TAG, seq.Name) + << FormatTag(Constants::SAM_SQ_LENGTH_TAG, seq.Length); + + // AS:<AssemblyID> + if ( seq.HasAssemblyID() ) + out << FormatTag(Constants::SAM_SQ_ASSEMBLYID_TAG, seq.AssemblyID); + + // M5:<Checksum> + if ( seq.HasChecksum() ) + out << FormatTag(Constants::SAM_SQ_CHECKSUM_TAG, seq.Checksum); + + // SP:<Species> + if ( seq.HasSpecies() ) + out << FormatTag(Constants::SAM_SQ_SPECIES_TAG, seq.Species); + + // UR:<URI> + if ( seq.HasURI() ) + out << FormatTag(Constants::SAM_SQ_URI_TAG, seq.URI); + + // newline + out << endl; + } +} + +void SamFormatPrinter::PrintRG(std::stringstream& out) const { + + // iterate over read group entries + SamReadGroupConstIterator rgIter = m_header.ReadGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = m_header.ReadGroups.ConstEnd(); + for ( ; rgIter != rgEnd; ++rgIter ) { + const SamReadGroup& rg = (*rgIter); + + // @RG ID:<ID> + out << Constants::SAM_RG_BEGIN_TOKEN + << FormatTag(Constants::SAM_RG_ID_TAG, rg.ID); + + // CN:<SequencingCenter> + if ( rg.HasSequencingCenter() ) + out << FormatTag(Constants::SAM_RG_SEQCENTER_TAG, rg.SequencingCenter); + + // DS:<Description> + if ( rg.HasDescription() ) + out << FormatTag(Constants::SAM_RG_DESCRIPTION_TAG, rg.Description); + + // DT:<ProductionDate> + if ( rg.HasProductionDate() ) + out << FormatTag(Constants::SAM_RG_PRODUCTIONDATE_TAG, rg.ProductionDate); + + // FO:<FlowOrder> + if ( rg.HasFlowOrder() ) + out << FormatTag(Constants::SAM_RG_FLOWORDER_TAG, rg.FlowOrder); + + // KS:<KeySequence> + if ( rg.HasKeySequence() ) + out << FormatTag(Constants::SAM_RG_KEYSEQUENCE_TAG, rg.KeySequence); + + // LB:<Library> + if ( rg.HasLibrary() ) + out << FormatTag(Constants::SAM_RG_LIBRARY_TAG, rg.Library); + + // PG:<Program> + if ( rg.HasProgram() ) + out << FormatTag(Constants::SAM_RG_PROGRAM_TAG, rg.Program); + + // PI:<PredictedInsertSize> + if ( rg.HasPredictedInsertSize() ) + out << FormatTag(Constants::SAM_RG_PREDICTEDINSERTSIZE_TAG, rg.PredictedInsertSize); + + // PL:<SequencingTechnology> + if ( rg.HasSequencingTechnology() ) + out << FormatTag(Constants::SAM_RG_SEQTECHNOLOGY_TAG, rg.SequencingTechnology); + + // PU:<PlatformUnit> + if ( rg.HasPlatformUnit() ) + out << FormatTag(Constants::SAM_RG_PLATFORMUNIT_TAG, rg.PlatformUnit); + + // SM:<Sample> + if ( rg.HasSample() ) + out << FormatTag(Constants::SAM_RG_SAMPLE_TAG, rg.Sample); + + // newline + out << endl; + } +} + +void SamFormatPrinter::PrintPG(std::stringstream& out) const { + + // iterate over program record entries + SamProgramConstIterator pgIter = m_header.Programs.ConstBegin(); + SamProgramConstIterator pgEnd = m_header.Programs.ConstEnd(); + for ( ; pgIter != pgEnd; ++pgIter ) { + const SamProgram& pg = (*pgIter); + + // @PG ID:<ID> + out << Constants::SAM_PG_BEGIN_TOKEN + << FormatTag(Constants::SAM_PG_ID_TAG, pg.ID); + + // PN:<Name> + if ( pg.HasName() ) + out << FormatTag(Constants::SAM_PG_NAME_TAG, pg.Name); + + // CL:<CommandLine> + if ( pg.HasCommandLine() ) + out << FormatTag(Constants::SAM_PG_COMMANDLINE_TAG, pg.CommandLine); + + // PP:<PreviousProgramID> + if ( pg.HasPreviousProgramID() ) + out << FormatTag(Constants::SAM_PG_PREVIOUSPROGRAM_TAG, pg.PreviousProgramID); + + // VN:<Version> + if ( pg.HasVersion() ) + out << FormatTag(Constants::SAM_PG_VERSION_TAG, pg.Version); + + // newline + out << endl; + } +} + +void SamFormatPrinter::PrintCO(std::stringstream& out) const { + + // iterate over comments + vector<string>::const_iterator commentIter = m_header.Comments.begin(); + vector<string>::const_iterator commentEnd = m_header.Comments.end(); + for ( ; commentIter != commentEnd; ++commentIter ) { + + // @CO <Comment> + out << Constants::SAM_CO_BEGIN_TOKEN + << Constants::SAM_TAB + << (*commentIter) + << endl; + } +} diff --git a/src/utils/BamTools/src/api/internal/SamFormatPrinter_p.h b/src/utils/BamTools/src/api/internal/SamFormatPrinter_p.h new file mode 100644 index 0000000000000000000000000000000000000000..5e28e9717c7d6a030b1f82e1968c742a433a5643 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/SamFormatPrinter_p.h @@ -0,0 +1,61 @@ +// *************************************************************************** +// SamFormatPrinter.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 23 December 2010 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for printing formatted SAM header to string +// *************************************************************************** + +#ifndef SAM_FORMAT_PRINTER_H +#define SAM_FORMAT_PRINTER_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <sstream> +#include <string> + +namespace BamTools { + +class SamHeader; + +namespace Internal { + +class SamFormatPrinter { + + // ctor & dtor + public: + SamFormatPrinter(const BamTools::SamHeader& header); + ~SamFormatPrinter(void); + + // generates SAM-formatted string from header data + public: + const std::string ToString(void) const; + + // internal methods + private: + const std::string FormatTag(const std::string& tag, const std::string& value) const; + void PrintHD(std::stringstream& out) const; + void PrintSQ(std::stringstream& out) const; + void PrintRG(std::stringstream& out) const; + void PrintPG(std::stringstream& out) const; + void PrintCO(std::stringstream& out) const; + + // data members + private: + const SamHeader& m_header; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // SAM_FORMAT_PRINTER_H diff --git a/src/utils/BamTools/src/api/internal/SamHeaderValidator_p.cpp b/src/utils/BamTools/src/api/internal/SamHeaderValidator_p.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7ecec2ca90c5fc3f201cdd7e95943d8950a02c0c --- /dev/null +++ b/src/utils/BamTools/src/api/internal/SamHeaderValidator_p.cpp @@ -0,0 +1,511 @@ +// *************************************************************************** +// SamHeaderValidator.cpp (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 18 April 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for validating SamHeader data +// *************************************************************************** + +#include <api/SamConstants.h> +#include <api/SamHeader.h> +#include <api/internal/SamHeaderValidator_p.h> +#include <api/internal/SamHeaderVersion_p.h> +using namespace BamTools; +using namespace BamTools::Internal; + +#include <cctype> +#include <iostream> +#include <set> +#include <sstream> +using namespace std; + +namespace BamTools { +namespace Internal { + +bool caseInsensitiveCompare(const string& lhs, const string& rhs) { + + // can omit checking chars if lengths not equal + const int lhsLength = lhs.length(); + const int rhsLength = rhs.length(); + if ( lhsLength != rhsLength ) + return false; + + // do *basic* toupper checks on each string char's + for ( int i = 0; i < lhsLength; ++i ) { + if ( toupper( (int)lhs.at(i)) != toupper( (int)rhs.at(i)) ) + return false; + } + + // otherwise OK + return true; +} + +} // namespace Internal +} // namespace BamTools + +// ------------------------------------------------------------------------ +// Allow validation rules to vary, as needed, between SAM header versions +// +// use SAM_VERSION_X_Y to tag important changes +// +// Together, they will allow for comparisons like: +// if ( m_version < SAM_VERSION_2_0 ) { +// // use some older rule +// else +// // use rule introduced with version 2.0 + +static const SamHeaderVersion SAM_VERSION_1_0 = SamHeaderVersion(1,0); +static const SamHeaderVersion SAM_VERSION_1_1 = SamHeaderVersion(1,1); +static const SamHeaderVersion SAM_VERSION_1_2 = SamHeaderVersion(1,2); +static const SamHeaderVersion SAM_VERSION_1_3 = SamHeaderVersion(1,3); +static const SamHeaderVersion SAM_VERSION_1_4 = SamHeaderVersion(1,4); + +// TODO: This functionality is currently unused. +// Make validation "version-aware." +// +// ------------------------------------------------------------------------ + +const string SamHeaderValidator::ERROR_PREFIX = "ERROR: "; +const string SamHeaderValidator::WARN_PREFIX = "WARNING: "; +const string SamHeaderValidator::NEWLINE = "\n"; + +SamHeaderValidator::SamHeaderValidator(const SamHeader& header) + : m_header(header) +{ } + +SamHeaderValidator::~SamHeaderValidator(void) { } + +bool SamHeaderValidator::Validate(bool verbose) { + + // validate header components + bool isValid = true; + isValid &= ValidateMetadata(); + isValid &= ValidateSequenceDictionary(); + isValid &= ValidateReadGroupDictionary(); + isValid &= ValidateProgramChain(); + + // report errors if desired + if ( verbose ) { + PrintErrorMessages(); + PrintWarningMessages(); + } + + // return validation status + return isValid; +} + +bool SamHeaderValidator::ValidateMetadata(void) { + bool isValid = true; + isValid &= ValidateVersion(); + isValid &= ValidateSortOrder(); + isValid &= ValidateGroupOrder(); + return isValid; +} + +bool SamHeaderValidator::ValidateVersion(void) { + + const string& version = m_header.Version; + + // warn if version not present + if ( version.empty() ) { + AddWarning("Version (VN) missing. Not required, but strongly recommended"); + return true; + } + + // invalid if version does not contain a period + const size_t periodFound = version.find(Constants::SAM_PERIOD); + if ( periodFound == string::npos ) { + AddError("Invalid version (VN) format: " + version); + return false; + } + + // invalid if major version is empty or contains non-digits + const string majorVersion = version.substr(0, periodFound); + if ( majorVersion.empty() || !ContainsOnlyDigits(majorVersion) ) { + AddError("Invalid version (VN) format: " + version); + return false; + } + + // invalid if major version is empty or contains non-digits + const string minorVersion = version.substr(periodFound + 1); + if ( minorVersion.empty() || !ContainsOnlyDigits(minorVersion) ) { + AddError("Invalid version (VN) format: " + version); + return false; + } + + // TODO: check if version is not just syntactically OK, + // but is also a valid SAM version ( 1.0 .. CURRENT ) + + // all checked out this far, then version is OK + return true; +} + +// assumes non-empty input string +bool SamHeaderValidator::ContainsOnlyDigits(const string& s) { + const size_t nonDigitPosition = s.find_first_not_of(Constants::SAM_DIGITS); + return ( nonDigitPosition == string::npos ) ; +} + +bool SamHeaderValidator::ValidateSortOrder(void) { + + const string& sortOrder = m_header.SortOrder; + + // warn if sort order not present + if ( sortOrder.empty() ) { + AddWarning("Sort order (SO) missing. Not required, but strongly recommended"); + return true; + } + + // if sort order is valid keyword + if ( sortOrder == Constants::SAM_HD_SORTORDER_COORDINATE || + sortOrder == Constants::SAM_HD_SORTORDER_QUERYNAME || + sortOrder == Constants::SAM_HD_SORTORDER_UNSORTED + ) + { + return true; + } + + // otherwise + AddError("Invalid sort order (SO): " + sortOrder); + return false; +} + +bool SamHeaderValidator::ValidateGroupOrder(void) { + + const string& groupOrder = m_header.GroupOrder; + + // if no group order, no problem, just return OK + if ( groupOrder.empty() ) + return true; + + // if group order is valid keyword + if ( groupOrder == Constants::SAM_HD_GROUPORDER_NONE || + groupOrder == Constants::SAM_HD_GROUPORDER_QUERY || + groupOrder == Constants::SAM_HD_GROUPORDER_REFERENCE + ) + { + return true; + } + + // otherwise + AddError("Invalid group order (GO): " + groupOrder); + return false; +} + +bool SamHeaderValidator::ValidateSequenceDictionary(void) { + + bool isValid = true; + + // check for unique sequence names + isValid &= ContainsUniqueSequenceNames(); + + // iterate over sequences + const SamSequenceDictionary& sequences = m_header.Sequences; + SamSequenceConstIterator seqIter = sequences.ConstBegin(); + SamSequenceConstIterator seqEnd = sequences.ConstEnd(); + for ( ; seqIter != seqEnd; ++seqIter ) { + const SamSequence& seq = (*seqIter); + isValid &= ValidateSequence(seq); + } + + // return validation state + return isValid; +} + +bool SamHeaderValidator::ContainsUniqueSequenceNames(void) { + + bool isValid = true; + set<string> sequenceNames; + set<string>::iterator nameIter; + + // iterate over sequences + const SamSequenceDictionary& sequences = m_header.Sequences; + SamSequenceConstIterator seqIter = sequences.ConstBegin(); + SamSequenceConstIterator seqEnd = sequences.ConstEnd(); + for ( ; seqIter != seqEnd; ++seqIter ) { + const SamSequence& seq = (*seqIter); + + // lookup sequence name + const string& name = seq.Name; + nameIter = sequenceNames.find(name); + + // error if found (duplicate entry) + if ( nameIter != sequenceNames.end() ) { + AddError("Sequence name (SN): " + name + " is not unique"); + isValid = false; + } + + // otherwise ok, store name + sequenceNames.insert(name); + } + + // return validation state + return isValid; +} + +bool SamHeaderValidator::ValidateSequence(const SamSequence& seq) { + bool isValid = true; + isValid &= CheckNameFormat(seq.Name); + isValid &= CheckLengthInRange(seq.Length); + return isValid; +} + +bool SamHeaderValidator::CheckNameFormat(const string& name) { + + // invalid if name is empty + if ( name.empty() ) { + AddError("Sequence entry (@SQ) is missing SN tag"); + return false; + } + + // invalid if first character is a reserved char + const char firstChar = name.at(0); + if ( firstChar == Constants::SAM_EQUAL || firstChar == Constants::SAM_STAR ) { + AddError("Invalid sequence name (SN): " + name); + return false; + } + // otherwise OK + return true; +} + +bool SamHeaderValidator::CheckLengthInRange(const string& length) { + + // invalid if empty + if ( length.empty() ) { + AddError("Sequence entry (@SQ) is missing LN tag"); + return false; + } + + // convert string length to numeric + stringstream lengthStream(length); + unsigned int sequenceLength; + lengthStream >> sequenceLength; + + // invalid if length outside accepted range + if ( sequenceLength < Constants::SAM_SQ_LENGTH_MIN || sequenceLength > Constants::SAM_SQ_LENGTH_MAX ) { + AddError("Sequence length (LN): " + length + " out of range"); + return false; + } + + // otherwise OK + return true; +} + +bool SamHeaderValidator::ValidateReadGroupDictionary(void) { + + bool isValid = true; + + // check for unique read group IDs & platform units + isValid &= ContainsUniqueIDsAndPlatformUnits(); + + // iterate over read groups + const SamReadGroupDictionary& readGroups = m_header.ReadGroups; + SamReadGroupConstIterator rgIter = readGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = readGroups.ConstEnd(); + for ( ; rgIter != rgEnd; ++rgIter ) { + const SamReadGroup& rg = (*rgIter); + isValid &= ValidateReadGroup(rg); + } + + // return validation state + return isValid; +} + +bool SamHeaderValidator::ContainsUniqueIDsAndPlatformUnits(void) { + + bool isValid = true; + set<string> readGroupIds; + set<string> platformUnits; + set<string>::iterator idIter; + set<string>::iterator puIter; + + // iterate over sequences + const SamReadGroupDictionary& readGroups = m_header.ReadGroups; + SamReadGroupConstIterator rgIter = readGroups.ConstBegin(); + SamReadGroupConstIterator rgEnd = readGroups.ConstEnd(); + for ( ; rgIter != rgEnd; ++rgIter ) { + const SamReadGroup& rg = (*rgIter); + + // -------------------------------- + // check for unique ID + + // lookup read group ID + const string& id = rg.ID; + idIter = readGroupIds.find(id); + + // error if found (duplicate entry) + if ( idIter != readGroupIds.end() ) { + AddError("Read group ID (ID): " + id + " is not unique"); + isValid = false; + } + + // otherwise ok, store id + readGroupIds.insert(id); + + // -------------------------------- + // check for unique platform unit + + // lookup platform unit + const string& pu = rg.PlatformUnit; + puIter = platformUnits.find(pu); + + // error if found (duplicate entry) + if ( puIter != platformUnits.end() ) { + AddError("Platform unit (PU): " + pu + " is not unique"); + isValid = false; + } + + // otherwise ok, store platform unit + platformUnits.insert(pu); + } + + // return validation state + return isValid; +} + +bool SamHeaderValidator::ValidateReadGroup(const SamReadGroup& rg) { + bool isValid = true; + isValid &= CheckReadGroupID(rg.ID); + isValid &= CheckSequencingTechnology(rg.SequencingTechnology); + return isValid; +} + +bool SamHeaderValidator::CheckReadGroupID(const string& id) { + + // invalid if empty + if ( id.empty() ) { + AddError("Read group entry (@RG) is missing ID tag"); + return false; + } + + // otherwise OK + return true; +} + +bool SamHeaderValidator::CheckSequencingTechnology(const string& technology) { + + // if no technology provided, no problem, just return OK + if ( technology.empty() ) + return true; + + // if technology is valid keyword + if ( caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_CAPILLARY) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_HELICOS) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_ILLUMINA) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_IONTORRENT) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_LS454) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_PACBIO) || + caseInsensitiveCompare(technology, Constants::SAM_RG_SEQTECHNOLOGY_SOLID) + ) + { + return true; + } + + // otherwise + AddError("Invalid read group sequencing platform (PL): " + technology); + return false; +} + +bool SamHeaderValidator::ValidateProgramChain(void) { + bool isValid = true; + isValid &= ContainsUniqueProgramIds(); + isValid &= ValidatePreviousProgramIds(); + return isValid; +} + +bool SamHeaderValidator::ContainsUniqueProgramIds(void) { + + bool isValid = true; + set<string> programIds; + set<string>::iterator pgIdIter; + + // iterate over program records + const SamProgramChain& programs = m_header.Programs; + SamProgramConstIterator pgIter = programs.ConstBegin(); + SamProgramConstIterator pgEnd = programs.ConstEnd(); + for ( ; pgIter != pgEnd; ++pgIter ) { + const SamProgram& pg = (*pgIter); + + // lookup program ID + const string& pgId = pg.ID; + pgIdIter = programIds.find(pgId); + + // error if found (duplicate entry) + if ( pgIdIter != programIds.end() ) { + AddError("Program ID (ID): " + pgId + " is not unique"); + isValid = false; + } + + // otherwise ok, store ID + programIds.insert(pgId); + } + + // return validation state + return isValid; +} + +bool SamHeaderValidator::ValidatePreviousProgramIds(void) { + + bool isValid = true; + + // iterate over program records + const SamProgramChain& programs = m_header.Programs; + SamProgramConstIterator pgIter = programs.ConstBegin(); + SamProgramConstIterator pgEnd = programs.ConstEnd(); + for ( ; pgIter != pgEnd; ++pgIter ) { + const SamProgram& pg = (*pgIter); + + // ignore record for validation if PreviousProgramID is empty + const string& ppId = pg.PreviousProgramID; + if ( ppId.empty() ) + continue; + + // see if program "chain" contains an entry for ppId + if ( !programs.Contains(ppId) ) { + AddError("PreviousProgramID (PP): " + ppId + " is not a known ID"); + isValid = false; + } + } + + // return validation state + return isValid; +} +void SamHeaderValidator::AddError(const string& message) { + m_errorMessages.push_back(ERROR_PREFIX + message + NEWLINE); +} + +void SamHeaderValidator::AddWarning(const string& message) { + m_warningMessages.push_back(WARN_PREFIX + message + NEWLINE); +} + +void SamHeaderValidator::PrintErrorMessages(void) { + + // skip if no error messages + if ( m_errorMessages.empty() ) return; + + // print error header line + cerr << "* SAM header has " << m_errorMessages.size() << " errors:" << endl; + + // print each error message + vector<string>::const_iterator errorIter = m_errorMessages.begin(); + vector<string>::const_iterator errorEnd = m_errorMessages.end(); + for ( ; errorIter != errorEnd; ++errorIter ) + cerr << (*errorIter); +} + +void SamHeaderValidator::PrintWarningMessages(void) { + + // skip if no warning messages + if ( m_warningMessages.empty() ) return; + + // print warning header line + cerr << "* SAM header has " << m_warningMessages.size() << " warnings:" << endl; + + // print each warning message + vector<string>::const_iterator warnIter = m_warningMessages.begin(); + vector<string>::const_iterator warnEnd = m_warningMessages.end(); + for ( ; warnIter != warnEnd; ++warnIter ) + cerr << (*warnIter); +} diff --git a/src/utils/BamTools/src/api/internal/SamHeaderValidator_p.h b/src/utils/BamTools/src/api/internal/SamHeaderValidator_p.h new file mode 100644 index 0000000000000000000000000000000000000000..06a82abeaa04d9ee1043ba89f47c41703ed12160 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/SamHeaderValidator_p.h @@ -0,0 +1,102 @@ +// *************************************************************************** +// SamHeaderValidator.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 13 January 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for validating SamHeader data +// *************************************************************************** + +#ifndef SAM_HEADER_VALIDATOR_P_H +#define SAM_HEADER_VALIDATOR_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <string> +#include <vector> + +namespace BamTools { + +class SamHeader; +class SamReadGroup; +class SamSequence; + +namespace Internal { + +class SamHeaderValidator { + + // ctor & dtor + public: + SamHeaderValidator(const SamHeader& header); + ~SamHeaderValidator(void); + + // SamHeaderValidator interface + public: + // validates SamHeader data, returns true/false accordingly + // prints error & warning messages to stderr when @verbose is true + bool Validate(bool verbose = false); + + // internal methods + private: + + // validate header metadata + bool ValidateMetadata(void); + bool ValidateVersion(void); + bool ContainsOnlyDigits(const std::string& s); + bool ValidateSortOrder(void); + bool ValidateGroupOrder(void); + + // validate sequence dictionary + bool ValidateSequenceDictionary(void); + bool ContainsUniqueSequenceNames(void); + bool CheckNameFormat(const std::string& name); + bool ValidateSequence(const SamSequence& seq); + bool CheckLengthInRange(const std::string& length); + + // validate read group dictionary + bool ValidateReadGroupDictionary(void); + bool ContainsUniqueIDsAndPlatformUnits(void); + bool ValidateReadGroup(const SamReadGroup& rg); + bool CheckReadGroupID(const std::string& id); + bool CheckSequencingTechnology(const std::string& technology); + + // validate program data + bool ValidateProgramChain(void); + bool ContainsUniqueProgramIds(void); + bool ValidatePreviousProgramIds(void); + + // error reporting + void AddError(const std::string& message); + void AddWarning(const std::string& message); + void PrintErrorMessages(void); + void PrintWarningMessages(void); + + // data members + private: + + // SamHeader being validated + const SamHeader& m_header; + + // error reporting helpers + static const std::string ERROR_PREFIX; + static const std::string WARN_PREFIX; + static const std::string NEWLINE; + + // error reporting messages + std::vector<std::string> m_errorMessages; + std::vector<std::string> m_warningMessages; +}; + +} // namespace Internal +} // namespace BamTools + +#endif // SAM_HEADER_VALIDATOR_P_H diff --git a/src/utils/BamTools/src/api/internal/SamHeaderVersion_p.h b/src/utils/BamTools/src/api/internal/SamHeaderVersion_p.h new file mode 100644 index 0000000000000000000000000000000000000000..256401739ece8c3db2713f5b60a1eb212902ac23 --- /dev/null +++ b/src/utils/BamTools/src/api/internal/SamHeaderVersion_p.h @@ -0,0 +1,135 @@ +// *************************************************************************** +// SamHeaderVersion.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 24 February 2011 (DB) +// --------------------------------------------------------------------------- +// Provides functionality for comparing SAM header versions +// ************************************************************************* + +#ifndef SAM_HEADERVERSION_P_H +#define SAM_HEADERVERSION_P_H + +// ------------- +// W A R N I N G +// ------------- +// +// This file is not part of the BamTools API. It exists purely as an +// implementation detail. This header file may change from version to version +// without notice, or even be removed. +// +// We mean it. + +#include <api/SamConstants.h> +#include <sstream> +#include <string> + +namespace BamTools { +namespace Internal { + +class SamHeaderVersion { + + // ctors & dtor + public: + SamHeaderVersion(void) + : m_majorVersion(0) + , m_minorVersion(0) + { } + + explicit SamHeaderVersion(const std::string& version) + : m_majorVersion(0) + , m_minorVersion(0) + { + SetVersion(version); + } + + SamHeaderVersion(const unsigned int& major, const unsigned int& minor) + : m_majorVersion(major) + , m_minorVersion(minor) + { } + + ~SamHeaderVersion(void) { + m_majorVersion = 0; + m_minorVersion = 0; + } + + // acess data + public: + unsigned int MajorVersion(void) const { return m_majorVersion; } + unsigned int MinorVersion(void) const { return m_minorVersion; } + + void SetVersion(const std::string& version); + std::string ToString(void) const; + + // data members + private: + unsigned int m_majorVersion; + unsigned int m_minorVersion; +}; + +inline +void SamHeaderVersion::SetVersion(const std::string& version) { + + // do nothing if version is empty + if ( !version.empty() ) { + + std::stringstream versionStream(""); + + // do nothing if period not found + const size_t periodFound = version.find(Constants::SAM_PERIOD); + if ( periodFound != std::string::npos ) { + + // store major version if non-empty and contains only digits + const std::string& majorVersion = version.substr(0, periodFound); + versionStream.str(majorVersion); + if ( !majorVersion.empty() ) { + const size_t nonDigitFound = majorVersion.find_first_not_of(Constants::SAM_DIGITS); + if ( nonDigitFound == std::string::npos ) + versionStream >> m_majorVersion; + } + + // store minor version if non-empty and contains only digits + const std::string& minorVersion = version.substr(periodFound + 1); + versionStream.str(minorVersion); + if ( !minorVersion.empty() ) { + const size_t nonDigitFound = minorVersion.find_first_not_of(Constants::SAM_DIGITS); + if ( nonDigitFound == std::string::npos ) + versionStream >> m_minorVersion; + } + } + } +} + +// ----------------------------------------------------- +// printing + +inline std::string SamHeaderVersion::ToString(void) const { + std::stringstream version; + version << m_majorVersion << Constants::SAM_PERIOD << m_minorVersion; + return version.str(); +} + +// ----------------------------------------------------- +// comparison operators + +inline bool operator==(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { + return (lhs.MajorVersion() == rhs.MajorVersion()) && + (lhs.MinorVersion() == rhs.MinorVersion()); +} + +inline bool operator<(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { + if ( lhs.MajorVersion() == rhs.MajorVersion() ) + return lhs.MinorVersion() < rhs.MinorVersion(); + else + return lhs.MajorVersion() < rhs.MajorVersion(); +} + +inline bool operator> (const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return rhs < lhs; } +inline bool operator<=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs>rhs); } +inline bool operator>=(const SamHeaderVersion& lhs, const SamHeaderVersion& rhs) { return !(lhs<rhs); } + +} // namespace Internal +} // namespace BamTools + +#endif // SAM_HEADERVERSION_P_H diff --git a/src/utils/BamTools/src/shared/bamtools_global.h b/src/utils/BamTools/src/shared/bamtools_global.h new file mode 100644 index 0000000000000000000000000000000000000000..6e3cb39e5a69fca28ef2196cdb0407671e8805c0 --- /dev/null +++ b/src/utils/BamTools/src/shared/bamtools_global.h @@ -0,0 +1,79 @@ +// *************************************************************************** +// bamtools_global.h (c) 2010 Derek Barnett +// Marth Lab, Department of Biology, Boston College +// All rights reserved. +// --------------------------------------------------------------------------- +// Last modified: 3 March 2011 (DB) +// --------------------------------------------------------------------------- +// Provides the basic definitions for exporting & importing library symbols. +// Also provides some platform-specific rules for definitions. +// *************************************************************************** + +#ifndef BAMTOOLS_GLOBAL_H +#define BAMTOOLS_GLOBAL_H + +/*! \brief Library export macro + \internal +*/ +#ifndef BAMTOOLS_LIBRARY_EXPORT +# if defined(WIN32) +# define BAMTOOLS_LIBRARY_EXPORT __declspec(dllexport) +# else +# define BAMTOOLS_LIBRARY_EXPORT __attribute__((visibility("default"))) +# endif +#endif // BAMTOOLS_LIBRARY_EXPORT + +/*! \brief Library import macro + \internal +*/ +#ifndef BAMTOOLS_LIBRARY_IMPORT +# if defined(WIN32) +# define BAMTOOLS_LIBRARY_IMPORT __declspec(dllimport) +# else +# define BAMTOOLS_LIBRARY_IMPORT +# endif +#endif // BAMTOOLS_LIBRARY_IMPORT + +/*! \brief Platform-specific type definitions + \internal +*/ +#ifndef BAMTOOLS_LFS +#define BAMTOOLS_LFS + #ifdef WIN32 + #define ftell64(a) _ftelli64(a) + #define fseek64(a,b,c) _fseeki64(a,b,c) + #else + #define ftell64(a) ftello(a) + #define fseek64(a,b,c) fseeko(a,b,c) + #endif +#endif // BAMTOOLS_LFS + +/*! \def ftell64(a) + \brief Platform-independent tell() operation. + \internal +*/ +/*! \def fseek64(a,b,c) + \brief Platform-independent seek() operation. + \internal +*/ + +/*! \brief Platform-specific type definitions + \internal +*/ +#ifndef BAMTOOLS_TYPES +#define BAMTOOLS_TYPES + #ifdef _MSC_VER + typedef char int8_t; + typedef unsigned char uint8_t; + typedef short int16_t; + typedef unsigned short uint16_t; + typedef int int32_t; + typedef unsigned int uint32_t; + typedef long long int64_t; + typedef unsigned long long uint64_t; + #else + #include <stdint.h> + #endif +#endif // BAMTOOLS_TYPES + +#endif // BAMTOOLS_GLOBAL_H