From b7ae3a4abd5af303d1e4325e8b8ba6ee0525163f Mon Sep 17 00:00:00 2001 From: Aaron <aaronquinlan@gmail.com> Date: Mon, 22 Oct 2012 15:29:09 -0400 Subject: [PATCH] standardize tag reporting for bamtobed --- src/bamToBed/bamToBed.cpp | 250 ++++++++++++++++++++++---------------- 1 file changed, 143 insertions(+), 107 deletions(-) diff --git a/src/bamToBed/bamToBed.cpp b/src/bamToBed/bamToBed.cpp index 281b9558..f7d73041 100644 --- a/src/bamToBed/bamToBed.cpp +++ b/src/bamToBed/bamToBed.cpp @@ -19,6 +19,7 @@ using namespace BamTools; #include <vector> #include <algorithm> // for swap() #include <iostream> +#include <sstream> #include <fstream> #include <stdlib.h> @@ -35,21 +36,31 @@ using namespace std; // function declarations void bamtobed_help(void); -void ConvertBamToBed(const string &bamFile, bool useEditDistance, const string &bamTag, - bool writeBed12, bool obeySplits, const string &color, - bool useCigar, bool useNovoalign, bool useBWA); +void ConvertBamToBed(const string &bamFile, bool useEditDistance, + const string &bamTag, bool writeBed12, + bool obeySplits, const string &color, + bool useCigar, bool useNovoalign, + bool useBWA); void ConvertBamToBedpe(const string &bamFile, const bool &useEditDistance); -void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, - const string &bamTag, bool obeySplits, bool useCigar, bool useNovoalign, bool useBWA); +string PrintTag(const BamAlignment &bam, const string &tag); + +void PrintBed(const BamAlignment &bam, const RefVector &refs, + bool useEditDistance, const string &bamTag, + bool obeySplits, bool useCigar, + bool useNovoalign, bool useBWA); -void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, string color = "255,0,0"); +void PrintBed12(const BamAlignment &bam, const RefVector &refs, + bool useEditDistance, const string &bamTag, + string color = "255,0,0"); + void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVector &refs, bool useEditDistance); void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts, vector<int> &blockEnds, int &alignmentEnd); + string BuildCigarString(const vector<CigarOp> &cigar); bool bamtobed_IsCorrectMappingForBEDPE (const BamAlignment &bam); @@ -112,6 +123,7 @@ int bamtobed_main(int argc, char* argv[]) { } else if(PARAMETER_CHECK("-ed", 3, parameterLength)) { useEditDistance = true; + tag = "NM"; } else if(PARAMETER_CHECK("-cigar", 6, parameterLength)) { useCigar = true; @@ -147,35 +159,50 @@ int bamtobed_main(int argc, char* argv[]) { // make sure we have an input files if (haveBam == false) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i (BAM) file. " << endl << "*****" << endl; + cerr << endl << "*****" << endl + << "*****ERROR: Need -i (BAM) file. " + << endl << "*****" << endl; showHelp = true; } if (haveColor == true && writeBed12 == false) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use color without BED12. " << endl << "*****" << endl; + cerr << endl << "*****" << endl + << "*****ERROR: Cannot use color without BED12. " + << endl << "*****" << endl; showHelp = true; } if (useEditDistance == true && obeySplits == true) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use -ed with -splits. Edit distances cannot be computed for each \'chunk\'." << endl << "*****" << endl; + cerr << endl << "*****" << endl + << "*****ERROR: Cannot use -ed with -splits. Edit distances cannot be computed for each \'chunk\'." + << endl << "*****" << endl; showHelp = true; } if (useEditDistance == true && useCigar == true) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use -cigar with -splits. Not yet supported." << endl << "*****" << endl; + cerr << endl << "*****" << endl + << "*****ERROR: Cannot use -cigar with -splits. Not yet supported." << endl + << "*****" << endl; showHelp = true; } if (useEditDistance == true && haveOtherTag == true) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use -ed with -tag. Choose one or the other." << endl << "*****" << endl; + cerr << endl << "*****" << endl + << "*****ERROR: Cannot use -ed with -tag. Choose one or the other." << endl + << "*****" << endl; showHelp = true; } if (writeBedPE == true && haveOtherTag == true) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use -tag with -bedpe." << endl << "*****" << endl; + cerr << endl << "*****" << endl + << "*****ERROR: Cannot use -tag with -bedpe." << endl + << "*****" << endl; showHelp = true; } // if there are no problems, let's convert BAM to BED or BEDPE if (!showHelp) { if (writeBedPE == false) - ConvertBamToBed(bamFile, useEditDistance, tag, writeBed12, obeySplits, color, useCigar, useNovoalign, useBWA); // BED or "blocked BED" + ConvertBamToBed(bamFile, useEditDistance, + tag, writeBed12, + obeySplits, color, + useCigar, useNovoalign, useBWA); else - ConvertBamToBedpe(bamFile, useEditDistance); // BEDPE + ConvertBamToBedpe(bamFile, useEditDistance); } else { bamtobed_help(); @@ -314,10 +341,29 @@ string BuildCigarString(const vector<CigarOp> &cigar) { return cigarString.str(); } +string PrintTag(const BamAlignment &bam, const string &tag) +{ + uint32_t uTagValue; + int32_t sTagValue; + ostringstream value; + if (bam.GetTag(tag, uTagValue)) + value << uTagValue; + else if (bam.GetTag(tag, sTagValue)) + value << sTagValue; + else { + cerr << "The requested tag (" + << tag + << ") was not found in the BAM file. Exiting\n"; + exit(1); + } + return value.str(); +} -void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, - const string &bamTag, bool obeySplits, bool useCigar, bool useNovoalign, bool useBWA) { - +void PrintBed(const BamAlignment &bam, const RefVector &refs, + bool useEditDistance, const string &bamTag, + bool obeySplits, bool useCigar, + bool useNovoalign, bool useBWA) +{ // set the strand string strand = "+"; if (bam.IsReverseStrand() == true) strand = "-"; @@ -335,38 +381,22 @@ void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDista if (!useNovoalign && !useBWA) { // report the alignment in BED6 format. - if (useEditDistance == false && bamTag == "") { - printf("%s\t%d\t%d\t\%s\t%d\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str()); - } - else if (useEditDistance == true && bamTag == "") { - uint32_t editDistance; - if (bam.GetTag("NM", editDistance)) { - printf("%s\t%d\t%d\t\%s\t%u\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), editDistance, strand.c_str()); - } - else { - cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; - exit(1); - } + if (bamTag == "") { + cout << refs.at(bam.RefID).RefName << "\t" + << bam.Position << "\t" + << alignmentEnd << "\t" + << name << "\t" + << bam.MapQuality << "\t" + << strand; } - else if (useEditDistance == false && bamTag != "") { - uint32_t uTagValue; - int32_t sTagValue; - if (bam.GetTag(bamTag, uTagValue)) { - printf("%s\t%d\t%d\t\%s\t%d\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), uTagValue, strand.c_str()); - } - else if (bam.GetTag(bamTag, sTagValue)) { - printf("%s\t%d\t%d\t\%s\t%d\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), sTagValue, strand.c_str()); - } - else { - cerr << "The requested tag (" << bamTag << ") was not found in the BAM file. Exiting\n"; - exit(1); - } + else { + cout << refs.at(bam.RefID).RefName << "\t" + << bam.Position << "\t" + << alignmentEnd << "\t" + << name << "\t" + << PrintTag(bam, bamTag) << "\t" + << strand; } - // does the user want CIGAR as well? if (useCigar == false) { printf("\n"); @@ -378,35 +408,28 @@ void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDista } else if (useNovoalign && !useBWA) { // special BED format for Hydra using Novoalign. - uint32_t editDistance; uint32_t numMappings; - - if (!bam.GetTag("NM", editDistance)) { - cerr << "Unable to extract NM for query: " - << bam.Name - << ". Verify that your BAM was generated by Novoalign." << endl; - exit(1); - } + if (!bam.GetTag("ZN", numMappings)) { // if ZN is missing, this means just one alignment was found. numMappings = 1; } - printf("%s\t%d\t%d\t\%s\t%u_%u_%u\t%s\n", - refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), - bam.MapQuality, editDistance, numMappings, strand.c_str()); + cout << refs.at(bam.RefID).RefName << "\t" + << bam.Position << "\t" + << alignmentEnd << "\t" + << name << "\t" + << bam.MapQuality << "\t" + << PrintTag(bam, "NM") << "\t" + << numMappings << "\t" + << strand + << endl; } else if (!useNovoalign && useBWA) { // special BED format for Hydra using Novoalign. - uint32_t editDistance; uint32_t numMappings; uint32_t x0 = 1; uint32_t x1 = 0; - - if (!bam.GetTag("NM", editDistance)) { - cerr << "Unable to extract NM. Verify that your BAM was generated by Novoalign." << endl; - exit(1); - } + if (!bam.GetTag("X0", x0) && !bam.GetTag("X1", x1)) { // if ZN is missing, this means just one alignment was found. numMappings = 1; @@ -414,14 +437,20 @@ void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDista else { numMappings = x0 + x1; } - printf("%s\t%d\t%d\t\%s\t%u_%u_%u\t%s\n", - refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), - bam.MapQuality, editDistance, numMappings, strand.c_str()); + cout << refs.at(bam.RefID).RefName << "\t" + << bam.Position << "\t" + << alignmentEnd << "\t" + << name << "\t" + << bam.MapQuality << "\t" + << PrintTag(bam, "NM") << "\t" + << numMappings << "\t" + << strand + << endl; } } // Report each chunk of the BAM alignment as a discrete BED entry - // For example 10M100N10M would be reported as two seprate BED entries of length 10 + // For example 10M100N10M would be reported as two seprate BED entries of + // length 10 else { // parse the CIGAR string and figure out the alignment blocks vector<BED> bedBlocks; @@ -433,8 +462,12 @@ void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDista for (i = 0; i < bedBlocks.size(); ++i) { BED curr = bedBlocks[i]; printf("%s\t%d\t%d\t\%s\t%d\t%s\n", - chrom.c_str(), curr.start, curr.end, - name.c_str(), bam.MapQuality, strand.c_str()); + chrom.c_str(), + curr.start, + curr.end, + name.c_str(), + bam.MapQuality, + strand.c_str()); } } } @@ -458,36 +491,26 @@ void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDist // extract the block starts and lengths from the CIGAR string GetBamBlocks(bam, chrom, bedBlocks, false, true); - // write BED6 portion - if (useEditDistance == false && bamTag == "") { - printf("%s\t%d\t%d\t\%s\t%d\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str()); - } - else if (useEditDistance == true && bamTag != "") { - uint32_t editDistance; - if (bam.GetTag("NM", editDistance)) { - printf("%s\t%d\t%d\t\%s\t%u\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), editDistance, strand.c_str()); - } - else { - cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; - exit(1); - } + if (bamTag == "") { + cout << refs.at(bam.RefID).RefName << "\t" + << bam.Position << "\t" + << alignmentEnd << "\t" + << name << "\t" + << bam.MapQuality << "\t" + << strand << "\t"; } - else if (useEditDistance == false && bamTag != "") { - int32_t tagValue; - if (bam.GetTag(bamTag, tagValue)) { - printf("%s\t%d\t%d\t\%s\t%d\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), tagValue, strand.c_str()); - } - else { - cerr << "The requested tag (" << bamTag << ") was not found in the BAM file. Exiting\n"; - exit(1); - } + else { + cout << refs.at(bam.RefID).RefName << "\t" + << bam.Position << "\t" + << alignmentEnd << "\t" + << name << "\t" + << PrintTag(bam, bamTag) << "\t" + << strand << "\t"; } // write the colors, etc. - printf("%d\t%d\t%s\t%d\t", bam.Position, alignmentEnd, color.c_str(), (int) bedBlocks.size()); + printf("%d\t%d\t%s\t%d\t", bam.Position, alignmentEnd, + color.c_str(), (int) bedBlocks.size()); // now write the lengths portion unsigned int b; @@ -525,7 +548,9 @@ void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVec // extract the edit distance from the NM tag // if possible. otherwise, complain. - if (useEditDistance == true && bam1.GetTag("NM", editDistance1) == false) { + if (useEditDistance == true && + bam1.GetTag("NM", editDistance1) == false) + { cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; exit(1); } @@ -541,7 +566,9 @@ void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVec // extract the edit distance from the NM tag // if possible. otherwise, complain. - if (useEditDistance == true && bam2.GetTag("NM", editDistance2) == false) { + if (useEditDistance == true && + bam2.GetTag("NM", editDistance2) == false) + { cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; exit(1); } @@ -561,9 +588,12 @@ void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVec if (bam1.IsMapped() == true && bam2.IsMapped() == true) minMapQuality = min(bam1.MapQuality, bam2.MapQuality); - printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n", - chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2, - bam1.Name.c_str(), minMapQuality, strand1.c_str(), strand2.c_str()); + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\n", + chrom1.c_str(), start1, + end1, chrom2.c_str(), + start2, end2, + bam1.Name.c_str(), minMapQuality, + strand1.c_str(), strand2.c_str()); } // report BEDPE using total edit distance else { @@ -575,9 +605,12 @@ void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVec else if (bam2.IsMapped() == true) totalEditDistance = editDistance2; - printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n", - chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2, - bam1.Name.c_str(), totalEditDistance, strand1.c_str(), strand2.c_str()); + printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\n", + chrom1.c_str(), start1, + end1, chrom2.c_str(), + start2, end2, + bam1.Name.c_str(), totalEditDistance, + strand1.c_str(), strand2.c_str()); } } @@ -588,7 +621,10 @@ bool bamtobed_IsCorrectMappingForBEDPE (const BamAlignment &bam) { if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize > 0) ) { return true; } - else if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize == 0) && bam.IsFirstMate() ) { + else if ( (bam.RefID == bam.MateRefID) && + (bam.InsertSize == 0) && + bam.IsFirstMate() ) + { return true; } else if ( (bam.RefID != bam.MateRefID) && bam.IsFirstMate() ) { -- GitLab