diff --git a/.cproject b/.cproject old mode 100644 new mode 100755 diff --git a/.project b/.project old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/RELEASE_HISTORY b/RELEASE_HISTORY old mode 100644 new mode 100755 diff --git a/data/knownGene.hg18.chr21.bed b/data/knownGene.hg18.chr21.bed old mode 100644 new mode 100755 diff --git a/genomes/human.hg18.genome b/genomes/human.hg18.genome old mode 100644 new mode 100755 diff --git a/genomes/human.hg19.genome b/genomes/human.hg19.genome old mode 100644 new mode 100755 diff --git a/genomes/mouse.mm8.genome b/genomes/mouse.mm8.genome old mode 100644 new mode 100755 diff --git a/genomes/mouse.mm9.genome b/genomes/mouse.mm9.genome old mode 100644 new mode 100755 diff --git a/src/bamFillMateSeq/Makefile b/src/bamFillMateSeq/Makefile deleted file mode 100755 index da902162427ce42bc8ef6cee9f019db69fa46b7b..0000000000000000000000000000000000000000 --- a/src/bamFillMateSeq/Makefile +++ /dev/null @@ -1,45 +0,0 @@ -CXX= g++ -CXXFLAGS= -Wall -g -LIBS= -lz -UTILITIES_DIR = ../utils/ -OBJ_DIR = ../../obj/ -BIN_DIR = ../../bin/ - -# ------------------- -# define our includes -# ------------------- -INCLUDES = -I$(UTILITIES_DIR)/BamTools/ -I$(UTILITIES_DIR)/sequenceUtilities/ -I$(UTILITIES_DIR)/version/ - -# ---------------------------------- -# define our source and object files -# ---------------------------------- -SOURCES= bamFillMateSeq.cpp -OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=BamReader.o BamWriter.o sequenceUtils.o BGZF.o -EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) -BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) -PROGRAM= bamFillMateSeq - - -all: $(PROGRAM) - -.PHONY: all - - -$(PROGRAM): $(BUILT_OBJECTS) $(EXT_OBJECTS) - @echo " * linking $(PROGRAM)" - @$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $(BIN_DIR)/$@ $^ $(LIBS) - -$(BUILT_OBJECTS): $(SOURCES) - @echo " * compiling" $(*F).cpp - @$(CXX) -c -o $@ $(*F).cpp $(LDFLAGS) $(CXXFLAGS) $(INCLUDES) - -$(EXT_OBJECTS): - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/sequenceUtilities/ - @$(MAKE) --no-print-directory -C $(UTILITIES_DIR)/BamTools/ - -clean: - @echo "Cleaning up." - @rm -f $(OBJ_DIR)/* $(BIN_DIR)/* - -.PHONY: clean diff --git a/src/bamFillMateSeq/bamFillMateSeq.cpp b/src/bamFillMateSeq/bamFillMateSeq.cpp deleted file mode 100755 index f691ecf28f0e693e9347f4954d29040a2a4d65c0..0000000000000000000000000000000000000000 --- a/src/bamFillMateSeq/bamFillMateSeq.cpp +++ /dev/null @@ -1,294 +0,0 @@ -/***************************************************************************** - bamToBed.cpp - - (c) 2009 - Aaron Quinlan - Hall Laboratory - Department of Biochemistry and Molecular Genetics - University of Virginia - aaronquinlan@gmail.com - - Licenced under the GNU General Public License 2.0+ license. -******************************************************************************/ -#include "version.h" -#include "sequenceUtils.h" -#include "BamReader.h" -#include "BamWriter.h" -#include "BamAux.h" -using namespace BamTools; - -#include <vector> -#include <iostream> -#include <fstream> -#include <stdlib.h> - -using namespace std; - - -// define our program name -#define PROGRAM_NAME "bamFillMateSeq" - -// define our parameter checking macro -#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) - -// function declarations -void ShowHelp(void); -void GetSeqsAndQuals(const vector<BamAlignment> &alignments, - string &seq1, - string &qual1, - string &seq, - string &qual2); -void FillMateSeqForReadBlock (const vector<BamAlignment> &alignments, - const RefVector &refs, - BamWriter &writer, - const string &seq1, - const string &qual1, - const string &seq, - const string &qual2); -int strnum_cmp(const char *a, const char *b); - - -int main(int argc, char* argv[]) { - - // our configuration variables - bool showHelp = false; - bool haveBam = false; - - // input files - string bamFile; - - // check to see if we should print out some help - if(argc <= 1) showHelp = true; - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - haveBam = true; - bamFile = argv[i + 1]; - i++; - } - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have an input files - if (!haveBam ) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i (BAM) file. " << endl << "*****" << endl; - showHelp = true; - } - - if (!showHelp) { - - // open the BAM file - BamReader reader; - reader.Open(bamFile); - BamWriter writer; - - // get header & reference information - string header = reader.GetHeaderText(); - RefVector refs = reader.GetReferenceData(); - - // open our BAM writer - writer.Open("stdout", header, refs); - - // track the previous and current sequence - // names so that we can identify blocks of - // alignments for a given read ID. - string prevName = ""; - string currName = ""; - - vector<BamAlignment> alignments; // vector of BAM alignments for a given ID in a BAM file. - alignments.reserve(100); - - BamAlignment bam; - string seq1, qual1, seq2, qual2; - // get each set of alignments for each pair. - while (reader.GetNextAlignment(bam)) { - - currName = bam.Name; - //printf("%s\t%s\n", currName.c_str(), prevName.c_str()); - if ( (strcmp(currName.c_str(), prevName.c_str()) != 0) && (prevName != "") ) { - //printf("yep\n"); - // block change. enforce expected sort order. - if (strnum_cmp(currName.c_str(), prevName.c_str()) == -1) { - cerr << "Error: The BAM file (" << bamFile << ") is not sorted by sequence name. Exiting!" << endl; - exit (1); - } - else { - // process the current block of alignments - GetSeqsAndQuals(alignments, seq1, qual1, seq2, qual2); - FillMateSeqForReadBlock(alignments, refs, writer, seq1, qual1, seq2, qual2); - - // reset the alignment vector for the next block - // and add the first alignment in the next block - alignments.clear(); - alignments.push_back(bam); - } - } - else { - alignments.push_back(bam); - } - prevName = currName; - } - // process the last block - GetSeqsAndQuals(alignments, seq1, qual1, seq2, qual2); - FillMateSeqForReadBlock(alignments, refs, writer, seq1, qual1, seq2, qual2); - - // close up - reader.Close(); - writer.Close(); - } - else { - ShowHelp(); - } -} - - -void GetSeqsAndQuals(const vector<BamAlignment> &alignments, - string &seq1, - string &qual1, - string &seq2, - string &qual2) { - - bool haveSeq1, haveSeq2, haveQual1, haveQual2; - haveSeq1 = haveSeq2 = haveQual1 = haveQual2 = false; - - // extract the sequences for the first and second mates - - vector<BamAlignment>::const_iterator alItr = alignments.begin(); - vector<BamAlignment>::const_iterator alEnd = alignments.end(); - while ( (alItr != alEnd) && (haveSeq1 == false || haveSeq2 == false || - haveQual1 == false || haveQual2 == false)) { - - if ( alItr->IsFirstMate() ) { - seq1 = alItr->QueryBases; - qual1 = alItr->Qualities; - if ( alItr->IsReverseStrand() ) { - reverseComplement(seq1); - reverseSequence(qual1); - } - haveSeq1 = true; - haveQual1 = true; - } - else if ( alItr->IsSecondMate() ) { - seq2 = alItr->QueryBases; - qual2 = alItr->Qualities; - if ( alItr->IsReverseStrand() ) { - reverseComplement(seq2); - reverseSequence(qual2); - } - haveSeq2 = true; - haveQual2 = true; - } - ++alItr; - } - - //cerr << alignments.size() << endl; - cout << alignments[0].Qualities << endl; - //seq1 = alignments[0].QueryBases; - //qual1 = alignments[0].Qualities; - //seq2 = alignments[1].QueryBases; - //qual2 = alignments[1].Qualities; - seq1 = "seq1\0"; - seq2 = "seq2\0"; - qual1 = "88###########################################################################88"; - qual2 = "77###########################################################################77"; - - // IT MUST have to do with modifying the alignment itself. -} - - -void FillMateSeqForReadBlock (const vector<BamAlignment> &alignments, - const RefVector &refs, - BamWriter &writer, - const string &seq1, - const string &qual1, - const string &seq2, - const string &qual2) { - // now update each alignment with the appropriate - // sequence for it's mate - cerr << "call\n"; - string joe = qual1; - vector<BamAlignment>::const_iterator aItr = alignments.begin(); - vector<BamAlignment>::const_iterator aEnd = alignments.end(); - int i = 0; - for (; aItr != aEnd; ++aItr) { - - cerr << i << " " << aItr->Name << " [" << qual1 << "]" << "\t[" << qual2 << "]" << "\t" << joe << endl; - //if ( aItr->IsFirstMate() ) { - //aItr->AddBamTag("R2", "Z", seq2); - //aItr->AddBamTag("Q2", "Z", qual2); - //} - //else if ( aItr->IsSecondMate() ) { - //aItr->AddBamTag("R2", "Z", seq1); - //aItr->AddBamTag("Q2", "Z", qual1); - //} - //BamAlignment poop = *aItr; - writer.SaveAlignment(*aItr); - cerr << i << " " << aItr->Name << " [" << qual1 << "]" << "\t[" << qual2 << "]" << "\t" << joe << endl << endl; - i++; - } -} - - -// samtools sort comparison function from bam_sort.c -// used for sorting by query name. -int strnum_cmp(const char *a, const char *b) { - char *pa, *pb; - pa = (char*)a; pb = (char*)b; - while (*pa && *pb) { - if (isdigit(*pa) && isdigit(*pb)) { - long ai, bi; - ai = strtol(pa, &pa, 10); - bi = strtol(pb, &pb, 10); - if (ai != bi) return ai<bi? -1 : ai>bi? 1 : 0; - } else { - if (*pa != *pb) break; - ++pa; ++pb; - } - } - if (*pa == *pb) - return (pa-a) < (pb-b)? -1 : (pa-a) > (pb-b)? 1 : 0; - return *pa<*pb? -1 : *pa>*pb? 1 : 0; -} - - -void ShowHelp(void) { - - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Updates the R2 and Q2 BAM tag to include the mate's seq/qual." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bam> " << endl << endl; - -// cerr << "Options: " << endl; - -// cerr << "\t-bedpe\t" << "Write BEDPE format." << endl << endl; - - cerr << "Notes: " << endl; - - cerr << "\tInput BAM must be sorted by read name" << endl << endl; - - // end the program here - exit(1); -} - diff --git a/src/bamToBed/Makefile b/src/bamToBed/Makefile index d9baa942a59ed618f4e502836cf29f4a7defe18b..3409605055b1e4048adf8511cde326a8de488294 100755 --- a/src/bamToBed/Makefile +++ b/src/bamToBed/Makefile @@ -8,14 +8,14 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/BamTools/ -I$(UTILITIES_DIR)/version/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/BamTools/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ # ---------------------------------- # define our source and object files # ---------------------------------- SOURCES= bamToBed.cpp OBJECTS= $(SOURCES:.cpp=.o) -_EXT_OBJECTS=BamReader.o BGZF.o +_EXT_OBJECTS=BamReader.o BamAncillary.o BGZF.o EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS)) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) PROGRAM= bamToBed diff --git a/src/bamToBed/bamToBed.cpp b/src/bamToBed/bamToBed.cpp index 684e835eb1833108459861e48e8fac8b24add14f..cb9d09af09771305f4a91ce25480e008954e528f 100755 --- a/src/bamToBed/bamToBed.cpp +++ b/src/bamToBed/bamToBed.cpp @@ -11,7 +11,9 @@ ******************************************************************************/ #include "version.h" #include "BamReader.h" +#include "BamAncillary.h" #include "BamAux.h" +#include "bedFile.h" using namespace BamTools; #include <vector> @@ -34,10 +36,10 @@ using namespace std; void ShowHelp(void); void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, - const bool &writeBed12, const string &color); + const bool &writeBed12, const bool &obeySplits, const string &color); void ConvertBamToBedpe(const string &bamFile, const bool &useEditDistance); -void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance); +void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, bool obeySplits); void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, string color = "255,0,0"); void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVector &refs, bool useEditDistance); @@ -61,6 +63,7 @@ int main(int argc, char* argv[]) { bool writeBedPE = false; bool writeBed12 = false; bool useEditDistance = false; + bool obeySplits = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -93,7 +96,10 @@ int main(int argc, char* argv[]) { } else if(PARAMETER_CHECK("-bed12", 6, parameterLength)) { writeBed12 = true; - } + } + else if(PARAMETER_CHECK("-split", 6, parameterLength)) { + obeySplits = true; + } else if(PARAMETER_CHECK("-ed", 3, parameterLength)) { useEditDistance = true; } @@ -111,19 +117,23 @@ int main(int argc, char* argv[]) { } // make sure we have an input files - if (!haveBam ) { + if (haveBam == false) { cerr << endl << "*****" << endl << "*****ERROR: Need -i (BAM) file. " << endl << "*****" << endl; showHelp = true; } - if (haveColor && writeBed12 == false) { + if (haveColor == true && writeBed12 == false) { cerr << endl << "*****" << endl << "*****ERROR: Cannot use color without BED12. " << endl << "*****" << endl; showHelp = true; } + if (useEditDistance == true && obeySplits == true) { + cerr << endl << "*****" << endl << "*****ERROR: Cannot use -ed with -splits. Edit distances cannot be computed for each \'chunk\'." << endl << "*****" << endl; + showHelp = true; + } // if there are no problems, let's convert BAM to BED or BEDPE if (!showHelp) { if (writeBedPE == false) - ConvertBamToBed(bamFile, useEditDistance, writeBed12, color); // BED or "blocked BED" + ConvertBamToBed(bamFile, useEditDistance, writeBed12, obeySplits, color); // BED or "blocked BED" else ConvertBamToBedpe(bamFile, useEditDistance); // BEDPE } @@ -150,7 +160,9 @@ void ShowHelp(void) { cerr << "\t-bed12\t" << "Write \"blocked\" BED format (aka \"BED12\")." << endl << endl; cerr << "\t\thttp://genome-test.cse.ucsc.edu/FAQ/FAQformat#format1" << endl << endl; - + + cerr << "\t-split\t" << "Report \"split\" BAM alignments as separate BED entries." << endl << endl; + cerr << "\t-ed\t" << "Use BAM edit distance (NM tag) for BED score." << endl; cerr << "\t\t- Default for BED is to use mapping quality." << endl; cerr << "\t\t- Default for BEDPE is to use the minimum of" << endl; @@ -168,7 +180,7 @@ void ShowHelp(void) { void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, - const bool &writeBed12, const string &color) { + const bool &writeBed12, const bool &obeySplits, const string &color) { // open the BAM file BamReader reader; reader.Open(bamFile); @@ -182,7 +194,7 @@ void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, while (reader.GetNextAlignment(bam)) { if (bam.IsMapped() == true) { if (writeBed12 == false) // BED - PrintBed(bam, refs, useEditDistance); + PrintBed(bam, refs, useEditDistance, obeySplits); else //"blocked" BED PrintBed12(bam, refs, useEditDistance, color); } @@ -265,7 +277,7 @@ void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts, vec } -void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance) { +void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, bool obeySplits) { // set the strand string strand = "+"; @@ -279,21 +291,39 @@ void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDista // get the unpadded (parm = false) end position based on the CIGAR unsigned int alignmentEnd = bam.GetEndPosition(false); - // report the alignment in BED6 format. - if (useEditDistance == false) { - printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str()); + // report the entire BAM footprint as a single BED entry + if (obeySplits == false) { + // report the alignment in BED6 format. + if (useEditDistance == false) { + printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, + alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str()); + } + else { + uint8_t editDistance; + if (bam.GetEditDistance(editDistance)) { + printf("%s\t%d\t%d\t\%s\t%u\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, + alignmentEnd, name.c_str(), editDistance, strand.c_str()); + } + else { + cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; + exit(1); + } + } } + // Report each chunk of the BAM alignment as a discrete BED entry + // For example 10M100N10M would be reported as two seprate BED entries of length 10 else { - uint8_t editDistance; - if (bam.GetEditDistance(editDistance)) { - printf("%s\t%d\t%d\t\%s\t%u\t%s\n", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), editDistance, strand.c_str()); - } - else { - cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; - exit(1); - } + vector<BED> bedBlocks; + // Load the alignment blocks in bam into the bedBlocks vector. + // Don't trigger a new block when a "D" (deletion) CIGAR op is found. + getBamBlocks(bam, refs, bedBlocks, false); + + vector<BED>::const_iterator bedItr = bedBlocks.begin(); + vector<BED>::const_iterator bedEnd = bedBlocks.end(); + for (; bedItr != bedEnd; ++bedItr) { + printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bedItr->start, + bedItr->end, name.c_str(), bam.MapQuality, strand.c_str()); + } } } diff --git a/src/bedToBam/Makefile b/src/bedToBam/Makefile index f5493cda77fd09a4f311791fa982fc4ccd4910ae..445b60d66474ef49247742673352686bd80b5073 100755 --- a/src/bedToBam/Makefile +++ b/src/bedToBam/Makefile @@ -1,5 +1,5 @@ CXX= g++ -CXXFLAGS= -Wall -O2 +CXXFLAGS= -Wall -O2 -fopenmp LIBS= -lz UTILITIES_DIR = ../utils/ OBJ_DIR = ../../obj/ @@ -8,7 +8,7 @@ BIN_DIR = ../../bin/ # ------------------- # define our includes # ------------------- -INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/genomeFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/BamTools/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/genomeFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/BamTools/ -I$(UTILITIES_DIR)/version/ -I$(UTILITIES_DIR)/gzstream/ # ---------------------------------- # define our source and object files diff --git a/src/bedToBam/bedToBam.cpp b/src/bedToBam/bedToBam.cpp index 1290e3cf2587a244ded5f650ee23e49f059a1e92..eaab98022388bdbd0d8d434b5628ba21b1792404 100755 --- a/src/bedToBam/bedToBam.cpp +++ b/src/bedToBam/bedToBam.cpp @@ -13,6 +13,8 @@ #include "bedFile.h" #include "genomeFile.h" #include "version.h" +#include <omp.h> + #include "BamWriter.h" #include "BamAux.h" @@ -155,7 +157,7 @@ void ShowHelp(void) { cerr << "Notes: " << endl; - cerr << "\t(1) BED files must be at least BED4 (needs name field)." << endl << endl; + cerr << "\t(1) BED files must be at least BED4 to be amenable to BAM (needs name field)." << endl << endl; // end the program here @@ -202,8 +204,7 @@ void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED1 BedLineStatus bedStatus; // open the BED file for reading. bed->Open(); - bedStatus = bed->GetNextBed(bedEntry, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { BamAlignment bamEntry; if (bed->bedType >= 4) { @@ -216,8 +217,10 @@ void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED1 } bedEntry = nullBed; } - bedStatus = bed->GetNextBed(bedEntry, lineNum); } + + + // close up bed->Close(); writer->Close(); diff --git a/src/closestBed/closestBed.cpp b/src/closestBed/closestBed.cpp index 07be358369bdc4f9648d28ae6fde97f00ac87a93..3dd0995e101349d63696332373159c01b70574f6 100755 --- a/src/closestBed/closestBed.cpp +++ b/src/closestBed/closestBed.cpp @@ -41,30 +41,6 @@ BedClosest::~BedClosest(void) { } -/* - reportNullB - - Writes a NULL B entry for cases where no closest BED was found - Works for BED3 - BED6. -*/ -void BedClosest::reportNullB() { - if (_bedB->bedType == 3) { - printf("none\t-1\t-1\n"); - } - else if (_bedB->bedType == 4) { - printf("none\t-1\t-1\t-1\n"); - } - else if (_bedB->bedType == 5) { - printf("none\t-1\t-1\t-1\t-1\n"); - } - else if (_bedB->bedType == 6) { - printf("none\t-1\t-1\t-1\t-1\t-1\n"); - } -} - - - - void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) { int slop = 0; // start out just looking for overlaps @@ -72,12 +48,12 @@ void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) { // update the current feature's start and end - int aFudgeStart = 0; - int aFudgeEnd; + CHRPOS aFudgeStart = 0; + CHRPOS aFudgeEnd; int numOverlaps = 0; vector<BED> closestB; float maxOverlap = 0; - int minDistance = 999999999; + CHRPOS minDistance = INT_MAX; if(_bedB->bedMap.find(a.chrom) != _bedB->bedMap.end()) { @@ -143,7 +119,7 @@ void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) { } else { _bedA->reportBedTab(a); - reportNullB(); + _bedB->reportNullBedNewLine(); } if (numOverlaps > 0) { @@ -186,14 +162,12 @@ void BedClosest::FindClosestBed() { _bedA->Open(); // process each entry in A in search of the closest feature in B - bedStatus = _bedA->GetNextBed(a, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { FindWindowOverlaps(a, hits); hits.clear(); a = nullBed; } - bedStatus = _bedA->GetNextBed(a, lineNum); } _bedA->Close(); } diff --git a/src/complementBed/complementBed.cpp b/src/complementBed/complementBed.cpp index 275455b349d5c3191d4df101b9d1fe302852b8de..15043c252ab2cd2ea841b7d05775caf09f1183ac 100755 --- a/src/complementBed/complementBed.cpp +++ b/src/complementBed/complementBed.cpp @@ -40,10 +40,11 @@ void BedComplement::ComplementBed() { string currChrom; // loop through each chromosome and merge their BED entries - for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { - + masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); + masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); + for (; m != mEnd; ++m) { currChrom = m->first; - int currChromSize = _genome->getChromSize(currChrom); + CHRPOS currChromSize = _genome->getChromSize(currChrom); // bedList is already sorted by start position. vector<BED> bedList = m->second; @@ -63,26 +64,22 @@ void BedComplement::ComplementBed() { } // mask all of the positions spanned by this BED entry. - for (int b = bIt->start; b < bIt->end; b++) { + for (CHRPOS b = bIt->start; b < bIt->end; b++) chromMasks[b] = 1; - } } - unsigned int i = 0; - unsigned int start; + CHRPOS i = 0; + CHRPOS start; while (i < chromMasks.size()) { if (chromMasks[i] == 0) { start = i; - while ((chromMasks[i] == 0) && (i < chromMasks.size())) { + while ((chromMasks[i] == 0) && (i < chromMasks.size())) i++; - } - if (start > 0) { - cout << currChrom << "\t" << start << "\t" << i << endl; - } - else { - cout << currChrom << "\t" << 0 << "\t" << i << endl; - } + if (start > 0) + cout << currChrom << "\t" << start << "\t" << i << endl; + else + cout << currChrom << "\t" << 0 << "\t" << i << endl; } i++; } diff --git a/src/fastaFromBed/fastaFromBed.cpp b/src/fastaFromBed/fastaFromBed.cpp index 537a2abf35365ef4ccf861095243f6ebd60e5001..84af9e3d1d0bf1a9756ccb81e1fd8d7dc33868c2 100755 --- a/src/fastaFromBed/fastaFromBed.cpp +++ b/src/fastaFromBed/fastaFromBed.cpp @@ -63,7 +63,7 @@ void Bed2Fa::ReportDNA(const BED &bed, const string &currDNA, const string &curr string dna = currDNA.substr(bed.start, ((bed.end - bed.start))); // revcomp if necessary. Thanks to Thomas Doktor. - if ((_useStrand == true) && (bed.strand == '-')) + if ((_useStrand == true) && (bed.strand == "-")) reverseComplement(dna); if (!(_useName)) { diff --git a/src/genomeCoverageBed/genomeCoverageBed.cpp b/src/genomeCoverageBed/genomeCoverageBed.cpp index 6059f3514cb9002b4444290fdbf3cd69315b8a21..db59b51c512e40a98afc8ce0d9bf2dbce2353f45 100755 --- a/src/genomeCoverageBed/genomeCoverageBed.cpp +++ b/src/genomeCoverageBed/genomeCoverageBed.cpp @@ -214,7 +214,7 @@ void BedGenomeCoverage::CoverageBam(string bamFile) { // process the results of the last chromosome. ReportChromCoverage(chromCov, currChromSize, currChrom, chromDepthHist); - if (_eachBase == false && _bedGraph == false) { + if (_eachBase == false && _bedGraph == false && _bedGraphAll == false) { ReportGenomeCoverage(chromDepthHist); } diff --git a/src/linksBed/linksBed.cpp b/src/linksBed/linksBed.cpp index 9e6e8fe41a47ad3de286a41edb6a349048986b14..2e4a9116cbca41899d0c41e05ef1a20f63dae34f 100755 --- a/src/linksBed/linksBed.cpp +++ b/src/linksBed/linksBed.cpp @@ -106,14 +106,11 @@ void BedLinks::CreateLinks() { BedLineStatus bedStatus; _bed->Open(); - bedStatus = _bed->GetNextBed(bedEntry, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { - bedEntry.count = 0; WriteURL(bedEntry, base); bedEntry = nullBed; } - bedStatus = _bed->GetNextBed(bedEntry, lineNum); } _bed->Close(); diff --git a/src/mergeBed/mergeBed.cpp b/src/mergeBed/mergeBed.cpp index 6720bf141fcc298ad43b12d91ba63aa7203421ce..870c099e19de6df3e33f5d1c8aa14170c339a17e 100755 --- a/src/mergeBed/mergeBed.cpp +++ b/src/mergeBed/mergeBed.cpp @@ -66,8 +66,8 @@ void BedMerge::MergeBed() { // bedList is already sorted by start position. vector<BED> bedList = m->second; - int minStart = INT_MAX; - int maxEnd = 0; + CHRPOS minStart = INT_MAX; + CHRPOS maxEnd = 0; bool OIP = false; // OIP = Overlap In Progress. Lame, I realize. int prev = -1; unsigned int curr = 0; @@ -180,8 +180,9 @@ void BedMerge::MergeBedStranded() { _bed->loadBedFileIntoMapNoBin(); // loop through each chromosome and merge their BED entries - for (masterBedMapNoBin::iterator m = _bed->bedMapNoBin.begin(); m != _bed->bedMapNoBin.end(); ++m) { - + masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); + masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); + for (; m != mEnd; ++m) { // bedList is already sorted by start position. vector<BED> bedList = m->second; @@ -193,8 +194,8 @@ void BedMerge::MergeBedStranded() { // do two passes, one for each strand. for (unsigned int s = 0; s < strands.size(); s++) { - int minStart = INT_MAX; - int maxEnd = 0; + CHRPOS minStart = INT_MAX; + CHRPOS maxEnd = 0; bool OIP = false; // OIP = Overlap In Progress. Lame, I realize. int prev = -1; unsigned int curr = 0; diff --git a/src/pairToBed/pairToBed.cpp b/src/pairToBed/pairToBed.cpp index e8248c66b227eb66bacf3018480ddfd638ba6c5b..5485f96c707f2dc1c5e7ed531e86f32c0f96fdce 100755 --- a/src/pairToBed/pairToBed.cpp +++ b/src/pairToBed/pairToBed.cpp @@ -257,9 +257,9 @@ void BedIntersectPE::FindSpanningOverlaps(const BEDPE &a, vector<BED> &hits, con // count of hits on _between_ end of BEDPE // that exceed the requested overlap fraction int numOverlaps = 0; - int spanStart = 0; - int spanEnd = 0; - int spanLength = 0; + CHRPOS spanStart = 0; + CHRPOS spanEnd = 0; + CHRPOS spanLength = 0; if ((type == "ispan") || (type == "notispan")) { spanStart = a.end1; @@ -357,8 +357,7 @@ void BedIntersectPE::IntersectBedPE() { BedLineStatus bedStatus; _bedA->Open(); - bedStatus = _bedA->GetNextBedPE(a, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bedA->GetNextBedPE(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { if ( (_searchType == "ispan") || (_searchType == "ospan") || (_searchType == "notispan") || (_searchType == "notospan") ) { @@ -374,7 +373,6 @@ void BedIntersectPE::IntersectBedPE() { } a = nullBedPE; } - bedStatus = _bedA->GetNextBedPE(a, lineNum); } _bedA->Close(); } diff --git a/src/pairToBed/pairToBed.h b/src/pairToBed/pairToBed.h index 479c7bfe718cae0c63fae754c72339982d987483..66a26eb84ebc3a52acfb8963b3c70aa48741753a 100755 --- a/src/pairToBed/pairToBed.h +++ b/src/pairToBed/pairToBed.h @@ -82,8 +82,8 @@ private: // initialize BEDPE variables a.start1 = a.start2 = a.end1 = a.end2 = -1; - a.chrom1 = a.chrom2 = a.strand1 = a.strand2 = "."; - + a.chrom1 = a.chrom2 = "."; + a.strand1 = a.strand2 = '.'; uint8_t editDistance1, editDistance2; editDistance1 = editDistance2 = 0; @@ -95,8 +95,8 @@ private: a.chrom1 = refs.at(bam1.RefID).RefName; a.start1 = bam1.Position; a.end1 = bam1.GetEndPosition(); - a.strand1 = "+"; - if (bam1.IsReverseStrand()) a.strand1 = "-"; + a.strand1 = '+'; + if (bam1.IsReverseStrand()) a.strand1 = '-'; // extract the edit distance from the NM tag // if possible. otherwise, complain. @@ -113,8 +113,8 @@ private: a.chrom2 = refs.at(bam2.RefID).RefName; a.start2 = bam2.Position; a.end2 = bam2.GetEndPosition(); - a.strand2 = "+"; - if (bam2.IsReverseStrand()) a.strand2 = "-"; + a.strand2 = '+'; + if (bam2.IsReverseStrand()) a.strand2 = '-'; // extract the edit distance from the NM tag // if possible. otherwise, complain. diff --git a/src/pairToPair/pairToPair.cpp b/src/pairToPair/pairToPair.cpp index 095b62f17d5f0dbdc8a3927dbf2dc1c9e3bfbeea..45b9d5a670928bdd06067c93b974d410231f1c50 100755 --- a/src/pairToPair/pairToPair.cpp +++ b/src/pairToPair/pairToPair.cpp @@ -47,7 +47,7 @@ void PairToPair::IntersectPairs() { _bedB->loadBedPEFileIntoMap(); int lineNum = 0; - vector<BED> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2; + vector<BEDCOV> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2; // reserve some space hitsA1B1.reserve(100); hitsA1B2.reserve(100); hitsA2B1.reserve(100); hitsA2B2.reserve(100); @@ -55,16 +55,15 @@ void PairToPair::IntersectPairs() { BEDPE a, nullBedPE; _bedA->Open(); - bedStatus = _bedA->GetNextBedPE(a, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bedA->GetNextBedPE(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { + // identify overlaps b/w the pairs FindOverlaps(a, hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2, _searchType); // reset space for next BEDPE hitsA1B1.clear(); hitsA1B2.clear(); hitsA2B1.clear(); hitsA2B2.clear(); a = nullBedPE; } - bedStatus = _bedA->GetNextBedPE(a, lineNum); } _bedA->Close(); } @@ -72,15 +71,15 @@ void PairToPair::IntersectPairs() { -void PairToPair::FindOverlaps(const BEDPE &a, vector<BED> &hitsA1B1, vector<BED> &hitsA1B2, - vector<BED> &hitsA2B1, vector<BED> &hitsA2B2, string type) { +void PairToPair::FindOverlaps(const BEDPE &a, vector<BEDCOV> &hitsA1B1, vector<BEDCOV> &hitsA1B2, + vector<BEDCOV> &hitsA2B1, vector<BEDCOV> &hitsA2B2, string type) { // list of hits on each end of BEDPE // that exceed the requested overlap fraction - vector<BED> qualityHitsA1B1; - vector<BED> qualityHitsA1B2; - vector<BED> qualityHitsA2B1; - vector<BED> qualityHitsA2B2; + vector<BEDCOV> qualityHitsA1B1; + vector<BEDCOV> qualityHitsA1B2; + vector<BEDCOV> qualityHitsA2B1; + vector<BEDCOV> qualityHitsA2B2; // count of hits on each end of BEDPE // that exceed the requested overlap fraction @@ -121,13 +120,13 @@ void PairToPair::FindOverlaps(const BEDPE &a, vector<BED> &hitsA1B1, vector<BED> -void PairToPair::FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vector<BED> &hits, - vector<BED> &qualityHits, int &numOverlaps) { +void PairToPair::FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vector<BEDCOV> &hits, + vector<BEDCOV> &qualityHits, int &numOverlaps) { if (end == 1) { - vector<BED>::const_iterator h = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); + vector<BEDCOV>::const_iterator h = hits.begin(); + vector<BEDCOV>::const_iterator hitsEnd = hits.end(); for (; h != hitsEnd; ++h) { int s = max(a.start1, h->start); int e = min(a.end1, h->end); @@ -142,8 +141,8 @@ void PairToPair::FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vecto } else if (end == 2) { - vector<BED>::const_iterator h = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); + vector<BEDCOV>::const_iterator h = hits.begin(); + vector<BEDCOV>::const_iterator hitsEnd = hits.end(); for (; h != hitsEnd; ++h) { int s = max(a.start2, h->start); int e = min(a.end2, h->end); @@ -157,25 +156,25 @@ void PairToPair::FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vecto } -void PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<BED> &qualityHitsEnd1, - const vector<BED> &qualityHitsEnd2, int &matchCount) { +void PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<BEDCOV> &qualityHitsEnd1, + const vector<BEDCOV> &qualityHitsEnd2, int &matchCount) { - map<unsigned int, vector<BED>, less<int> > hitsMap; + map<unsigned int, vector<BEDCOV>, less<int> > hitsMap; - for (vector<BED>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) { + for (vector<BEDCOV>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) { hitsMap[h->count].push_back(*h); matchCount++; } - for (vector<BED>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) { + for (vector<BEDCOV>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) { hitsMap[h->count].push_back(*h); matchCount++; } - for (map<unsigned int, vector<BED>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) { + for (map<unsigned int, vector<BEDCOV>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) { if (m->second.size() == 2) { - BED b1 = m->second[0]; - BED b2 = m->second[1]; + BEDCOV b1 = m->second[0]; + BEDCOV b2 = m->second[1]; if (_searchType == "both") { _bedA->reportBedPETab(a); diff --git a/src/pairToPair/pairToPair.h b/src/pairToPair/pairToPair.h index 3dd057d095010335f6517978c35fbff30b94259f..a87ee10b0374501b4cbcc08f2b64eb2b02f75b33 100755 --- a/src/pairToPair/pairToPair.h +++ b/src/pairToPair/pairToPair.h @@ -35,14 +35,14 @@ public: void IntersectPairs(); - void FindOverlaps(const BEDPE &a, vector<BED> &hitsA1B1, vector<BED> &hitsA1B2, - vector<BED> &hitsA2B1, vector<BED> &hitsA2B2, string type); + void FindOverlaps(const BEDPE &a, vector<BEDCOV> &hitsA1B1, vector<BEDCOV> &hitsA1B2, + vector<BEDCOV> &hitsA2B1, vector<BEDCOV> &hitsA2B2, string type); - void FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vector<BED> &hits, - vector<BED> &qualityHits, int &numOverlaps); + void FindQualityHitsBetweenEnds(const BEDPE &a, int end, const vector<BEDCOV> &hits, + vector<BEDCOV> &qualityHits, int &numOverlaps); - void FindHitsOnBothEnds(const BEDPE &a, const vector<BED> &qualityHitsEnd1, - const vector<BED> &qualityHitsEnd2, int &matchCount); + void FindHitsOnBothEnds(const BEDPE &a, const vector<BEDCOV> &qualityHitsEnd1, + const vector<BEDCOV> &qualityHitsEnd2, int &matchCount); private: diff --git a/src/shuffleBed/shuffleBed.cpp b/src/shuffleBed/shuffleBed.cpp index b08fb8d69913a25d07f1cab5506026afad577039..55446e32e93f3ba82c319f8e0ef3576c00e9403a 100755 --- a/src/shuffleBed/shuffleBed.cpp +++ b/src/shuffleBed/shuffleBed.cpp @@ -71,14 +71,12 @@ void BedShuffle::Shuffle() { BedLineStatus bedStatus; _bed->Open(); - bedStatus = _bed->GetNextBed(bedEntry, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { ChooseLocus(bedEntry); _bed->reportBedNewLine(bedEntry); bedEntry = nullBed; } - bedStatus = _bed->GetNextBed(bedEntry, lineNum); } _bed->Close(); } @@ -88,72 +86,76 @@ void BedShuffle::Shuffle() { void BedShuffle::ShuffleWithExclusions() { int lineNum = 0; - BED bedEntry; // used to store the current BED line from the BED file. + BED bedEntry, nullBed; // used to store the current BED line from the BED file. + BedLineStatus bedStatus; vector<BED> hits; hits.reserve(100); _bed->Open(); - while (_bed->GetNextBed(bedEntry, lineNum)) { - - // choose a random locus - ChooseLocus(bedEntry); + while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + // choose a random locus + ChooseLocus(bedEntry); - // test to see if the chosen locus overlaps - // with an exclude region - _exclude->FindOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end, bedEntry.strand, hits, false); + // test to see if the chosen locus overlaps + // with an exclude region + _exclude->FindOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end, bedEntry.strand, hits, false); - bool haveOverlap = false; - vector<BED>::const_iterator hitsItr = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); - for (; hitsItr != hitsEnd; ++hitsItr) { - - int s = max(bedEntry.start, hitsItr->start); - int e = min(bedEntry.end, hitsItr->end); - - if ( (e - s) > 0) { - haveOverlap = true; - break; /* stop looking. one overlap is enough*/ - } - } + bool haveOverlap = false; + vector<BED>::const_iterator hitsItr = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; hitsItr != hitsEnd; ++hitsItr) { + + int s = max(bedEntry.start, hitsItr->start); + int e = min(bedEntry.end, hitsItr->end); + + if ( (e - s) > 0) { + haveOverlap = true; + break; /* stop looking. one overlap is enough*/ + } + } - /* - keep looking as long as the chosen - locus happens to overlap with regions - that the user wishes to exclude. - */ - int tries = 0; - while ((haveOverlap == true) && (tries <= MAX_TRIES)) { - - // choose a new locus - ChooseLocus(bedEntry); - - vector<BED> hits; - _exclude->FindOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end, - bedEntry.strand, hits, false); - - haveOverlap = false; - vector<BED>::const_iterator hitsItr = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); - for (; hitsItr != hitsEnd; ++hitsItr) { - - int s = max(bedEntry.start, hitsItr->start); - int e = min(bedEntry.end, hitsItr->end); - - if ( (e - s) > 0) { - haveOverlap = true; - break; /* stop looking. one overlap is enough*/ - } - } - tries++; - } + /* + keep looking as long as the chosen + locus happens to overlap with regions + that the user wishes to exclude. + */ + int tries = 0; + while ((haveOverlap == true) && (tries <= MAX_TRIES)) { + + // choose a new locus + ChooseLocus(bedEntry); + + vector<BED> hits; + _exclude->FindOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end, + bedEntry.strand, hits, false); + + haveOverlap = false; + vector<BED>::const_iterator hitsItr = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; hitsItr != hitsEnd; ++hitsItr) { + + int s = max(bedEntry.start, hitsItr->start); + int e = min(bedEntry.end, hitsItr->end); + + if ( (e - s) > 0) { + haveOverlap = true; + break; // stop looking. one overlap is enough + } + } + tries++; + } - if (tries > MAX_TRIES) { - cerr << "Error, line " << lineNum << ": tried " << MAX_TRIES << " potential loci for entry, but could not avoid excluded regions. Ignoring entry and moving on." << endl; - } - else { - _bed->reportBedNewLine(bedEntry); - } + if (tries > MAX_TRIES) { + cerr << "Error, line " << lineNum << ": tried " << MAX_TRIES << " potential loci for entry, but could not avoid excluded regions. Ignoring entry and moving on." << endl; + } + else { + _bed->reportBedNewLine(bedEntry); + } + } + bedEntry = nullBed; } + _bed->Close(); } @@ -161,13 +163,13 @@ void BedShuffle::ShuffleWithExclusions() { void BedShuffle::ChooseLocus(BED &bedEntry) { string chrom = bedEntry.chrom; - int start = bedEntry.start; - int end = bedEntry.end; - int length = end - start; + CHRPOS start = bedEntry.start; + CHRPOS end = bedEntry.end; + CHRPOS length = end - start; string randomChrom; - int randomStart; - int chromSize; + CHRPOS randomStart; + CHRPOS chromSize; if (_sameChrom == false) { randomChrom = _chroms[rand() % _numChroms]; diff --git a/src/slopBed/slopBed.cpp b/src/slopBed/slopBed.cpp index e70a2711af370166bdf42ce2fbfd444567f60a16..8da78f56cd7c70c173873eb8587bf34c99e07f90 100755 --- a/src/slopBed/slopBed.cpp +++ b/src/slopBed/slopBed.cpp @@ -58,7 +58,7 @@ void BedSlop::AddSlop(BED &bed) { // special handling if the BED entry is on the negative // strand and the user cares about strandedness. - int chromSize = _genome->getChromSize(bed.chrom); + CHRPOS chromSize = _genome->getChromSize(bed.chrom); if ( (_forceStrand) && (bed.strand == "-") ) { // inspect the start diff --git a/src/subtractBed/subtractBed.cpp b/src/subtractBed/subtractBed.cpp index 94b92ba4c35508eee50cd0a8702903ab5816ce9e..588462ad27fa8720ccef3eb5702ad5853ce42565 100755 --- a/src/subtractBed/subtractBed.cpp +++ b/src/subtractBed/subtractBed.cpp @@ -133,11 +133,11 @@ void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) { // report the remaining blocks. for (unsigned int i = 0; i < aKeep.size(); ++i) { if (aKeep[i] == true) { - int blockStart = i + a.start; + CHRPOS blockStart = i + a.start; while ((aKeep[i] == true) && (i < aKeep.size())) { i++; } - int blockEnd = i + a.start; + CHRPOS blockEnd = i + a.start; blockEnd = min(a.end, blockEnd); _bedA->reportBedRangeNewLine(a,blockStart,blockEnd); } @@ -162,14 +162,12 @@ void BedSubtract::SubtractBed() { hits.reserve(100); _bedA->Open(); - bedStatus = _bedA->GetNextBed(a, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { FindAndSubtractOverlaps(a, hits); hits.clear(); a = nullBed; - } - bedStatus = _bedA->GetNextBed(a, lineNum); + } } _bedA->Close(); diff --git a/src/utils/BamTools/BGZF.cpp b/src/utils/BamTools/BGZF.cpp old mode 100644 new mode 100755 diff --git a/src/utils/BamTools/BGZF.h b/src/utils/BamTools/BGZF.h old mode 100644 new mode 100755 diff --git a/src/utils/BamTools/BamAncillary.cpp b/src/utils/BamTools/BamAncillary.cpp new file mode 100755 index 0000000000000000000000000000000000000000..b7e149be05d0f40ebd7ed7223f51610ea8757c6c --- /dev/null +++ b/src/utils/BamTools/BamAncillary.cpp @@ -0,0 +1,65 @@ +/***************************************************************************** + bamAncillary.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licensed under the GNU General Public License 2.0+ license. +******************************************************************************/ +#include "BamAncillary.h" +using namespace std; + +// 10 15 20 25 30 4000 +// acccctttggacct---ataggga.................aaaa +// acccc---ggaccttttataggga.................aaaa +// 5M 3D 6M 2I 7M 20N 4M + +namespace BamTools { + void getBamBlocks(const BamAlignment &bam, const RefVector &refs, + vector<BED> &blocks, bool includeDeletions) { + + CHRPOS currPosition = bam.Position; + CHRPOS blockStart = bam.Position; + string chrom = refs.at(bam.RefID).RefName; + + vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin(); + vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end(); + for ( ; cigItr != cigEnd; ++cigItr ) { + switch (cigItr->Type) { + case 'M': + currPosition += cigItr->Length; + blocks.push_back( BED(chrom, blockStart, currPosition) ); + break; + + case 'S': + case 'P': + case 'H': + case 'I': + // Insertion relative to the reference genome. + // Don't advance the current position, since no new nucleotides are covered. + break; + + case 'D': + if (includeDeletions == true) + currPosition += cigItr->Length; + else { + blocks.push_back( BED(chrom, blockStart, currPosition) ); + currPosition += cigItr->Length; + blockStart = currPosition; + } + case 'N': + currPosition += cigItr->Length; + blockStart = currPosition; + break; + + default: + cerr << "Input error: invalid CIGAR type (" << cigItr->Type + << ") for: " << bam.Name << endl; + exit(1); + } + } + } +} \ No newline at end of file diff --git a/src/utils/BamTools/BamAncillary.h b/src/utils/BamTools/BamAncillary.h new file mode 100755 index 0000000000000000000000000000000000000000..1581efaab9a500c6603e24914d1aa1dbf3f3a6dc --- /dev/null +++ b/src/utils/BamTools/BamAncillary.h @@ -0,0 +1,18 @@ +/***************************************************************************** + bamAncillary.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licensed under the GNU General Public License 2.0+ license. +******************************************************************************/ +#include "bedFile.h" +#include "BamAux.h" + +namespace BamTools { + void getBamBlocks(const BamAlignment &bam, const RefVector &refs, + vector<BED> &blocks, bool includeDeletions = true); +} \ No newline at end of file diff --git a/src/utils/BamTools/BamAux.h b/src/utils/BamTools/BamAux.h old mode 100644 new mode 100755 diff --git a/src/utils/BamTools/BamReader.cpp b/src/utils/BamTools/BamReader.cpp old mode 100644 new mode 100755 index a2f975f9659032bed75b5cb280b12a97e97b94df..a4b6d0d4f91d61ab949d33396891935f95cc0786 --- a/src/utils/BamTools/BamReader.cpp +++ b/src/utils/BamTools/BamReader.cpp @@ -48,7 +48,7 @@ struct BamReader::BamReaderPrivate { int64_t AlignmentsBeginOffset; string Filename; string IndexFilename; - + // system data bool IsBigEndian; @@ -614,7 +614,7 @@ void BamReader::BamReaderPrivate::InsertLinearOffset(LinearOffsetVector& offsets const uint64_t& lastOffset) { // get converted offsets - int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; + int beginOffset = bAlignment.Position >> BAM_LIDX_SHIFT; int endOffset = (bAlignment.GetEndPosition() - 1) >> BAM_LIDX_SHIFT; // resize vector if necessary @@ -639,7 +639,7 @@ bool BamReader::BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { // read starts after left boundary if ( bAlignment.Position >= CurrentLeft) { return true; } - // return whether alignment end overlaps left boundary + // return whether alignment end overlaps left boundary return ( bAlignment.GetEndPosition() >= CurrentLeft ); } diff --git a/src/utils/BamTools/BamReader.h b/src/utils/BamTools/BamReader.h old mode 100644 new mode 100755 diff --git a/src/utils/BamTools/BamWriter.cpp b/src/utils/BamTools/BamWriter.cpp old mode 100644 new mode 100755 diff --git a/src/utils/BamTools/BamWriter.h b/src/utils/BamTools/BamWriter.h old mode 100644 new mode 100755 diff --git a/src/utils/BamTools/Makefile b/src/utils/BamTools/Makefile old mode 100644 new mode 100755 index 1bac7b531d9ef185b566d4931342173708b687d0..d07ad0c830596b4663388b0391fa068dfd0c4295 --- a/src/utils/BamTools/Makefile +++ b/src/utils/BamTools/Makefile @@ -3,11 +3,14 @@ CXXFLAGS = -O2 -Wall LDFLAGS = OBJ_DIR = ../../../obj/ BIN_DIR = ../../../bin/ +UTILITIES_DIR = ../ + +INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/lineFileUtilities/ -I$(UTILITIES_DIR)/gzstream/ # ---------------------------------- # define our source and object files # ---------------------------------- -SOURCES= BamReader.cpp BamWriter.cpp BGZF.cpp +SOURCES= BamReader.cpp BamWriter.cpp BGZF.cpp BamAncillary.cpp OBJECTS= $(SOURCES:.cpp=.o) BUILT_OBJECTS= $(patsubst %,$(OBJ_DIR)/%,$(OBJECTS)) diff --git a/src/utils/bedFile/bedFile.cpp b/src/utils/bedFile/bedFile.cpp index c99dc78f08c27974a4aa00864b1e3fb90f49ff0e..ec2fe6e1a106a96eb2d497c8d08d7784c90579a8 100755 --- a/src/utils/bedFile/bedFile.cpp +++ b/src/utils/bedFile/bedFile.cpp @@ -16,7 +16,7 @@ /*********************************************** Sorting comparison functions ************************************************/ -bool sortByChrom(BED const & a, BED const & b) { +bool sortByChrom(BED const &a, BED const &b) { if (a.chrom < b.chrom) return true; else return false; }; @@ -54,8 +54,7 @@ bool sortByScoreDesc(const BED &a, const BED &b) { else return false; }; - -bool byChromThenStart(BED const & a, BED const & b) { +bool byChromThenStart(BED const &a, BED const &b) { if (a.chrom < b.chrom) return true; else if (a.chrom > b.chrom) return false; @@ -67,27 +66,6 @@ bool byChromThenStart(BED const & a, BED const & b) { }; - -/* - NOTE: Tweaked from kent source. -*/ -int getBin(CHRPOS start, CHRPOS end) { - --end; - start >>= _binFirstShift; - end >>= _binFirstShift; - - for (int i = 0; i < _binLevels; ++i) { - if (start == end) return _binOffsetsExtended[i] + start; - start >>= _binNextShift; - end >>= _binNextShift; - } - cerr << "start " << start << ", end " << end << " out of range in findBin (max is 512M)" << endl; - return 0; -} - - - - /******************************************* Class methods *******************************************/ @@ -178,18 +156,18 @@ BedLineStatus BedFile::GetNextBed(BED &bed, int &lineNum) { } -void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, vector<BED> &hits, bool forceStrand) { +void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool forceStrand) { - int startBin, endBin; + BIN startBin, endBin; startBin = (start >> _binFirstShift); endBin = ((end-1) >> _binFirstShift); // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < _binLevels; ++i) { + for (BINLEVEL i = 0; i < _binLevels; ++i) { // loop through each bin at this level of the hierarchy - int offset = _binOffsetsExtended[i]; - for (int j = (startBin+offset); j <= (endBin+offset); ++j) { + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps // with the feature that was passed in. if so, add the feature to @@ -214,21 +192,21 @@ void BedFile::FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char st } -bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, +bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, bool forceStrand, float overlapFraction) { - int startBin, endBin; - startBin = (start >> _binFirstShift); - endBin = ((end-1) >> _binFirstShift); + BIN startBin, endBin; + startBin = (start >> _binFirstShift); + endBin = ((end-1) >> _binFirstShift); - int aLength = (end - start); + CHRPOS aLength = (end - start); // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < _binLevels; ++i) { + for (BINLEVEL i = 0; i < _binLevels; ++i) { // loop through each bin at this level of the hierarchy - int offset = _binOffsetsExtended[i]; - for (int j = (startBin+offset); j <= (endBin+offset); ++j) { + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps // with the feature that was passed in. if so, add the feature to @@ -236,10 +214,10 @@ bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); for (; bedItr != bedEnd; ++bedItr) { - int s = max(start, bedItr->start); - int e = min(end, bedItr->end); + CHRPOS s = max(start, bedItr->start); + CHRPOS e = min(end, bedItr->end); // the number of overlapping bases b/w a and b - int overlapBases = (e - s); + CHRPOS overlapBases = (e - s); // do we have sufficient overlap? if ( (float) overlapBases / (float) aLength >= overlapFraction) { @@ -258,21 +236,21 @@ bool BedFile::FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end } -bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, +bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, bool forceStrand, float overlapFraction) { - int startBin, endBin; + BIN startBin, endBin; startBin = (start >> _binFirstShift); endBin = ((end-1) >> _binFirstShift); - int aLength = (end - start); + CHRPOS aLength = (end - start); // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < _binLevels; ++i) { + for (BINLEVEL i = 0; i < _binLevels; ++i) { // loop through each bin at this level of the hierarchy - int offset = _binOffsetsExtended[i]; - for (int j = (startBin+offset); j <= (endBin+offset); ++j) { + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps // with the feature that was passed in. if so, add the feature to @@ -280,14 +258,14 @@ bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, vector<BED>::const_iterator bedItr = bedMap[chrom][j].begin(); vector<BED>::const_iterator bedEnd = bedMap[chrom][j].end(); for (; bedItr != bedEnd; ++bedItr) { - int s = max(start, bedItr->start); - int e = min(end, bedItr->end); + CHRPOS s = max(start, bedItr->start); + CHRPOS e = min(end, bedItr->end); // the number of overlapping bases b/w a and b - int overlapBases = (e - s); + CHRPOS overlapBases = (e - s); // do we have sufficient overlap? if ( (float) overlapBases / (float) aLength >= overlapFraction) { - int bLength = (bedItr->end - bedItr->start); + CHRPOS bLength = (bedItr->end - bedItr->start); float bOverlap = ( (float) overlapBases / (float) bLength ); if ((forceStrand == false) && (bOverlap >= overlapFraction)) { return true; @@ -307,16 +285,16 @@ bool BedFile::FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, void BedFile::countHits(const BED &a, bool forceStrand) { - int startBin, endBin; + BIN startBin, endBin; startBin = (a.start >> _binFirstShift); endBin = ((a.end-1) >> _binFirstShift); // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < _binLevels; ++i) { + for (BINLEVEL i = 0; i < _binLevels; ++i) { // loop through each bin at this level of the hierarchy - int offset = _binOffsetsExtended[i]; - for (int j = (startBin+offset); j <= (endBin+offset); ++j) { + BIN offset = _binOffsetsExtended[i]; + for (BIN j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps // with the feature that was passed in. if so, add the feature to @@ -353,6 +331,12 @@ void BedFile::setGff (bool gff) { } +void BedFile::setVcf (bool vcf) { + if (vcf == true) this->_isVcf = true; + else this->_isVcf = false; +} + + void BedFile::loadBedFileIntoMap() { BED bedEntry, nullBed; @@ -362,7 +346,7 @@ void BedFile::loadBedFileIntoMap() { Open(); while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { - int bin = getBin(bedEntry.start, bedEntry.end); + BIN bin = getBin(bedEntry.start, bedEntry.end); bedMap[bedEntry.chrom][bin].push_back(bedEntry); bedEntry = nullBed; } @@ -380,7 +364,7 @@ void BedFile::loadBedCovFileIntoMap() { Open(); while ((bedStatus = GetNextBed(bedEntry, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { - int bin = getBin(bedEntry.start, bedEntry.end); + BIN bin = getBin(bedEntry.start, bedEntry.end); BEDCOV bedCov; bedCov.chrom = bedEntry.chrom; diff --git a/src/utils/bedFile/bedFile.h b/src/utils/bedFile/bedFile.h index 36e048c13701e7b7343c779389fc80ea3274cb41..5c67005c8e92eecbdb3aefd8afea3649c00d5f08 100755 --- a/src/utils/bedFile/bedFile.h +++ b/src/utils/bedFile/bedFile.h @@ -32,14 +32,17 @@ using namespace std; // Data type tydedef //************************************************* typedef uint32_t CHRPOS; - +typedef uint16_t BINLEVEL; +typedef uint32_t BIN; +typedef uint16_t USHORT; +typedef uint16_t UINT; //************************************************* // Genome binning constants //************************************************* -const int _numBins = 37450; -const int _binLevels = 7; +const BIN _numBins = 37450; +const BINLEVEL _binLevels = 7; // bins range in size from 16kb to 512Mb // Bin 0 spans 512Mbp, # Level 1 @@ -48,11 +51,10 @@ const int _binLevels = 7; // Bins 73-584 span 1Mbp # Level 4 // Bins 585-4680 span 128Kbp # Level 5 // Bins 4681-37449 span 16Kbp # Level 6 -const int _binOffsetsExtended[] = - {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; +const BIN _binOffsetsExtended[] = {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; -const int _binFirstShift = 14; /* How much to shift to get to finest bin. */ -const int _binNextShift = 3; /* How much to shift to get to next larger bin. */ +const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */ +const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */ //************************************************* @@ -60,29 +62,57 @@ const int _binNextShift = 3; /* How much to shift to get to next larger bin. * //************************************************* struct DEPTH { - unsigned int starts; - unsigned int ends; + UINT starts; + UINT ends; }; - /* Structure for regular BED records */ struct BED { // Regular BED fields + string chrom; CHRPOS start; CHRPOS end; - - string chrom; string name; string score; - char strand; + string strand; // Add'l fields for BED12 and/or custom BED annotations vector<string> otherFields; -}; + +public: + // constructors + + // Null + BED() + : chrom(""), + start(0), + end(0), + name(""), + score(""), + strand(""), + otherFields() + {} + + // BED3 + BED(string chrom, CHRPOS start, CHRPOS end) + : chrom(chrom), + start(start), + end(end) + {} + + // BED4 + BED(string chrom, CHRPOS start, CHRPOS end, string strand) + : chrom(chrom), + start(start), + end(end), + strand(strand) + {} +}; // BED + /* @@ -97,7 +127,7 @@ struct BEDCOV { string chrom; string name; string score; - char strand; + string strand; // Add'l fields for BED12 and/or custom BED annotations vector<string> otherFields; @@ -121,7 +151,19 @@ enum BedLineStatus // return the genome "bin" for a feature with this start and end inline -int getBin(CHRPOS start, CHRPOS end); +BIN getBin(CHRPOS start, CHRPOS end) { + --end; + start >>= _binFirstShift; + end >>= _binFirstShift; + + for (register short i = 0; i < _binLevels; ++i) { + if (start == end) return _binOffsetsExtended[i] + start; + start >>= _binNextShift; + end >>= _binNextShift; + } + cerr << "start " << start << ", end " << end << " out of range in findBin (max is 512M)" << endl; + return 0; +} // return the amount of overlap between two features. Negative if none and the the @@ -163,8 +205,8 @@ bool byChromThenStart(BED const &a, BED const &b); typedef vector<BED> bedVector; typedef vector<BEDCOV> bedCovVector; -typedef map<int, bedVector, std::less<int> > binsToBeds; -typedef map<int, bedCovVector, std::less<int> > binsToBedCovs; +typedef map<BIN, bedVector, std::less<BIN> > binsToBeds; +typedef map<BIN, bedCovVector, std::less<BIN> > binsToBedCovs; typedef map<string, binsToBeds, std::less<string> > masterBedMap; typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap; @@ -217,14 +259,14 @@ public: // search for all overlapping features in another BED file. // Searches through each relevant genome bin on the same chromosome // as the single feature. Note: Adapted from kent source "binKeeperFind" - void FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, vector<BED> &hits, bool forceStrand); + void FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool forceStrand); // return true if at least one overlap was found. otherwise, return false. - bool FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, + bool FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, bool forceStrand, float overlapFraction = 0.0); // return true if at least one __reciprocal__ overlap was found. otherwise, return false. - bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, char strand, + bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, bool forceStrand, float overlapFraction = 0.0); // Given a chrom, start, end and strand for a single feature, @@ -246,9 +288,11 @@ private: // data bool _isGff; + bool _isVcf; istream *_bedStream; void setGff (bool isGff); + void setVcf (bool isVcf); /****************************************************** @@ -284,6 +328,12 @@ private: if (parseBedLine(bed, lineVector, lineNum) == true) return BED_VALID; } + else if (p2End != lineVector[1].c_str()) { + setGff(false); + setVcf(true); + if (parseVcfLine(bed, lineVector, lineNum) == true) + return BED_VALID; + } else if (lineVector.size() == 9) { // otherwise test if columns 4 and 5 are integers. If so, assume GFF. l4 = strtol(lineVector[3].c_str(), &p4End, 10); @@ -339,12 +389,12 @@ private: else if (this->bedType == 6) { bed.name = lineVector[3]; bed.score = lineVector[4]; - bed.strand = lineVector[5][0]; + bed.strand = lineVector[5]; } else if (this->bedType > 6) { bed.name = lineVector[3]; bed.score = lineVector[4]; - bed.strand = lineVector[5][0]; + bed.strand = lineVector[5]; for (unsigned int i = 6; i < lineVector.size(); ++i) { bed.otherFields.push_back(lineVector[i]); } @@ -383,12 +433,12 @@ private: else if (this->bedType == 6) { bed.name = lineVector[3]; bed.score = lineVector[4]; - bed.strand = lineVector[5][0]; + bed.strand = lineVector[5]; } else if (this->bedType > 6) { bed.name = lineVector[3]; bed.score = lineVector[4]; - bed.strand = lineVector[5][0]; + bed.strand = lineVector[5]; for (unsigned int i = 6; i < lineVector.size(); ++i) { bed.otherFields.push_back(lineVector[i]); } @@ -425,6 +475,80 @@ private: } + /* + parseBedLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) + */ + template <typename T> + inline bool parseVcfLine (T &bed, const vector<string> &lineVector, int lineNum) { + + if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { + + bed.chrom = lineVector[0]; + bed.start = atoi(lineVector[1].c_str()) - 1; // VCF is one-based + bed.end = atoi(lineVector[1].c_str()); // TO-DO: make this the size of the variant. + bed.strand = "+"; + + if (this->bedType > 2) { + for (unsigned int i = 2; i < lineVector.size(); ++i) { + bed.otherFields.push_back(lineVector[i]); + } + } + + if ((bed.start <= bed.end) && (bed.start > 0) && (bed.end > 0)) { + return true; + } + else if (bed.start > bed.end) { + cerr << "Error: malformed VCF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + exit(1); + } + else if ( (bed.start < 0) || (bed.end < 0) ) { + cerr << "Error: malformed VCF entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; + exit(1); + } + } + else if ((lineNum == 1) && (lineVector.size() >= 6)) { + this->bedType = lineVector.size(); + + bed.chrom = lineVector[0]; + bed.start = atoi(lineVector[1].c_str()) - 1; // VCF is one-based + bed.end = atoi(lineVector[1].c_str()); // TO-DO: make this the size of the variant. + bed.strand = "+"; + + if (this->bedType > 2) { + for (unsigned int i = 2; i < lineVector.size(); ++i) { + bed.otherFields.push_back(lineVector[i]); + } + } + + if ((bed.start <= bed.end) && (bed.start > 0) && (bed.end > 0)) { + return true; + } + else if (bed.start > bed.end) { + cerr << "Error: malformed VCF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + exit(1); + } + else if ( (bed.start < 0) || (bed.end < 0) ) { + cerr << "Error: malformed VCF entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; + exit(1); + } + } + else if (lineVector.size() == 1) { + cerr << "Only one VCF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + exit(1); + } + else if ((lineVector.size() != this->bedType) && (lineVector.size() != 0)) { + cerr << "Differing number of VCF fields encountered at line: " << lineNum << ". Exiting..." << endl; + exit(1); + } + else if ((lineVector.size() < 2) && (lineVector.size() != 0)) { + cerr << "TAB delimited VCF file with at least 2 fields (chrom, pos) is required at line: "<< lineNum << ". Exiting..." << endl; + exit(1); + } + return false; + } + + + /* parseGffLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) */ @@ -552,12 +676,12 @@ public: bed.score.c_str()); } else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); } else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); vector<string>::const_iterator othIt = bed.otherFields.begin(); vector<string>::const_iterator othEnd = bed.otherFields.end(); @@ -567,9 +691,9 @@ public: } } else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%c\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), bed.name.c_str(), bed.start+1, bed.end, - bed.score.c_str(), bed.strand, + bed.score.c_str(), bed.strand.c_str(), bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); } } @@ -597,12 +721,12 @@ public: bed.score.c_str()); } else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); } else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.start, bed.end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); vector<string>::const_iterator othIt = bed.otherFields.begin(); vector<string>::const_iterator othEnd = bed.otherFields.end(); @@ -613,9 +737,9 @@ public: } } else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%c\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), bed.name.c_str(), bed.start+1, bed.end, - bed.score.c_str(), bed.strand, + bed.score.c_str(), bed.strand.c_str(), bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); } } @@ -644,12 +768,12 @@ public: bed.score.c_str()); } else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); } else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); vector<string>::const_iterator othIt = bed.otherFields.begin(); vector<string>::const_iterator othEnd = bed.otherFields.end(); @@ -659,9 +783,9 @@ public: } } else if (this->bedType == 9) { - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%c\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand, + bed.score.c_str(), bed.strand.c_str(), bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); } } @@ -690,12 +814,12 @@ public: bed.score.c_str()); } else if (this->bedType == 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\n", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); } else if (this->bedType > 6) { - printf ("%s\t%d\t%d\t%s\t%s\t%c\t", bed.chrom.c_str(), start, end, bed.name.c_str(), - bed.score.c_str(), bed.strand); + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); vector<string>::const_iterator othIt = bed.otherFields.begin(); vector<string>::const_iterator othEnd = bed.otherFields.end(); @@ -706,9 +830,9 @@ public: } } else if (this->bedType == 9) { // add 1 to the start for GFF - printf ("%s\t%s\t%s\t%d\t%d\t%s\t%c\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), bed.name.c_str(), start+1, end, - bed.score.c_str(), bed.strand, + bed.score.c_str(), bed.strand.c_str(), bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); } } diff --git a/src/utils/bedFilePE/bedFilePE.cpp b/src/utils/bedFilePE/bedFilePE.cpp index 27c7a3cb4210413990cdaf6b8dfbb74cd1b42cc2..20a6ad1fbcf2d5d2d664954ff3070d41ce9b9485 100755 --- a/src/utils/bedFilePE/bedFilePE.cpp +++ b/src/utils/bedFilePE/bedFilePE.cpp @@ -174,7 +174,7 @@ void BedFilePE::reportBedPENewLine(const BEDPE &a) { a.chrom2.c_str(), a.start2, a.end2, a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); - vector<string>::const_iterator othIt = a.otherFields.begin(); + vector<string>::const_iterator othIt = a.otherFields.begin(); vector<string>::const_iterator othEnd = a.otherFields.end(); for ( ; othIt != othEnd; ++othIt) { printf("%s\t", othIt->c_str()); @@ -421,24 +421,24 @@ bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, co /* Adapted from kent source "binKeeperFind" */ -void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, int start, int end, string strand, vector<BED> &hits, bool forceStrand) { +void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string strand, vector<BEDCOV> &hits, bool forceStrand) { int startBin, endBin; startBin = (start >> _binFirstShift); endBin = ((end-1) >> _binFirstShift); // loop through each bin "level" in the binning hierarchy - for (int i = 0; i < 6; ++i) { + for (int i = 0; i < _binLevels; ++i) { // loop through each bin at this level of the hierarchy - int offset = binOffsetsExtended[i]; + int offset = _binOffsetsExtended[i]; for (int j = (startBin+offset); j <= (endBin+offset); ++j) { // loop through each feature in this chrom/bin and see if it overlaps // with the feature that was passed in. if so, add the feature to // the list of hits. - vector<BED>::const_iterator bedItr; - vector<BED>::const_iterator bedEnd; + vector<BEDCOV>::const_iterator bedItr; + vector<BEDCOV>::const_iterator bedEnd; if (bEnd == 1) { bedItr = bedMapEnd1[chrom][j].begin(); bedEnd = bedMapEnd1[chrom][j].end(); @@ -480,7 +480,7 @@ void BedFilePE::loadBedPEFileIntoMap() { while (bedStatus != BED_INVALID) { if (bedStatus == BED_VALID) { - BED bedEntry1, bedEntry2; + BEDCOV bedEntry1, bedEntry2; // separate the BEDPE entry into separate // BED entries splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2); @@ -501,7 +501,7 @@ void BedFilePE::loadBedPEFileIntoMap() { } -void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, BED &bedEntry1, BED &bedEntry2) { +void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, BEDCOV &bedEntry1, BEDCOV &bedEntry2) { /* Split the BEDPE entry into separate BED entries diff --git a/src/utils/bedFilePE/bedFilePE.h b/src/utils/bedFilePE/bedFilePE.h index 82750752694f4f199d28a504f2ce9f28367d1e00..0e6be51bed21dee73192437f9bb88e6c2ebdf1e8 100755 --- a/src/utils/bedFilePE/bedFilePE.h +++ b/src/utils/bedFilePE/bedFilePE.h @@ -22,12 +22,12 @@ struct BEDPE { // UCSC BED fields string chrom1; - int start1; - int end1; + CHRPOS start1; + CHRPOS end1; string chrom2; - int start2; - int end2; + CHRPOS start2; + CHRPOS end2; string name; string score; @@ -69,18 +69,18 @@ public: void reportBedPETab(const BEDPE &a); void reportBedPENewLine(const BEDPE &a); void loadBedPEFileIntoMap(); - void splitBedPEIntoBeds(const BEDPE &a, const int &lineNum, BED &bedEntry1, BED &bedEntry2); + void splitBedPEIntoBeds(const BEDPE &a, const int &lineNum, BEDCOV &bedEntry1, BEDCOV &bedEntry2); - void FindOverlapsPerBin(int bEnd, string chrom, int start, int end, string strand, - vector<BED> &hits, bool forceStrand); + void FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string strand, + vector<BEDCOV> &hits, bool forceStrand); string bedFile; unsigned int bedType; - masterBedMap bedMapEnd1; - masterBedMap bedMapEnd2; + masterBedCovMap bedMapEnd1; + masterBedCovMap bedMapEnd2; private: istream *_bedStream; diff --git a/src/utils/gzstream/COPYING.LIB b/src/utils/gzstream/COPYING.LIB old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/Makefile b/src/utils/gzstream/Makefile old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/README b/src/utils/gzstream/README old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/gzstream.C b/src/utils/gzstream/gzstream.C old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/gzstream.h b/src/utils/gzstream/gzstream.h old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/gzstream.o b/src/utils/gzstream/gzstream.o old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/test_gunzip.o b/src/utils/gzstream/test_gunzip.o old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/test_gzip.o b/src/utils/gzstream/test_gzip.o old mode 100644 new mode 100755 diff --git a/src/utils/gzstream/version b/src/utils/gzstream/version old mode 100644 new mode 100755 diff --git a/src/utils/version/version.h b/src/utils/version/version.h old mode 100644 new mode 100755 diff --git a/src/windowBed/windowBed.cpp b/src/windowBed/windowBed.cpp index 92938c6d66d2f185587bc8c8b1b8392edc2a7978..e1809f793d8ceb591395c78a9e81e319c88ad954 100755 --- a/src/windowBed/windowBed.cpp +++ b/src/windowBed/windowBed.cpp @@ -70,8 +70,8 @@ void BedWindow::FindWindowOverlaps(const BED &a, vector<BED> &hits) { // update the current feature's start and end // according to the slop requested (slop = 0 by default) - int aFudgeStart = 0; - int aFudgeEnd; + CHRPOS aFudgeStart = 0; + CHRPOS aFudgeEnd; AddWindow(a, aFudgeStart, aFudgeEnd); /* @@ -118,8 +118,8 @@ bool BedWindow::FindOneOrMoreWindowOverlaps(const BED &a) { // update the current feature's start and end // according to the slop requested (slop = 0 by default) - int aFudgeStart = 0; - int aFudgeEnd; + CHRPOS aFudgeStart = 0; + CHRPOS aFudgeEnd; AddWindow(a, aFudgeStart, aFudgeEnd); bool overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, _matchOnStrand); @@ -133,21 +133,19 @@ void BedWindow::WindowIntersectBed() { // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); - BED a; + BED a, nullBed; int lineNum = 0; // current input line number BedLineStatus bedStatus; vector<BED> hits; // vector of potential hits hits.reserve(100); _bedA->Open(); - // process each entry in A - bedStatus = _bedA->GetNextBed(a, lineNum); - while (bedStatus != BED_INVALID) { + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { FindWindowOverlaps(a, hits); hits.clear(); + a = nullBed; } - bedStatus = _bedA->GetNextBed(a, lineNum); } _bedA->Close(); } @@ -196,7 +194,7 @@ void BedWindow::WindowIntersectBam(string bamFile) { if (bam.IsSecondMate()) a.name += "/2"; a.score = ToString(bam.MapQuality); - a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; + a.strand = '+'; if (bam.IsReverseStrand()) a.strand = '-'; if (_bamOutput == true) { overlapsFound = FindOneOrMoreWindowOverlaps(a); @@ -224,7 +222,7 @@ void BedWindow::WindowIntersectBam(string bamFile) { } -void BedWindow::AddWindow(const BED &a, int &fudgeStart, int &fudgeEnd) { +void BedWindow::AddWindow(const BED &a, CHRPOS &fudgeStart, CHRPOS &fudgeEnd) { // Does the user want to treat the windows based on strand? // If so, // if "+", then left is left and right is right diff --git a/src/windowBed/windowBed.h b/src/windowBed/windowBed.h index 4d9b3bbb61aa8633e9f8111aad2cd24cd8e9ebaf..b1af91110a57df61b730f24ccaa0b023a9ae03b5 100755 --- a/src/windowBed/windowBed.h +++ b/src/windowBed/windowBed.h @@ -60,7 +60,7 @@ private: void WindowIntersectBam(string bamFile); void FindWindowOverlaps(const BED &a, vector<BED> &hits); bool FindOneOrMoreWindowOverlaps(const BED &a); - void AddWindow(const BED &a, int &fudgeStart, int &fudgeEnd); + void AddWindow(const BED &a, CHRPOS &fudgeStart, CHRPOS &fudgeEnd); }; #endif /* WINDOWBED_H */