diff --git a/docs/content/tools/cluster.rst b/docs/content/tools/cluster.rst index 0ab1fd7e0841660b31ad1af94ece0f84b20faec9..9f4f6f370f5d4951976d247a6f4f0fe618bf54be 100644 --- a/docs/content/tools/cluster.rst +++ b/docs/content/tools/cluster.rst @@ -19,7 +19,7 @@ a single interval file are combined. .. note:: ``bedtools cluster`` requires that you presort your data by chromosome and - then by start position (e.g., ``sort k1,1 -k2,2n in.bed > in.sorted.bed`` + then by start position (e.g., ``sort -k1,1 -k2,2n in.bed > in.sorted.bed`` for BED files). .. seealso:: diff --git a/src/fastaFromBed/fastaFromBed.cpp b/src/fastaFromBed/fastaFromBed.cpp index d9bfd055f0ac2f02a0bbbff17710f87788571eb9..469399450a06904b0628680371e958623e5d6af2 100644 --- a/src/fastaFromBed/fastaFromBed.cpp +++ b/src/fastaFromBed/fastaFromBed.cpp @@ -17,14 +17,15 @@ Bed2Fa::Bed2Fa(bool useName, const string &dbFile, const string &bedFile, const string &fastaOutFile, bool useFasta, bool useStrand, - bool useBlocks) : + bool useBlocks, bool useFullHeader) : _useName(useName), _dbFile(dbFile), _bedFile(bedFile), _fastaOutFile(fastaOutFile), _useFasta(useFasta), _useStrand(useStrand), - _useBlocks(useBlocks) + _useBlocks(useBlocks), + _useFullHeader(useFullHeader) { _bed = new BedFile(_bedFile); @@ -110,7 +111,7 @@ void Bed2Fa::ReportDNA(const BED &bed, string &dna) { //****************************************************************************** void Bed2Fa::ExtractDNA() { - /* Make sure that we can oen all of the files successfully*/ + /* Make sure that we can open all of the files successfully*/ // open the fasta database for reading ifstream faDb(_dbFile.c_str(), ios::in); @@ -124,7 +125,7 @@ void Bed2Fa::ExtractDNA() { // open and memory-map genome file FastaReference *fr = new FastaReference; bool memmap = true; - fr->open(_dbFile, memmap); + fr->open(_dbFile, memmap, _useFullHeader); BED bed, nullBed; string sequence; diff --git a/src/fastaFromBed/fastaFromBed.h b/src/fastaFromBed/fastaFromBed.h index 76fab656d396678cc84b7685e6ab6de3858f6908..a8ff65eea1495d8f71af79a159248f2446a60784 100644 --- a/src/fastaFromBed/fastaFromBed.h +++ b/src/fastaFromBed/fastaFromBed.h @@ -33,7 +33,7 @@ public: Bed2Fa(bool useName, const string &dbFile, const string &bedFile, const string &fastaOutFile, bool useFasta, bool useStrand, - bool useBlocks); + bool useBlocks, bool useFullHeader); // destructor ~Bed2Fa(void); @@ -52,6 +52,7 @@ private: bool _useStrand; // should the extracted sequence obey strandedness? bool _useBlocks; // should the extracted sequence obey BED blocks // (for example, exons?) + bool _useFullHeader; // instance of a bed file class. BedFile *_bed; diff --git a/src/fastaFromBed/fastaFromBedMain.cpp b/src/fastaFromBed/fastaFromBedMain.cpp index 77ba7d8d0714c7e9d6fb51c05e2b2c885af70bf6..9bceb3d9937b7d26ad5a005b2107e289de863d70 100644 --- a/src/fastaFromBed/fastaFromBedMain.cpp +++ b/src/fastaFromBed/fastaFromBedMain.cpp @@ -44,6 +44,7 @@ int fastafrombed_main(int argc, char* argv[]) { bool useFasta = true; bool useStrand = false; bool useBlocks = false; + bool useFullHeader = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -97,6 +98,9 @@ int fastafrombed_main(int argc, char* argv[]) { else if(PARAMETER_CHECK("-s", 2, parameterLength)) { useStrand = true; } + else if(PARAMETER_CHECK("-fullHeader", 11, parameterLength)) { + useFullHeader = true; + } else { cerr << "*****ERROR: Unrecognized parameter: " << argv[i] @@ -115,7 +119,7 @@ int fastafrombed_main(int argc, char* argv[]) { Bed2Fa *b2f = new Bed2Fa(useNameOnly, fastaDbFile, bedFile, fastaOutFile, useFasta, useStrand, - useBlocks); + useBlocks, useFullHeader); delete b2f; } else { @@ -148,6 +152,9 @@ void fastafrombed_help(void) { << endl; cerr << "\t\tstrand, the sequence will be reverse complemented." << endl; cerr << "\t\t- By default, strand information is ignored." << endl << endl; + cerr << "\t-fullHeader\tUse full fasta header." << endl; + cerr << "\t\t- By default, only the word before the first space or tab " + << "is used." << endl << endl; // end the program here exit(1); diff --git a/src/intersectFile/intersectFile.cpp b/src/intersectFile/intersectFile.cpp index 7eb46d89f0a268a33b59612e93820c7ce1363024..5c78887425de2e8a7712aac3ba69e41db86a56ac 100644 --- a/src/intersectFile/intersectFile.cpp +++ b/src/intersectFile/intersectFile.cpp @@ -23,9 +23,12 @@ FileIntersect::FileIntersect(ContextIntersect *context) _blockMgr(NULL), _recordOutputMgr(NULL) { - _blockMgr = new BlockMgr(); - _blockMgr->setContext(context); _recordOutputMgr = new RecordOutputMgr(); + if (_context->getObeySplits()) { + _blockMgr = new BlockMgr(); + _blockMgr->setContext(context); + _recordOutputMgr->setSplitInfo(_blockMgr); + } } FileIntersect::~FileIntersect(void) { diff --git a/src/nucBed/nucBed.cpp b/src/nucBed/nucBed.cpp index 446f6a6b52faef275bb93fd4a79a479acc14bb84..9b515d277b938fd58430533b4aff8d1d6e5bd89d 100644 --- a/src/nucBed/nucBed.cpp +++ b/src/nucBed/nucBed.cpp @@ -15,16 +15,17 @@ NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq, bool hasPattern, const string &pattern, - bool forceStrand, bool ignoreCase) + bool forceStrand, bool ignoreCase, bool useFullHeader) { - _dbFile = dbFile; - _bedFile = bedFile; - _printSeq = printSeq; - _hasPattern = hasPattern; - _pattern = pattern; - _forceStrand = forceStrand; - _ignoreCase = ignoreCase; + _dbFile = dbFile; + _bedFile = bedFile; + _printSeq = printSeq; + _hasPattern = hasPattern; + _pattern = pattern; + _forceStrand = forceStrand; + _ignoreCase = ignoreCase; + _useFullHeader = useFullHeader; _bed = new BedFile(_bedFile); // Compute the DNA content in each BED/GFF/VCF interval @@ -109,7 +110,7 @@ void NucBed::ProfileDNA() { // open and memory-map genome file FastaReference fr; bool memmap = true; - fr.open(_dbFile, memmap); + fr.open(_dbFile, memmap, _useFullHeader); bool headerReported = false; BED bed; diff --git a/src/nucBed/nucBed.h b/src/nucBed/nucBed.h index ba45af4f4a2a3e5d12a3465b375c6b27394222a1..dc82f3e24037ae6d35c8eb35ca51875e169c3c69 100644 --- a/src/nucBed/nucBed.h +++ b/src/nucBed/nucBed.h @@ -31,7 +31,7 @@ public: // constructor NucBed(string &dbFile, string &bedFile, bool printSeq, bool hasPattern, const string &pattern, - bool forceStrand, bool ignoreCase); + bool forceStrand, bool ignoreCase, bool useFullHeader); // destructor ~NucBed(void); @@ -46,6 +46,7 @@ private: string _pattern; bool _forceStrand; bool _ignoreCase; + bool _useFullHeader; // instance of a bed file class. BedFile *_bed; diff --git a/src/nucBed/nucBedMain.cpp b/src/nucBed/nucBedMain.cpp index eff8bec36687e653c90398aba1e51e0850c57dd5..f8cbc35a8b4fe395a75be6fe33b172623ee9ef2d 100644 --- a/src/nucBed/nucBedMain.cpp +++ b/src/nucBed/nucBedMain.cpp @@ -35,12 +35,13 @@ int nuc_main(int argc, char* argv[]) { string pattern; // checks for existence of parameters - bool haveFastaDb = false; - bool haveBed = false; - bool printSeq = false; - bool hasPattern = false; - bool forceStrand = false; - bool ignoreCase = false; + bool haveFastaDb = false; + bool haveBed = false; + bool printSeq = false; + bool hasPattern = false; + bool forceStrand = false; + bool ignoreCase = false; + bool useFullHeader = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -90,6 +91,9 @@ int nuc_main(int argc, char* argv[]) { pattern = argv[i + 1]; i++; } + } + else if(PARAMETER_CHECK("-useFullHeader", 11, parameterLength)) { + useFullHeader = true; } else { cerr << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; @@ -104,7 +108,8 @@ int nuc_main(int argc, char* argv[]) { if (!showHelp) { NucBed *nuc = new NucBed(fastaDbFile, bedFile, printSeq, - hasPattern, pattern, forceStrand, ignoreCase); + hasPattern, pattern, forceStrand, ignoreCase, + useFullHeader); delete nuc; } else { @@ -134,8 +139,12 @@ void nuc_help(void) { cerr << "\t-pattern\tReport the number of times a user-defined sequence" << endl; cerr << "\t\t\tis observed (case-sensitive)." << endl << endl; - cerr << "\t-C\tIgore case when matching -pattern. By defaulty, case matters." << endl << endl; - + cerr << "\t-C\tIgnore case when matching -pattern. By defaulty, case matters." << endl << endl; + + cerr << "\t-fullHeader\tUse full fasta header." << endl; + cerr << "\t\t- By default, only the word before the first space or tab " + << "is used." << endl << endl; + cerr << "Output format: " << endl; cerr << "\tThe following information will be reported after each BED entry:" << endl; cerr << "\t 1) %AT content" << endl; diff --git a/src/utils/Fasta/Fasta.cpp b/src/utils/Fasta/Fasta.cpp index bedf311d20439880e10083c59b22a15b0838c2ea..076f06d2a1b99352e00e5d7d45878ed0760dbbe7 100644 --- a/src/utils/Fasta/Fasta.cpp +++ b/src/utils/Fasta/Fasta.cpp @@ -8,12 +8,13 @@ #include "Fasta.h" -FastaIndexEntry::FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len) +FastaIndexEntry::FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len, bool useFullHeader) : name(name) , length(length) , offset(offset) , line_blen(line_blen) , line_len(line_len) + , useFullHeader(useFullHeader) {} FastaIndexEntry::FastaIndexEntry(void) // empty constructor @@ -30,16 +31,19 @@ void FastaIndexEntry::clear(void) // check if we have already recorded a real offset line_blen = NULL; line_len = NULL; + useFullHeader = false; } ostream& operator<<(ostream& output, const FastaIndexEntry& e) { // just write the first component of the name, for compliance with other tools - output << split(e.name, ' ').at(0) << "\t" << e.length << "\t" << e.offset << "\t" << - e.line_blen << "\t" << e.line_len; + output << (e.useFullHeader? e.name : split(e.name, ' ').at(0)) + << "\t" << e.length << "\t" << e.offset << "\t" + << e.line_blen << "\t" << e.line_len; return output; // for multiple << operators. } -FastaIndex::FastaIndex(void) +FastaIndex::FastaIndex(bool useFullHeader) + : useFullHeader(useFullHeader) {} void FastaIndex::readIndexFile(string fname) { @@ -55,12 +59,14 @@ void FastaIndex::readIndexFile(string fname) { if (fields.size() == 5) { // if we don't get enough fields then there is a problem with the file // note that fields[0] is the sequence name char* end; - string name = split(fields[0], " \t").at(0); // key by first token of name + string name = useFullHeader ? fields[0] : + split(fields[0], " \t").at(0); // key by first token of name sequenceNames.push_back(name); this->insert(make_pair(name, FastaIndexEntry(fields[0], atoi(fields[1].c_str()), - strtoll(fields[2].c_str(), &end, 10), - atoi(fields[3].c_str()), - atoi(fields[4].c_str())))); + strtoll(fields[2].c_str(), &end, 10), + atoi(fields[3].c_str()), + atoi(fields[4].c_str()), + useFullHeader))); } else { cerr << "Warning: malformed fasta index file " << fname << "does not have enough fields @ line " << linenum << endl; @@ -181,12 +187,13 @@ void FastaIndex::indexReference(string refname) { } void FastaIndex::flushEntryToIndex(FastaIndexEntry& entry) { - string name = split(entry.name, " \t").at(0); // key by first token of name + string name = useFullHeader? entry.name : + split(entry.name, " \t").at(0); // key by first token of name sequenceNames.push_back(name); this->insert(make_pair(name, FastaIndexEntry(entry.name, entry.length, - entry.offset, entry.line_blen, - entry.line_len))); - + entry.offset, entry.line_blen, + entry.line_len, + useFullHeader))); } void FastaIndex::writeIndexFile(string fname) { @@ -225,13 +232,15 @@ bool FastaIndex::chromFound(string name) { string FastaIndex::indexFileExtension() { return ".fai"; } -void FastaReference::open(string reffilename, bool usemmap) { +void FastaReference::open(string reffilename, bool usemmap, bool useFullHeader) { filename = reffilename; if (!(file = fopen(filename.c_str(), "r"))) { cerr << "could not open " << filename << endl; exit(1); } - index = new FastaIndex(); + if (useFullHeader) + usingfullheader = true; + index = new FastaIndex(useFullHeader); struct stat stFileInfo; string indexFileName = filename + index->indexFileExtension(); // if we can find an index file, use it diff --git a/src/utils/Fasta/Fasta.h b/src/utils/Fasta/Fasta.h index 68a8bf80c58379bffe88b9980876d6c35e117476..f9d28208d5d31f6d1092cc5245e10c7ae8da6617 100644 --- a/src/utils/Fasta/Fasta.h +++ b/src/utils/Fasta/Fasta.h @@ -29,7 +29,8 @@ using namespace std; class FastaIndexEntry { friend ostream& operator<<(ostream& output, const FastaIndexEntry& e); public: - FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len); + FastaIndexEntry(string name, int length, long long offset, + int line_blen, int line_len, bool useFullHeader); FastaIndexEntry(void); ~FastaIndexEntry(void); string name; // sequence name @@ -37,14 +38,16 @@ class FastaIndexEntry { long long offset; // bytes offset of sequence from start of file int line_blen; // line length in bytes, sequence characters int line_len; // line length including newline + bool useFullHeader; void clear(void); }; class FastaIndex : public map<string, FastaIndexEntry> { friend ostream& operator<<(ostream& output, FastaIndex& i); public: - FastaIndex(void); + FastaIndex(bool useFullHeader); ~FastaIndex(void); + bool useFullHeader; vector<string> sequenceNames; void indexReference(string refName); void readIndexFile(string fname); @@ -58,10 +61,12 @@ class FastaIndex : public map<string, FastaIndexEntry> { class FastaReference { public: - void open(string reffilename, bool usemmap = false); + void open(string reffilename, bool usemmap = false, + bool useFullHeader = false); bool usingmmap; string filename; - FastaReference(void) : usingmmap(false) { } + bool usingfullheader; + FastaReference(void) : usingmmap(false), usingfullheader(false) { } ~FastaReference(void); FILE* file; void* filemm; diff --git a/src/utils/FileRecordTools/FileRecordMgr.cpp b/src/utils/FileRecordTools/FileRecordMgr.cpp index 3230f75a0f34db7b73f71967ed8efc7ecb8ef481..16ebf006d9eac5054940b4147e18833f6e66ff1e 100644 --- a/src/utils/FileRecordTools/FileRecordMgr.cpp +++ b/src/utils/FileRecordTools/FileRecordMgr.cpp @@ -127,9 +127,11 @@ Record *FileRecordMgr::allocateAndGetNextRecord() _recordMgr->deleteRecord(record); return NULL; } + // In the rare case of Bam records where both the read and it's mate failed to map, - // Ignore the record. Delete and return null - if (!(record->isUnmapped() && record->isMateUnmapped())) { + // Ignore the record. Delete and return null. + + if (!(record->isUnmapped() )) { if (!record->coordsValid()) { cerr << "Error: Invalid record in file " << _filename << ". Record is " << endl << *record << endl; exit(1); diff --git a/src/utils/FileRecordTools/RecordOutputMgr.cpp b/src/utils/FileRecordTools/RecordOutputMgr.cpp index 57fc8ac4df72cea1f6f9184ef66d4d52627e3d2d..b24522ebc21163edc62d360f231e816c77326d59 100644 --- a/src/utils/FileRecordTools/RecordOutputMgr.cpp +++ b/src/utils/FileRecordTools/RecordOutputMgr.cpp @@ -29,10 +29,10 @@ RecordOutputMgr::RecordOutputMgr() : _context(NULL), _printable(true), _bamWriter(NULL), - _currBlockList(NULL), - _blockMgr(NULL) + _currBamBlockList(NULL), + _bamBlockMgr(NULL) { - _blockMgr = new BlockMgr(); + _bamBlockMgr = new BlockMgr(); } RecordOutputMgr::~RecordOutputMgr() @@ -45,8 +45,8 @@ RecordOutputMgr::~RecordOutputMgr() delete _bamWriter; _bamWriter = NULL; } - delete _blockMgr; - _blockMgr = NULL; + delete _bamBlockMgr; + _bamBlockMgr = NULL; } @@ -102,7 +102,7 @@ RecordOutputMgr::printBamType RecordOutputMgr::printBamRecord(RecordKeyList &key if (record->isUnmapped()) { record->printUnmapped(_outBuf); } else { - static_cast<const BamRecord *>(record)->print(_outBuf, _currBlockList); + static_cast<const BamRecord *>(record)->print(_outBuf, _currBamBlockList); } } return BAM_AS_BED; @@ -132,10 +132,10 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList) { if (keyList.getKey()->getType() == FileRecordTypeChecker::BAM_RECORD_TYPE) { RecordKeyList blockList(keyList.getKey()); bool deleteBlocks = false; - _blockMgr->getBlocks(blockList, deleteBlocks); + _bamBlockMgr->getBlocks(blockList, deleteBlocks); printRecord(keyList, &blockList); if (deleteBlocks) { - _blockMgr->deleteBlocks(blockList); + _bamBlockMgr->deleteBlocks(blockList); } return; } @@ -153,7 +153,7 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi checkForHeader(); } const_cast<Record *>(keyList.getKey())->undoZeroLength(); - _currBlockList = blockList; + _currBamBlockList = blockList; if (_context->getProgram() == ContextBase::INTERSECT) { if (_printable) { @@ -161,7 +161,7 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi if ((static_cast<ContextIntersect *>(_context))->getWriteAllOverlap()) { // -wao the user wants to force the reporting of 0 overlap if (printKeyAndTerminate(keyList)) { - _currBlockList = NULL; + _currBamBlockList = NULL; return; } tab(); @@ -173,34 +173,36 @@ void RecordOutputMgr::printRecord(RecordKeyList &keyList, RecordKeyList *blockLi } else if ((static_cast<ContextIntersect *>(_context))->getLeftJoin()) { if (printKeyAndTerminate(keyList)) { - _currBlockList = NULL; + _currBamBlockList = NULL; return; } tab(); null(false, true); newline(); if (needsFlush()) flush(); - _currBlockList = NULL; + _currBamBlockList = NULL; return; } } else { if (printBamRecord(keyList, true) == BAM_AS_BAM) { - _currBlockList = NULL; + _currBamBlockList = NULL; return; } + int hitIdx = 0; for (RecordKeyList::const_iterator_type iter = keyList.begin(); iter != keyList.end(); iter = keyList.next()) { - reportOverlapDetail(keyList.getKey(), iter->value()); + reportOverlapDetail(keyList.getKey(), iter->value(), hitIdx); + hitIdx++; } } } else { // not printable reportOverlapSummary(keyList); } - _currBlockList = NULL; + _currBamBlockList = NULL; } else if (_context->getProgram() == ContextBase::SAMPLE) { if (!printKeyAndTerminate(keyList)) { newline(); } - _currBlockList = NULL; + _currBamBlockList = NULL; return; } else if (_context->getProgram() == ContextBase::MAP) { if (!printKeyAndTerminate(keyList)) { @@ -226,7 +228,7 @@ void RecordOutputMgr::checkForHeader() { if (needsFlush()) flush(); } -void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record *hitRecord) +void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record *hitRecord, int hitIdx) { //get the max start and min end as strings. const_cast<Record *>(hitRecord)->undoZeroLength(); @@ -281,12 +283,6 @@ void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record } } -// const QuickString &startStr = keyRecord->getStartPos() > hitRecord->getStartPos() ? keyRecord->getStartPosStr() : hitRecord->getStartPosStr(); -// const QuickString &endStr = keyRecord->getEndPos() < hitRecord->getEndPos() ? keyRecord->getEndPosStr() : hitRecord->getEndPosStr(); -// -// int maxStart = max(keyRecord->getStartPos(), hitRecord->getStartPos()); -// int minEnd = min(keyRecord->getEndPos(), hitRecord->getEndPos()); -// if (!(static_cast<ContextIntersect *>(_context))->getWriteA() && !(static_cast<ContextIntersect *>(_context))->getWriteB() && !(static_cast<ContextIntersect *>(_context))->getWriteOverlap() && !(static_cast<ContextIntersect *>(_context))->getLeftJoin()) { @@ -315,7 +311,12 @@ void RecordOutputMgr::reportOverlapDetail(const Record *keyRecord, const Record if (needsFlush()) flush(); } else if ((static_cast<ContextIntersect *>(_context))->getWriteOverlap()) { - int printOverlapBases = max(0, minEnd-maxStart); + int printOverlapBases = 0; + if (_context->getObeySplits()) { + printOverlapBases = _splitInfo->getOverlapBases(hitIdx); + } else { + printOverlapBases = minEnd - maxStart; + } printKey(keyRecord); tab(); hitRecord->print(_outBuf); @@ -416,7 +417,7 @@ void RecordOutputMgr::printKey(const Record *key, const QuickString & start, con if (key->getType() != FileRecordTypeChecker::BAM_RECORD_TYPE) { key->print(_outBuf, start, end); } else { - static_cast<const BamRecord *>(key)->print(_outBuf, start, end, _currBlockList); + static_cast<const BamRecord *>(key)->print(_outBuf, start, end, _currBamBlockList); } } @@ -425,7 +426,7 @@ void RecordOutputMgr::printKey(const Record *key) if (key->getType() != FileRecordTypeChecker::BAM_RECORD_TYPE) { key->print(_outBuf); } else { - static_cast<const BamRecord *>(key)->print(_outBuf, _currBlockList); + static_cast<const BamRecord *>(key)->print(_outBuf, _currBamBlockList); } } diff --git a/src/utils/FileRecordTools/RecordOutputMgr.h b/src/utils/FileRecordTools/RecordOutputMgr.h index c6921561affe0a05c2cd9172ec4b8629fa958f66..69b977b7a9b9eeda866f09f5d13dcbeeb29a6a82 100644 --- a/src/utils/FileRecordTools/RecordOutputMgr.h +++ b/src/utils/FileRecordTools/RecordOutputMgr.h @@ -24,22 +24,27 @@ public: //The init method must be called after all the input files are open. bool init(ContextBase *context); -// void printHeader(const string &); void printRecord(const Record *record); void printRecord(RecordKeyList &keyList); // Added by ARQ void printRecord(const Record *record, const QuickString & value); + //where necessary, pass additional information about splits through the blockMgr. + void setSplitInfo(const BlockMgr *blockMgr) { _splitInfo = blockMgr; } + private: typedef enum { NOT_BAM, BAM_AS_BAM, BAM_AS_BED} printBamType; ContextBase *_context; bool _printable; BamTools::BamWriter *_bamWriter; - RecordKeyList *_currBlockList; + RecordKeyList *_currBamBlockList; QuickString _outBuf; - BlockMgr *_blockMgr; + + // + BlockMgr *_bamBlockMgr; + const BlockMgr *_splitInfo; //some helper functions to neaten the code. void tab() { _outBuf.append('\t'); } void newline() { _outBuf.append('\n'); } @@ -51,7 +56,7 @@ private: bool printKeyAndTerminate(RecordKeyList &keyList); printBamType printBamRecord(RecordKeyList &keyList, bool bamOutputOnly = false); void checkForHeader(); - void reportOverlapDetail(const Record *keyRecord, const Record *hitRecord); + void reportOverlapDetail(const Record *keyRecord, const Record *hitRecord, int hitIdx = 0); void reportOverlapSummary(RecordKeyList &keyList); static const unsigned int MAX_OUTBUF_SIZE = 16384; //16 K diff --git a/src/utils/FileRecordTools/Records/BlockMgr.cpp b/src/utils/FileRecordTools/Records/BlockMgr.cpp index 54dbd78835177f3dc26660abce7478edc27df4dd..4aede83d0de63f3259cf81e66d20fa59b7c18f05 100644 --- a/src/utils/FileRecordTools/Records/BlockMgr.cpp +++ b/src/utils/FileRecordTools/Records/BlockMgr.cpp @@ -161,17 +161,23 @@ int BlockMgr::findBlockedOverlaps(RecordKeyList &keyList, RecordKeyList &hitList { bool deleteKeyBlocks = false; if (keyList.empty()) { + //get all the blocks for the query record, put them in it's list. getBlocks(keyList, deleteKeyBlocks); } + _overlapBases.clear(); int keyBlocksSumLength = getTotalBlockLength(keyList); + //Loop through every database record the query intersected with for (RecordKeyList::const_iterator_type hitListIter = hitList.begin(); hitListIter != hitList.end(); hitListIter = hitList.next()) { RecordKeyList hitBlocks(hitListIter->value()); bool deleteHitBlocks = false; - getBlocks(hitBlocks, deleteHitBlocks); - int hitBlockSumLength = getTotalBlockLength(hitBlocks); + getBlocks(hitBlocks, deleteHitBlocks); //get all blocks for the hit record. + int hitBlockSumLength = getTotalBlockLength(hitBlocks); //get total lentgh of the bocks for the hitRecord. int totalHitOverlap = 0; bool hitHasOverlap = false; + + //loop through every block of the database record. for (RecordKeyList::const_iterator_type hitBlockIter = hitBlocks.begin(); hitBlockIter != hitBlocks.end(); hitBlockIter = hitBlocks.next()) { + //loop through every block of the query record. for (RecordKeyList::const_iterator_type keyListIter = keyList.begin(); keyListIter != keyList.end(); keyListIter = keyList.next()) { const Record *keyBlock = keyListIter->value(); const Record *hitBlock = hitBlockIter->value(); @@ -186,6 +192,7 @@ int BlockMgr::findBlockedOverlaps(RecordKeyList &keyList, RecordKeyList &hitList } } + _overlapBases.push_back(totalHitOverlap); if (hitHasOverlap) { if ((float) totalHitOverlap / (float)keyBlocksSumLength > _context->getOverlapFraction()) { if (_context->getReciprocal() && diff --git a/src/utils/FileRecordTools/Records/BlockMgr.h b/src/utils/FileRecordTools/Records/BlockMgr.h index 5de642f8cac274e4020829763750805de117b65b..86ddfd7e2b5e1b3d7de129e10b18d87b0cb13866 100644 --- a/src/utils/FileRecordTools/Records/BlockMgr.h +++ b/src/utils/FileRecordTools/Records/BlockMgr.h @@ -36,17 +36,20 @@ public: // and checking that their total intersection meets any overlapFraction and reciprocal criteria compared to // the total block lengths of the hitList and keyList. All hits that pass will be in the resultList. // Return value is the number of hits in the result set. + int findBlockedOverlaps(RecordKeyList &keyList, RecordKeyList &hitList, RecordKeyList &resultList); //these are setting options for splitting BAM records void setBreakOnDeletionOps(bool val) { _breakOnDeletionOps = val; } void setBreakOnSkipOps(bool val) { _breakOnSkipOps = val; } + int getOverlapBases(int hitIdx) const { return _overlapBases[hitIdx]; } private: RecordMgr *_blockRecordsMgr; ContextIntersect *_context; bool _breakOnDeletionOps; bool _breakOnSkipOps; + vector<int> _overlapBases; // For now, all records will be split into Bed6 records. const static FileRecordTypeChecker::RECORD_TYPE _blockRecordsType = FileRecordTypeChecker::BED6_RECORD_TYPE; diff --git a/src/utils/FileRecordTools/Records/Record.cpp b/src/utils/FileRecordTools/Records/Record.cpp index 8e7d79111a8f599e9e4b213f9297a9b67e24a488..2beb4dca13cad05913be21e5870c66657a2dfdc7 100644 --- a/src/utils/FileRecordTools/Records/Record.cpp +++ b/src/utils/FileRecordTools/Records/Record.cpp @@ -195,14 +195,9 @@ ostream &operator << (ostream &out, const Record &record) const QuickString &Record::getField(int fieldNum) const { -// try { -// _column_vec.push_back(hits[i].fields.at(_column)); -// } -// catch(std::out_of_range& e) { - cerr << endl << "*****" << endl - << "*****ERROR: requested column " << fieldNum << - " , but record only has fields 1 - " << getNumFields() << ". Exiting." << endl - << endl << "*****" << endl; - exit(1); -// } + cerr << endl << "*****" << endl + << "*****ERROR: requested column " << fieldNum << + " , but record only has fields 1 - " << getNumFields() << ". Exiting." << endl + << endl << "*****" << endl; + exit(1); } diff --git a/src/utils/FileRecordTools/Records/recordsTar.tar.gz b/src/utils/FileRecordTools/Records/recordsTar.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..321a88e4935d73f7034cce0611f9191d27bfeaa2 Binary files /dev/null and b/src/utils/FileRecordTools/Records/recordsTar.tar.gz differ diff --git a/test/getfasta/t_fH.fa b/test/getfasta/t_fH.fa new file mode 100644 index 0000000000000000000000000000000000000000..d58c3d7e96bcb068fefa31eadbc338d7dba78a9b --- /dev/null +++ b/test/getfasta/t_fH.fa @@ -0,0 +1,6 @@ +>chr1 assembled by consortium X +aggggggggg +cggggggggg +tggggggggg +aggggggggg +cggggggggg diff --git a/test/getfasta/t_fH.fa.fai b/test/getfasta/t_fH.fa.fai new file mode 100644 index 0000000000000000000000000000000000000000..924e70dd6e310546078bbb1bd586b4cf227adfb7 --- /dev/null +++ b/test/getfasta/t_fH.fa.fai @@ -0,0 +1 @@ +chr1 assembled by consortium X 50 32 10 11 diff --git a/test/getfasta/test-getfasta.sh b/test/getfasta/test-getfasta.sh index a661b22dfc9659cac92e69e215a90529c70e303c..499c5724f5c7caca21e9e7a70968438740ee7a04 100644 --- a/test/getfasta/test-getfasta.sh +++ b/test/getfasta/test-getfasta.sh @@ -59,3 +59,12 @@ if [ "$SEQ" != "tag" ]; then else echo ok fi + +# test -fullHeader +echo " getfasta.t06...\c" +LINES=$(echo $'chr1 assembled by consortium X\t1\t10' | $BT getfasta -fullHeader -fi t_fH.fa -bed stdin -fo - | awk 'END{ print NR }') +if [ "$LINES" != "2" ]; then + echo fail +else + echo ok +fi diff --git a/test/intersect/d.bed b/test/intersect/d.bed new file mode 100644 index 0000000000000000000000000000000000000000..27061a4a23c160d7106c1eb20209d2830c1141fe --- /dev/null +++ b/test/intersect/d.bed @@ -0,0 +1 @@ +chr1 5 15 diff --git a/test/intersect/new_test-intersect.sh b/test/intersect/new_test-intersect.sh index 85c6a8bbbf77c8f3f2ff7600bf8f6c4cc01efb4b..8516c7cb3124956846e8fda2322777593f12eb77 100755 --- a/test/intersect/new_test-intersect.sh +++ b/test/intersect/new_test-intersect.sh @@ -223,8 +223,8 @@ rm obs exp ########################################################### -# Test intersection of bam file containing read where one -# mate is mapped and one is not. +# Test intersection of bam file containing read where query +# is mapped and mate is not. ############################################################ echo " intersect.new.t19...\c" echo \ @@ -234,8 +234,8 @@ check obs exp rm obs exp ########################################################### -# Test intersection of bam file containing read where one -# mate is mapped and one is not, with noHit (-v) option. +# Test intersection of bam file containing read where query +# is mapped and mate is not, with noHit (-v) option. ############################################################ echo " intersect.new.t20...\c" echo \ @@ -245,6 +245,30 @@ check obs exp rm obs exp +########################################################### +# Test intersection of bam file containing read where query +# is unmapped but mate is mapped. +############################################################ +echo " intersect.new.t20.b...\c" +touch exp +$BT intersect -a queryUnmappedMateMappedCoordsInvalid.bam -b j1.bed -bed > obs +check obs exp +rm obs exp + +########################################################### +# Test intersection of bam file containing read where one +# mate is mapped and one is not, with noHit (-v) option. +############################################################ +echo " intersect.new.t20.c...\c" +echo ". -1 -1 TTTACCTTT:4FSQ5P1:286:D2GA7ACXX:6:2316:20858:89646 0 . -1 -1 -1 0,0,0 0 . ." > exp +$BT intersect -a queryUnmappedMateMappedCoordsInvalid.bam -b j1.bed -bed -v > obs +check obs exp +rm obs exp + + + + + ########################################################### # Test intersection with -sorted, see that order specified # in genome file is enforced. diff --git a/test/intersect/queryUnmappedMateMappedCoordsInvalid.bam b/test/intersect/queryUnmappedMateMappedCoordsInvalid.bam new file mode 100644 index 0000000000000000000000000000000000000000..1ecaaae0cef02ede4312af6b87a55764112e3382 Binary files /dev/null and b/test/intersect/queryUnmappedMateMappedCoordsInvalid.bam differ diff --git a/test/intersect/test-intersect.sh b/test/intersect/test-intersect.sh index 40e2ec487651044a4c43baa0877e63547555eaa4..da0ba967e8530b2b6feb0f366c3acc6eb4e6ec5b 100644 --- a/test/intersect/test-intersect.sh +++ b/test/intersect/test-intersect.sh @@ -129,6 +129,7 @@ $BT intersect -a a.bed -b b.bed -wo -s > obs check obs exp rm obs exp + ########################################################### # Test with -wao (write all overlap) with -s ############################################################ @@ -255,6 +256,67 @@ rm obs exp +########################################################### +# Test three blocks with -split -wo only shows 5 overlap +# bases, not ten. +############################################################ +echo " intersect.t22.a...\c" +echo "chr1 0 50 three_blocks_match 0 + 0 0 0 3 10,10,10, 0,20,40, chr1 5 15 5" > exp +$BT intersect -a three_blocks_match.bed -b d.bed -split -wo > obs +check obs exp +rm obs exp + +########################################################### +# Same test but for BAM file +############################################################ +echo " intersect.t22.b...\c" +echo "chr1 0 50 three_blocks_match 255 + 0 50 0,0,0 3 10,10,10, 0,20,40, chr1 5 15 5" > exp +$BT intersect -a three_blocks_match.bam -b d.bed -split -wo -bed > obs +check obs exp +rm obs exp + + +########################################################### +# Test three blocks with -split -wo, and DB record also has +# blocks that somewhat intersect +############################################################ +echo " intersect.t22.c...\c" +echo "chr1 0 50 three_blocks_match 0 + 0 0 0 3 10,10,10, 0,20,40, chr1 0 45 three_blocks_match 0 + 0 0 0 2 5,10, 25,35, 10" > exp +$BT intersect -a three_blocks_match.bed -b two_blocks_partial.bed -split -wo > obs +check obs exp +rm obs exp + +########################################################### +# Same test but for BAM file +############################################################ +echo " intersect.t22.d...\c" +echo "chr1 0 50 three_blocks_match 255 + 0 50 0,0,0 3 10,10,10, 0,20,40, chr1 0 45 three_blocks_match 0 + 0 0 0 2 5,10, 25,35, 10" > exp +$BT intersect -a three_blocks_match.bam -b two_blocks_partial.bed -split -wo -bed > obs +check obs exp +rm obs exp + +########################################################### +# Test three blocks with -split -wo, and DB record also has +# blocks that do not intersect +############################################################ +echo " intersect.t22.e...\c" +touch exp +$BT intersect -a three_blocks_match.bed -b three_blocks_nomatch.bed -split -wo > obs +check obs exp +rm obs exp + + +########################################################### +# Same test but for BAM file +############################################################ +echo " intersect.t22.f...\c" +touch exp +$BT intersect -a three_blocks_match.bam -b three_blocks_nomatch.bed -split -wo -bed > obs +check obs exp +rm obs exp + + + ################################################################## # Test that only the mapped read is is found as an intersection ################################################################## diff --git a/test/intersect/three_blocks_match.bam b/test/intersect/three_blocks_match.bam new file mode 100644 index 0000000000000000000000000000000000000000..f67547b5d0b324171cc5148d9d49a09b7cb322ff Binary files /dev/null and b/test/intersect/three_blocks_match.bam differ diff --git a/test/intersect/two_blocks_partial.bam b/test/intersect/two_blocks_partial.bam new file mode 100644 index 0000000000000000000000000000000000000000..3a82822b49a0adffcdafe9b4b81aa7841db1cde1 Binary files /dev/null and b/test/intersect/two_blocks_partial.bam differ diff --git a/test/intersect/two_blocks_partial.bed b/test/intersect/two_blocks_partial.bed new file mode 100644 index 0000000000000000000000000000000000000000..8a83f351da64186c42d6b8945a8cab750a021601 --- /dev/null +++ b/test/intersect/two_blocks_partial.bed @@ -0,0 +1 @@ +chr1 0 45 three_blocks_match 0 + 0 0 0 2 5,10, 25,35,