From accf0a13ac7e752f708f5272d225032d85ced5e0 Mon Sep 17 00:00:00 2001 From: Timothee Flutre <timflutre@gmail.com> Date: Sat, 1 Feb 2014 12:08:30 +0100 Subject: [PATCH] add option -fullHeader for fasta files --- src/fastaFromBed/fastaFromBed.cpp | 9 ++++--- src/fastaFromBed/fastaFromBed.h | 3 ++- src/fastaFromBed/fastaFromBedMain.cpp | 9 ++++++- src/nucBed/nucBed.cpp | 19 +++++++------- src/nucBed/nucBed.h | 3 ++- src/nucBed/nucBedMain.cpp | 27 ++++++++++++------- src/utils/Fasta/Fasta.cpp | 37 +++++++++++++++++---------- src/utils/Fasta/Fasta.h | 13 +++++++--- test/getfasta/test-getfasta.sh | 9 +++++++ 9 files changed, 86 insertions(+), 43 deletions(-) diff --git a/src/fastaFromBed/fastaFromBed.cpp b/src/fastaFromBed/fastaFromBed.cpp index d9bfd055..46939945 100644 --- a/src/fastaFromBed/fastaFromBed.cpp +++ b/src/fastaFromBed/fastaFromBed.cpp @@ -17,14 +17,15 @@ Bed2Fa::Bed2Fa(bool useName, const string &dbFile, const string &bedFile, const string &fastaOutFile, bool useFasta, bool useStrand, - bool useBlocks) : + bool useBlocks, bool useFullHeader) : _useName(useName), _dbFile(dbFile), _bedFile(bedFile), _fastaOutFile(fastaOutFile), _useFasta(useFasta), _useStrand(useStrand), - _useBlocks(useBlocks) + _useBlocks(useBlocks), + _useFullHeader(useFullHeader) { _bed = new BedFile(_bedFile); @@ -110,7 +111,7 @@ void Bed2Fa::ReportDNA(const BED &bed, string &dna) { //****************************************************************************** void Bed2Fa::ExtractDNA() { - /* Make sure that we can oen all of the files successfully*/ + /* Make sure that we can open all of the files successfully*/ // open the fasta database for reading ifstream faDb(_dbFile.c_str(), ios::in); @@ -124,7 +125,7 @@ void Bed2Fa::ExtractDNA() { // open and memory-map genome file FastaReference *fr = new FastaReference; bool memmap = true; - fr->open(_dbFile, memmap); + fr->open(_dbFile, memmap, _useFullHeader); BED bed, nullBed; string sequence; diff --git a/src/fastaFromBed/fastaFromBed.h b/src/fastaFromBed/fastaFromBed.h index 76fab656..a8ff65ee 100644 --- a/src/fastaFromBed/fastaFromBed.h +++ b/src/fastaFromBed/fastaFromBed.h @@ -33,7 +33,7 @@ public: Bed2Fa(bool useName, const string &dbFile, const string &bedFile, const string &fastaOutFile, bool useFasta, bool useStrand, - bool useBlocks); + bool useBlocks, bool useFullHeader); // destructor ~Bed2Fa(void); @@ -52,6 +52,7 @@ private: bool _useStrand; // should the extracted sequence obey strandedness? bool _useBlocks; // should the extracted sequence obey BED blocks // (for example, exons?) + bool _useFullHeader; // instance of a bed file class. BedFile *_bed; diff --git a/src/fastaFromBed/fastaFromBedMain.cpp b/src/fastaFromBed/fastaFromBedMain.cpp index 77ba7d8d..9bceb3d9 100644 --- a/src/fastaFromBed/fastaFromBedMain.cpp +++ b/src/fastaFromBed/fastaFromBedMain.cpp @@ -44,6 +44,7 @@ int fastafrombed_main(int argc, char* argv[]) { bool useFasta = true; bool useStrand = false; bool useBlocks = false; + bool useFullHeader = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -97,6 +98,9 @@ int fastafrombed_main(int argc, char* argv[]) { else if(PARAMETER_CHECK("-s", 2, parameterLength)) { useStrand = true; } + else if(PARAMETER_CHECK("-fullHeader", 11, parameterLength)) { + useFullHeader = true; + } else { cerr << "*****ERROR: Unrecognized parameter: " << argv[i] @@ -115,7 +119,7 @@ int fastafrombed_main(int argc, char* argv[]) { Bed2Fa *b2f = new Bed2Fa(useNameOnly, fastaDbFile, bedFile, fastaOutFile, useFasta, useStrand, - useBlocks); + useBlocks, useFullHeader); delete b2f; } else { @@ -148,6 +152,9 @@ void fastafrombed_help(void) { << endl; cerr << "\t\tstrand, the sequence will be reverse complemented." << endl; cerr << "\t\t- By default, strand information is ignored." << endl << endl; + cerr << "\t-fullHeader\tUse full fasta header." << endl; + cerr << "\t\t- By default, only the word before the first space or tab " + << "is used." << endl << endl; // end the program here exit(1); diff --git a/src/nucBed/nucBed.cpp b/src/nucBed/nucBed.cpp index 446f6a6b..9b515d27 100644 --- a/src/nucBed/nucBed.cpp +++ b/src/nucBed/nucBed.cpp @@ -15,16 +15,17 @@ NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq, bool hasPattern, const string &pattern, - bool forceStrand, bool ignoreCase) + bool forceStrand, bool ignoreCase, bool useFullHeader) { - _dbFile = dbFile; - _bedFile = bedFile; - _printSeq = printSeq; - _hasPattern = hasPattern; - _pattern = pattern; - _forceStrand = forceStrand; - _ignoreCase = ignoreCase; + _dbFile = dbFile; + _bedFile = bedFile; + _printSeq = printSeq; + _hasPattern = hasPattern; + _pattern = pattern; + _forceStrand = forceStrand; + _ignoreCase = ignoreCase; + _useFullHeader = useFullHeader; _bed = new BedFile(_bedFile); // Compute the DNA content in each BED/GFF/VCF interval @@ -109,7 +110,7 @@ void NucBed::ProfileDNA() { // open and memory-map genome file FastaReference fr; bool memmap = true; - fr.open(_dbFile, memmap); + fr.open(_dbFile, memmap, _useFullHeader); bool headerReported = false; BED bed; diff --git a/src/nucBed/nucBed.h b/src/nucBed/nucBed.h index ba45af4f..dc82f3e2 100644 --- a/src/nucBed/nucBed.h +++ b/src/nucBed/nucBed.h @@ -31,7 +31,7 @@ public: // constructor NucBed(string &dbFile, string &bedFile, bool printSeq, bool hasPattern, const string &pattern, - bool forceStrand, bool ignoreCase); + bool forceStrand, bool ignoreCase, bool useFullHeader); // destructor ~NucBed(void); @@ -46,6 +46,7 @@ private: string _pattern; bool _forceStrand; bool _ignoreCase; + bool _useFullHeader; // instance of a bed file class. BedFile *_bed; diff --git a/src/nucBed/nucBedMain.cpp b/src/nucBed/nucBedMain.cpp index eff8bec3..f8cbc35a 100644 --- a/src/nucBed/nucBedMain.cpp +++ b/src/nucBed/nucBedMain.cpp @@ -35,12 +35,13 @@ int nuc_main(int argc, char* argv[]) { string pattern; // checks for existence of parameters - bool haveFastaDb = false; - bool haveBed = false; - bool printSeq = false; - bool hasPattern = false; - bool forceStrand = false; - bool ignoreCase = false; + bool haveFastaDb = false; + bool haveBed = false; + bool printSeq = false; + bool hasPattern = false; + bool forceStrand = false; + bool ignoreCase = false; + bool useFullHeader = false; // check to see if we should print out some help if(argc <= 1) showHelp = true; @@ -90,6 +91,9 @@ int nuc_main(int argc, char* argv[]) { pattern = argv[i + 1]; i++; } + } + else if(PARAMETER_CHECK("-useFullHeader", 11, parameterLength)) { + useFullHeader = true; } else { cerr << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; @@ -104,7 +108,8 @@ int nuc_main(int argc, char* argv[]) { if (!showHelp) { NucBed *nuc = new NucBed(fastaDbFile, bedFile, printSeq, - hasPattern, pattern, forceStrand, ignoreCase); + hasPattern, pattern, forceStrand, ignoreCase, + useFullHeader); delete nuc; } else { @@ -134,8 +139,12 @@ void nuc_help(void) { cerr << "\t-pattern\tReport the number of times a user-defined sequence" << endl; cerr << "\t\t\tis observed (case-sensitive)." << endl << endl; - cerr << "\t-C\tIgore case when matching -pattern. By defaulty, case matters." << endl << endl; - + cerr << "\t-C\tIgnore case when matching -pattern. By defaulty, case matters." << endl << endl; + + cerr << "\t-fullHeader\tUse full fasta header." << endl; + cerr << "\t\t- By default, only the word before the first space or tab " + << "is used." << endl << endl; + cerr << "Output format: " << endl; cerr << "\tThe following information will be reported after each BED entry:" << endl; cerr << "\t 1) %AT content" << endl; diff --git a/src/utils/Fasta/Fasta.cpp b/src/utils/Fasta/Fasta.cpp index bedf311d..076f06d2 100644 --- a/src/utils/Fasta/Fasta.cpp +++ b/src/utils/Fasta/Fasta.cpp @@ -8,12 +8,13 @@ #include "Fasta.h" -FastaIndexEntry::FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len) +FastaIndexEntry::FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len, bool useFullHeader) : name(name) , length(length) , offset(offset) , line_blen(line_blen) , line_len(line_len) + , useFullHeader(useFullHeader) {} FastaIndexEntry::FastaIndexEntry(void) // empty constructor @@ -30,16 +31,19 @@ void FastaIndexEntry::clear(void) // check if we have already recorded a real offset line_blen = NULL; line_len = NULL; + useFullHeader = false; } ostream& operator<<(ostream& output, const FastaIndexEntry& e) { // just write the first component of the name, for compliance with other tools - output << split(e.name, ' ').at(0) << "\t" << e.length << "\t" << e.offset << "\t" << - e.line_blen << "\t" << e.line_len; + output << (e.useFullHeader? e.name : split(e.name, ' ').at(0)) + << "\t" << e.length << "\t" << e.offset << "\t" + << e.line_blen << "\t" << e.line_len; return output; // for multiple << operators. } -FastaIndex::FastaIndex(void) +FastaIndex::FastaIndex(bool useFullHeader) + : useFullHeader(useFullHeader) {} void FastaIndex::readIndexFile(string fname) { @@ -55,12 +59,14 @@ void FastaIndex::readIndexFile(string fname) { if (fields.size() == 5) { // if we don't get enough fields then there is a problem with the file // note that fields[0] is the sequence name char* end; - string name = split(fields[0], " \t").at(0); // key by first token of name + string name = useFullHeader ? fields[0] : + split(fields[0], " \t").at(0); // key by first token of name sequenceNames.push_back(name); this->insert(make_pair(name, FastaIndexEntry(fields[0], atoi(fields[1].c_str()), - strtoll(fields[2].c_str(), &end, 10), - atoi(fields[3].c_str()), - atoi(fields[4].c_str())))); + strtoll(fields[2].c_str(), &end, 10), + atoi(fields[3].c_str()), + atoi(fields[4].c_str()), + useFullHeader))); } else { cerr << "Warning: malformed fasta index file " << fname << "does not have enough fields @ line " << linenum << endl; @@ -181,12 +187,13 @@ void FastaIndex::indexReference(string refname) { } void FastaIndex::flushEntryToIndex(FastaIndexEntry& entry) { - string name = split(entry.name, " \t").at(0); // key by first token of name + string name = useFullHeader? entry.name : + split(entry.name, " \t").at(0); // key by first token of name sequenceNames.push_back(name); this->insert(make_pair(name, FastaIndexEntry(entry.name, entry.length, - entry.offset, entry.line_blen, - entry.line_len))); - + entry.offset, entry.line_blen, + entry.line_len, + useFullHeader))); } void FastaIndex::writeIndexFile(string fname) { @@ -225,13 +232,15 @@ bool FastaIndex::chromFound(string name) { string FastaIndex::indexFileExtension() { return ".fai"; } -void FastaReference::open(string reffilename, bool usemmap) { +void FastaReference::open(string reffilename, bool usemmap, bool useFullHeader) { filename = reffilename; if (!(file = fopen(filename.c_str(), "r"))) { cerr << "could not open " << filename << endl; exit(1); } - index = new FastaIndex(); + if (useFullHeader) + usingfullheader = true; + index = new FastaIndex(useFullHeader); struct stat stFileInfo; string indexFileName = filename + index->indexFileExtension(); // if we can find an index file, use it diff --git a/src/utils/Fasta/Fasta.h b/src/utils/Fasta/Fasta.h index 68a8bf80..f9d28208 100644 --- a/src/utils/Fasta/Fasta.h +++ b/src/utils/Fasta/Fasta.h @@ -29,7 +29,8 @@ using namespace std; class FastaIndexEntry { friend ostream& operator<<(ostream& output, const FastaIndexEntry& e); public: - FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len); + FastaIndexEntry(string name, int length, long long offset, + int line_blen, int line_len, bool useFullHeader); FastaIndexEntry(void); ~FastaIndexEntry(void); string name; // sequence name @@ -37,14 +38,16 @@ class FastaIndexEntry { long long offset; // bytes offset of sequence from start of file int line_blen; // line length in bytes, sequence characters int line_len; // line length including newline + bool useFullHeader; void clear(void); }; class FastaIndex : public map<string, FastaIndexEntry> { friend ostream& operator<<(ostream& output, FastaIndex& i); public: - FastaIndex(void); + FastaIndex(bool useFullHeader); ~FastaIndex(void); + bool useFullHeader; vector<string> sequenceNames; void indexReference(string refName); void readIndexFile(string fname); @@ -58,10 +61,12 @@ class FastaIndex : public map<string, FastaIndexEntry> { class FastaReference { public: - void open(string reffilename, bool usemmap = false); + void open(string reffilename, bool usemmap = false, + bool useFullHeader = false); bool usingmmap; string filename; - FastaReference(void) : usingmmap(false) { } + bool usingfullheader; + FastaReference(void) : usingmmap(false), usingfullheader(false) { } ~FastaReference(void); FILE* file; void* filemm; diff --git a/test/getfasta/test-getfasta.sh b/test/getfasta/test-getfasta.sh index a661b22d..499c5724 100644 --- a/test/getfasta/test-getfasta.sh +++ b/test/getfasta/test-getfasta.sh @@ -59,3 +59,12 @@ if [ "$SEQ" != "tag" ]; then else echo ok fi + +# test -fullHeader +echo " getfasta.t06...\c" +LINES=$(echo $'chr1 assembled by consortium X\t1\t10' | $BT getfasta -fullHeader -fi t_fH.fa -bed stdin -fo - | awk 'END{ print NR }') +if [ "$LINES" != "2" ]; then + echo fail +else + echo ok +fi -- GitLab