From accf0a13ac7e752f708f5272d225032d85ced5e0 Mon Sep 17 00:00:00 2001
From: Timothee Flutre <timflutre@gmail.com>
Date: Sat, 1 Feb 2014 12:08:30 +0100
Subject: [PATCH] add option -fullHeader for fasta files

---
 src/fastaFromBed/fastaFromBed.cpp     |  9 ++++---
 src/fastaFromBed/fastaFromBed.h       |  3 ++-
 src/fastaFromBed/fastaFromBedMain.cpp |  9 ++++++-
 src/nucBed/nucBed.cpp                 | 19 +++++++-------
 src/nucBed/nucBed.h                   |  3 ++-
 src/nucBed/nucBedMain.cpp             | 27 ++++++++++++-------
 src/utils/Fasta/Fasta.cpp             | 37 +++++++++++++++++----------
 src/utils/Fasta/Fasta.h               | 13 +++++++---
 test/getfasta/test-getfasta.sh        |  9 +++++++
 9 files changed, 86 insertions(+), 43 deletions(-)

diff --git a/src/fastaFromBed/fastaFromBed.cpp b/src/fastaFromBed/fastaFromBed.cpp
index d9bfd055..46939945 100644
--- a/src/fastaFromBed/fastaFromBed.cpp
+++ b/src/fastaFromBed/fastaFromBed.cpp
@@ -17,14 +17,15 @@
 Bed2Fa::Bed2Fa(bool useName, const string &dbFile, 
                const string &bedFile, const string &fastaOutFile, 
                bool useFasta, bool useStrand, 
-               bool useBlocks) :
+               bool useBlocks, bool useFullHeader) :
     _useName(useName),
     _dbFile(dbFile),
     _bedFile(bedFile),
     _fastaOutFile(fastaOutFile),
     _useFasta(useFasta),
     _useStrand(useStrand),
-    _useBlocks(useBlocks)
+    _useBlocks(useBlocks),
+    _useFullHeader(useFullHeader)
 {
     _bed = new BedFile(_bedFile);
 
@@ -110,7 +111,7 @@ void Bed2Fa::ReportDNA(const BED &bed, string &dna) {
 //******************************************************************************
 void Bed2Fa::ExtractDNA() {
 
-    /* Make sure that we can oen all of the files successfully*/
+    /* Make sure that we can open all of the files successfully*/
 
     // open the fasta database for reading
     ifstream faDb(_dbFile.c_str(), ios::in);
@@ -124,7 +125,7 @@ void Bed2Fa::ExtractDNA() {
     // open and memory-map genome file
     FastaReference *fr = new FastaReference;
     bool memmap = true;
-    fr->open(_dbFile, memmap);
+    fr->open(_dbFile, memmap, _useFullHeader);
 
     BED bed, nullBed;
     string sequence;
diff --git a/src/fastaFromBed/fastaFromBed.h b/src/fastaFromBed/fastaFromBed.h
index 76fab656..a8ff65ee 100644
--- a/src/fastaFromBed/fastaFromBed.h
+++ b/src/fastaFromBed/fastaFromBed.h
@@ -33,7 +33,7 @@ public:
     Bed2Fa(bool useName, const string &dbFile, 
            const string &bedFile, const string &fastaOutFile, 
            bool useFasta, bool useStrand, 
-           bool useBlocks);
+           bool useBlocks, bool useFullHeader);
 
     // destructor
     ~Bed2Fa(void);
@@ -52,6 +52,7 @@ private:
     bool _useStrand;    // should the extracted sequence obey strandedness?
     bool _useBlocks;    // should the extracted sequence obey BED blocks
                         // (for example, exons?)
+    bool _useFullHeader;
 
     // instance of a bed file class.
     BedFile  *_bed;
diff --git a/src/fastaFromBed/fastaFromBedMain.cpp b/src/fastaFromBed/fastaFromBedMain.cpp
index 77ba7d8d..9bceb3d9 100644
--- a/src/fastaFromBed/fastaFromBedMain.cpp
+++ b/src/fastaFromBed/fastaFromBedMain.cpp
@@ -44,6 +44,7 @@ int fastafrombed_main(int argc, char* argv[]) {
     bool useFasta = true;
     bool useStrand = false;
     bool useBlocks = false;
+    bool useFullHeader = false;
 
     // check to see if we should print out some help
     if(argc <= 1) showHelp = true;
@@ -97,6 +98,9 @@ int fastafrombed_main(int argc, char* argv[]) {
         else if(PARAMETER_CHECK("-s", 2, parameterLength)) {
             useStrand = true;
         }
+        else if(PARAMETER_CHECK("-fullHeader", 11, parameterLength)) {
+            useFullHeader = true;
+        }
         else {
             cerr << "*****ERROR: Unrecognized parameter: " 
                  << argv[i] 
@@ -115,7 +119,7 @@ int fastafrombed_main(int argc, char* argv[]) {
         Bed2Fa *b2f = new Bed2Fa(useNameOnly, fastaDbFile, 
                                  bedFile, fastaOutFile, 
                                  useFasta, useStrand, 
-                                 useBlocks);
+                                 useBlocks, useFullHeader);
         delete b2f;
     }
     else {
@@ -148,6 +152,9 @@ void fastafrombed_help(void) {
          << endl;
     cerr << "\t\tstrand, the sequence will be reverse complemented." << endl;
     cerr << "\t\t- By default, strand information is ignored." << endl << endl;
+    cerr << "\t-fullHeader\tUse full fasta header." << endl;
+    cerr << "\t\t- By default, only the word before the first space or tab "
+	 << "is used." << endl << endl;
 
     // end the program here
     exit(1);
diff --git a/src/nucBed/nucBed.cpp b/src/nucBed/nucBed.cpp
index 446f6a6b..9b515d27 100644
--- a/src/nucBed/nucBed.cpp
+++ b/src/nucBed/nucBed.cpp
@@ -15,16 +15,17 @@
 
 NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq, 
                bool hasPattern, const string &pattern, 
-               bool forceStrand, bool ignoreCase) 
+               bool forceStrand, bool ignoreCase, bool useFullHeader) 
 {
 
-    _dbFile       = dbFile;
-    _bedFile      = bedFile;
-    _printSeq     = printSeq;
-    _hasPattern   = hasPattern;
-    _pattern      = pattern;
-    _forceStrand  = forceStrand;
-    _ignoreCase   = ignoreCase;
+    _dbFile        = dbFile;
+    _bedFile       = bedFile;
+    _printSeq      = printSeq;
+    _hasPattern    = hasPattern;
+    _pattern       = pattern;
+    _forceStrand   = forceStrand;
+    _ignoreCase    = ignoreCase;
+    _useFullHeader = useFullHeader;
     _bed = new BedFile(_bedFile);
 
     // Compute the DNA content in each BED/GFF/VCF interval
@@ -109,7 +110,7 @@ void NucBed::ProfileDNA() {
     // open and memory-map genome file
     FastaReference fr;
     bool memmap = true;    
-    fr.open(_dbFile, memmap);
+    fr.open(_dbFile, memmap, _useFullHeader);
 
     bool headerReported = false;
     BED bed;
diff --git a/src/nucBed/nucBed.h b/src/nucBed/nucBed.h
index ba45af4f..dc82f3e2 100644
--- a/src/nucBed/nucBed.h
+++ b/src/nucBed/nucBed.h
@@ -31,7 +31,7 @@ public:
     // constructor
     NucBed(string &dbFile, string &bedFile, bool printSeq, 
            bool hasPattern, const string &pattern,
-           bool forceStrand, bool ignoreCase);
+           bool forceStrand, bool ignoreCase, bool useFullHeader);
     // destructor
     ~NucBed(void);
 
@@ -46,6 +46,7 @@ private:
     string _pattern;
     bool _forceStrand;
     bool _ignoreCase;
+    bool _useFullHeader;
 
     // instance of a bed file class.
     BedFile  *_bed;
diff --git a/src/nucBed/nucBedMain.cpp b/src/nucBed/nucBedMain.cpp
index eff8bec3..f8cbc35a 100644
--- a/src/nucBed/nucBedMain.cpp
+++ b/src/nucBed/nucBedMain.cpp
@@ -35,12 +35,13 @@ int nuc_main(int argc, char* argv[]) {
     string pattern;
 
     // checks for existence of parameters
-    bool haveFastaDb = false;
-    bool haveBed     = false;
-    bool printSeq    = false;
-    bool hasPattern  = false;
-    bool forceStrand = false;
-    bool ignoreCase  = false;
+    bool haveFastaDb   = false;
+    bool haveBed       = false;
+    bool printSeq      = false;
+    bool hasPattern    = false;
+    bool forceStrand   = false;
+    bool ignoreCase    = false;
+    bool useFullHeader = false;
 
     // check to see if we should print out some help
     if(argc <= 1) showHelp = true;
@@ -90,6 +91,9 @@ int nuc_main(int argc, char* argv[]) {
                 pattern = argv[i + 1];
                 i++;
             }
+	}
+        else if(PARAMETER_CHECK("-useFullHeader", 11, parameterLength)) {
+            useFullHeader = true;
         }
         else {
             cerr << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
@@ -104,7 +108,8 @@ int nuc_main(int argc, char* argv[]) {
     if (!showHelp) {
 
         NucBed *nuc = new NucBed(fastaDbFile, bedFile, printSeq, 
-                                 hasPattern, pattern, forceStrand, ignoreCase);
+                                 hasPattern, pattern, forceStrand, ignoreCase,
+				 useFullHeader);
         delete nuc;
     }
     else {
@@ -134,8 +139,12 @@ void nuc_help(void) {
     cerr << "\t-pattern\tReport the number of times a user-defined sequence" << endl;
     cerr << "\t\t\tis observed (case-sensitive)." << endl << endl;    
 
-    cerr << "\t-C\tIgore case when matching -pattern. By defaulty, case matters." << endl << endl;
-    
+    cerr << "\t-C\tIgnore case when matching -pattern. By defaulty, case matters." << endl << endl;
+
+    cerr << "\t-fullHeader\tUse full fasta header." << endl;
+    cerr << "\t\t- By default, only the word before the first space or tab "
+	 << "is used." << endl << endl;
+
     cerr << "Output format: " << endl;
     cerr << "\tThe following information will be reported after each BED entry:" << endl;
     cerr << "\t    1) %AT content" << endl;
diff --git a/src/utils/Fasta/Fasta.cpp b/src/utils/Fasta/Fasta.cpp
index bedf311d..076f06d2 100644
--- a/src/utils/Fasta/Fasta.cpp
+++ b/src/utils/Fasta/Fasta.cpp
@@ -8,12 +8,13 @@
 
 #include "Fasta.h"
 
-FastaIndexEntry::FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len)
+FastaIndexEntry::FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len, bool useFullHeader)
     : name(name)
     , length(length)
     , offset(offset)
     , line_blen(line_blen)
     , line_len(line_len)
+    , useFullHeader(useFullHeader)
 {}
 
 FastaIndexEntry::FastaIndexEntry(void) // empty constructor
@@ -30,16 +31,19 @@ void FastaIndexEntry::clear(void)
                   // check if we have already recorded a real offset
     line_blen = NULL;
     line_len  = NULL;
+    useFullHeader = false;
 }
 
 ostream& operator<<(ostream& output, const FastaIndexEntry& e) {
     // just write the first component of the name, for compliance with other tools
-    output << split(e.name, ' ').at(0) << "\t" << e.length << "\t" << e.offset << "\t" <<
-        e.line_blen << "\t" << e.line_len;
+  output << (e.useFullHeader? e.name : split(e.name, ' ').at(0))
+	   << "\t" << e.length << "\t" << e.offset << "\t"
+	   << e.line_blen << "\t" << e.line_len;
     return output;  // for multiple << operators.
 }
 
-FastaIndex::FastaIndex(void) 
+FastaIndex::FastaIndex(bool useFullHeader) 
+  : useFullHeader(useFullHeader)
 {}
 
 void FastaIndex::readIndexFile(string fname) {
@@ -55,12 +59,14 @@ void FastaIndex::readIndexFile(string fname) {
             if (fields.size() == 5) {  // if we don't get enough fields then there is a problem with the file
                 // note that fields[0] is the sequence name
                 char* end;
-                string name = split(fields[0], " \t").at(0);  // key by first token of name
+                string name = useFullHeader ? fields[0] :
+		  split(fields[0], " \t").at(0);  // key by first token of name
                 sequenceNames.push_back(name);
                 this->insert(make_pair(name, FastaIndexEntry(fields[0], atoi(fields[1].c_str()),
-                                                    strtoll(fields[2].c_str(), &end, 10),
-                                                    atoi(fields[3].c_str()),
-                                                    atoi(fields[4].c_str()))));
+							     strtoll(fields[2].c_str(), &end, 10),
+							     atoi(fields[3].c_str()),
+							     atoi(fields[4].c_str()),
+							     useFullHeader)));
             } else {
                 cerr << "Warning: malformed fasta index file " << fname << 
                     "does not have enough fields @ line " << linenum << endl;
@@ -181,12 +187,13 @@ void FastaIndex::indexReference(string refname) {
 }
 
 void FastaIndex::flushEntryToIndex(FastaIndexEntry& entry) {
-    string name = split(entry.name, " \t").at(0);  // key by first token of name
+    string name = useFullHeader? entry.name :
+      split(entry.name, " \t").at(0);  // key by first token of name
     sequenceNames.push_back(name);
     this->insert(make_pair(name, FastaIndexEntry(entry.name, entry.length,
-                        entry.offset, entry.line_blen,
-                        entry.line_len)));
-
+						 entry.offset, entry.line_blen,
+						 entry.line_len,
+						 useFullHeader)));
 }
 
 void FastaIndex::writeIndexFile(string fname) {
@@ -225,13 +232,15 @@ bool FastaIndex::chromFound(string name) {
 
 string FastaIndex::indexFileExtension() { return ".fai"; }
 
-void FastaReference::open(string reffilename, bool usemmap) {
+void FastaReference::open(string reffilename, bool usemmap, bool useFullHeader) {
     filename = reffilename;
     if (!(file = fopen(filename.c_str(), "r"))) {
         cerr << "could not open " << filename << endl;
         exit(1);
     }
-    index = new FastaIndex();
+    if (useFullHeader)
+      usingfullheader = true;
+    index = new FastaIndex(useFullHeader);
     struct stat stFileInfo; 
     string indexFileName = filename + index->indexFileExtension(); 
     // if we can find an index file, use it
diff --git a/src/utils/Fasta/Fasta.h b/src/utils/Fasta/Fasta.h
index 68a8bf80..f9d28208 100644
--- a/src/utils/Fasta/Fasta.h
+++ b/src/utils/Fasta/Fasta.h
@@ -29,7 +29,8 @@ using namespace std;
 class FastaIndexEntry {
     friend ostream& operator<<(ostream& output, const FastaIndexEntry& e);
     public:
-        FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len);
+        FastaIndexEntry(string name, int length, long long offset,
+			int line_blen, int line_len, bool useFullHeader);
         FastaIndexEntry(void);
         ~FastaIndexEntry(void);
         string name;  // sequence name
@@ -37,14 +38,16 @@ class FastaIndexEntry {
         long long offset;  // bytes offset of sequence from start of file
         int line_blen;  // line length in bytes, sequence characters
         int line_len;  // line length including newline
+	bool useFullHeader;
         void clear(void);
 };
 
 class FastaIndex : public map<string, FastaIndexEntry> {
     friend ostream& operator<<(ostream& output, FastaIndex& i);
     public:
-        FastaIndex(void);
+        FastaIndex(bool useFullHeader);
         ~FastaIndex(void);
+	bool useFullHeader;
         vector<string> sequenceNames;
         void indexReference(string refName);
         void readIndexFile(string fname);
@@ -58,10 +61,12 @@ class FastaIndex : public map<string, FastaIndexEntry> {
 
 class FastaReference {
     public:
-        void open(string reffilename, bool usemmap = false);
+        void open(string reffilename, bool usemmap = false,
+		  bool useFullHeader = false);
         bool usingmmap;
         string filename;
-        FastaReference(void) : usingmmap(false) { }
+	bool usingfullheader;
+        FastaReference(void) : usingmmap(false), usingfullheader(false) { }
         ~FastaReference(void);
         FILE* file;
         void* filemm;
diff --git a/test/getfasta/test-getfasta.sh b/test/getfasta/test-getfasta.sh
index a661b22d..499c5724 100644
--- a/test/getfasta/test-getfasta.sh
+++ b/test/getfasta/test-getfasta.sh
@@ -59,3 +59,12 @@ if [ "$SEQ" != "tag" ]; then
 else
     echo ok
 fi
+
+# test -fullHeader
+echo "    getfasta.t06...\c"
+LINES=$(echo $'chr1 assembled by consortium X\t1\t10' | $BT getfasta -fullHeader -fi t_fH.fa -bed stdin -fo - | awk 'END{ print NR }')
+if [ "$LINES" != "2" ]; then
+    echo fail
+else
+    echo ok
+fi
-- 
GitLab