The output of fastaFromBed is now in the order of the input -bed file.

3318a01a · Aaron · 72fc4f98 · 3318a01a · 3318a01a · 3318a01a
Commit 3318a01a authored 14 years ago by Aaron
--- a/src/fastaFromBed/Fasta.cpp
+++ b/src/fastaFromBed/Fasta.cpp
+// ***************************************************************************
+// FastaIndex.cpp (c) 2010 Erik Garrison <erik.garrison@bc.edu>
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 9 February 2010 (EG)
+// ---------------------------------------------------------------------------
+
+#include "Fasta.h"
+
+FastaIndexEntry::FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len)
+    : name(name)
+    , length(length)
+    , offset(offset)
+    , line_blen(line_blen)
+    , line_len(line_len)
+{}
+
+FastaIndexEntry::FastaIndexEntry(void) // empty constructor
+{ clear(); }
+
+FastaIndexEntry::~FastaIndexEntry(void)
+{}
+
+void FastaIndexEntry::clear(void)
+{
+    name = "";
+    length = NULL;
+    offset = -1;  // no real offset will ever be below 0, so this allows us to
+                  // check if we have already recorded a real offset
+    line_blen = NULL;
+    line_len = NULL;
+}
+
+ostream& operator<<(ostream& output, const FastaIndexEntry& e) {
+    // just write the first component of the name, for compliance with other tools
+    output << split(e.name, ' ').at(0) << "\t" << e.length << "\t" << e.offset << "\t" <<
+        e.line_blen << "\t" << e.line_len;
+    return output;  // for multiple << operators.
+}
+
+FastaIndex::FastaIndex(void) 
+{}
+
+void FastaIndex::readIndexFile(string fname) {
+    string line;
+    long long linenum = 0;
+    indexFile.open(fname.c_str(), ifstream::in);
+    if (indexFile.is_open()) {
+        while (getline (indexFile, line)) {
+            ++linenum;
+            // the fai format defined in samtools is tab-delimited, every line being:
+            // fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len
+            vector<string> fields = split(line, '\t');
+            if (fields.size() == 5) {  // if we don't get enough fields then there is a problem with the file
+                // note that fields[0] is the sequence name
+                char* end;
+                string name = split(fields[0], " \t").at(0);  // key by first token of name
+                sequenceNames.push_back(name);
+                this->insert(make_pair(name, FastaIndexEntry(fields[0], atoi(fields[1].c_str()),
+                                                    strtoll(fields[2].c_str(), &end, 10),
+                                                    atoi(fields[3].c_str()),
+                                                    atoi(fields[4].c_str()))));
+            } else {
+                cerr << "Warning: malformed fasta index file " << fname << 
+                    "does not have enough fields @ line " << linenum << endl;
+                cerr << line << endl;
+                exit(1);
+            }
+        }
+    } else {
+        cerr << "could not open index file " << fname << endl;
+        exit(1);
+    }
+}
+
+// for consistency this should be a class method
+bool fastaIndexEntryCompare ( FastaIndexEntry a, FastaIndexEntry b) { return (a.offset<b.offset); }
+
+ostream& operator<<(ostream& output, FastaIndex& fastaIndex) {
+    vector<FastaIndexEntry> sortedIndex;
+    for(vector<string>::const_iterator it = fastaIndex.sequenceNames.begin(); it != fastaIndex.sequenceNames.end(); ++it)
+    {
+        sortedIndex.push_back(fastaIndex[*it]);
+    }
+    sort(sortedIndex.begin(), sortedIndex.end(), fastaIndexEntryCompare);
+    for( vector<FastaIndexEntry>::iterator fit = sortedIndex.begin(); fit != sortedIndex.end(); ++fit) {
+        output << *fit << endl;
+    }
+    return output;
+}
+
+void FastaIndex::indexReference(string refname) {
+    // overview:
+    //  for line in the reference fasta file
+    //  track byte offset from the start of the file
+    //  if line is a fasta header, take the name and dump the last sequnece to the index
+    //  if line is a sequence, add it to the current sequence
+    //cerr << "indexing fasta reference " << refname << endl;
+    string line;
+    FastaIndexEntry entry;  // an entry buffer used in processing
+    entry.clear();
+    int line_length = 0;
+    long long offset = 0;  // byte offset from start of file
+    long long line_number = 0; // current line number
+    bool mismatchedLineLengths = false; // flag to indicate if our line length changes mid-file
+                                        // this will be used to raise an error
+                                        // if we have a line length change at
+                                        // any line other than the last line in
+                                        // the sequence
+    bool emptyLine = false;  // flag to catch empty lines, which we allow for
+                             // index generation only on the last line of the sequence
+    ifstream refFile;
+    refFile.open(refname.c_str());
+    if (refFile.is_open()) {
+        while (getline(refFile, line)) {
+            ++line_number;
+            line_length = line.length();
+            if (line[0] == ';') {
+                // fasta comment, skip
+            } else if (line[0] == '+') {
+                // fastq quality header
+                getline(refFile, line);
+                line_length = line.length();
+                offset += line_length + 1;
+                // get and don't handle the quality line
+                getline(refFile, line);
+                line_length = line.length();
+            } else if (line[0] == '>' || line[0] == '@') { // fasta /fastq header
+                // if we aren't on the first entry, push the last sequence into the index
+                if (entry.name != "") {
+                    mismatchedLineLengths = false; // reset line length error tracker for every new sequence
+                    emptyLine = false;
+                    flushEntryToIndex(entry);
+                    entry.clear();
+                }
+                entry.name = line.substr(1, line_length - 1);
+            } else { // we assume we have found a sequence line
+                if (entry.offset == -1) // NB initially the offset is -1
+                    entry.offset = offset;
+                entry.length += line_length;
+                if (entry.line_len) {
+                    //entry.line_len = entry.line_len ? entry.line_len : line_length + 1;
+                    if (mismatchedLineLengths || emptyLine) {
+                        if (line_length == 0) {
+                            emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence
+                        } else {
+                            if (emptyLine) {
+                                cerr << "ERROR: embedded newline";
+                            } else {
+                                cerr << "ERROR: mismatched line lengths";
+                            }
+                            cerr << " at line " << line_number << " within sequence " << entry.name <<
+                                endl << "File not suitable for fasta index generation." << endl;
+                            exit(1);
+                        }
+                    }
+                    // this flag is set here and checked on the next line
+                    // because we may have reached the end of the sequence, in
+                    // which case a mismatched line length is OK
+                    if (entry.line_len != line_length + 1) {
+                        mismatchedLineLengths = true;
+                        if (line_length == 0) {
+                            emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence
+                        }
+                    }
+                } else {
+                    entry.line_len = line_length + 1; // first line
+                }
+                entry.line_blen = entry.line_len - 1;
+            }
+            offset += line_length + 1;
+        }
+        // we've hit the end of the fasta file!
+        // flush the last entry
+        flushEntryToIndex(entry);
+    } else {
+        cerr << "could not open reference file " << refname << " for indexing!" << endl;
+        exit(1);
+    }
+}
+
+void FastaIndex::flushEntryToIndex(FastaIndexEntry& entry) {
+    string name = split(entry.name, " \t").at(0);  // key by first token of name
+    sequenceNames.push_back(name);
+    this->insert(make_pair(name, FastaIndexEntry(entry.name, entry.length,
+                        entry.offset, entry.line_blen,
+                        entry.line_len)));
+
+}
+
+void FastaIndex::writeIndexFile(string fname) {
+    //cerr << "writing fasta index file " << fname << endl;
+    ofstream file;
+    file.open(fname.c_str()); 
+    if (file.is_open()) {
+        file << *this;
+    } else { 
+        cerr << "could not open index file " << fname << " for writing!" << endl;
+        exit(1);
+    }
+}
+
+FastaIndex::~FastaIndex(void) {
+    indexFile.close();
+}
+
+FastaIndexEntry FastaIndex::entry(string name) {
+    FastaIndex::iterator e = this->find(name);
+    if (e == this->end()) {
+        cerr << "unable to find FASTA index entry for '" << name << "'" << endl;
+        exit(1);
+    } else {
+        return e->second;
+    }
+}
+
+string FastaIndex::indexFileExtension() { return ".fai"; }
+
+void FastaReference::open(string reffilename, bool usemmap) {
+    filename = reffilename;
+    if (!(file = fopen(filename.c_str(), "r"))) {
+        cerr << "could not open " << filename << endl;
+        exit(1);
+    }
+    index = new FastaIndex();
+    struct stat stFileInfo; 
+    string indexFileName = filename + index->indexFileExtension(); 
+    // if we can find an index file, use it
+    if(stat(indexFileName.c_str(), &stFileInfo) == 0) { 
+        index->readIndexFile(indexFileName);
+    } else { // otherwise, read the reference and generate the index file in the cwd
+        cerr << "index file " << indexFileName << " not found, generating..." << endl;
+        index->indexReference(filename);
+        index->writeIndexFile(indexFileName);
+    }
+    if (usemmap) {
+        usingmmap = true;
+        int fd = fileno(file);
+        struct stat sb;
+        if (fstat(fd, &sb) == -1)
+            cerr << "could not stat file" << filename << endl;
+        filesize = sb.st_size;
+        // map the whole file
+        filemm = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0);
+    }
+}
+
+FastaReference::~FastaReference(void) {
+    fclose(file);
+    if (usingmmap) {
+        munmap(filemm, filesize);
+    }
+    delete index;
+}
+
+string FastaReference::getSequence(string seqname) {
+    FastaIndexEntry entry = index->entry(seqname);
+    int newlines_in_sequence = entry.length / entry.line_blen;
+    int seqlen = newlines_in_sequence  + entry.length;
+    char* seq = (char*) calloc (seqlen + 1, sizeof(char));
+    if (usingmmap) {
+        memcpy(seq, (char*) filemm + entry.offset, seqlen);
+    } else {
+        fseek64(file, entry.offset, SEEK_SET);
+        fread(seq, sizeof(char), seqlen, file);
+    }
+    seq[seqlen] = '\0';
+    char* pbegin = seq;
+    char* pend = seq + (seqlen/sizeof(char));
+    pend = remove(pbegin, pend, '\n');
+    pend = remove(pbegin, pend, '\0');
+    string s = seq;
+    free(seq);
+    s.resize((pend - pbegin)/sizeof(char));
+    return s;
+}
+
+// TODO cleanup; odd function.  use a map
+string FastaReference::sequenceNameStartingWith(string seqnameStart) {
+    try {
+        return (*index)[seqnameStart].name;
+    } catch (exception& e) {
+        cerr << e.what() << ": unable to find index entry for " << seqnameStart << endl;
+        exit(1);
+    }
+}
+
+string FastaReference::getSubSequence(string seqname, int start, int length) {
+    FastaIndexEntry entry = index->entry(seqname);
+    if (start < 0 || length < 1) {
+        cerr << "Error: cannot construct subsequence with negative offset or length < 1" << endl;
+        exit(1);
+    }
+    // we have to handle newlines
+    // approach: count newlines before start
+    //           count newlines by end of read
+    //             subtracting newlines before start find count of embedded newlines
+    int newlines_before = start > 0 ? (start - 1) / entry.line_blen : 0;
+    int newlines_by_end = (start + length - 1) / entry.line_blen;
+    int newlines_inside = newlines_by_end - newlines_before;
+    int seqlen = length + newlines_inside;
+    char* seq = (char*) calloc (seqlen + 1, sizeof(char));
+    if (usingmmap) {
+        memcpy(seq, (char*) filemm + entry.offset + newlines_before + start, seqlen);
+    } else {
+        fseek64(file, (off_t) (entry.offset + newlines_before + start), SEEK_SET);
+        fread(seq, sizeof(char), (off_t) seqlen, file);
+    }
+    seq[seqlen] = '\0';
+    char* pbegin = seq;
+    char* pend = seq + (seqlen/sizeof(char));
+    pend = remove(pbegin, pend, '\n');
+    pend = remove(pbegin, pend, '\0');
+    string s = seq;
+    free(seq);
+    s.resize((pend - pbegin)/sizeof(char));
+    return s;
+}
+
+long unsigned int FastaReference::sequenceLength(string seqname) {
+    FastaIndexEntry entry = index->entry(seqname);
+    return entry.length;
+}
+
--- a/src/fastaFromBed/Fasta.h
+++ b/src/fastaFromBed/Fasta.h
+// ***************************************************************************
+// FastaIndex.h (c) 2010 Erik Garrison <erik.garrison@bc.edu>
+// Marth Lab, Department of Biology, Boston College
+// All rights reserved.
+// ---------------------------------------------------------------------------
+// Last modified: 5 February 2010 (EG)
+// ---------------------------------------------------------------------------
+
+#ifndef _FASTA_H
+#define _FASTA_H
+
+#include <map>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <stdint.h>
+#include <stdio.h>
+#include <algorithm>
+#include "LargeFileSupport.h"
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include "split.h"
+#include <stdlib.h>
+#include <ctype.h>
+#include <unistd.h>
+
+using namespace std;
+
+class FastaIndexEntry {
+    friend ostream& operator<<(ostream& output, const FastaIndexEntry& e);
+    public:
+        FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len);
+        FastaIndexEntry(void);
+        ~FastaIndexEntry(void);
+        string name;  // sequence name
+        int length;  // length of sequence
+        long long offset;  // bytes offset of sequence from start of file
+        int line_blen;  // line length in bytes, sequence characters
+        int line_len;  // line length including newline
+        void clear(void);
+};
+
+class FastaIndex : public map<string, FastaIndexEntry> {
+    friend ostream& operator<<(ostream& output, FastaIndex& i);
+    public:
+        FastaIndex(void);
+        ~FastaIndex(void);
+        vector<string> sequenceNames;
+        void indexReference(string refName);
+        void readIndexFile(string fname);
+        void writeIndexFile(string fname);
+        ifstream indexFile;
+        FastaIndexEntry entry(string key);
+        void flushEntryToIndex(FastaIndexEntry& entry);
+        string indexFileExtension(void);
+};
+
+class FastaReference {
+    public:
+        void open(string reffilename, bool usemmap = false);
+        bool usingmmap;
+        string filename;
+        FastaReference(void) : usingmmap(false) { }
+        ~FastaReference(void);
+        FILE* file;
+        void* filemm;
+        size_t filesize;
+        FastaIndex* index;
+        vector<FastaIndexEntry> findSequencesStartingWith(string seqnameStart);
+        string getSequence(string seqname);
+        // potentially useful for performance, investigate
+        // void getSequence(string seqname, string& sequence);
+        string getSubSequence(string seqname, int start, int length);
+        string sequenceNameStartingWith(string seqnameStart);
+        long unsigned int sequenceLength(string seqname);
+};
+
+#endif
--- a/src/fastaFromBed/LargeFileSupport.h
+++ b/src/fastaFromBed/LargeFileSupport.h
+#pragma once
+
+#define _FILE_OFFSET_BITS 64
+
+#ifdef WIN32
+#define ftell64(a)     _ftelli64(a)
+#define fseek64(a,b,c) _fseeki64(a,b,c)
+typedef __int64_t off_type;
+#else
+#define ftell64(a)     ftello(a)
+#define fseek64(a,b,c) fseeko(a,b,c)
+typedef off_t off_type;
+#endif
--- a/src/fastaFromBed/Makefile
+++ b/src/fastaFromBed/Makefile
@@ -10,7 +10,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/sequenceUtilities/ -I$
 # ----------------------------------
 # define our source and object files
 # ----------------------------------
-SOURCES= fastaFromBedMain.cpp fastaFromBed.cpp
+SOURCES= fastaFromBedMain.cpp fastaFromBed.cpp Fasta.cpp split.cpp
 OBJECTS= $(SOURCES:.cpp=.o)
 _EXT_OBJECTS=bedFile.o sequenceUtils.o lineFileUtilities.o gzstream.o fileType.o
 EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))

--- a/src/fastaFromBed/fastaFromBed.cpp
+++ b/src/fastaFromBed/fastaFromBed.cpp
@@ -57,38 +57,32 @@ Bed2Fa::~Bed2Fa(void) {
 //******************************************************************************
 // ReportDNA
 //******************************************************************************
-void Bed2Fa::ReportDNA(const BED &bed, const string &currDNA, const string &currChrom) {
-
-    if ( (bed.start <= currDNA.size()) && (bed.end <= currDNA.size()) ) {
-
-        string dna = currDNA.substr(bed.start, ((bed.end - bed.start)));
-        // revcomp if necessary.  Thanks to Thomas Doktor.
-        if ((_useStrand == true) && (bed.strand == "-"))
-            reverseComplement(dna);
-
-        if (!(_useName)) {
-            if (_useFasta == true) {
-                if (_useStrand == true)
-                    *_faOut << ">" << currChrom << ":" << bed.start << "-" << bed.end   << "(" << bed.strand << ")" << endl << dna << endl;
-                else
-                    *_faOut << ">" << currChrom << ":" << bed.start << "-" << bed.end << endl << dna << endl;
-            }
-            else {
-                if (_useStrand == true)
-                    *_faOut << currChrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << "\t" << dna << endl;
-                else
-                    *_faOut << currChrom << ":" << bed.start << "-" << bed.end << "\t" << dna << endl;
-            }
+void Bed2Fa::ReportDNA(const BED &bed, string &dna) {
+
+    // revcomp if necessary.  Thanks to Thomas Doktor.
+    if ((_useStrand == true) && (bed.strand == "-"))
+        reverseComplement(dna);
+
+    if (!(_useName)) {
+        if (_useFasta == true) {
+            if (_useStrand == true)
+                *_faOut << ">" << bed.chrom << ":" << bed.start << "-" << bed.end   << "(" << bed.strand << ")" << endl << dna << endl;
+            else
+                *_faOut << ">" << bed.chrom << ":" << bed.start << "-" << bed.end << endl << dna << endl;
        }
        else {
-            if (_useFasta == true)
-                *_faOut << ">" << bed.name << endl << dna << endl;
+            if (_useStrand == true)
+                *_faOut << bed.chrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << "\t" << dna << endl;
            else
-                *_faOut << bed.name << "\t" << dna << endl;
+                *_faOut << bed.chrom << ":" << bed.start << "-" << bed.end << "\t" << dna << endl;
        }
    }
-    else cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond "
-        << currChrom << " size (" << currDNA.size() << " bp).  Skipping." << endl;
+    else {
+        if (_useFasta == true)
+            *_faOut << ">" << bed.name << endl << dna << endl;
+        else
+            *_faOut << bed.name << "\t" << dna << endl;
+    }
 }


@@ -107,44 +101,26 @@ void Bed2Fa::ExtractDNA() {
        exit (1);
    }

-    // load the BED file into an unbinned map.
-    _bed->loadBedFileIntoMapNoBin();
-
-    //Read the fastaDb chromosome by chromosome
-    string fastaDbLine;
-    string currChrom;
-    string currDNA = "";
-    currDNA.reserve(500000000);
-
-    while (getline(faDb,fastaDbLine)) {
-        if (fastaDbLine.find(">",0) != 0 ) {
-            currDNA += fastaDbLine;
-        }
-        else {
-            if (currDNA.size() > 0) {
-
-                vector<BED>::const_iterator bedItr = _bed->bedMapNoBin[currChrom].begin();
-                vector<BED>::const_iterator bedEnd = _bed->bedMapNoBin[currChrom].end();
-                // loop through each BED entry for this chrom and print the sequence
-                for (; bedItr != bedEnd; ++bedItr) {
-                    ReportDNA(*bedItr, currDNA, currChrom);
-                }
-                currDNA = "";
-            }
-            currChrom = fastaDbLine.substr(1, fastaDbLine.find_first_of(" ")-1);
-        }
-    }
-
-    // process the last chromosome in the fasta file.
-    if (currDNA.size() > 0) {
-        vector<BED>::const_iterator bedItr = _bed->bedMapNoBin[currChrom].begin();
-        vector<BED>::const_iterator bedEnd = _bed->bedMapNoBin[currChrom].end();
-        // loop through each BED entry for this chrom and print the sequence
-        for (; bedItr != bedEnd; ++bedItr) {
-            ReportDNA(*bedItr, currDNA, currChrom);
+    // open and memory-map genome file
+    FastaReference fr;
+    bool memmap = true;
+    fr.open(_dbFile, memmap);
+
+    BED bed, nullBed;
+    int lineNum = 0;
+    BedLineStatus bedStatus;
+    string sequence;
+
+    _bed->Open();
+    while ((bedStatus = _bed->GetNextBed(bed, lineNum)) != BED_INVALID) {
+        if (bedStatus == BED_VALID) {
+            int length = bed.end - bed.start;
+            sequence = fr.getSubSequence(bed.chrom, bed.start, length);
+            ReportDNA(bed, sequence);
+            bed = nullBed;
        }
-        currDNA = "";
    }
+    _bed->Close();
 }



--- a/src/fastaFromBed/fastaFromBed.h
+++ b/src/fastaFromBed/fastaFromBed.h
@@ -14,6 +14,7 @@

 #include "bedFile.h"
 #include "sequenceUtils.h"
+#include "Fasta.h"
 #include <vector>
 #include <iostream>
 #include <fstream>
@@ -35,7 +36,7 @@ public:
    ~Bed2Fa(void);

    void ExtractDNA();
-    void ReportDNA(const BED &bed, const string &currDNA, const string &currChrom);
+    void ReportDNA(const BED &bed, string &dna);


 private:

--- a/src/fastaFromBed/split.cpp
+++ b/src/fastaFromBed/split.cpp
+#include "split.h"
+
+std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
+    std::stringstream ss(s);
+    std::string item;
+    while(std::getline(ss, item, delim)) {
+        elems.push_back(item);
+    }
+    return elems;
+}
+
+std::vector<std::string> split(const std::string &s, char delim) {
+    std::vector<std::string> elems;
+    return split(s, delim, elems);
+}
+
+std::vector<std::string> &split(const std::string &s, const std::string& delims, std::vector<std::string> &elems) {
+    char* tok;
+    char cchars [s.size()+1];
+    char* cstr = &cchars[0];
+    strcpy(cstr, s.c_str());
+    tok = strtok(cstr, delims.c_str());
+    while (tok != NULL) {
+        elems.push_back(tok);
+        tok = strtok(NULL, delims.c_str());
+    }
+    return elems;
+}
+
+std::vector<std::string> split(const std::string &s, const std::string& delims) {
+    std::vector<std::string> elems;
+    return split(s, delims, elems);
+}
--- a/src/fastaFromBed/split.h
+++ b/src/fastaFromBed/split.h
+#ifndef __SPLIT_H
+#define __SPLIT_H
+
+// functions to split a string by a specific delimiter
+#include <string>
+#include <vector>
+#include <sstream>
+#include <string.h>
+
+// thanks to Evan Teran, http://stackoverflow.com/questions/236129/how-to-split-a-string/236803#236803
+
+// split a string on a single delimiter character (delim)
+std::vector<std::string>& split(const std::string &s, char delim, std::vector<std::string> &elems);
+std::vector<std::string>  split(const std::string &s, char delim);
+
+// split a string on any character found in the string of delimiters (delims)
+std::vector<std::string>& split(const std::string &s, const std::string& delims, std::vector<std::string> &elems);
+std::vector<std::string>  split(const std::string &s, const std::string& delims);
+
+#endif
--- a/src/fastaFromBed/split.o
+++ b/src/fastaFromBed/split.o