Skip to content
Snippets Groups Projects
Commit 3318a01a authored by Aaron's avatar Aaron
Browse files

The output of fastaFromBed is now in the order of the input -bed file.

parent 72fc4f98
No related branches found
No related tags found
No related merge requests found
// ***************************************************************************
// FastaIndex.cpp (c) 2010 Erik Garrison <erik.garrison@bc.edu>
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
// Last modified: 9 February 2010 (EG)
// ---------------------------------------------------------------------------
#include "Fasta.h"
FastaIndexEntry::FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len)
: name(name)
, length(length)
, offset(offset)
, line_blen(line_blen)
, line_len(line_len)
{}
FastaIndexEntry::FastaIndexEntry(void) // empty constructor
{ clear(); }
FastaIndexEntry::~FastaIndexEntry(void)
{}
void FastaIndexEntry::clear(void)
{
name = "";
length = NULL;
offset = -1; // no real offset will ever be below 0, so this allows us to
// check if we have already recorded a real offset
line_blen = NULL;
line_len = NULL;
}
ostream& operator<<(ostream& output, const FastaIndexEntry& e) {
// just write the first component of the name, for compliance with other tools
output << split(e.name, ' ').at(0) << "\t" << e.length << "\t" << e.offset << "\t" <<
e.line_blen << "\t" << e.line_len;
return output; // for multiple << operators.
}
FastaIndex::FastaIndex(void)
{}
void FastaIndex::readIndexFile(string fname) {
string line;
long long linenum = 0;
indexFile.open(fname.c_str(), ifstream::in);
if (indexFile.is_open()) {
while (getline (indexFile, line)) {
++linenum;
// the fai format defined in samtools is tab-delimited, every line being:
// fai->name[i], (int)x.len, (long long)x.offset, (int)x.line_blen, (int)x.line_len
vector<string> fields = split(line, '\t');
if (fields.size() == 5) { // if we don't get enough fields then there is a problem with the file
// note that fields[0] is the sequence name
char* end;
string name = split(fields[0], " \t").at(0); // key by first token of name
sequenceNames.push_back(name);
this->insert(make_pair(name, FastaIndexEntry(fields[0], atoi(fields[1].c_str()),
strtoll(fields[2].c_str(), &end, 10),
atoi(fields[3].c_str()),
atoi(fields[4].c_str()))));
} else {
cerr << "Warning: malformed fasta index file " << fname <<
"does not have enough fields @ line " << linenum << endl;
cerr << line << endl;
exit(1);
}
}
} else {
cerr << "could not open index file " << fname << endl;
exit(1);
}
}
// for consistency this should be a class method
bool fastaIndexEntryCompare ( FastaIndexEntry a, FastaIndexEntry b) { return (a.offset<b.offset); }
ostream& operator<<(ostream& output, FastaIndex& fastaIndex) {
vector<FastaIndexEntry> sortedIndex;
for(vector<string>::const_iterator it = fastaIndex.sequenceNames.begin(); it != fastaIndex.sequenceNames.end(); ++it)
{
sortedIndex.push_back(fastaIndex[*it]);
}
sort(sortedIndex.begin(), sortedIndex.end(), fastaIndexEntryCompare);
for( vector<FastaIndexEntry>::iterator fit = sortedIndex.begin(); fit != sortedIndex.end(); ++fit) {
output << *fit << endl;
}
return output;
}
void FastaIndex::indexReference(string refname) {
// overview:
// for line in the reference fasta file
// track byte offset from the start of the file
// if line is a fasta header, take the name and dump the last sequnece to the index
// if line is a sequence, add it to the current sequence
//cerr << "indexing fasta reference " << refname << endl;
string line;
FastaIndexEntry entry; // an entry buffer used in processing
entry.clear();
int line_length = 0;
long long offset = 0; // byte offset from start of file
long long line_number = 0; // current line number
bool mismatchedLineLengths = false; // flag to indicate if our line length changes mid-file
// this will be used to raise an error
// if we have a line length change at
// any line other than the last line in
// the sequence
bool emptyLine = false; // flag to catch empty lines, which we allow for
// index generation only on the last line of the sequence
ifstream refFile;
refFile.open(refname.c_str());
if (refFile.is_open()) {
while (getline(refFile, line)) {
++line_number;
line_length = line.length();
if (line[0] == ';') {
// fasta comment, skip
} else if (line[0] == '+') {
// fastq quality header
getline(refFile, line);
line_length = line.length();
offset += line_length + 1;
// get and don't handle the quality line
getline(refFile, line);
line_length = line.length();
} else if (line[0] == '>' || line[0] == '@') { // fasta /fastq header
// if we aren't on the first entry, push the last sequence into the index
if (entry.name != "") {
mismatchedLineLengths = false; // reset line length error tracker for every new sequence
emptyLine = false;
flushEntryToIndex(entry);
entry.clear();
}
entry.name = line.substr(1, line_length - 1);
} else { // we assume we have found a sequence line
if (entry.offset == -1) // NB initially the offset is -1
entry.offset = offset;
entry.length += line_length;
if (entry.line_len) {
//entry.line_len = entry.line_len ? entry.line_len : line_length + 1;
if (mismatchedLineLengths || emptyLine) {
if (line_length == 0) {
emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence
} else {
if (emptyLine) {
cerr << "ERROR: embedded newline";
} else {
cerr << "ERROR: mismatched line lengths";
}
cerr << " at line " << line_number << " within sequence " << entry.name <<
endl << "File not suitable for fasta index generation." << endl;
exit(1);
}
}
// this flag is set here and checked on the next line
// because we may have reached the end of the sequence, in
// which case a mismatched line length is OK
if (entry.line_len != line_length + 1) {
mismatchedLineLengths = true;
if (line_length == 0) {
emptyLine = true; // flag empty lines, raise error only if this is embedded in the sequence
}
}
} else {
entry.line_len = line_length + 1; // first line
}
entry.line_blen = entry.line_len - 1;
}
offset += line_length + 1;
}
// we've hit the end of the fasta file!
// flush the last entry
flushEntryToIndex(entry);
} else {
cerr << "could not open reference file " << refname << " for indexing!" << endl;
exit(1);
}
}
void FastaIndex::flushEntryToIndex(FastaIndexEntry& entry) {
string name = split(entry.name, " \t").at(0); // key by first token of name
sequenceNames.push_back(name);
this->insert(make_pair(name, FastaIndexEntry(entry.name, entry.length,
entry.offset, entry.line_blen,
entry.line_len)));
}
void FastaIndex::writeIndexFile(string fname) {
//cerr << "writing fasta index file " << fname << endl;
ofstream file;
file.open(fname.c_str());
if (file.is_open()) {
file << *this;
} else {
cerr << "could not open index file " << fname << " for writing!" << endl;
exit(1);
}
}
FastaIndex::~FastaIndex(void) {
indexFile.close();
}
FastaIndexEntry FastaIndex::entry(string name) {
FastaIndex::iterator e = this->find(name);
if (e == this->end()) {
cerr << "unable to find FASTA index entry for '" << name << "'" << endl;
exit(1);
} else {
return e->second;
}
}
string FastaIndex::indexFileExtension() { return ".fai"; }
void FastaReference::open(string reffilename, bool usemmap) {
filename = reffilename;
if (!(file = fopen(filename.c_str(), "r"))) {
cerr << "could not open " << filename << endl;
exit(1);
}
index = new FastaIndex();
struct stat stFileInfo;
string indexFileName = filename + index->indexFileExtension();
// if we can find an index file, use it
if(stat(indexFileName.c_str(), &stFileInfo) == 0) {
index->readIndexFile(indexFileName);
} else { // otherwise, read the reference and generate the index file in the cwd
cerr << "index file " << indexFileName << " not found, generating..." << endl;
index->indexReference(filename);
index->writeIndexFile(indexFileName);
}
if (usemmap) {
usingmmap = true;
int fd = fileno(file);
struct stat sb;
if (fstat(fd, &sb) == -1)
cerr << "could not stat file" << filename << endl;
filesize = sb.st_size;
// map the whole file
filemm = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0);
}
}
FastaReference::~FastaReference(void) {
fclose(file);
if (usingmmap) {
munmap(filemm, filesize);
}
delete index;
}
string FastaReference::getSequence(string seqname) {
FastaIndexEntry entry = index->entry(seqname);
int newlines_in_sequence = entry.length / entry.line_blen;
int seqlen = newlines_in_sequence + entry.length;
char* seq = (char*) calloc (seqlen + 1, sizeof(char));
if (usingmmap) {
memcpy(seq, (char*) filemm + entry.offset, seqlen);
} else {
fseek64(file, entry.offset, SEEK_SET);
fread(seq, sizeof(char), seqlen, file);
}
seq[seqlen] = '\0';
char* pbegin = seq;
char* pend = seq + (seqlen/sizeof(char));
pend = remove(pbegin, pend, '\n');
pend = remove(pbegin, pend, '\0');
string s = seq;
free(seq);
s.resize((pend - pbegin)/sizeof(char));
return s;
}
// TODO cleanup; odd function. use a map
string FastaReference::sequenceNameStartingWith(string seqnameStart) {
try {
return (*index)[seqnameStart].name;
} catch (exception& e) {
cerr << e.what() << ": unable to find index entry for " << seqnameStart << endl;
exit(1);
}
}
string FastaReference::getSubSequence(string seqname, int start, int length) {
FastaIndexEntry entry = index->entry(seqname);
if (start < 0 || length < 1) {
cerr << "Error: cannot construct subsequence with negative offset or length < 1" << endl;
exit(1);
}
// we have to handle newlines
// approach: count newlines before start
// count newlines by end of read
// subtracting newlines before start find count of embedded newlines
int newlines_before = start > 0 ? (start - 1) / entry.line_blen : 0;
int newlines_by_end = (start + length - 1) / entry.line_blen;
int newlines_inside = newlines_by_end - newlines_before;
int seqlen = length + newlines_inside;
char* seq = (char*) calloc (seqlen + 1, sizeof(char));
if (usingmmap) {
memcpy(seq, (char*) filemm + entry.offset + newlines_before + start, seqlen);
} else {
fseek64(file, (off_t) (entry.offset + newlines_before + start), SEEK_SET);
fread(seq, sizeof(char), (off_t) seqlen, file);
}
seq[seqlen] = '\0';
char* pbegin = seq;
char* pend = seq + (seqlen/sizeof(char));
pend = remove(pbegin, pend, '\n');
pend = remove(pbegin, pend, '\0');
string s = seq;
free(seq);
s.resize((pend - pbegin)/sizeof(char));
return s;
}
long unsigned int FastaReference::sequenceLength(string seqname) {
FastaIndexEntry entry = index->entry(seqname);
return entry.length;
}
// ***************************************************************************
// FastaIndex.h (c) 2010 Erik Garrison <erik.garrison@bc.edu>
// Marth Lab, Department of Biology, Boston College
// All rights reserved.
// ---------------------------------------------------------------------------
// Last modified: 5 February 2010 (EG)
// ---------------------------------------------------------------------------
#ifndef _FASTA_H
#define _FASTA_H
#include <map>
#include <iostream>
#include <fstream>
#include <vector>
#include <stdint.h>
#include <stdio.h>
#include <algorithm>
#include "LargeFileSupport.h"
#include <sys/stat.h>
#include <sys/mman.h>
#include "split.h"
#include <stdlib.h>
#include <ctype.h>
#include <unistd.h>
using namespace std;
class FastaIndexEntry {
friend ostream& operator<<(ostream& output, const FastaIndexEntry& e);
public:
FastaIndexEntry(string name, int length, long long offset, int line_blen, int line_len);
FastaIndexEntry(void);
~FastaIndexEntry(void);
string name; // sequence name
int length; // length of sequence
long long offset; // bytes offset of sequence from start of file
int line_blen; // line length in bytes, sequence characters
int line_len; // line length including newline
void clear(void);
};
class FastaIndex : public map<string, FastaIndexEntry> {
friend ostream& operator<<(ostream& output, FastaIndex& i);
public:
FastaIndex(void);
~FastaIndex(void);
vector<string> sequenceNames;
void indexReference(string refName);
void readIndexFile(string fname);
void writeIndexFile(string fname);
ifstream indexFile;
FastaIndexEntry entry(string key);
void flushEntryToIndex(FastaIndexEntry& entry);
string indexFileExtension(void);
};
class FastaReference {
public:
void open(string reffilename, bool usemmap = false);
bool usingmmap;
string filename;
FastaReference(void) : usingmmap(false) { }
~FastaReference(void);
FILE* file;
void* filemm;
size_t filesize;
FastaIndex* index;
vector<FastaIndexEntry> findSequencesStartingWith(string seqnameStart);
string getSequence(string seqname);
// potentially useful for performance, investigate
// void getSequence(string seqname, string& sequence);
string getSubSequence(string seqname, int start, int length);
string sequenceNameStartingWith(string seqnameStart);
long unsigned int sequenceLength(string seqname);
};
#endif
#pragma once
#define _FILE_OFFSET_BITS 64
#ifdef WIN32
#define ftell64(a) _ftelli64(a)
#define fseek64(a,b,c) _fseeki64(a,b,c)
typedef __int64_t off_type;
#else
#define ftell64(a) ftello(a)
#define fseek64(a,b,c) fseeko(a,b,c)
typedef off_t off_type;
#endif
......@@ -10,7 +10,7 @@ INCLUDES = -I$(UTILITIES_DIR)/bedFile/ -I$(UTILITIES_DIR)/sequenceUtilities/ -I$
# ----------------------------------
# define our source and object files
# ----------------------------------
SOURCES= fastaFromBedMain.cpp fastaFromBed.cpp
SOURCES= fastaFromBedMain.cpp fastaFromBed.cpp Fasta.cpp split.cpp
OBJECTS= $(SOURCES:.cpp=.o)
_EXT_OBJECTS=bedFile.o sequenceUtils.o lineFileUtilities.o gzstream.o fileType.o
EXT_OBJECTS=$(patsubst %,$(OBJ_DIR)/%,$(_EXT_OBJECTS))
......
......@@ -57,38 +57,32 @@ Bed2Fa::~Bed2Fa(void) {
//******************************************************************************
// ReportDNA
//******************************************************************************
void Bed2Fa::ReportDNA(const BED &bed, const string &currDNA, const string &currChrom) {
if ( (bed.start <= currDNA.size()) && (bed.end <= currDNA.size()) ) {
string dna = currDNA.substr(bed.start, ((bed.end - bed.start)));
// revcomp if necessary. Thanks to Thomas Doktor.
if ((_useStrand == true) && (bed.strand == "-"))
reverseComplement(dna);
if (!(_useName)) {
if (_useFasta == true) {
if (_useStrand == true)
*_faOut << ">" << currChrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << endl << dna << endl;
else
*_faOut << ">" << currChrom << ":" << bed.start << "-" << bed.end << endl << dna << endl;
}
else {
if (_useStrand == true)
*_faOut << currChrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << "\t" << dna << endl;
else
*_faOut << currChrom << ":" << bed.start << "-" << bed.end << "\t" << dna << endl;
}
void Bed2Fa::ReportDNA(const BED &bed, string &dna) {
// revcomp if necessary. Thanks to Thomas Doktor.
if ((_useStrand == true) && (bed.strand == "-"))
reverseComplement(dna);
if (!(_useName)) {
if (_useFasta == true) {
if (_useStrand == true)
*_faOut << ">" << bed.chrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << endl << dna << endl;
else
*_faOut << ">" << bed.chrom << ":" << bed.start << "-" << bed.end << endl << dna << endl;
}
else {
if (_useFasta == true)
*_faOut << ">" << bed.name << endl << dna << endl;
if (_useStrand == true)
*_faOut << bed.chrom << ":" << bed.start << "-" << bed.end << "(" << bed.strand << ")" << "\t" << dna << endl;
else
*_faOut << bed.name << "\t" << dna << endl;
*_faOut << bed.chrom << ":" << bed.start << "-" << bed.end << "\t" << dna << endl;
}
}
else cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond "
<< currChrom << " size (" << currDNA.size() << " bp). Skipping." << endl;
else {
if (_useFasta == true)
*_faOut << ">" << bed.name << endl << dna << endl;
else
*_faOut << bed.name << "\t" << dna << endl;
}
}
......@@ -107,44 +101,26 @@ void Bed2Fa::ExtractDNA() {
exit (1);
}
// load the BED file into an unbinned map.
_bed->loadBedFileIntoMapNoBin();
//Read the fastaDb chromosome by chromosome
string fastaDbLine;
string currChrom;
string currDNA = "";
currDNA.reserve(500000000);
while (getline(faDb,fastaDbLine)) {
if (fastaDbLine.find(">",0) != 0 ) {
currDNA += fastaDbLine;
}
else {
if (currDNA.size() > 0) {
vector<BED>::const_iterator bedItr = _bed->bedMapNoBin[currChrom].begin();
vector<BED>::const_iterator bedEnd = _bed->bedMapNoBin[currChrom].end();
// loop through each BED entry for this chrom and print the sequence
for (; bedItr != bedEnd; ++bedItr) {
ReportDNA(*bedItr, currDNA, currChrom);
}
currDNA = "";
}
currChrom = fastaDbLine.substr(1, fastaDbLine.find_first_of(" ")-1);
}
}
// process the last chromosome in the fasta file.
if (currDNA.size() > 0) {
vector<BED>::const_iterator bedItr = _bed->bedMapNoBin[currChrom].begin();
vector<BED>::const_iterator bedEnd = _bed->bedMapNoBin[currChrom].end();
// loop through each BED entry for this chrom and print the sequence
for (; bedItr != bedEnd; ++bedItr) {
ReportDNA(*bedItr, currDNA, currChrom);
// open and memory-map genome file
FastaReference fr;
bool memmap = true;
fr.open(_dbFile, memmap);
BED bed, nullBed;
int lineNum = 0;
BedLineStatus bedStatus;
string sequence;
_bed->Open();
while ((bedStatus = _bed->GetNextBed(bed, lineNum)) != BED_INVALID) {
if (bedStatus == BED_VALID) {
int length = bed.end - bed.start;
sequence = fr.getSubSequence(bed.chrom, bed.start, length);
ReportDNA(bed, sequence);
bed = nullBed;
}
currDNA = "";
}
_bed->Close();
}
......
......@@ -14,6 +14,7 @@
#include "bedFile.h"
#include "sequenceUtils.h"
#include "Fasta.h"
#include <vector>
#include <iostream>
#include <fstream>
......@@ -35,7 +36,7 @@ public:
~Bed2Fa(void);
void ExtractDNA();
void ReportDNA(const BED &bed, const string &currDNA, const string &currChrom);
void ReportDNA(const BED &bed, string &dna);
private:
......
#include "split.h"
std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
std::stringstream ss(s);
std::string item;
while(std::getline(ss, item, delim)) {
elems.push_back(item);
}
return elems;
}
std::vector<std::string> split(const std::string &s, char delim) {
std::vector<std::string> elems;
return split(s, delim, elems);
}
std::vector<std::string> &split(const std::string &s, const std::string& delims, std::vector<std::string> &elems) {
char* tok;
char cchars [s.size()+1];
char* cstr = &cchars[0];
strcpy(cstr, s.c_str());
tok = strtok(cstr, delims.c_str());
while (tok != NULL) {
elems.push_back(tok);
tok = strtok(NULL, delims.c_str());
}
return elems;
}
std::vector<std::string> split(const std::string &s, const std::string& delims) {
std::vector<std::string> elems;
return split(s, delims, elems);
}
#ifndef __SPLIT_H
#define __SPLIT_H
// functions to split a string by a specific delimiter
#include <string>
#include <vector>
#include <sstream>
#include <string.h>
// thanks to Evan Teran, http://stackoverflow.com/questions/236129/how-to-split-a-string/236803#236803
// split a string on a single delimiter character (delim)
std::vector<std::string>& split(const std::string &s, char delim, std::vector<std::string> &elems);
std::vector<std::string> split(const std::string &s, char delim);
// split a string on any character found in the string of delimiters (delims)
std::vector<std::string>& split(const std::string &s, const std::string& delims, std::vector<std::string> &elems);
std::vector<std::string> split(const std::string &s, const std::string& delims);
#endif
File added
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment