// 
//  bedFilePE.cpp
//  BEDTools
//  
//  Created by Aaron Quinlan Spring 2009.
//  Copyright 2009 Aaron Quinlan. All rights reserved.
//
//  Summary:  Contains common functions for finding BED overlaps.
//
//  Acknowledgments: Much of the code herein is taken from Jim Kent's
//                   BED processing code.  I am grateful for his elegant
//					 genome binning algorithm and therefore use it extensively.


#include "bedFilePE.h"


// Constructor
BedFilePE::BedFilePE(string &bedFile) {
	this->bedFile = bedFile;
}

// Destructor
BedFilePE::~BedFilePE(void) {
}

void BedFilePE::Open(void) {
	if (bedFile == "stdin") {
		_bedStream = &cin;
	}
	else {
		size_t foundPos;
	  	foundPos = bedFile.find_last_of(".gz");
		// is this a GZIPPED BED file?
		if (foundPos == bedFile.size() - 1) {
			igzstream beds(bedFile.c_str(), ios::in);
			if ( !beds ) {
				cerr << "Error: The requested bedpe file (" << bedFile << ") could not be opened. Exiting!" << endl;
				exit (1);
			}
			else {
				// if so, close it (this was just a test)
				beds.close();		
				// now set a pointer to the stream so that we
				// can read the file later on.
				// Thank God for Josuttis, p. 631!
				_bedStream = new igzstream(bedFile.c_str(), ios::in);
			}
		}  
		// not GZIPPED.
		else {
		
			ifstream beds(bedFile.c_str(), ios::in);
			// can we open the file?
			if ( !beds ) {
				cerr << "Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl;
				exit (1);
			}
			else {
				// if so, close it (this was just a test)
				beds.close();		
				// now set a pointer to the stream so that we
				// can read the file later on.
				// Thank God for Josuttis, p. 631!
				_bedStream = new ifstream(bedFile.c_str(), ios::in);
			}
		}
	}
}


// Close the BEDPE file
void BedFilePE::Close(void) {
	if (bedFile != "stdin") delete _bedStream;
}


BedLineStatus BedFilePE::GetNextBedPE (BEDPE &bedpe, int &lineNum) {

	// make sure there are still lines to process.
	// if so, tokenize, validate and return the BEDPE entry.
	if (_bedStream->good()) {
		string bedPELine;
		vector<string> bedPEFields;
		bedPEFields.reserve(10);
		
		// parse the bedStream pointer
		getline(*_bedStream, bedPELine);
		lineNum++;

		// split into a string vector.
		Tokenize(bedPELine,bedPEFields);
		
		// load the BEDPE struct as long as it's a valid BEDPE entry.
		return parseLine(bedpe, bedPEFields, lineNum);
	}
	// default if file is closed or EOF
	return BED_INVALID;
}


/*
	reportBedPETab
	
	Writes the _original_ BED entry for A.
	Works for BEDPE only.
*/
void BedFilePE::reportBedPETab(const BEDPE &a) {

	if (this->bedType == 6) {
		printf("%s\t%d\t%d\t%s\t%d\t%d\t", a.chrom1.c_str(), a.start1, a.end1,
		 									a.chrom2.c_str(), a.start2, a.end2);
	}
	else if (this->bedType == 7) {
		printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t", a.chrom1.c_str(), a.start1, a.end1,
		 									a.chrom2.c_str(), a.start2, a.end2,
											a.name.c_str());
	}	
	else if (this->bedType == 8) {
		printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t", a.chrom1.c_str(), a.start1, a.end1,
		 									a.chrom2.c_str(), a.start2, a.end2,
											a.name.c_str(), a.score.c_str());
	}
	else if (this->bedType == 10) {
		printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", a.chrom1.c_str(), a.start1, a.end1,
		 									a.chrom2.c_str(), a.start2, a.end2,
											a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
	}
	else if (this->bedType > 10) {
		printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", a.chrom1.c_str(), a.start1, a.end1,
		 									a.chrom2.c_str(), a.start2, a.end2,
											a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
											
		vector<string>::const_iterator othIt = a.otherFields.begin(); 
		vector<string>::const_iterator othEnd = a.otherFields.end(); 
		for ( ; othIt != othEnd; ++othIt) {
			printf("\t%s", othIt->c_str());
		}
		printf("\t");
	}
}



/*
	reportBedPENewLine
	
	Writes the _original_ BED entry for A.
	Works for BEDPE only.
*/
void BedFilePE::reportBedPENewLine(const BEDPE &a) {

	if (this->bedType == 6) {
		printf("%s\t%d\t%d\t%s\t%d\t%d\n", a.chrom1.c_str(), a.start1, a.end1,
		 									a.chrom2.c_str(), a.start2, a.end2);
	}
	else if (this->bedType == 7) {
		printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\n", a.chrom1.c_str(), a.start1, a.end1,
		 									a.chrom2.c_str(), a.start2, a.end2,
											a.name.c_str());
	}	
	else if (this->bedType == 8) {
		printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\n", a.chrom1.c_str(), a.start1, a.end1,
		 									a.chrom2.c_str(), a.start2, a.end2,
											a.name.c_str(), a.score.c_str());
	}
	else if (this->bedType == 10) {
		printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", a.chrom1.c_str(), a.start1, a.end1,
		 									a.chrom2.c_str(), a.start2, a.end2,
											a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
	}
	else if (this->bedType > 10) {
		printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", a.chrom1.c_str(), a.start1, a.end1,
		 									a.chrom2.c_str(), a.start2, a.end2,
											a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
											
		vector<string>::const_iterator othIt  = a.otherFields.begin(); 
		vector<string>::const_iterator othEnd = a.otherFields.end(); 
		for ( ; othIt != othEnd; ++othIt) {
			printf("\t%s", othIt->c_str());
		}
		printf("\n");
	}
}


BedLineStatus BedFilePE::parseLine (BEDPE &bedpe, const vector<string> &lineVector, int &lineNum) {

	// bail out if we have a blank line
	if (lineVector.empty()) 
		return BED_BLANK;

	if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) {
		// we need at least 6 columns
		if (lineVector.size() >= 6) {
			if (parseBedPELine(bedpe, lineVector, lineNum) == true)
				return BED_VALID;
			else return BED_INVALID;
		}
		else  {
			cerr << "It looks as though you have less than 6 columns.  Are you sure your files are tab-delimited?" << endl;
			exit(1);
		}
	}
	else {
		lineNum--;
		return BED_HEADER;	
	}

	// default
	return BED_INVALID;
}


bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, const int &lineNum) {

	if ((lineNum == 1) && (lineVector.size() >= 6)) {

		this->bedType = lineVector.size();

		if (this->bedType == 6) {
			bed.chrom1 = lineVector[0];
			bed.start1 = atoi(lineVector[1].c_str());
			bed.end1 = atoi(lineVector[2].c_str());
						
			bed.chrom2 = lineVector[3];
			bed.start2 = atoi(lineVector[4].c_str());
			bed.end2 = atoi(lineVector[5].c_str());

			return true;
		}
		else if (this->bedType == 7) {
			bed.chrom1 = lineVector[0];
			bed.start1 = atoi(lineVector[1].c_str());
			bed.end1 = atoi(lineVector[2].c_str());
						
			bed.chrom2 = lineVector[3];
			bed.start2 = atoi(lineVector[4].c_str());
			bed.end2 = atoi(lineVector[5].c_str());

			bed.name = lineVector[6];
			return true;
		}	
		else if (this->bedType == 8) {
			bed.chrom1 = lineVector[0];
			bed.start1 = atoi(lineVector[1].c_str());
			bed.end1 = atoi(lineVector[2].c_str());
						
			bed.chrom2 = lineVector[3];
			bed.start2 = atoi(lineVector[4].c_str());
			bed.end2 = atoi(lineVector[5].c_str());

			bed.name = lineVector[6];
			bed.score = lineVector[7].c_str();
			return true;
		}	
		else if (this->bedType == 10) {
			bed.chrom1 = lineVector[0];
			bed.start1 = atoi(lineVector[1].c_str());
			bed.end1 = atoi(lineVector[2].c_str());
						
			bed.chrom2 = lineVector[3];
			bed.start2 = atoi(lineVector[4].c_str());
			bed.end2 = atoi(lineVector[5].c_str());

			bed.name = lineVector[6];
			bed.score = lineVector[7].c_str();

			bed.strand1 = lineVector[8];
			bed.strand2 = lineVector[9];

			return true;
		}
		else if (this->bedType > 10) {
			bed.chrom1 = lineVector[0];
			bed.start1 = atoi(lineVector[1].c_str());
			bed.end1 = atoi(lineVector[2].c_str());
						
			bed.chrom2 = lineVector[3];
			bed.start2 = atoi(lineVector[4].c_str());
			bed.end2 = atoi(lineVector[5].c_str());

			bed.name = lineVector[6];
			bed.score = lineVector[7].c_str();

			bed.strand1 = lineVector[8];
			bed.strand2 = lineVector[9];
			
			for (unsigned int i = 10; i < lineVector.size(); ++i) {
				bed.otherFields.push_back(lineVector[i]); 
			}
			return true;
		}
		else {
			cerr << "Unexpected number of fields: " << lineNum << ".  Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields.  Exiting..." << endl;
			exit(1);
		}
		
		if (bed.start1 > bed.end1) {
			cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl;
			return false;
		}
		else if (bed.start2 > bed.end2) {
			cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl;
			return false;
		}
		else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) {
			cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl;
			return false;
		}
	}
	else if ( (lineNum > 1) && (lineVector.size() == this->bedType)) {

		if (this->bedType == 6) {
			bed.chrom1 = lineVector[0];
			bed.start1 = atoi(lineVector[1].c_str());
			bed.end1 = atoi(lineVector[2].c_str());
						
			bed.chrom2 = lineVector[3];
			bed.start2 = atoi(lineVector[4].c_str());
			bed.end2 = atoi(lineVector[5].c_str());

			return true;
		}
		else if (this->bedType == 7) {
			bed.chrom1 = lineVector[0];
			bed.start1 = atoi(lineVector[1].c_str());
			bed.end1 = atoi(lineVector[2].c_str());
						
			bed.chrom2 = lineVector[3];
			bed.start2 = atoi(lineVector[4].c_str());
			bed.end2 = atoi(lineVector[5].c_str());

			bed.name = lineVector[6];
			return true;
		}	
		else if (this->bedType == 8) {
			bed.chrom1 = lineVector[0];
			bed.start1 = atoi(lineVector[1].c_str());
			bed.end1 = atoi(lineVector[2].c_str());
						
			bed.chrom2 = lineVector[3];
			bed.start2 = atoi(lineVector[4].c_str());
			bed.end2 = atoi(lineVector[5].c_str());

			bed.name = lineVector[6];
			bed.score = lineVector[7].c_str();
			return true;
		}	
		else if (this->bedType == 10) {
			bed.chrom1 = lineVector[0];
			bed.start1 = atoi(lineVector[1].c_str());
			bed.end1 = atoi(lineVector[2].c_str());
						
			bed.chrom2 = lineVector[3];
			bed.start2 = atoi(lineVector[4].c_str());
			bed.end2 = atoi(lineVector[5].c_str());

			bed.name = lineVector[6];
			bed.score = lineVector[7].c_str();

			bed.strand1 = lineVector[8];
			bed.strand2 = lineVector[9];

			return true;
		}
		else if (this->bedType > 10) {
			bed.chrom1 = lineVector[0];
			bed.start1 = atoi(lineVector[1].c_str());
			bed.end1 = atoi(lineVector[2].c_str());
						
			bed.chrom2 = lineVector[3];
			bed.start2 = atoi(lineVector[4].c_str());
			bed.end2 = atoi(lineVector[5].c_str());

			bed.name = lineVector[6];
			bed.score = lineVector[7].c_str();

			bed.strand1 = lineVector[8];
			bed.strand2 = lineVector[9];
			
			for (unsigned int i = 10; i < lineVector.size(); ++i) {
				bed.otherFields.push_back(lineVector[i]); 
			}
			return true;
		}
		else {
			cerr << "Unexpected number of fields: " << lineNum << ".  Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields.  Exiting..." << endl;
			exit(1);
		}

		if (bed.start1 > bed.end1) {
			cerr << "Error: malformed BED entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl;
			return false;
		}
		else if (bed.start2 > bed.end2) {
			cerr << "Error: malformed BED entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl;
			return false;
		}
		else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) {
			cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl;
			return false;
		}
	}
	else if (lineVector.size() == 1) {
		cerr << "Only one BED field detected: " << lineNum << ".  Verify that your files are TAB-delimited.  Exiting..." << endl;
		exit(1);		
	}
	else if ((lineVector.size() != this->bedType) && (lineVector.size() != 0)) {
		cerr << "Differing number of BEDPE fields encountered at line: " << lineNum << ".  Exiting..." << endl;
		exit(1);
	}
	else if ((lineVector.size() < 6) && (lineVector.size() != 0)) {
		cerr << "TAB delimited BEDPE file with at least 6 fields (chrom1, start1, end1, chrom2, start2, end2) is required at line: "<< lineNum << ".  Exiting..." << endl;
		exit(1);
	}
	return false;
}


/*
	Adapted from kent source "binKeeperFind"
*/
void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string strand, vector<BEDCOV> &hits, bool forceStrand) {

	int startBin, endBin;
	startBin = (start >> _binFirstShift);
	endBin = ((end-1) >> _binFirstShift);

	// loop through each bin "level" in the binning hierarchy
	for (int i = 0; i < _binLevels; ++i) {
		
		// loop through each bin at this level of the hierarchy
		int offset = _binOffsetsExtended[i];
		for (int j = (startBin+offset); j <= (endBin+offset); ++j)  {
			
			// loop through each feature in this chrom/bin and see if it overlaps
			// with the feature that was passed in.  if so, add the feature to 
			// the list of hits.
			vector<BEDCOV>::const_iterator bedItr;
			vector<BEDCOV>::const_iterator bedEnd;
			if (bEnd == 1) {
				bedItr = bedMapEnd1[chrom][j].begin();
				bedEnd = bedMapEnd1[chrom][j].end();
			}
			else if (bEnd == 2) {
				bedItr = bedMapEnd2[chrom][j].begin();
				bedEnd = bedMapEnd2[chrom][j].end();				
			}
			else {
				cerr << "Unexpected end of B requested" << endl;
			}
			for (; bedItr != bedEnd; ++bedItr) {
				
				// skip the hit if not on the same strand (and we care)
				if (forceStrand && (strand != bedItr->strand)) {
					continue;
				}
				else if (overlaps(bedItr->start, bedItr->end, start, end) > 0) {
					hits.push_back(*bedItr);	// it's a hit, add it.
				}
							
			}
		}
		startBin >>= _binNextShift;
		endBin >>= _binNextShift;
	}
}


void BedFilePE::loadBedPEFileIntoMap() {

	int lineNum = 0;
	int bin1, bin2;
	BedLineStatus bedStatus;
	BEDPE bedpeEntry, nullBedPE;

	Open();	
	bedStatus = this->GetNextBedPE(bedpeEntry, lineNum);	
	while (bedStatus != BED_INVALID) {
		
		if (bedStatus == BED_VALID) {
			BEDCOV bedEntry1, bedEntry2;
			// separate the BEDPE entry into separate
			// BED entries
			splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2);

			// load end1 into a UCSC bin map
			bin1 = getBin(bedEntry1.start, bedEntry1.end);
			this->bedMapEnd1[bedEntry1.chrom][bin1].push_back(bedEntry1);	
		
			// load end2 into a UCSC bin map
			bin2 = getBin(bedEntry2.start, bedEntry2.end);
			this->bedMapEnd2[bedEntry2.chrom][bin2].push_back(bedEntry2);

			bedpeEntry = nullBedPE;
		}
		bedStatus = this->GetNextBedPE(bedpeEntry, lineNum);
	}
	Close();
}


void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, BEDCOV &bedEntry1, BEDCOV &bedEntry2) {
	
	/* 
	   Split the BEDPE entry into separate BED entries
	
	   NOTE: I am using a trick here where I store
	   the lineNum of the BEDPE from the original file
	   in the "count" column.  This allows me to later
	   resolve whether the hits found on both ends of BEDPE A
	   came from the same entry in BEDPE B.  Tracking by "name"
	   alone with fail when there are multiple mappings for a given
	   read-pair.
	*/
	
	bedEntry1.chrom           = bedpeEntry.chrom1;
	bedEntry1.start           = bedpeEntry.start1;
	bedEntry1.end             = bedpeEntry.end1;
	bedEntry1.name            = bedpeEntry.name;
	bedEntry1.score           = bedpeEntry.score;
	bedEntry1.strand          = bedpeEntry.strand1;
	bedEntry1.count           = lineNum;
	bedEntry1.minOverlapStart = INT_MAX;
	
	bedEntry2.chrom           = bedpeEntry.chrom2;
	bedEntry2.start           = bedpeEntry.start2;
	bedEntry2.end             = bedpeEntry.end2;
	bedEntry2.name            = bedpeEntry.name;
	bedEntry2.score           = bedpeEntry.score;
	bedEntry2.strand          = bedpeEntry.strand2;		
	bedEntry2.count           = lineNum;
	bedEntry2.minOverlapStart = INT_MAX;
}