Mercurial > repos > aaronquinlan > multi_intersect
diff BEDTools-Version-2.14.3/src/utils/bedFile/bedFile.h @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BEDTools-Version-2.14.3/src/utils/bedFile/bedFile.h Thu Nov 03 10:25:04 2011 -0400 @@ -0,0 +1,1167 @@ +/***************************************************************************** + bedFile.h + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licensed under the GNU General Public License 2.0 license. +******************************************************************************/ +#ifndef BEDFILE_H +#define BEDFILE_H + +// "local" includes +#include "gzstream.h" +#include "lineFileUtilities.h" +#include "fileType.h" + +// standard includes +#include <vector> +#include <map> +#include <set> +#include <string> +#include <iostream> +#include <fstream> +#include <sstream> +#include <cstring> +#include <algorithm> +#include <limits.h> +#include <stdint.h> +#include <cstdio> +//#include <tr1/unordered_map> // Experimental. +using namespace std; + + +//************************************************* +// Data type tydedef +//************************************************* +typedef uint32_t CHRPOS; +typedef uint16_t BINLEVEL; +typedef uint32_t BIN; +typedef uint16_t USHORT; +typedef uint32_t UINT; + +//************************************************* +// Genome binning constants +//************************************************* + +const BIN _numBins = 37450; +const BINLEVEL _binLevels = 7; + +// bins range in size from 16kb to 512Mb +// Bin 0 spans 512Mbp, # Level 1 +// Bins 1-8 span 64Mbp, # Level 2 +// Bins 9-72 span 8Mbp, # Level 3 +// Bins 73-584 span 1Mbp # Level 4 +// Bins 585-4680 span 128Kbp # Level 5 +// Bins 4681-37449 span 16Kbp # Level 6 +const BIN _binOffsetsExtended[] = {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; +//const BIN _binOffsetsExtended[] = {4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; + +const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */ +const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */ + + +//************************************************* +// Common data structures +//************************************************* + +struct DEPTH { + UINT starts; + UINT ends; +}; + + +/* + Structure for regular BED records +*/ +struct BED { + + // Regular BED fields + string chrom; + CHRPOS start; + CHRPOS end; + string name; + string score; + string strand; + + // Add'l fields for BED12 and/or custom BED annotations + vector<string> otherFields; + + // experimental fields for the FJOIN approach. + bool zeroLength; + bool added; + bool finished; + // list of hits from another file. + vector<BED> overlaps; + +public: + // constructors + + // Null + BED() + : chrom(""), + start(0), + end(0), + name(""), + score(""), + strand(""), + otherFields(), + zeroLength(false), + added(false), + finished(false), + overlaps() + {} + + // BED3 + BED(string chrom, CHRPOS start, CHRPOS end) + : chrom(chrom), + start(start), + end(end), + name(""), + score(""), + strand(""), + otherFields(), + zeroLength(false), + added(false), + finished(false), + overlaps() + {} + + // BED4 + BED(string chrom, CHRPOS start, CHRPOS end, string strand) + : chrom(chrom), + start(start), + end(end), + name(""), + score(""), + strand(strand), + otherFields(), + zeroLength(false), + added(false), + finished(false), + overlaps() + {} + + // BED6 + BED(string chrom, CHRPOS start, CHRPOS end, string name, + string score, string strand) + : chrom(chrom), + start(start), + end(end), + name(name), + score(score), + strand(strand), + otherFields(), + zeroLength(false), + added(false), + finished(false), + overlaps() + {} + + // BEDALL + BED(string chrom, CHRPOS start, CHRPOS end, string name, + string score, string strand, vector<string> otherFields) + : chrom(chrom), + start(start), + end(end), + name(name), + score(score), + strand(strand), + otherFields(otherFields), + zeroLength(false), + added(false), + finished(false), + overlaps() + {} + + int size() { + return end-start; + } + +}; // BED + + +/* + Structure for each end of a paired BED record + mate points to the other end. +*/ +struct MATE { + BED bed; + int lineNum; + MATE *mate; +}; + + +/* + Structure for regular BED COVERAGE records +*/ +struct BEDCOV { + + string chrom; + + // Regular BED fields + CHRPOS start; + CHRPOS end; + string name; + string score; + string strand; + + // Add'l fields for BED12 and/or custom BED annotations + vector<string> otherFields; + + // flag a zero-length feature + bool zeroLength; + + // Additional fields specific to computing coverage + map<unsigned int, DEPTH> depthMap; + unsigned int count; + CHRPOS minOverlapStart; + + + public: + // constructors + // Null + BEDCOV() + : chrom(""), + start(0), + end(0), + name(""), + score(""), + strand(""), + otherFields(), + zeroLength(false), + depthMap(), + count(0), + minOverlapStart(0) + {} +}; + + +/* + Structure for BED COVERAGE records having lists of + multiple coverages +*/ +struct BEDCOVLIST { + + // Regular BED fields + string chrom; + CHRPOS start; + CHRPOS end; + string name; + string score; + string strand; + + // Add'l fields for BED12 and/or custom BED annotations + vector<string> otherFields; + + // flag a zero-length feature + bool zeroLength; + + // Additional fields specific to computing coverage + vector< map<unsigned int, DEPTH> > depthMapList; + vector<unsigned int> counts; + vector<CHRPOS> minOverlapStarts; + + + public: + // constructors + // Null + BEDCOVLIST() + : chrom(""), + start(0), + end(0), + name(""), + score(""), + strand(""), + otherFields(), + zeroLength(false), + depthMapList(), + counts(0), + minOverlapStarts(0) + {} +}; + + +// enum to flag the state of a given line in a BED file. +enum BedLineStatus +{ + BED_INVALID = -1, + BED_HEADER = 0, + BED_BLANK = 1, + BED_VALID = 2 +}; + +// enum to indicate the type of file we are dealing with +enum FileType +{ + BED_FILETYPE, + GFF_FILETYPE, + VCF_FILETYPE +}; + +//************************************************* +// Data structure typedefs +//************************************************* +typedef vector<BED> bedVector; +typedef vector<BEDCOV> bedCovVector; +typedef vector<MATE> mateVector; +typedef vector<BEDCOVLIST> bedCovListVector; + +typedef map<BIN, bedVector, std::less<BIN> > binsToBeds; +typedef map<BIN, bedCovVector, std::less<BIN> > binsToBedCovs; +typedef map<BIN, mateVector, std::less<BIN> > binsToMates; +typedef map<BIN, bedCovListVector, std::less<BIN> > binsToBedCovLists; + +typedef map<string, binsToBeds, std::less<string> > masterBedMap; +typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap; +typedef map<string, binsToMates, std::less<string> > masterMateMap; +typedef map<string, binsToBedCovLists, std::less<string> > masterBedCovListMap; +typedef map<string, bedVector, std::less<string> > masterBedMapNoBin; + + +// EXPERIMENTAL - wait for TR1 +// typedef vector<BED> bedVector; +// typedef vector<BEDCOV> bedCovVector; +// +// typedef tr1::unordered_map<BIN, bedVector> binsToBeds; +// typedef tr1::unordered_map<BIN, bedCovVector> binsToBedCovs; +// +// typedef tr1::unordered_map<string, binsToBeds> masterBedMap; +// typedef tr1::unordered_map<string, binsToBedCovs> masterBedCovMap; +// typedef tr1::unordered_map<string, bedVector> masterBedMapNoBin; + + + +// return the genome "bin" for a feature with this start and end +inline +BIN getBin(CHRPOS start, CHRPOS end) { + --end; + start >>= _binFirstShift; + end >>= _binFirstShift; + + for (register short i = 0; i < _binLevels; ++i) { + if (start == end) return _binOffsetsExtended[i] + start; + start >>= _binNextShift; + end >>= _binNextShift; + } + cerr << "start " << start << ", end " << end << " out of range in findBin (max is 512M)" << endl; + return 0; +} + +/**************************************************** +// isInteger(s): Tests if string s is a valid integer +*****************************************************/ +inline bool isInteger(const std::string& s) { + int len = s.length(); + for (int i = 0; i < len; i++) { + if (!std::isdigit(s[i])) return false; + } + return true; +} + + +// return the amount of overlap between two features. Negative if none and the the +// number of negative bases is the distance between the two. +inline +int overlaps(CHRPOS aS, CHRPOS aE, CHRPOS bS, CHRPOS bE) { + return min(aE, bE) - max(aS, bS); +} + +// is A after (to the right of) B? +inline +bool after(const BED &a, const BED &b) { + return (a.start >= b.end); +} + + +// Ancillary functions +void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks); + + +// BED Sorting Methods +bool sortByChrom(const BED &a, const BED &b); +bool sortByStart(const BED &a, const BED &b); +bool sortBySizeAsc(const BED &a, const BED &b); +bool sortBySizeDesc(const BED &a, const BED &b); +bool sortByScoreAsc(const BED &a, const BED &b); +bool sortByScoreDesc(const BED &a, const BED &b); +bool byChromThenStart(BED const &a, BED const &b); + + + +//************************************************ +// BedFile Class methods and elements +//************************************************ +class BedFile { + +public: + + // Constructor + BedFile(string &); + + // Destructor + ~BedFile(void); + + // Open a BED file for reading (creates an istream pointer) + void Open(void); + + // Rewind the pointer back to the beginning of the file + void Rewind(void); + + // Jump to a specific byte in the file + void Seek(unsigned long offset); + + bool Empty(); + + // Close an opened BED file. + void Close(void); + + // Get the next BED entry in an opened BED file. + BedLineStatus GetNextBed (BED &bed, int &lineNum, bool forceSorted = false); + + // Returns the next MERGED (i.e., non-overlapping) interval in an opened BED file + // NOTE: assumes input file is sorted by chrom then start + bool GetNextMergedBed(BED &merged_bed, int &lineNum); + + // load a BED file into a map keyed by chrom, then bin. value is vector of BEDs + void loadBedFileIntoMap(); + + // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVs + void loadBedCovFileIntoMap(); + + // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVLISTs + void loadBedCovListFileIntoMap(); + + // load a BED file into a map keyed by chrom. value is vector of BEDs + void loadBedFileIntoMapNoBin(); + + // Given a chrom, start, end and strand for a single feature, + // search for all overlapping features in another BED file. + // Searches through each relevant genome bin on the same chromosome + // as the single feature. Note: Adapted from kent source "binKeeperFind" + void FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool sameStrand, bool diffStrand); + + // return true if at least one overlap was found. otherwise, return false. + bool FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, + bool sameStrand, bool diffStrand, float overlapFraction = 0.0); + + // return true if at least one __reciprocal__ overlap was found. otherwise, return false. + bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, + bool sameStrand, bool diffStrand, float overlapFraction = 0.0); + + // Given a chrom, start, end and strand for a single feature, + // increment a the number of hits for each feature in B file + // that the feature overlaps + void countHits(const BED &a, bool sameStrand = false, bool diffStrand = false, bool countsOnly = false); + + // same as above, but has special logic that processes a set of + // BED "blocks" from a single entry so as to avoid over-counting + // each "block" of a single BAM/BED12 as distinct coverage. That is, + // if one read has four block, we only want to count the coverage as + // coming from one read, not four. + void countSplitHits(const vector<BED> &bedBlock, bool sameStrand = false, bool diffStrand = false, bool countsOnly = false); + + // Given a chrom, start, end and strand for a single feature, + // increment a the number of hits for each feature in B file + // that the feature overlaps + void countListHits(const BED &a, int index, bool sameStrand, bool diffStrand); + + // the bedfile with which this instance is associated + string bedFile; + unsigned int bedType; // 3-6, 12 for BED + // 9 for GFF + bool isZeroBased; + + // Main data structires used by BEDTools + masterBedCovMap bedCovMap; + masterBedCovListMap bedCovListMap; + masterBedMap bedMap; + masterBedMapNoBin bedMapNoBin; + +private: + + // data + bool _isGff; + bool _isVcf; + bool _typeIsKnown; // do we know the type? (i.e., BED, GFF, VCF) + FileType _fileType; // what is the file type? (BED? GFF? VCF?) + istream *_bedStream; + string _bedLine; + vector<string> _bedFields; + int _merged_start; + int _merged_end; + string _merged_chrom; + int _prev_start; + string _prev_chrom; + + void setZeroBased(bool zeroBased); + void setGff (bool isGff); + void setVcf (bool isVcf); + void setFileType (FileType type); + void setBedType (int colNums); + + /****************************************************** + Private definitions to circumvent linker issues with + templated member functions. + *******************************************************/ + + /* + parseLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) + */ + template <typename T> + inline BedLineStatus parseLine (T &bed, const vector<string> &lineVector, int &lineNum) { + + //char *p2End, *p3End, *p4End, *p5End; + //long l2, l3, l4, l5; + unsigned int numFields = lineVector.size(); + + // bail out if we have a blank line + if (numFields == 0) { + return BED_BLANK; + } + + if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) { + + if (numFields >= 3) { + // line parsing for all lines after the first non-header line + if (_typeIsKnown == true) { + switch(_fileType) { + case BED_FILETYPE: + if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; + case VCF_FILETYPE: + if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; + case GFF_FILETYPE: + if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; + default: + printf("ERROR: file type encountered. Exiting\n"); + exit(1); + } + } + // line parsing for first non-header line: figure out file contents + else { + // it's BED format if columns 2 and 3 are integers + if (isInteger(lineVector[1]) && isInteger(lineVector[2])) { + setGff(false); + setZeroBased(true); + setFileType(BED_FILETYPE); + setBedType(numFields); // we now expect numFields columns in each line + if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; + } + // it's VCF, assuming the second column is numeric and there are at least 8 fields. + else if (isInteger(lineVector[1]) && numFields >= 8) { + setGff(false); + setVcf(true); + setZeroBased(false); + setFileType(VCF_FILETYPE); + setBedType(numFields); // we now expect numFields columns in each line + if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; + } + // it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total. + else if ((numFields >= 8) && isInteger(lineVector[3]) && isInteger(lineVector[4])) { + setGff(true); + setZeroBased(false); + setFileType(GFF_FILETYPE); + setBedType(numFields); // we now expect numFields columns in each line + if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; + } + else { + cerr << "Unexpected file format. Please use tab-delimited BED, GFF, or VCF. " << + "Perhaps you have non-integer starts or ends at line " << lineNum << "?" << endl; + exit(1); + } + } + } + else { + cerr << "It looks as though you have less than 3 columns at line: " << lineNum << ". Are you sure your files are tab-delimited?" << endl; + exit(1); + } + } + else { + lineNum--; + return BED_HEADER; + } + // default + return BED_INVALID; + } + + + /* + parseBedLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) + */ + template <typename T> + inline bool parseBedLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { + + // process as long as the number of fields in this + // line matches what we expect for this file. + if (numFields == this->bedType) { + bed.chrom = lineVector[0]; + int i; + i = atoi(lineVector[1].c_str()); + if (i<0) { + cerr << "Error: malformed BED entry at line " << lineNum << ". Start Coordinate detected that is < 0. Exiting." << endl; + exit(1); + } + bed.start = (CHRPOS)i; + i = atoi(lineVector[2].c_str()); + if (i<0) { + cerr << "Error: malformed BED entry at line " << lineNum << ". End Coordinate detected that is < 0. Exiting." << endl; + exit(1); + } + bed.end = (CHRPOS)i; + + // handle starts == end (e.g., insertions in reference genome) + if (bed.start == bed.end) { + bed.start--; + bed.end++; + bed.zeroLength = true; + } + + if (this->bedType == 4) { + bed.name = lineVector[3]; + } + else if (this->bedType == 5) { + bed.name = lineVector[3]; + bed.score = lineVector[4]; + } + else if (this->bedType == 6) { + bed.name = lineVector[3]; + bed.score = lineVector[4]; + bed.strand = lineVector[5]; + } + else if (this->bedType > 6) { + bed.name = lineVector[3]; + bed.score = lineVector[4]; + bed.strand = lineVector[5]; + for (unsigned int i = 6; i < lineVector.size(); ++i) { + bed.otherFields.push_back(lineVector[i]); + } + } + else if (this->bedType != 3) { + cerr << "Error: unexpected number of fields at line: " << lineNum + << ". Verify that your files are TAB-delimited. Exiting..." << endl; + exit(1); + } + + // sanity checks. + if (bed.start <= bed.end) { + return true; + } + else { + cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + exit(1); + } + } + else if (numFields == 1) { + cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + exit(1); + } + else if ((numFields != this->bedType) && (numFields != 0)) { + cerr << "Differing number of BED fields encountered at line: " << lineNum << ". Exiting..." << endl; + exit(1); + } + else if ((numFields < 3) && (numFields != 0)) { + cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required at line: "<< lineNum << ". Exiting..." << endl; + exit(1); + } + return false; + } + + + /* + parseVcfLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) + */ + template <typename T> + inline bool parseVcfLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { + if (numFields == this->bedType) { + bed.chrom = lineVector[0]; + bed.start = atoi(lineVector[1].c_str()) - 1; // VCF is one-based + bed.end = bed.start + lineVector[3].size(); // VCF 4.0 stores the size of the affected REF allele. + bed.strand = "+"; + // construct the name from the ref and alt alleles. + // if it's an annotated variant, add the rsId as well. + bed.name = lineVector[3] + "/" + lineVector[4]; + if (lineVector[2] != ".") { + bed.name += "_" + lineVector[2]; + } + + if (this->bedType > 2) { + for (unsigned int i = 2; i < numFields; ++i) + bed.otherFields.push_back(lineVector[i]); + } + + if ((bed.start <= bed.end) && (bed.start >= 0) && (bed.end >= 0)) { + return true; + } + else if (bed.start > bed.end) { + cerr << "Error: malformed VCF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + exit(1); + } + else if ( (bed.start < 0) || (bed.end < 0) ) { + cerr << "Error: malformed VCF entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; + exit(1); + } + } + else if (numFields == 1) { + cerr << "Only one VCF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + exit(1); + } + else if ((numFields != this->bedType) && (numFields != 0)) { + cerr << "Differing number of VCF fields encountered at line: " << lineNum << ". Exiting..." << endl; + exit(1); + } + else if ((numFields < 2) && (numFields != 0)) { + cerr << "TAB delimited VCF file with at least 2 fields (chrom, pos) is required at line: "<< lineNum << ". Exiting..." << endl; + exit(1); + } + return false; + } + + + + /* + parseGffLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) + */ + template <typename T> + inline bool parseGffLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { + if (numFields == this->bedType) { + if (this->bedType >= 8 && _isGff) { + bed.chrom = lineVector[0]; + if (isInteger(lineVector[3])) + bed.start = atoi(lineVector[3].c_str()); + if (isInteger(lineVector[4])) + bed.end = atoi(lineVector[4].c_str()); + bed.name = lineVector[2]; + bed.score = lineVector[5]; + bed.strand = lineVector[6].c_str(); + bed.otherFields.push_back(lineVector[1]); // add GFF "source". unused in BED + bed.otherFields.push_back(lineVector[7]); // add GFF "fname". unused in BED + // handle the optional 9th field. + if (this->bedType == 9) + bed.otherFields.push_back(lineVector[8]); // add GFF "group". unused in BED + bed.start--; + } + else { + cerr << "Error: unexpected number of fields at line: " << lineNum << + ". Verify that your files are TAB-delimited and that your GFF file has 8 or 9 fields. Exiting..." << endl; + exit(1); + } + if (bed.start > bed.end) { + cerr << "Error: malformed GFF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; + exit(1); + } + if ( (bed.start < 0) || (bed.end < 0) ) { + cerr << "Error: malformed GFF entry at line " << lineNum << ". Coordinate detected that is < 1. Exiting." << endl; + exit(1); + } + return true; + } + else if (numFields == 1) { + cerr << "Only one GFF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; + exit(1); + } + else if ((numFields != this->bedType) && (numFields != 0)) { + cerr << "Differing number of GFF fields encountered at line: " << lineNum << ". Exiting..." << endl; + exit(1); + } + else if ((numFields < 8) && (numFields != 0)) { + cerr << "TAB delimited GFF file with 8 or 9 fields is required at line: "<< lineNum << ". Exiting..." << endl; + exit(1); + } + return false; + } + + +public: + + /* + reportBedTab + + Writes the _original_ BED entry with a TAB + at the end of the line. + Works for BED3 - BED6. + */ + template <typename T> + inline void reportBedTab(const T &bed) { + + // if it is azeroLength feature, we need to + // correct the start and end coords to what they were + // in the original file + CHRPOS start = bed.start; + CHRPOS end = bed.end; + if (bed.zeroLength) { + if (_isGff == false) + start++; + end--; + } + + // BED + if (_isGff == false && _isVcf == false) { + if (this->bedType == 3) { + printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end); + } + else if (this->bedType == 4) { + printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str()); + } + else if (this->bedType == 5) { + printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str()); + } + else if (this->bedType == 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + } + else if (this->bedType > 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + } + } + // VCF + else if (_isGff == false && _isVcf == true) { + printf ("%s\t%d\t", bed.chrom.c_str(), start+1); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + } + // GFF + else if (_isGff == true) { + // "GFF-8" + if (this->bedType == 8) { + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand.c_str(), + bed.otherFields[1].c_str()); + } + // "GFF-9" + else if (this->bedType == 9) { + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand.c_str(), + bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + } + } + } + + + + /* + reportBedNewLine + + Writes the _original_ BED entry with a NEWLINE + at the end of the line. + Works for BED3 - BED6. + */ + template <typename T> + inline void reportBedNewLine(const T &bed) { + + // if it is azeroLength feature, we need to + // correct the start and end coords to what they were + // in the original file + CHRPOS start = bed.start; + CHRPOS end = bed.end; + if (bed.zeroLength) { + if (_isGff == false) + start++; + end--; + } + + //BED + if (_isGff == false && _isVcf == false) { + if (this->bedType == 3) { + printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end); + } + else if (this->bedType == 4) { + printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str()); + } + else if (this->bedType == 5) { + printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str()); + } + else if (this->bedType == 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + } + else if (this->bedType > 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("\t%s", othIt->c_str()); + } + printf("\n"); + } + } + // VCF + else if (_isGff == false && _isVcf == true) { + printf ("%s\t%d\t", bed.chrom.c_str(), start+1); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + printf("\n"); + } + // GFF + else if (_isGff == true) { + // "GFF-8" + if (this->bedType == 8) { + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand.c_str(), + bed.otherFields[1].c_str()); + } + // "GFF-9" + else if (this->bedType == 9) { + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand.c_str(), + bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + } + } + } + + + + /* + reportBedRangeNewLine + + Writes a custom start->end for a BED entry + with a NEWLINE at the end of the line. + + Works for BED3 - BED6. + */ + template <typename T> + inline void reportBedRangeTab(const T &bed, CHRPOS start, CHRPOS end) { + + // if it is azeroLength feature, we need to + // correct the start and end coords to what they were + // in the original file + if (bed.zeroLength) { + start = bed.start + 1; + end = bed.end - 1; + } + // BED + if (_isGff == false && _isVcf == false) { + if (this->bedType == 3) { + printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end); + } + else if (this->bedType == 4) { + printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str()); + } + else if (this->bedType == 5) { + printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str()); + } + else if (this->bedType == 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + } + else if (this->bedType > 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + } + } + // VCF + else if (_isGff == false && _isVcf == true) { + printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + } + // GFF + else if (_isGff == true) { + // "GFF-8" + if (this->bedType == 8) { + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand.c_str(), + bed.otherFields[1].c_str()); + } + // "GFF-9" + else if (this->bedType == 9) { + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand.c_str(), + bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + } + } + } + + + + /* + reportBedRangeTab + + Writes a custom start->end for a BED entry + with a TAB at the end of the line. + + Works for BED3 - BED6. + */ + template <typename T> + inline void reportBedRangeNewLine(const T &bed, CHRPOS start, CHRPOS end) { + + // if it is azeroLength feature, we need to + // correct the start and end coords to what they were + // in the original file + if (bed.zeroLength) { + start = bed.start + 1; + end = bed.end - 1; + } + // BED + if (_isGff == false && _isVcf == false) { + if (this->bedType == 3) { + printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end); + } + else if (this->bedType == 4) { + printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str()); + } + else if (this->bedType == 5) { + printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str()); + } + else if (this->bedType == 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + } + else if (this->bedType > 6) { + printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(), + bed.score.c_str(), bed.strand.c_str()); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("\t%s", othIt->c_str()); + } + printf("\n"); + } + } + // VCF + else if (_isGff == false && _isVcf == true) { + printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); + + vector<string>::const_iterator othIt = bed.otherFields.begin(); + vector<string>::const_iterator othEnd = bed.otherFields.end(); + for ( ; othIt != othEnd; ++othIt) { + printf("%s\t", othIt->c_str()); + } + printf("\n"); + } + // GFF + else if (_isGff == true) { + // "GFF-9" + if (this->bedType == 8) { + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand.c_str(), + bed.otherFields[1].c_str()); + } + // "GFF-8" + else if (this->bedType == 9) { + printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), + bed.name.c_str(), start+1, end, + bed.score.c_str(), bed.strand.c_str(), + bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); + } + } + } + + + /* + reportNullBedTab + */ + void reportNullBedTab() { + + if (_isGff == false && _isVcf == false) { + if (this->bedType == 3) { + printf (".\t-1\t-1\t"); + } + else if (this->bedType == 4) { + printf (".\t-1\t-1\t.\t"); + } + else if (this->bedType == 5) { + printf (".\t-1\t-1\t.\t-1\t"); + } + else if (this->bedType == 6) { + printf (".\t-1\t-1\t.\t-1\t.\t"); + } + else if (this->bedType > 6) { + printf (".\t-1\t-1\t.\t-1\t.\t"); + for (unsigned int i = 6; i < this->bedType; ++i) { + printf(".\t"); + } + } + } + else if (_isGff == true && _isVcf == false) { + if (this->bedType == 8) { + printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t"); + } + else if (this->bedType == 9) { + printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t"); + } + } + } + + + /* + reportNullBedTab + */ + void reportNullBedNewLine() { + + if (_isGff == false && _isVcf == false) { + if (this->bedType == 3) { + printf (".\t-1\t-1\n"); + } + else if (this->bedType == 4) { + printf (".\t-1\t-1\t.\n"); + } + else if (this->bedType == 5) { + printf (".\t-1\t-1\t.\t-1\n"); + } + else if (this->bedType == 6) { + printf (".\t-1\t-1\t.\t-1\t.\n"); + } + else if (this->bedType > 6) { + printf (".\t-1\t-1\t.\t-1\t."); + for (unsigned int i = 6; i < this->bedType; ++i) { + printf("\t."); + } + printf("\n"); + } + } + else if (_isGff == true && _isVcf == false) { + if (this->bedType == 8) { + printf (".\t.\t.\t-1\t-1\t-1\t.\t.\n"); + } + else if (this->bedType == 9) { + printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n"); + } + } + } + + +}; + +#endif /* BEDFILE_H */