Mercurial > repos > aaronquinlan > multi_intersect

diff BEDTools-Version-2.14.3/src/utils/bedFile/bedFile.h @ 0:dfcd8b6c1bda
Uploaded
author: aaronquinlan
date: Thu, 03 Nov 2011 10:25:04 -0400
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BEDTools-Version-2.14.3/src/utils/bedFile/bedFile.h	Thu Nov 03 10:25:04 2011 -0400
@@ -0,0 +1,1167 @@
+/*****************************************************************************
+  bedFile.h
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licensed under the GNU General Public License 2.0 license.
+******************************************************************************/
+#ifndef BEDFILE_H
+#define BEDFILE_H
+
+// "local" includes
+#include "gzstream.h"
+#include "lineFileUtilities.h"
+#include "fileType.h"
+
+// standard includes
+#include <vector>
+#include <map>
+#include <set>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <cstring>
+#include <algorithm>
+#include <limits.h>
+#include <stdint.h>
+#include <cstdio>
+//#include <tr1/unordered_map>  // Experimental.
+using namespace std;
+
+
+//*************************************************
+// Data type tydedef
+//*************************************************
+typedef uint32_t CHRPOS;
+typedef uint16_t BINLEVEL;
+typedef uint32_t BIN;
+typedef uint16_t USHORT;
+typedef uint32_t UINT;
+
+//*************************************************
+// Genome binning constants
+//*************************************************
+
+const BIN      _numBins   = 37450;
+const BINLEVEL _binLevels = 7;
+
+// bins range in size from 16kb to 512Mb
+// Bin  0          spans 512Mbp,   # Level 1
+// Bins 1-8        span 64Mbp,     # Level 2
+// Bins 9-72       span 8Mbp,      # Level 3
+// Bins 73-584     span 1Mbp       # Level 4
+// Bins 585-4680   span 128Kbp     # Level 5
+// Bins 4681-37449 span 16Kbp      # Level 6
+const BIN _binOffsetsExtended[] = {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0};
+//const BIN _binOffsetsExtended[] = {4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0};
+
+const USHORT _binFirstShift = 14;       /* How much to shift to get to finest bin. */
+const USHORT _binNextShift  = 3;        /* How much to shift to get to next larger bin. */
+
+
+//*************************************************
+// Common data structures
+//*************************************************
+
+struct DEPTH {
+    UINT starts;
+    UINT ends;
+};
+
+
+/*
+    Structure for regular BED records
+*/
+struct BED {
+
+    // Regular BED fields
+    string chrom;
+    CHRPOS start;
+    CHRPOS end;
+    string name;
+    string score;
+    string strand;
+
+    // Add'l fields for BED12 and/or custom BED annotations
+    vector<string> otherFields;
+
+    // experimental fields for the FJOIN approach.
+    bool   zeroLength;
+    bool   added;
+    bool   finished;
+    // list of hits from another file.
+    vector<BED> overlaps;
+
+public:
+    // constructors
+
+    // Null
+    BED()
+    : chrom(""),
+      start(0),
+      end(0),
+      name(""),
+      score(""),
+      strand(""),
+      otherFields(),
+      zeroLength(false),
+      added(false),
+      finished(false),
+      overlaps()
+    {}
+
+    // BED3
+    BED(string chrom, CHRPOS start, CHRPOS end)
+    : chrom(chrom),
+      start(start),
+      end(end),
+      name(""),
+      score(""),
+      strand(""),
+      otherFields(),
+      zeroLength(false),
+      added(false),
+      finished(false),
+      overlaps()
+    {}
+
+    // BED4
+    BED(string chrom, CHRPOS start, CHRPOS end, string strand)
+    : chrom(chrom),
+      start(start),
+      end(end),
+      name(""),
+      score(""),
+      strand(strand),
+      otherFields(),
+      zeroLength(false),
+      added(false),
+      finished(false),
+      overlaps()
+    {}
+
+    // BED6
+    BED(string chrom, CHRPOS start, CHRPOS end, string name,
+        string score, string strand)
+    : chrom(chrom),
+      start(start),
+      end(end),
+      name(name),
+      score(score),
+      strand(strand),
+      otherFields(),
+      zeroLength(false),
+      added(false),
+      finished(false),
+      overlaps()
+    {}
+
+    // BEDALL
+    BED(string chrom, CHRPOS start, CHRPOS end, string name,
+        string score, string strand, vector<string> otherFields)
+    : chrom(chrom),
+      start(start),
+      end(end),
+      name(name),
+      score(score),
+      strand(strand),
+      otherFields(otherFields),
+      zeroLength(false),
+      added(false),
+      finished(false),
+      overlaps()
+    {}
+    
+    int size() {
+        return end-start;
+    }
+
+}; // BED
+
+
+/*
+    Structure for each end of a paired BED record
+    mate points to the other end.
+*/
+struct MATE {
+    BED bed;
+    int lineNum;
+    MATE *mate;
+};
+
+
+/*
+    Structure for regular BED COVERAGE records
+*/
+struct BEDCOV {
+
+    string chrom;
+    
+    // Regular BED fields
+    CHRPOS start;
+    CHRPOS end;
+    string name;
+    string score;
+    string strand;
+
+    // Add'l fields for BED12 and/or custom BED annotations
+    vector<string> otherFields;
+
+    // flag a zero-length feature
+    bool   zeroLength;
+    
+    // Additional fields specific to computing coverage
+    map<unsigned int, DEPTH> depthMap;
+    unsigned int count;
+    CHRPOS minOverlapStart;
+    
+    
+    public:
+    // constructors
+    // Null
+    BEDCOV()
+    : chrom(""),
+      start(0),
+      end(0),
+      name(""),
+      score(""),
+      strand(""),
+      otherFields(),
+      zeroLength(false),
+      depthMap(),
+      count(0),
+      minOverlapStart(0)
+    {}
+};
+
+
+/*
+    Structure for BED COVERAGE records having lists of
+    multiple coverages
+*/
+struct BEDCOVLIST {
+
+    // Regular BED fields
+    string chrom;
+    CHRPOS start;
+    CHRPOS end;
+    string name;
+    string score;
+    string strand;
+
+    // Add'l fields for BED12 and/or custom BED annotations
+    vector<string> otherFields;
+    
+    // flag a zero-length feature
+    bool   zeroLength;
+
+    // Additional fields specific to computing coverage
+    vector< map<unsigned int, DEPTH> > depthMapList;
+    vector<unsigned int> counts;
+    vector<CHRPOS> minOverlapStarts;
+    
+    
+    public:
+    // constructors
+    // Null
+    BEDCOVLIST()
+    : chrom(""),
+      start(0),
+      end(0),
+      name(""),
+      score(""),
+      strand(""),
+      otherFields(),
+      zeroLength(false),
+      depthMapList(),
+      counts(0),
+      minOverlapStarts(0)
+    {}
+};
+
+
+// enum to flag the state of a given line in a BED file.
+enum BedLineStatus
+{
+    BED_INVALID = -1,
+    BED_HEADER  = 0,
+    BED_BLANK   = 1,
+    BED_VALID   = 2
+};
+
+// enum to indicate the type of file we are dealing with
+enum FileType
+{
+    BED_FILETYPE,
+    GFF_FILETYPE,
+    VCF_FILETYPE
+};
+
+//*************************************************
+// Data structure typedefs
+//*************************************************
+typedef vector<BED>    bedVector;
+typedef vector<BEDCOV> bedCovVector;
+typedef vector<MATE> mateVector;
+typedef vector<BEDCOVLIST> bedCovListVector;
+
+typedef map<BIN, bedVector,    std::less<BIN> > binsToBeds;
+typedef map<BIN, bedCovVector, std::less<BIN> > binsToBedCovs;
+typedef map<BIN, mateVector, std::less<BIN> > binsToMates;
+typedef map<BIN, bedCovListVector, std::less<BIN> > binsToBedCovLists;
+
+typedef map<string, binsToBeds, std::less<string> >    masterBedMap;
+typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap;
+typedef map<string, binsToMates, std::less<string> > masterMateMap;
+typedef map<string, binsToBedCovLists, std::less<string> > masterBedCovListMap;
+typedef map<string, bedVector, std::less<string> >     masterBedMapNoBin;
+
+
+// EXPERIMENTAL - wait for TR1
+// typedef vector<BED>    bedVector;
+// typedef vector<BEDCOV> bedCovVector;
+//
+// typedef tr1::unordered_map<BIN, bedVector> binsToBeds;
+// typedef tr1::unordered_map<BIN, bedCovVector> binsToBedCovs;
+//
+// typedef tr1::unordered_map<string, binsToBeds>    masterBedMap;
+// typedef tr1::unordered_map<string, binsToBedCovs> masterBedCovMap;
+// typedef tr1::unordered_map<string, bedVector>     masterBedMapNoBin;
+
+
+
+// return the genome "bin" for a feature with this start and end
+inline
+BIN getBin(CHRPOS start, CHRPOS end) {
+    --end;
+    start >>= _binFirstShift;
+    end   >>= _binFirstShift;
+
+    for (register short i = 0; i < _binLevels; ++i) {
+        if (start == end) return _binOffsetsExtended[i] + start;
+        start >>= _binNextShift;
+        end   >>= _binNextShift;
+    }
+    cerr << "start " << start << ", end " << end << " out of range in findBin (max is 512M)" << endl;
+    return 0;
+}
+
+/****************************************************
+// isInteger(s): Tests if string s is a valid integer
+*****************************************************/
+inline bool isInteger(const std::string& s) {
+    int len = s.length();
+    for (int i = 0; i < len; i++) {
+        if (!std::isdigit(s[i])) return false;
+    }
+    return true;
+}
+
+
+// return the amount of overlap between two features.  Negative if none and the the
+// number of negative bases is the distance between the two.
+inline
+int overlaps(CHRPOS aS, CHRPOS aE, CHRPOS bS, CHRPOS bE) {
+    return min(aE, bE) - max(aS, bS);
+}
+
+// is A after (to the right of) B?
+inline
+bool after(const BED &a, const BED &b) {
+    return (a.start >= b.end);
+}
+
+
+// Ancillary functions
+void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks);
+
+
+// BED Sorting Methods
+bool sortByChrom(const BED &a, const BED &b);
+bool sortByStart(const BED &a, const BED &b);
+bool sortBySizeAsc(const BED &a, const BED &b);
+bool sortBySizeDesc(const BED &a, const BED &b);
+bool sortByScoreAsc(const BED &a, const BED &b);
+bool sortByScoreDesc(const BED &a, const BED &b);
+bool byChromThenStart(BED const &a, BED const &b);
+
+
+
+//************************************************
+// BedFile Class methods and elements
+//************************************************
+class BedFile {
+
+public:
+
+    // Constructor
+    BedFile(string &);
+
+    // Destructor
+    ~BedFile(void);
+
+    // Open a BED file for reading (creates an istream pointer)
+    void Open(void);
+    
+    // Rewind the pointer back to the beginning of the file
+    void Rewind(void);
+
+    // Jump to a specific byte in the file
+    void Seek(unsigned long offset);
+    
+    bool Empty();
+
+    // Close an opened BED file.
+    void Close(void);
+
+    // Get the next BED entry in an opened BED file.
+    BedLineStatus GetNextBed (BED &bed, int &lineNum, bool forceSorted = false);
+    
+    // Returns the next MERGED (i.e., non-overlapping) interval in an opened BED file
+    // NOTE: assumes input file is sorted by chrom then start
+    bool GetNextMergedBed(BED &merged_bed, int &lineNum);
+
+    // load a BED file into a map keyed by chrom, then bin. value is vector of BEDs
+    void loadBedFileIntoMap();
+
+    // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVs
+    void loadBedCovFileIntoMap();
+
+    // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVLISTs
+    void loadBedCovListFileIntoMap();
+
+    // load a BED file into a map keyed by chrom. value is vector of BEDs
+    void loadBedFileIntoMapNoBin();
+
+    // Given a chrom, start, end and strand for a single feature,
+    // search for all overlapping features in another BED file.
+    // Searches through each relevant genome bin on the same chromosome
+    // as the single feature. Note: Adapted from kent source "binKeeperFind"
+    void FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool sameStrand, bool diffStrand);
+
+    // return true if at least one overlap was found.  otherwise, return false.
+    bool FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand,
+                                        bool sameStrand, bool diffStrand, float overlapFraction = 0.0);
+
+    // return true if at least one __reciprocal__ overlap was found.  otherwise, return false.
+    bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand,
+                                                    bool sameStrand, bool diffStrand, float overlapFraction = 0.0);
+
+    // Given a chrom, start, end and strand for a single feature,
+    // increment a the number of hits for each feature in B file
+    // that the feature overlaps
+    void countHits(const BED &a, bool sameStrand = false, bool diffStrand = false, bool countsOnly = false);
+
+    // same as above, but has special logic that processes a set of
+    // BED "blocks" from a single entry so as to avoid over-counting
+    // each "block" of a single BAM/BED12 as distinct coverage.  That is,
+    // if one read has four block, we only want to count the coverage as
+    // coming from one read, not four.
+    void countSplitHits(const vector<BED> &bedBlock, bool sameStrand = false, bool diffStrand = false, bool countsOnly = false);
+
+    // Given a chrom, start, end and strand for a single feature,
+    // increment a the number of hits for each feature in B file
+    // that the feature overlaps
+    void countListHits(const BED &a, int index, bool sameStrand, bool diffStrand);
+
+    // the bedfile with which this instance is associated
+    string bedFile;
+    unsigned int bedType;  // 3-6, 12 for BED
+                           // 9 for GFF
+    bool isZeroBased;
+
+    // Main data structires used by BEDTools
+    masterBedCovMap      bedCovMap;
+    masterBedCovListMap  bedCovListMap;
+    masterBedMap         bedMap;
+    masterBedMapNoBin    bedMapNoBin;
+
+private:
+
+    // data
+    bool _isGff;
+    bool _isVcf;
+    bool _typeIsKnown;        // do we know the type?   (i.e., BED, GFF, VCF)
+    FileType   _fileType;     // what is the file type? (BED? GFF? VCF?)
+    istream   *_bedStream;
+    string _bedLine;
+    vector<string> _bedFields;
+    int _merged_start;
+    int _merged_end;
+    string _merged_chrom;
+    int _prev_start;
+    string _prev_chrom;
+
+    void setZeroBased(bool zeroBased);
+    void setGff (bool isGff);
+    void setVcf (bool isVcf);
+    void setFileType (FileType type);
+    void setBedType (int colNums);
+
+    /******************************************************
+    Private definitions to circumvent linker issues with
+    templated member functions.
+    *******************************************************/
+
+    /*
+        parseLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
+    */
+    template <typename T>
+    inline BedLineStatus parseLine (T &bed, const vector<string> &lineVector, int &lineNum) {
+
+        //char *p2End, *p3End, *p4End, *p5End;
+        //long l2, l3, l4, l5;
+        unsigned int numFields = lineVector.size();
+
+        // bail out if we have a blank line
+        if (numFields == 0) {
+            return BED_BLANK;
+        }
+
+        if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) {
+
+            if (numFields >= 3) {
+                // line parsing for all lines after the first non-header line
+                if (_typeIsKnown == true) {
+                    switch(_fileType) {
+                        case BED_FILETYPE:
+                            if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
+                        case VCF_FILETYPE:
+                            if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
+                        case GFF_FILETYPE:
+                            if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
+                        default:
+                            printf("ERROR: file type encountered. Exiting\n");
+                            exit(1);
+                    }
+                }
+                // line parsing for first non-header line: figure out file contents
+                else {
+                    // it's BED format if columns 2 and 3 are integers
+                    if (isInteger(lineVector[1]) && isInteger(lineVector[2])) {
+                        setGff(false);
+                        setZeroBased(true);
+                        setFileType(BED_FILETYPE);
+                        setBedType(numFields);       // we now expect numFields columns in each line
+                        if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
+                    }
+                    // it's VCF, assuming the second column is numeric and there are at least 8 fields.
+                    else if (isInteger(lineVector[1]) && numFields >= 8) {
+                        setGff(false);
+                        setVcf(true);
+                        setZeroBased(false);
+                        setFileType(VCF_FILETYPE);
+                        setBedType(numFields);       // we now expect numFields columns in each line
+                        if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
+                    }
+                    // it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total.
+                    else if ((numFields >= 8) && isInteger(lineVector[3]) && isInteger(lineVector[4])) {
+                        setGff(true);
+                        setZeroBased(false);
+                        setFileType(GFF_FILETYPE);
+                        setBedType(numFields);       // we now expect numFields columns in each line
+                        if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
+                    }
+                    else {
+                        cerr << "Unexpected file format.  Please use tab-delimited BED, GFF, or VCF. " <<
+                                "Perhaps you have non-integer starts or ends at line " << lineNum << "?" << endl;
+                        exit(1);
+                    }
+                }
+            }
+            else {
+                cerr << "It looks as though you have less than 3 columns at line: " << lineNum << ".  Are you sure your files are tab-delimited?" << endl;
+                exit(1);
+            }
+        }
+        else {
+            lineNum--;
+            return BED_HEADER;
+        }
+        // default
+        return BED_INVALID;
+    }
+
+
+    /*
+        parseBedLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
+    */
+    template <typename T>
+    inline bool parseBedLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) {
+
+        // process as long as the number of fields in this
+        // line matches what we expect for this file.
+        if (numFields == this->bedType) {
+            bed.chrom = lineVector[0];
+            int i;
+            i = atoi(lineVector[1].c_str());
+            if (i<0) {
+                 cerr << "Error: malformed BED entry at line " << lineNum << ". Start Coordinate detected that is < 0. Exiting." << endl;
+                 exit(1);
+            }
+            bed.start = (CHRPOS)i;
+            i = atoi(lineVector[2].c_str());
+            if (i<0) {
+                cerr << "Error: malformed BED entry at line " << lineNum << ". End Coordinate detected that is < 0. Exiting." << endl;
+                exit(1);
+            }
+            bed.end = (CHRPOS)i;
+            
+            // handle starts == end (e.g., insertions in reference genome)
+            if (bed.start == bed.end) {
+                bed.start--;
+                bed.end++;
+                bed.zeroLength = true;
+            }
+            
+            if (this->bedType == 4) {
+                bed.name = lineVector[3];
+            }
+            else if (this->bedType == 5) {
+                bed.name = lineVector[3];
+                bed.score = lineVector[4];
+            }
+            else if (this->bedType == 6) {
+                bed.name = lineVector[3];
+                bed.score = lineVector[4];
+                bed.strand = lineVector[5];
+            }
+            else if (this->bedType > 6) {
+                bed.name = lineVector[3];
+                bed.score = lineVector[4];
+                bed.strand = lineVector[5];
+                for (unsigned int i = 6; i < lineVector.size(); ++i) {
+                    bed.otherFields.push_back(lineVector[i]);
+                }
+            }
+            else if (this->bedType != 3) {
+                cerr << "Error: unexpected number of fields at line: " << lineNum
+                     << ".  Verify that your files are TAB-delimited.  Exiting..." << endl;
+                exit(1);
+            }
+
+            // sanity checks.
+            if (bed.start <= bed.end) {
+                return true;
+            }
+            else {
+                cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than end. Exiting." << endl;
+                exit(1);
+            }
+        }
+        else if (numFields == 1) {
+            cerr << "Only one BED field detected: " << lineNum << ".  Verify that your files are TAB-delimited.  Exiting..." << endl;
+            exit(1);
+        }
+        else if ((numFields != this->bedType) && (numFields != 0)) {
+            cerr << "Differing number of BED fields encountered at line: " << lineNum << ".  Exiting..." << endl;
+            exit(1);
+        }
+        else if ((numFields < 3) && (numFields != 0)) {
+            cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required at line: "<< lineNum << ".  Exiting..." << endl;
+            exit(1);
+        }
+        return false;
+    }
+
+
+    /*
+        parseVcfLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
+    */
+    template <typename T>
+    inline bool parseVcfLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) {
+        if (numFields == this->bedType) {
+            bed.chrom  = lineVector[0];
+            bed.start  = atoi(lineVector[1].c_str()) - 1;  // VCF is one-based
+            bed.end    = bed.start + lineVector[3].size(); // VCF 4.0 stores the size of the affected REF allele.
+            bed.strand = "+";
+            // construct the name from the ref and alt alleles.
+            // if it's an annotated variant, add the rsId as well.
+            bed.name   = lineVector[3] + "/" + lineVector[4];
+            if (lineVector[2] != ".") {
+                bed.name += "_" + lineVector[2];
+            }
+
+            if (this->bedType > 2) {
+                for (unsigned int i = 2; i < numFields; ++i)
+                    bed.otherFields.push_back(lineVector[i]);
+            }
+
+            if ((bed.start <= bed.end) && (bed.start >= 0) && (bed.end >= 0)) {
+                return true;
+            }
+            else if (bed.start > bed.end) {
+                cerr << "Error: malformed VCF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl;
+                exit(1);
+            }
+            else if ( (bed.start < 0) || (bed.end < 0) ) {
+                cerr << "Error: malformed VCF entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl;
+                exit(1);
+            }
+        }
+        else if (numFields == 1) {
+            cerr << "Only one VCF field detected: " << lineNum << ".  Verify that your files are TAB-delimited.  Exiting..." << endl;
+            exit(1);
+        }
+        else if ((numFields != this->bedType) && (numFields != 0)) {
+            cerr << "Differing number of VCF fields encountered at line: " << lineNum << ".  Exiting..." << endl;
+            exit(1);
+        }
+        else if ((numFields < 2) && (numFields != 0)) {
+            cerr << "TAB delimited VCF file with at least 2 fields (chrom, pos) is required at line: "<< lineNum << ".  Exiting..." << endl;
+            exit(1);
+        }
+        return false;
+    }
+
+
+
+    /*
+        parseGffLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
+    */
+    template <typename T>
+    inline bool parseGffLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) {
+        if (numFields == this->bedType) {
+            if (this->bedType >= 8 && _isGff) {
+                bed.chrom = lineVector[0];
+                if (isInteger(lineVector[3]))
+                    bed.start  = atoi(lineVector[3].c_str());
+                if (isInteger(lineVector[4]))
+                    bed.end  = atoi(lineVector[4].c_str());
+                bed.name   = lineVector[2];
+                bed.score  = lineVector[5];
+                bed.strand = lineVector[6].c_str();
+                bed.otherFields.push_back(lineVector[1]);  // add GFF "source". unused in BED
+                bed.otherFields.push_back(lineVector[7]);  // add GFF "fname". unused in BED
+                // handle the optional 9th field.
+                if (this->bedType == 9)
+                    bed.otherFields.push_back(lineVector[8]);  // add GFF "group". unused in BED
+                bed.start--;
+            }
+            else {
+                cerr << "Error: unexpected number of fields at line: " << lineNum <<
+                        ".  Verify that your files are TAB-delimited and that your GFF file has 8 or 9 fields.  Exiting..." << endl;
+                exit(1);
+            }
+            if (bed.start > bed.end) {
+                cerr << "Error: malformed GFF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl;
+                exit(1);
+            }
+            if ( (bed.start < 0) || (bed.end < 0) ) {
+                cerr << "Error: malformed GFF entry at line " << lineNum << ". Coordinate detected that is < 1. Exiting." << endl;
+                exit(1);
+            }
+            return true;
+        }
+        else if (numFields == 1) {
+            cerr << "Only one GFF field detected: " << lineNum << ".  Verify that your files are TAB-delimited.  Exiting..." << endl;
+            exit(1);
+        }
+        else if ((numFields != this->bedType) && (numFields != 0)) {
+            cerr << "Differing number of GFF fields encountered at line: " << lineNum << ".  Exiting..." << endl;
+            exit(1);
+        }
+        else if ((numFields < 8) && (numFields != 0)) {
+            cerr << "TAB delimited GFF file with 8 or 9 fields is required at line: "<< lineNum << ".  Exiting..." << endl;
+            exit(1);
+        }
+        return false;
+    }
+
+
+public:
+
+    /*
+        reportBedTab
+
+        Writes the _original_ BED entry with a TAB
+        at the end of the line.
+        Works for BED3 - BED6.
+    */
+    template <typename T>
+    inline void reportBedTab(const T &bed) {
+        
+        // if it is azeroLength feature, we need to
+        // correct the start and end coords to what they were
+        // in the original file
+        CHRPOS start = bed.start;
+        CHRPOS end   = bed.end;
+        if (bed.zeroLength) {
+            if (_isGff == false)
+                start++;
+            end--;
+        }
+        
+        // BED
+        if (_isGff == false && _isVcf == false) {
+            if (this->bedType == 3) {
+                printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end);
+            }
+            else if (this->bedType == 4) {
+                printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str());
+            }
+            else if (this->bedType == 5) {
+                printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
+                                                bed.score.c_str());
+            }
+            else if (this->bedType == 6) {
+                printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
+                                                    bed.score.c_str(), bed.strand.c_str());
+            }
+            else if (this->bedType > 6) {
+                printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
+                                                    bed.score.c_str(), bed.strand.c_str());
+
+                vector<string>::const_iterator othIt = bed.otherFields.begin();
+                vector<string>::const_iterator othEnd = bed.otherFields.end();
+                for ( ; othIt != othEnd; ++othIt) {
+                    printf("%s\t", othIt->c_str());
+                }
+            }
+        }
+        // VCF
+        else if (_isGff == false && _isVcf == true) {
+            printf ("%s\t%d\t", bed.chrom.c_str(), start+1);
+
+            vector<string>::const_iterator othIt = bed.otherFields.begin();
+            vector<string>::const_iterator othEnd = bed.otherFields.end();
+            for ( ; othIt != othEnd; ++othIt) {
+                printf("%s\t", othIt->c_str());
+            }
+        }
+        // GFF
+        else if (_isGff == true) {
+            // "GFF-8"
+            if (this->bedType == 8) {
+                printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(),
+                                                                 bed.name.c_str(), start+1, end,
+                                                                 bed.score.c_str(), bed.strand.c_str(),
+                                                                 bed.otherFields[1].c_str());
+            }
+            // "GFF-9"
+            else if (this->bedType == 9) {
+                printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(),
+                                                                 bed.name.c_str(), start+1, end,
+                                                                 bed.score.c_str(), bed.strand.c_str(),
+                                                                 bed.otherFields[1].c_str(), bed.otherFields[2].c_str());
+            }
+        }
+    }
+
+
+
+    /*
+        reportBedNewLine
+
+        Writes the _original_ BED entry with a NEWLINE
+        at the end of the line.
+        Works for BED3 - BED6.
+    */
+    template <typename T>
+    inline void reportBedNewLine(const T &bed) {
+        
+        // if it is azeroLength feature, we need to
+        // correct the start and end coords to what they were
+        // in the original file
+        CHRPOS start = bed.start;
+        CHRPOS end   = bed.end;
+        if (bed.zeroLength) {
+            if (_isGff == false)
+                start++;
+            end--;
+        }
+        
+        //BED
+        if (_isGff == false && _isVcf == false) {
+            if (this->bedType == 3) {
+                printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end);
+            }
+            else if (this->bedType == 4) {
+                printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str());
+            }
+            else if (this->bedType == 5) {
+                printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(),
+                                                bed.score.c_str());
+            }
+            else if (this->bedType == 6) {
+                printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(),
+                                                    bed.score.c_str(), bed.strand.c_str());
+            }
+            else if (this->bedType > 6) {
+                printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(),
+                                                    bed.score.c_str(), bed.strand.c_str());
+
+                vector<string>::const_iterator othIt = bed.otherFields.begin();
+                vector<string>::const_iterator othEnd = bed.otherFields.end();
+                for ( ; othIt != othEnd; ++othIt) {
+                    printf("\t%s", othIt->c_str());
+                }
+                printf("\n");
+            }
+        }
+        // VCF
+        else if (_isGff == false && _isVcf == true) {
+            printf ("%s\t%d\t", bed.chrom.c_str(), start+1);
+
+            vector<string>::const_iterator othIt = bed.otherFields.begin();
+            vector<string>::const_iterator othEnd = bed.otherFields.end();
+            for ( ; othIt != othEnd; ++othIt) {
+                printf("%s\t", othIt->c_str());
+            }
+            printf("\n");
+        }
+        // GFF
+        else if (_isGff == true) {
+            // "GFF-8"
+            if (this->bedType == 8) {
+                printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(),
+                                                                 bed.name.c_str(), start+1, end,
+                                                                 bed.score.c_str(), bed.strand.c_str(),
+                                                                 bed.otherFields[1].c_str());
+            }
+            // "GFF-9"
+            else if (this->bedType == 9) {
+                printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(),
+                                                                 bed.name.c_str(), start+1, end,
+                                                                 bed.score.c_str(), bed.strand.c_str(),
+                                                                 bed.otherFields[1].c_str(), bed.otherFields[2].c_str());
+            }
+        }
+    }
+
+
+
+    /*
+        reportBedRangeNewLine
+
+        Writes a custom start->end for a BED entry
+        with a NEWLINE at the end of the line.
+
+        Works for BED3 - BED6.
+    */
+    template <typename T>
+    inline void reportBedRangeTab(const T &bed, CHRPOS start, CHRPOS end) {
+        
+        // if it is azeroLength feature, we need to
+        // correct the start and end coords to what they were
+        // in the original file
+        if (bed.zeroLength) {
+            start = bed.start + 1;
+            end   = bed.end - 1;
+        }
+        // BED
+        if (_isGff == false && _isVcf == false) {
+            if (this->bedType == 3) {
+                printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end);
+            }
+            else if (this->bedType == 4) {
+                printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str());
+            }
+            else if (this->bedType == 5) {
+                printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
+                                                bed.score.c_str());
+            }
+            else if (this->bedType == 6) {
+                printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
+                                                    bed.score.c_str(), bed.strand.c_str());
+            }
+            else if (this->bedType > 6) {
+                printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
+                                                    bed.score.c_str(), bed.strand.c_str());
+
+                vector<string>::const_iterator othIt = bed.otherFields.begin();
+                vector<string>::const_iterator othEnd = bed.otherFields.end();
+                for ( ; othIt != othEnd; ++othIt) {
+                    printf("%s\t", othIt->c_str());
+                }
+            }
+        }
+        // VCF
+        else if (_isGff == false && _isVcf == true) {
+            printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1);
+
+            vector<string>::const_iterator othIt = bed.otherFields.begin();
+            vector<string>::const_iterator othEnd = bed.otherFields.end();
+            for ( ; othIt != othEnd; ++othIt) {
+                printf("%s\t", othIt->c_str());
+            }
+        }
+        // GFF
+        else if (_isGff == true) {
+            // "GFF-8"
+            if (this->bedType == 8) {
+                printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(),
+                                                             bed.name.c_str(), start+1, end,
+                                                             bed.score.c_str(), bed.strand.c_str(),
+                                                             bed.otherFields[1].c_str());
+            }
+            // "GFF-9"
+            else if (this->bedType == 9) {
+                printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(),
+                                                             bed.name.c_str(), start+1, end,
+                                                             bed.score.c_str(), bed.strand.c_str(),
+                                                             bed.otherFields[1].c_str(), bed.otherFields[2].c_str());
+            }
+        }
+    }
+
+
+
+    /*
+        reportBedRangeTab
+
+        Writes a custom start->end for a BED entry
+        with a TAB at the end of the line.
+
+        Works for BED3 - BED6.
+    */
+    template <typename T>
+    inline void reportBedRangeNewLine(const T &bed, CHRPOS start, CHRPOS end) {
+        
+        // if it is azeroLength feature, we need to
+        // correct the start and end coords to what they were
+        // in the original file
+        if (bed.zeroLength) {
+            start = bed.start + 1;
+            end   = bed.end - 1;
+        }
+        // BED
+        if (_isGff == false && _isVcf == false) {
+            if (this->bedType == 3) {
+                printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end);
+            }
+            else if (this->bedType == 4) {
+                printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str());
+            }
+            else if (this->bedType == 5) {
+                printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(),
+                                                bed.score.c_str());
+            }
+            else if (this->bedType == 6) {
+                printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(),
+                                                    bed.score.c_str(), bed.strand.c_str());
+            }
+            else if (this->bedType > 6) {
+                printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(),
+                                                    bed.score.c_str(), bed.strand.c_str());
+
+                vector<string>::const_iterator othIt = bed.otherFields.begin();
+                vector<string>::const_iterator othEnd = bed.otherFields.end();
+                for ( ; othIt != othEnd; ++othIt) {
+                    printf("\t%s", othIt->c_str());
+                }
+                printf("\n");
+            }
+        }
+        // VCF
+        else if (_isGff == false && _isVcf == true) {
+            printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1);
+
+            vector<string>::const_iterator othIt = bed.otherFields.begin();
+            vector<string>::const_iterator othEnd = bed.otherFields.end();
+            for ( ; othIt != othEnd; ++othIt) {
+                printf("%s\t", othIt->c_str());
+            }
+            printf("\n");
+        }
+        // GFF
+        else if (_isGff == true) {
+            // "GFF-9"
+            if (this->bedType == 8) {
+                printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(),
+                                                             bed.name.c_str(), start+1, end,
+                                                             bed.score.c_str(), bed.strand.c_str(),
+                                                             bed.otherFields[1].c_str());
+            }
+            // "GFF-8"
+            else if (this->bedType == 9) {
+                printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(),
+                                                             bed.name.c_str(), start+1, end,
+                                                             bed.score.c_str(), bed.strand.c_str(),
+                                                             bed.otherFields[1].c_str(), bed.otherFields[2].c_str());
+            }
+        }
+    }
+
+
+    /*
+        reportNullBedTab
+    */
+    void reportNullBedTab() {
+
+        if (_isGff == false && _isVcf == false) {
+            if (this->bedType == 3) {
+                printf (".\t-1\t-1\t");
+            }
+            else if (this->bedType == 4) {
+                printf (".\t-1\t-1\t.\t");
+            }
+            else if (this->bedType == 5) {
+                printf (".\t-1\t-1\t.\t-1\t");
+            }
+            else if (this->bedType == 6) {
+                printf (".\t-1\t-1\t.\t-1\t.\t");
+            }
+            else if (this->bedType > 6) {
+                printf (".\t-1\t-1\t.\t-1\t.\t");
+                for (unsigned int i = 6; i < this->bedType; ++i) {
+                    printf(".\t");
+                }
+            }
+        }
+        else if (_isGff == true && _isVcf == false) {
+            if (this->bedType == 8) {
+                printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t");
+            }
+            else if (this->bedType == 9) {
+                printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t");
+            }
+        }
+    }
+
+
+    /*
+        reportNullBedTab
+    */
+    void reportNullBedNewLine() {
+
+        if (_isGff == false && _isVcf == false) {
+            if (this->bedType == 3) {
+                printf (".\t-1\t-1\n");
+            }
+            else if (this->bedType == 4) {
+                printf (".\t-1\t-1\t.\n");
+            }
+            else if (this->bedType == 5) {
+                printf (".\t-1\t-1\t.\t-1\n");
+            }
+            else if (this->bedType == 6) {
+                printf (".\t-1\t-1\t.\t-1\t.\n");
+            }
+            else if (this->bedType > 6) {
+                printf (".\t-1\t-1\t.\t-1\t.");
+                for (unsigned int i = 6; i < this->bedType; ++i) {
+                    printf("\t.");
+                }
+                printf("\n");
+            }
+        }
+        else if (_isGff == true && _isVcf == false) {
+            if (this->bedType == 8) {
+                printf (".\t.\t.\t-1\t-1\t-1\t.\t.\n");
+            }
+            else if (this->bedType == 9) {
+                printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n");
+            }
+        }
+    }
+
+
+};
+
+#endif /* BEDFILE_H */
author	aaronquinlan
date	Thu, 03 Nov 2011 10:25:04 -0400
parents
children