comparison BEDTools-Version-2.14.3/src/utils/bedFile/bedFile.h.orig @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfcd8b6c1bda
1 /*****************************************************************************
2 bedFile.h
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licensed under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #ifndef BEDFILE_H
13 #define BEDFILE_H
14
15 // "local" includes
16 #include "gzstream.h"
17 #include "lineFileUtilities.h"
18 #include "fileType.h"
19
20 // standard includes
21 #include <vector>
22 #include <map>
23 #include <set>
24 #include <string>
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <cstring>
29 #include <algorithm>
30 #include <limits.h>
31 #include <stdint.h>
32 #include <cstdio>
33 //#include <tr1/unordered_map> // Experimental.
34 using namespace std;
35
36
37 //*************************************************
38 // Data type tydedef
39 //*************************************************
40 typedef uint32_t CHRPOS;
41 typedef uint16_t BINLEVEL;
42 typedef uint32_t BIN;
43 typedef uint16_t USHORT;
44 typedef uint32_t UINT;
45
46 //*************************************************
47 // Genome binning constants
48 //*************************************************
49
50 const BIN _numBins = 37450;
51 const BINLEVEL _binLevels = 7;
52
53 // bins range in size from 16kb to 512Mb
54 // Bin 0 spans 512Mbp, # Level 1
55 // Bins 1-8 span 64Mbp, # Level 2
56 // Bins 9-72 span 8Mbp, # Level 3
57 // Bins 73-584 span 1Mbp # Level 4
58 // Bins 585-4680 span 128Kbp # Level 5
59 // Bins 4681-37449 span 16Kbp # Level 6
60 const BIN _binOffsetsExtended[] = {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0};
61 //const BIN _binOffsetsExtended[] = {4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0};
62
63 const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */
64 const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */
65
66
67 //*************************************************
68 // Common data structures
69 //*************************************************
70
71 struct DEPTH {
72 UINT starts;
73 UINT ends;
74 };
75
76
77 /*
78 Structure for regular BED records
79 */
80 struct BED {
81
82 // Regular BED fields
83 string chrom;
84 CHRPOS start;
85 CHRPOS end;
86 string name;
87 string score;
88 string strand;
89
90 // Add'l fields for BED12 and/or custom BED annotations
91 vector<string> otherFields;
92
93 // experimental fields for the FJOIN approach.
94 bool zeroLength;
95 bool added;
96 bool finished;
97 // list of hits from another file.
98 vector<BED> overlaps;
99
100 public:
101 // constructors
102
103 // Null
104 BED()
105 : chrom(""),
106 start(0),
107 end(0),
108 name(""),
109 score(""),
110 strand(""),
111 otherFields(),
112 zeroLength(false),
113 added(false),
114 finished(false),
115 overlaps()
116 {}
117
118 // BED3
119 BED(string chrom, CHRPOS start, CHRPOS end)
120 : chrom(chrom),
121 start(start),
122 end(end),
123 name(""),
124 score(""),
125 strand(""),
126 otherFields(),
127 zeroLength(false),
128 added(false),
129 finished(false),
130 overlaps()
131 {}
132
133 // BED4
134 BED(string chrom, CHRPOS start, CHRPOS end, string strand)
135 : chrom(chrom),
136 start(start),
137 end(end),
138 name(""),
139 score(""),
140 strand(strand),
141 otherFields(),
142 zeroLength(false),
143 added(false),
144 finished(false),
145 overlaps()
146 {}
147
148 // BED6
149 BED(string chrom, CHRPOS start, CHRPOS end, string name,
150 string score, string strand)
151 : chrom(chrom),
152 start(start),
153 end(end),
154 name(name),
155 score(score),
156 strand(strand),
157 otherFields(),
158 zeroLength(false),
159 added(false),
160 finished(false),
161 overlaps()
162 {}
163
164 // BEDALL
165 BED(string chrom, CHRPOS start, CHRPOS end, string name,
166 string score, string strand, vector<string> otherFields)
167 : chrom(chrom),
168 start(start),
169 end(end),
170 name(name),
171 score(score),
172 strand(strand),
173 otherFields(otherFields),
174 zeroLength(false),
175 added(false),
176 finished(false),
177 overlaps()
178 {}
179
180 int size() {
181 return end-start;
182 }
183
184 }; // BED
185
186
187 /*
188 Structure for each end of a paired BED record
189 mate points to the other end.
190 */
191 struct MATE {
192 BED bed;
193 int lineNum;
194 MATE *mate;
195 };
196
197
198 /*
199 Structure for regular BED COVERAGE records
200 */
201 struct BEDCOV {
202
203 string chrom;
204
205 // Regular BED fields
206 CHRPOS start;
207 CHRPOS end;
208 string name;
209 string score;
210 string strand;
211
212 // Add'l fields for BED12 and/or custom BED annotations
213 vector<string> otherFields;
214
215 // flag a zero-length feature
216 bool zeroLength;
217
218 // Additional fields specific to computing coverage
219 map<unsigned int, DEPTH> depthMap;
220 unsigned int count;
221 CHRPOS minOverlapStart;
222
223
224 public:
225 // constructors
226 // Null
227 BEDCOV()
228 : chrom(""),
229 start(0),
230 end(0),
231 name(""),
232 score(""),
233 strand(""),
234 otherFields(),
235 zeroLength(false),
236 depthMap(),
237 count(0),
238 minOverlapStart(0)
239 {}
240 };
241
242
243 /*
244 Structure for BED COVERAGE records having lists of
245 multiple coverages
246 */
247 struct BEDCOVLIST {
248
249 // Regular BED fields
250 string chrom;
251 CHRPOS start;
252 CHRPOS end;
253 string name;
254 string score;
255 string strand;
256
257 // Add'l fields for BED12 and/or custom BED annotations
258 vector<string> otherFields;
259
260 // flag a zero-length feature
261 bool zeroLength;
262
263 // Additional fields specific to computing coverage
264 vector< map<unsigned int, DEPTH> > depthMapList;
265 vector<unsigned int> counts;
266 vector<CHRPOS> minOverlapStarts;
267
268
269 public:
270 // constructors
271 // Null
272 BEDCOVLIST()
273 : chrom(""),
274 start(0),
275 end(0),
276 name(""),
277 score(""),
278 strand(""),
279 otherFields(),
280 zeroLength(false),
281 depthMapList(),
282 counts(0),
283 minOverlapStarts(0)
284 {}
285 };
286
287
288 // enum to flag the state of a given line in a BED file.
289 enum BedLineStatus
290 {
291 BED_INVALID = -1,
292 BED_HEADER = 0,
293 BED_BLANK = 1,
294 BED_VALID = 2
295 };
296
297 // enum to indicate the type of file we are dealing with
298 enum FileType
299 {
300 BED_FILETYPE,
301 GFF_FILETYPE,
302 VCF_FILETYPE
303 };
304
305 //*************************************************
306 // Data structure typedefs
307 //*************************************************
308 typedef vector<BED> bedVector;
309 typedef vector<BEDCOV> bedCovVector;
310 typedef vector<MATE> mateVector;
311 typedef vector<BEDCOVLIST> bedCovListVector;
312
313 typedef map<BIN, bedVector, std::less<BIN> > binsToBeds;
314 typedef map<BIN, bedCovVector, std::less<BIN> > binsToBedCovs;
315 typedef map<BIN, mateVector, std::less<BIN> > binsToMates;
316 typedef map<BIN, bedCovListVector, std::less<BIN> > binsToBedCovLists;
317
318 typedef map<string, binsToBeds, std::less<string> > masterBedMap;
319 typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap;
320 typedef map<string, binsToMates, std::less<string> > masterMateMap;
321 typedef map<string, binsToBedCovLists, std::less<string> > masterBedCovListMap;
322 typedef map<string, bedVector, std::less<string> > masterBedMapNoBin;
323
324
325 // EXPERIMENTAL - wait for TR1
326 // typedef vector<BED> bedVector;
327 // typedef vector<BEDCOV> bedCovVector;
328 //
329 // typedef tr1::unordered_map<BIN, bedVector> binsToBeds;
330 // typedef tr1::unordered_map<BIN, bedCovVector> binsToBedCovs;
331 //
332 // typedef tr1::unordered_map<string, binsToBeds> masterBedMap;
333 // typedef tr1::unordered_map<string, binsToBedCovs> masterBedCovMap;
334 // typedef tr1::unordered_map<string, bedVector> masterBedMapNoBin;
335
336
337
338 // return the genome "bin" for a feature with this start and end
339 inline
340 BIN getBin(CHRPOS start, CHRPOS end) {
341 --end;
342 start >>= _binFirstShift;
343 end >>= _binFirstShift;
344
345 for (register short i = 0; i < _binLevels; ++i) {
346 if (start == end) return _binOffsetsExtended[i] + start;
347 start >>= _binNextShift;
348 end >>= _binNextShift;
349 }
350 cerr << "start " << start << ", end " << end << " out of range in findBin (max is 512M)" << endl;
351 return 0;
352 }
353
354 /****************************************************
355 // isInteger(s): Tests if string s is a valid integer
356 *****************************************************/
357 inline bool isInteger(const std::string& s) {
358 int len = s.length();
359 for (int i = 0; i < len; i++) {
360 if (!std::isdigit(s[i])) return false;
361 }
362 return true;
363 }
364
365
366 // return the amount of overlap between two features. Negative if none and the the
367 // number of negative bases is the distance between the two.
368 inline
369 int overlaps(CHRPOS aS, CHRPOS aE, CHRPOS bS, CHRPOS bE) {
370 return min(aE, bE) - max(aS, bS);
371 }
372
373
374 // Ancillary functions
375 void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks);
376
377
378 // BED Sorting Methods
379 bool sortByChrom(const BED &a, const BED &b);
380 bool sortByStart(const BED &a, const BED &b);
381 bool sortBySizeAsc(const BED &a, const BED &b);
382 bool sortBySizeDesc(const BED &a, const BED &b);
383 bool sortByScoreAsc(const BED &a, const BED &b);
384 bool sortByScoreDesc(const BED &a, const BED &b);
385 bool byChromThenStart(BED const &a, BED const &b);
386
387
388
389 //************************************************
390 // BedFile Class methods and elements
391 //************************************************
392 class BedFile {
393
394 public:
395
396 // Constructor
397 BedFile(string &);
398
399 // Destructor
400 ~BedFile(void);
401
402 // Open a BED file for reading (creates an istream pointer)
403 void Open(void);
404
405 // Close an opened BED file.
406 void Close(void);
407
408 // Get the next BED entry in an opened BED file.
409 BedLineStatus GetNextBed (BED &bed, int &lineNum);
410
411 // load a BED file into a map keyed by chrom, then bin. value is vector of BEDs
412 void loadBedFileIntoMap();
413
414 // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVs
415 void loadBedCovFileIntoMap();
416
417 // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVLISTs
418 void loadBedCovListFileIntoMap();
419
420 // load a BED file into a map keyed by chrom. value is vector of BEDs
421 void loadBedFileIntoMapNoBin();
422
423 // Given a chrom, start, end and strand for a single feature,
424 // search for all overlapping features in another BED file.
425 // Searches through each relevant genome bin on the same chromosome
426 // as the single feature. Note: Adapted from kent source "binKeeperFind"
427 void FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool forceStrand);
428
429 // return true if at least one overlap was found. otherwise, return false.
430 bool FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand,
431 bool forceStrand, float overlapFraction = 0.0);
432
433 // return true if at least one __reciprocal__ overlap was found. otherwise, return false.
434 bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand,
435 bool forceStrand, float overlapFraction = 0.0);
436
437 // Given a chrom, start, end and strand for a single feature,
438 // increment a the number of hits for each feature in B file
439 // that the feature overlaps
440 void countHits(const BED &a, bool forceStrand = false, bool countsOnly = false);
441
442 // same as above, but has special logic that processes a set of
443 // BED "blocks" from a single entry so as to avoid over-counting
444 // each "block" of a single BAM/BED12 as distinct coverage. That is,
445 // if one read has four block, we only want to count the coverage as
446 // coming from one read, not four.
447 void countSplitHits(const vector<BED> &bedBlock, bool forceStrand = false, bool countsOnly = false);
448
449 // Given a chrom, start, end and strand for a single feature,
450 // increment a the number of hits for each feature in B file
451 // that the feature overlaps
452 void countListHits(const BED &a, int index, bool forceStrand);
453
454 // the bedfile with which this instance is associated
455 string bedFile;
456 unsigned int bedType; // 3-6, 12 for BED
457 // 9 for GFF
458 bool isZeroBased;
459
460 // Main data structires used by BEDTools
461 masterBedCovMap bedCovMap;
462 masterBedCovListMap bedCovListMap;
463 masterBedMap bedMap;
464 masterBedMapNoBin bedMapNoBin;
465
466 private:
467
468 // data
469 bool _isGff;
470 bool _isVcf;
471 bool _typeIsKnown; // do we know the type? (i.e., BED, GFF, VCF)
472 FileType _fileType; // what is the file type? (BED? GFF? VCF?)
473 istream *_bedStream;
474 string _bedLine;
475 vector<string> _bedFields;
476
477 void setZeroBased(bool zeroBased);
478 void setGff (bool isGff);
479 void setVcf (bool isVcf);
480 void setFileType (FileType type);
481 void setBedType (int colNums);
482
483 /******************************************************
484 Private definitions to circumvent linker issues with
485 templated member functions.
486 *******************************************************/
487
488 /*
489 parseLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
490 */
491 template <typename T>
492 inline BedLineStatus parseLine (T &bed, const vector<string> &lineVector, int &lineNum) {
493
494 //char *p2End, *p3End, *p4End, *p5End;
495 //long l2, l3, l4, l5;
496 unsigned int numFields = lineVector.size();
497
498 // bail out if we have a blank line
499 if (numFields == 0) {
500 return BED_BLANK;
501 }
502
503 if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) {
504
505 if (numFields >= 3) {
506 // line parsing for all lines after the first non-header line
507 if (_typeIsKnown == true) {
508 switch(_fileType) {
509 case BED_FILETYPE:
510 if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
511 case VCF_FILETYPE:
512 if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
513 case GFF_FILETYPE:
514 if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
515 default:
516 printf("ERROR: file type encountered. Exiting\n");
517 exit(1);
518 }
519 }
520 // line parsing for first non-header line: figure out file contents
521 else {
522 // it's BED format if columns 2 and 3 are integers
523 if (isInteger(lineVector[1]) && isInteger(lineVector[2])) {
524 setGff(false);
525 setZeroBased(true);
526 setFileType(BED_FILETYPE);
527 setBedType(numFields); // we now expect numFields columns in each line
528 if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
529 }
530 // it's VCF, assuming the second column is numeric and there are at least 8 fields.
531 else if (isInteger(lineVector[1]) && numFields >= 8) {
532 setGff(false);
533 setVcf(true);
534 setZeroBased(false);
535 setFileType(VCF_FILETYPE);
536 setBedType(numFields); // we now expect numFields columns in each line
537 if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
538 }
539 // it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total.
540 else if ((numFields >= 8) && isInteger(lineVector[3]) && isInteger(lineVector[4])) {
541 setGff(true);
542 setZeroBased(false);
543 setFileType(GFF_FILETYPE);
544 setBedType(numFields); // we now expect numFields columns in each line
545 if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
546 }
547 else {
548 cerr << "Unexpected file format. Please use tab-delimited BED, GFF, or VCF. " <<
549 "Perhaps you have non-integer starts or ends at line " << lineNum << "?" << endl;
550 exit(1);
551 }
552 }
553 }
554 else {
555 cerr << "It looks as though you have less than 3 columns at line: " << lineNum << ". Are you sure your files are tab-delimited?" << endl;
556 exit(1);
557 }
558 }
559 else {
560 lineNum--;
561 return BED_HEADER;
562 }
563 // default
564 return BED_INVALID;
565 }
566
567
568 /*
569 parseBedLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
570 */
571 template <typename T>
572 inline bool parseBedLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) {
573
574 // process as long as the number of fields in this
575 // line matches what we expect for this file.
576 if (numFields == this->bedType) {
577 bed.chrom = lineVector[0];
578 bed.start = atoi(lineVector[1].c_str());
579 bed.end = atoi(lineVector[2].c_str());
580
581 // handle starts == end (e.g., insertions in reference genome)
582 if (bed.start == bed.end) {
583 bed.start--;
584 bed.end++;
585 bed.zeroLength = true;
586 }
587
588 if (this->bedType == 4) {
589 bed.name = lineVector[3];
590 }
591 else if (this->bedType == 5) {
592 bed.name = lineVector[3];
593 bed.score = lineVector[4];
594 }
595 else if (this->bedType == 6) {
596 bed.name = lineVector[3];
597 bed.score = lineVector[4];
598 bed.strand = lineVector[5];
599 }
600 else if (this->bedType > 6) {
601 bed.name = lineVector[3];
602 bed.score = lineVector[4];
603 bed.strand = lineVector[5];
604 for (unsigned int i = 6; i < lineVector.size(); ++i) {
605 bed.otherFields.push_back(lineVector[i]);
606 }
607 }
608 else if (this->bedType != 3) {
609 cerr << "Error: unexpected number of fields at line: " << lineNum
610 << ". Verify that your files are TAB-delimited. Exiting..." << endl;
611 exit(1);
612 }
613
614 // sanity checks.
615 if ((bed.start <= bed.end) && (bed.start >= 0) && (bed.end >= 0)) {
616 return true;
617 }
618 else if (bed.start > bed.end) {
619 cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than end. Exiting." << endl;
620 exit(1);
621 }
622 else if ( (bed.start < 0) || (bed.end < 0) ) {
623 cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl;
624 exit(1);
625 }
626 }
627 else if (numFields == 1) {
628 cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl;
629 exit(1);
630 }
631 else if ((numFields != this->bedType) && (numFields != 0)) {
632 cerr << "Differing number of BED fields encountered at line: " << lineNum << ". Exiting..." << endl;
633 exit(1);
634 }
635 else if ((numFields < 3) && (numFields != 0)) {
636 cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required at line: "<< lineNum << ". Exiting..." << endl;
637 exit(1);
638 }
639 return false;
640 }
641
642
643 /*
644 parseVcfLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
645 */
646 template <typename T>
647 inline bool parseVcfLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) {
648 if (numFields == this->bedType) {
649 bed.chrom = lineVector[0];
650 bed.start = atoi(lineVector[1].c_str()) - 1; // VCF is one-based
651 bed.end = bed.start + lineVector[3].size(); // VCF 4.0 stores the size of the affected REF allele.
652 bed.strand = "+";
653 // construct the name from the ref and alt alleles.
654 // if it's an annotated variant, add the rsId as well.
655 bed.name = lineVector[3] + "/" + lineVector[4];
656 if (lineVector[2] != ".") {
657 bed.name += "_" + lineVector[2];
658 }
659
660 if (this->bedType > 2) {
661 for (unsigned int i = 2; i < numFields; ++i)
662 bed.otherFields.push_back(lineVector[i]);
663 }
664
665 if ((bed.start <= bed.end) && (bed.start > 0) && (bed.end > 0)) {
666 return true;
667 }
668 else if (bed.start > bed.end) {
669 cerr << "Error: malformed VCF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl;
670 exit(1);
671 }
672 else if ( (bed.start < 0) || (bed.end < 0) ) {
673 cerr << "Error: malformed VCF entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl;
674 exit(1);
675 }
676 }
677 else if (numFields == 1) {
678 cerr << "Only one VCF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl;
679 exit(1);
680 }
681 else if ((numFields != this->bedType) && (numFields != 0)) {
682 cerr << "Differing number of VCF fields encountered at line: " << lineNum << ". Exiting..." << endl;
683 exit(1);
684 }
685 else if ((numFields < 2) && (numFields != 0)) {
686 cerr << "TAB delimited VCF file with at least 2 fields (chrom, pos) is required at line: "<< lineNum << ". Exiting..." << endl;
687 exit(1);
688 }
689 return false;
690 }
691
692
693
694 /*
695 parseGffLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
696 */
697 template <typename T>
698 inline bool parseGffLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) {
699 if (numFields == this->bedType) {
700 if (this->bedType >= 8 && _isGff) {
701 bed.chrom = lineVector[0];
702 bed.start = atoi(lineVector[3].c_str());
703 bed.end = atoi(lineVector[4].c_str());
704 bed.name = lineVector[2];
705 bed.score = lineVector[5];
706 bed.strand = lineVector[6].c_str();
707 bed.otherFields.push_back(lineVector[1]); // add GFF "source". unused in BED
708 bed.otherFields.push_back(lineVector[7]); // add GFF "fname". unused in BED
709 // handle the optional 9th field.
710 if (this->bedType == 9)
711 bed.otherFields.push_back(lineVector[8]); // add GFF "group". unused in BED
712
713 // handle starts == end (e.g., insertions in reference genome)
714 if (bed.start == bed.end) {
715 bed.end++;
716 bed.zeroLength = true;
717 }
718 // GFF uses 1-based starts, covert to zero-based
719 bed.start--;
720 }
721 else {
722 cerr << "Error: unexpected number of fields at line: " << lineNum <<
723 ". Verify that your files are TAB-delimited and that your GFF file has 8 or 9 fields. Exiting..." << endl;
724 exit(1);
725 }
726 if (bed.start > bed.end) {
727 cerr << "Error: malformed GFF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl;
728 exit(1);
729 }
730 else if ( (bed.start < 0) || (bed.end < 0) ) {
731 cerr << "Error: malformed GFF entry at line " << lineNum << ". Coordinate detected that is < 1. Exiting." << endl;
732 exit(1);
733 }
734 else return true;
735 }
736 else if (numFields == 1) {
737 cerr << "Only one GFF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl;
738 exit(1);
739 }
740 else if ((numFields != this->bedType) && (numFields != 0)) {
741 cerr << "Differing number of GFF fields encountered at line: " << lineNum << ". Exiting..." << endl;
742 exit(1);
743 }
744 else if ((numFields < 8) && (numFields != 0)) {
745 cerr << "TAB delimited GFF file with 8 or 9 fields is required at line: "<< lineNum << ". Exiting..." << endl;
746 exit(1);
747 }
748 return false;
749 }
750
751
752 public:
753
754 /*
755 reportBedTab
756
757 Writes the _original_ BED entry with a TAB
758 at the end of the line.
759 Works for BED3 - BED6.
760 */
761 template <typename T>
762 inline void reportBedTab(const T &bed) {
763
764 // if it is azeroLength feature, we need to
765 // correct the start and end coords to what they were
766 // in the original file
767 CHRPOS start = bed.start;
768 CHRPOS end = bed.end;
769 if (bed.zeroLength) {
770 if (_isGff == false)
771 start++;
772 end--;
773 }
774
775 // BED
776 if (_isGff == false && _isVcf == false) {
777 if (this->bedType == 3) {
778 printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end);
779 }
780 else if (this->bedType == 4) {
781 printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str());
782 }
783 else if (this->bedType == 5) {
784 printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
785 bed.score.c_str());
786 }
787 else if (this->bedType == 6) {
788 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
789 bed.score.c_str(), bed.strand.c_str());
790 }
791 else if (this->bedType > 6) {
792 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
793 bed.score.c_str(), bed.strand.c_str());
794
795 vector<string>::const_iterator othIt = bed.otherFields.begin();
796 vector<string>::const_iterator othEnd = bed.otherFields.end();
797 for ( ; othIt != othEnd; ++othIt) {
798 printf("%s\t", othIt->c_str());
799 }
800 }
801 }
802 // VCF
803 else if (_isGff == false && _isVcf == true) {
804 printf ("%s\t%d\t", bed.chrom.c_str(), start+1);
805
806 vector<string>::const_iterator othIt = bed.otherFields.begin();
807 vector<string>::const_iterator othEnd = bed.otherFields.end();
808 for ( ; othIt != othEnd; ++othIt) {
809 printf("%s\t", othIt->c_str());
810 }
811 }
812 // GFF
813 else if (_isGff == true) {
814 // "GFF-8"
815 if (this->bedType == 8) {
816 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(),
817 bed.name.c_str(), start+1, end,
818 bed.score.c_str(), bed.strand.c_str(),
819 bed.otherFields[1].c_str());
820 }
821 // "GFF-9"
822 else if (this->bedType == 9) {
823 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(),
824 bed.name.c_str(), start+1, end,
825 bed.score.c_str(), bed.strand.c_str(),
826 bed.otherFields[1].c_str(), bed.otherFields[2].c_str());
827 }
828 }
829 }
830
831
832
833 /*
834 reportBedNewLine
835
836 Writes the _original_ BED entry with a NEWLINE
837 at the end of the line.
838 Works for BED3 - BED6.
839 */
840 template <typename T>
841 inline void reportBedNewLine(const T &bed) {
842
843 // if it is azeroLength feature, we need to
844 // correct the start and end coords to what they were
845 // in the original file
846 CHRPOS start = bed.start;
847 CHRPOS end = bed.end;
848 if (bed.zeroLength) {
849 if (_isGff == false)
850 start++;
851 end--;
852 }
853
854 //BED
855 if (_isGff == false && _isVcf == false) {
856 if (this->bedType == 3) {
857 printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end);
858 }
859 else if (this->bedType == 4) {
860 printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str());
861 }
862 else if (this->bedType == 5) {
863 printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(),
864 bed.score.c_str());
865 }
866 else if (this->bedType == 6) {
867 printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(),
868 bed.score.c_str(), bed.strand.c_str());
869 }
870 else if (this->bedType > 6) {
871 printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(),
872 bed.score.c_str(), bed.strand.c_str());
873
874 vector<string>::const_iterator othIt = bed.otherFields.begin();
875 vector<string>::const_iterator othEnd = bed.otherFields.end();
876 for ( ; othIt != othEnd; ++othIt) {
877 printf("\t%s", othIt->c_str());
878 }
879 printf("\n");
880 }
881 }
882 // VCF
883 else if (_isGff == false && _isVcf == true) {
884 printf ("%s\t%d\t", bed.chrom.c_str(), start+1);
885
886 vector<string>::const_iterator othIt = bed.otherFields.begin();
887 vector<string>::const_iterator othEnd = bed.otherFields.end();
888 for ( ; othIt != othEnd; ++othIt) {
889 printf("%s\t", othIt->c_str());
890 }
891 printf("\n");
892 }
893 // GFF
894 else if (_isGff == true) {
895 // "GFF-8"
896 if (this->bedType == 8) {
897 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(),
898 bed.name.c_str(), start+1, end,
899 bed.score.c_str(), bed.strand.c_str(),
900 bed.otherFields[1].c_str());
901 }
902 // "GFF-9"
903 else if (this->bedType == 9) {
904 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(),
905 bed.name.c_str(), start+1, end,
906 bed.score.c_str(), bed.strand.c_str(),
907 bed.otherFields[1].c_str(), bed.otherFields[2].c_str());
908 }
909 }
910 }
911
912
913
914 /*
915 reportBedRangeNewLine
916
917 Writes a custom start->end for a BED entry
918 with a NEWLINE at the end of the line.
919
920 Works for BED3 - BED6.
921 */
922 template <typename T>
923 inline void reportBedRangeTab(const T &bed, CHRPOS start, CHRPOS end) {
924
925 // if it is azeroLength feature, we need to
926 // correct the start and end coords to what they were
927 // in the original file
928 if (bed.zeroLength) {
929 start = bed.start + 1;
930 end = bed.end - 1;
931 }
932
933 // BED
934 if (_isGff == false && _isVcf == false) {
935 if (this->bedType == 3) {
936 printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end);
937 }
938 else if (this->bedType == 4) {
939 printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str());
940 }
941 else if (this->bedType == 5) {
942 printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
943 bed.score.c_str());
944 }
945 else if (this->bedType == 6) {
946 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
947 bed.score.c_str(), bed.strand.c_str());
948 }
949 else if (this->bedType > 6) {
950 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
951 bed.score.c_str(), bed.strand.c_str());
952
953 vector<string>::const_iterator othIt = bed.otherFields.begin();
954 vector<string>::const_iterator othEnd = bed.otherFields.end();
955 for ( ; othIt != othEnd; ++othIt) {
956 printf("%s\t", othIt->c_str());
957 }
958 }
959 }
960 // VCF
961 else if (_isGff == false && _isVcf == true) {
962 printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1);
963
964 vector<string>::const_iterator othIt = bed.otherFields.begin();
965 vector<string>::const_iterator othEnd = bed.otherFields.end();
966 for ( ; othIt != othEnd; ++othIt) {
967 printf("%s\t", othIt->c_str());
968 }
969 }
970 // GFF
971 else if (_isGff == true) {
972 // "GFF-8"
973 if (this->bedType == 8) {
974 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(),
975 bed.name.c_str(), start+1, end,
976 bed.score.c_str(), bed.strand.c_str(),
977 bed.otherFields[1].c_str());
978 }
979 // "GFF-9"
980 else if (this->bedType == 9) {
981 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(),
982 bed.name.c_str(), start+1, end,
983 bed.score.c_str(), bed.strand.c_str(),
984 bed.otherFields[1].c_str(), bed.otherFields[2].c_str());
985 }
986 }
987 }
988
989
990
991 /*
992 reportBedRangeTab
993
994 Writes a custom start->end for a BED entry
995 with a TAB at the end of the line.
996
997 Works for BED3 - BED6.
998 */
999 template <typename T>
1000 inline void reportBedRangeNewLine(const T &bed, CHRPOS start, CHRPOS end) {
1001
1002 // if it is azeroLength feature, we need to
1003 // correct the start and end coords to what they were
1004 // in the original file
1005 if (bed.zeroLength) {
1006 start = bed.start + 1;
1007 end = bed.end - 1;
1008 }
1009
1010 // BED
1011 if (_isGff == false && _isVcf == false) {
1012 if (this->bedType == 3) {
1013 printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end);
1014 }
1015 else if (this->bedType == 4) {
1016 printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str());
1017 }
1018 else if (this->bedType == 5) {
1019 printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(),
1020 bed.score.c_str());
1021 }
1022 else if (this->bedType == 6) {
1023 printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(),
1024 bed.score.c_str(), bed.strand.c_str());
1025 }
1026 else if (this->bedType > 6) {
1027 printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(),
1028 bed.score.c_str(), bed.strand.c_str());
1029
1030 vector<string>::const_iterator othIt = bed.otherFields.begin();
1031 vector<string>::const_iterator othEnd = bed.otherFields.end();
1032 for ( ; othIt != othEnd; ++othIt) {
1033 printf("\t%s", othIt->c_str());
1034 }
1035 printf("\n");
1036 }
1037 }
1038 // VCF
1039 else if (_isGff == false && _isVcf == true) {
1040 printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1);
1041
1042 vector<string>::const_iterator othIt = bed.otherFields.begin();
1043 vector<string>::const_iterator othEnd = bed.otherFields.end();
1044 for ( ; othIt != othEnd; ++othIt) {
1045 printf("%s\t", othIt->c_str());
1046 }
1047 printf("\n");
1048 }
1049 // GFF
1050 else if (_isGff == true) {
1051 // "GFF-9"
1052 if (this->bedType == 8) {
1053 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(),
1054 bed.name.c_str(), start+1, end,
1055 bed.score.c_str(), bed.strand.c_str(),
1056 bed.otherFields[1].c_str());
1057 }
1058 // "GFF-8"
1059 else if (this->bedType == 9) {
1060 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(),
1061 bed.name.c_str(), start+1, end,
1062 bed.score.c_str(), bed.strand.c_str(),
1063 bed.otherFields[1].c_str(), bed.otherFields[2].c_str());
1064 }
1065 }
1066 }
1067
1068
1069 /*
1070 reportNullBedTab
1071 */
1072 void reportNullBedTab() {
1073
1074 if (_isGff == false && _isVcf == false) {
1075 if (this->bedType == 3) {
1076 printf (".\t-1\t-1\t");
1077 }
1078 else if (this->bedType == 4) {
1079 printf (".\t-1\t-1\t.\t");
1080 }
1081 else if (this->bedType == 5) {
1082 printf (".\t-1\t-1\t.\t-1\t");
1083 }
1084 else if (this->bedType == 6) {
1085 printf (".\t-1\t-1\t.\t-1\t.\t");
1086 }
1087 else if (this->bedType > 6) {
1088 printf (".\t-1\t-1\t.\t-1\t.\t");
1089 for (unsigned int i = 6; i < this->bedType; ++i) {
1090 printf(".\t");
1091 }
1092 }
1093 }
1094 else if (_isGff == true && _isVcf == false) {
1095 if (this->bedType == 8) {
1096 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t");
1097 }
1098 else if (this->bedType == 9) {
1099 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t");
1100 }
1101 }
1102 }
1103
1104
1105 /*
1106 reportNullBedTab
1107 */
1108 void reportNullBedNewLine() {
1109
1110 if (_isGff == false && _isVcf == false) {
1111 if (this->bedType == 3) {
1112 printf (".\t-1\t-1\n");
1113 }
1114 else if (this->bedType == 4) {
1115 printf (".\t-1\t-1\t.\n");
1116 }
1117 else if (this->bedType == 5) {
1118 printf (".\t-1\t-1\t.\t-1\n");
1119 }
1120 else if (this->bedType == 6) {
1121 printf (".\t-1\t-1\t.\t-1\t.\n");
1122 }
1123 else if (this->bedType > 6) {
1124 printf (".\t-1\t-1\t.\t-1\t.");
1125 for (unsigned int i = 6; i < this->bedType; ++i) {
1126 printf("\t.");
1127 }
1128 printf("\n");
1129 }
1130 }
1131 else if (_isGff == true && _isVcf == false) {
1132 if (this->bedType == 8) {
1133 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\n");
1134 }
1135 else if (this->bedType == 9) {
1136 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n");
1137 }
1138 }
1139 }
1140
1141
1142 };
1143
1144 #endif /* BEDFILE_H */