comparison BEDTools-Version-2.14.3/src/utils/bedFile/bedFile.h @ 1:bec36315bd12 default tip

Deleted selected files
author aaronquinlan
date Sat, 19 Nov 2011 14:17:03 -0500
parents dfcd8b6c1bda
children
comparison
equal deleted inserted replaced
0:dfcd8b6c1bda 1:bec36315bd12
1 /*****************************************************************************
2 bedFile.h
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licensed under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #ifndef BEDFILE_H
13 #define BEDFILE_H
14
15 // "local" includes
16 #include "gzstream.h"
17 #include "lineFileUtilities.h"
18 #include "fileType.h"
19
20 // standard includes
21 #include <vector>
22 #include <map>
23 #include <set>
24 #include <string>
25 #include <iostream>
26 #include <fstream>
27 #include <sstream>
28 #include <cstring>
29 #include <algorithm>
30 #include <limits.h>
31 #include <stdint.h>
32 #include <cstdio>
33 //#include <tr1/unordered_map> // Experimental.
34 using namespace std;
35
36
37 //*************************************************
38 // Data type tydedef
39 //*************************************************
40 typedef uint32_t CHRPOS;
41 typedef uint16_t BINLEVEL;
42 typedef uint32_t BIN;
43 typedef uint16_t USHORT;
44 typedef uint32_t UINT;
45
46 //*************************************************
47 // Genome binning constants
48 //*************************************************
49
50 const BIN _numBins = 37450;
51 const BINLEVEL _binLevels = 7;
52
53 // bins range in size from 16kb to 512Mb
54 // Bin 0 spans 512Mbp, # Level 1
55 // Bins 1-8 span 64Mbp, # Level 2
56 // Bins 9-72 span 8Mbp, # Level 3
57 // Bins 73-584 span 1Mbp # Level 4
58 // Bins 585-4680 span 128Kbp # Level 5
59 // Bins 4681-37449 span 16Kbp # Level 6
60 const BIN _binOffsetsExtended[] = {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0};
61 //const BIN _binOffsetsExtended[] = {4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0};
62
63 const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */
64 const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */
65
66
67 //*************************************************
68 // Common data structures
69 //*************************************************
70
71 struct DEPTH {
72 UINT starts;
73 UINT ends;
74 };
75
76
77 /*
78 Structure for regular BED records
79 */
80 struct BED {
81
82 // Regular BED fields
83 string chrom;
84 CHRPOS start;
85 CHRPOS end;
86 string name;
87 string score;
88 string strand;
89
90 // Add'l fields for BED12 and/or custom BED annotations
91 vector<string> otherFields;
92
93 // experimental fields for the FJOIN approach.
94 bool zeroLength;
95 bool added;
96 bool finished;
97 // list of hits from another file.
98 vector<BED> overlaps;
99
100 public:
101 // constructors
102
103 // Null
104 BED()
105 : chrom(""),
106 start(0),
107 end(0),
108 name(""),
109 score(""),
110 strand(""),
111 otherFields(),
112 zeroLength(false),
113 added(false),
114 finished(false),
115 overlaps()
116 {}
117
118 // BED3
119 BED(string chrom, CHRPOS start, CHRPOS end)
120 : chrom(chrom),
121 start(start),
122 end(end),
123 name(""),
124 score(""),
125 strand(""),
126 otherFields(),
127 zeroLength(false),
128 added(false),
129 finished(false),
130 overlaps()
131 {}
132
133 // BED4
134 BED(string chrom, CHRPOS start, CHRPOS end, string strand)
135 : chrom(chrom),
136 start(start),
137 end(end),
138 name(""),
139 score(""),
140 strand(strand),
141 otherFields(),
142 zeroLength(false),
143 added(false),
144 finished(false),
145 overlaps()
146 {}
147
148 // BED6
149 BED(string chrom, CHRPOS start, CHRPOS end, string name,
150 string score, string strand)
151 : chrom(chrom),
152 start(start),
153 end(end),
154 name(name),
155 score(score),
156 strand(strand),
157 otherFields(),
158 zeroLength(false),
159 added(false),
160 finished(false),
161 overlaps()
162 {}
163
164 // BEDALL
165 BED(string chrom, CHRPOS start, CHRPOS end, string name,
166 string score, string strand, vector<string> otherFields)
167 : chrom(chrom),
168 start(start),
169 end(end),
170 name(name),
171 score(score),
172 strand(strand),
173 otherFields(otherFields),
174 zeroLength(false),
175 added(false),
176 finished(false),
177 overlaps()
178 {}
179
180 int size() {
181 return end-start;
182 }
183
184 }; // BED
185
186
187 /*
188 Structure for each end of a paired BED record
189 mate points to the other end.
190 */
191 struct MATE {
192 BED bed;
193 int lineNum;
194 MATE *mate;
195 };
196
197
198 /*
199 Structure for regular BED COVERAGE records
200 */
201 struct BEDCOV {
202
203 string chrom;
204
205 // Regular BED fields
206 CHRPOS start;
207 CHRPOS end;
208 string name;
209 string score;
210 string strand;
211
212 // Add'l fields for BED12 and/or custom BED annotations
213 vector<string> otherFields;
214
215 // flag a zero-length feature
216 bool zeroLength;
217
218 // Additional fields specific to computing coverage
219 map<unsigned int, DEPTH> depthMap;
220 unsigned int count;
221 CHRPOS minOverlapStart;
222
223
224 public:
225 // constructors
226 // Null
227 BEDCOV()
228 : chrom(""),
229 start(0),
230 end(0),
231 name(""),
232 score(""),
233 strand(""),
234 otherFields(),
235 zeroLength(false),
236 depthMap(),
237 count(0),
238 minOverlapStart(0)
239 {}
240 };
241
242
243 /*
244 Structure for BED COVERAGE records having lists of
245 multiple coverages
246 */
247 struct BEDCOVLIST {
248
249 // Regular BED fields
250 string chrom;
251 CHRPOS start;
252 CHRPOS end;
253 string name;
254 string score;
255 string strand;
256
257 // Add'l fields for BED12 and/or custom BED annotations
258 vector<string> otherFields;
259
260 // flag a zero-length feature
261 bool zeroLength;
262
263 // Additional fields specific to computing coverage
264 vector< map<unsigned int, DEPTH> > depthMapList;
265 vector<unsigned int> counts;
266 vector<CHRPOS> minOverlapStarts;
267
268
269 public:
270 // constructors
271 // Null
272 BEDCOVLIST()
273 : chrom(""),
274 start(0),
275 end(0),
276 name(""),
277 score(""),
278 strand(""),
279 otherFields(),
280 zeroLength(false),
281 depthMapList(),
282 counts(0),
283 minOverlapStarts(0)
284 {}
285 };
286
287
288 // enum to flag the state of a given line in a BED file.
289 enum BedLineStatus
290 {
291 BED_INVALID = -1,
292 BED_HEADER = 0,
293 BED_BLANK = 1,
294 BED_VALID = 2
295 };
296
297 // enum to indicate the type of file we are dealing with
298 enum FileType
299 {
300 BED_FILETYPE,
301 GFF_FILETYPE,
302 VCF_FILETYPE
303 };
304
305 //*************************************************
306 // Data structure typedefs
307 //*************************************************
308 typedef vector<BED> bedVector;
309 typedef vector<BEDCOV> bedCovVector;
310 typedef vector<MATE> mateVector;
311 typedef vector<BEDCOVLIST> bedCovListVector;
312
313 typedef map<BIN, bedVector, std::less<BIN> > binsToBeds;
314 typedef map<BIN, bedCovVector, std::less<BIN> > binsToBedCovs;
315 typedef map<BIN, mateVector, std::less<BIN> > binsToMates;
316 typedef map<BIN, bedCovListVector, std::less<BIN> > binsToBedCovLists;
317
318 typedef map<string, binsToBeds, std::less<string> > masterBedMap;
319 typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap;
320 typedef map<string, binsToMates, std::less<string> > masterMateMap;
321 typedef map<string, binsToBedCovLists, std::less<string> > masterBedCovListMap;
322 typedef map<string, bedVector, std::less<string> > masterBedMapNoBin;
323
324
325 // EXPERIMENTAL - wait for TR1
326 // typedef vector<BED> bedVector;
327 // typedef vector<BEDCOV> bedCovVector;
328 //
329 // typedef tr1::unordered_map<BIN, bedVector> binsToBeds;
330 // typedef tr1::unordered_map<BIN, bedCovVector> binsToBedCovs;
331 //
332 // typedef tr1::unordered_map<string, binsToBeds> masterBedMap;
333 // typedef tr1::unordered_map<string, binsToBedCovs> masterBedCovMap;
334 // typedef tr1::unordered_map<string, bedVector> masterBedMapNoBin;
335
336
337
338 // return the genome "bin" for a feature with this start and end
339 inline
340 BIN getBin(CHRPOS start, CHRPOS end) {
341 --end;
342 start >>= _binFirstShift;
343 end >>= _binFirstShift;
344
345 for (register short i = 0; i < _binLevels; ++i) {
346 if (start == end) return _binOffsetsExtended[i] + start;
347 start >>= _binNextShift;
348 end >>= _binNextShift;
349 }
350 cerr << "start " << start << ", end " << end << " out of range in findBin (max is 512M)" << endl;
351 return 0;
352 }
353
354 /****************************************************
355 // isInteger(s): Tests if string s is a valid integer
356 *****************************************************/
357 inline bool isInteger(const std::string& s) {
358 int len = s.length();
359 for (int i = 0; i < len; i++) {
360 if (!std::isdigit(s[i])) return false;
361 }
362 return true;
363 }
364
365
366 // return the amount of overlap between two features. Negative if none and the the
367 // number of negative bases is the distance between the two.
368 inline
369 int overlaps(CHRPOS aS, CHRPOS aE, CHRPOS bS, CHRPOS bE) {
370 return min(aE, bE) - max(aS, bS);
371 }
372
373 // is A after (to the right of) B?
374 inline
375 bool after(const BED &a, const BED &b) {
376 return (a.start >= b.end);
377 }
378
379
380 // Ancillary functions
381 void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks);
382
383
384 // BED Sorting Methods
385 bool sortByChrom(const BED &a, const BED &b);
386 bool sortByStart(const BED &a, const BED &b);
387 bool sortBySizeAsc(const BED &a, const BED &b);
388 bool sortBySizeDesc(const BED &a, const BED &b);
389 bool sortByScoreAsc(const BED &a, const BED &b);
390 bool sortByScoreDesc(const BED &a, const BED &b);
391 bool byChromThenStart(BED const &a, BED const &b);
392
393
394
395 //************************************************
396 // BedFile Class methods and elements
397 //************************************************
398 class BedFile {
399
400 public:
401
402 // Constructor
403 BedFile(string &);
404
405 // Destructor
406 ~BedFile(void);
407
408 // Open a BED file for reading (creates an istream pointer)
409 void Open(void);
410
411 // Rewind the pointer back to the beginning of the file
412 void Rewind(void);
413
414 // Jump to a specific byte in the file
415 void Seek(unsigned long offset);
416
417 bool Empty();
418
419 // Close an opened BED file.
420 void Close(void);
421
422 // Get the next BED entry in an opened BED file.
423 BedLineStatus GetNextBed (BED &bed, int &lineNum, bool forceSorted = false);
424
425 // Returns the next MERGED (i.e., non-overlapping) interval in an opened BED file
426 // NOTE: assumes input file is sorted by chrom then start
427 bool GetNextMergedBed(BED &merged_bed, int &lineNum);
428
429 // load a BED file into a map keyed by chrom, then bin. value is vector of BEDs
430 void loadBedFileIntoMap();
431
432 // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVs
433 void loadBedCovFileIntoMap();
434
435 // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVLISTs
436 void loadBedCovListFileIntoMap();
437
438 // load a BED file into a map keyed by chrom. value is vector of BEDs
439 void loadBedFileIntoMapNoBin();
440
441 // Given a chrom, start, end and strand for a single feature,
442 // search for all overlapping features in another BED file.
443 // Searches through each relevant genome bin on the same chromosome
444 // as the single feature. Note: Adapted from kent source "binKeeperFind"
445 void FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool sameStrand, bool diffStrand);
446
447 // return true if at least one overlap was found. otherwise, return false.
448 bool FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand,
449 bool sameStrand, bool diffStrand, float overlapFraction = 0.0);
450
451 // return true if at least one __reciprocal__ overlap was found. otherwise, return false.
452 bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand,
453 bool sameStrand, bool diffStrand, float overlapFraction = 0.0);
454
455 // Given a chrom, start, end and strand for a single feature,
456 // increment a the number of hits for each feature in B file
457 // that the feature overlaps
458 void countHits(const BED &a, bool sameStrand = false, bool diffStrand = false, bool countsOnly = false);
459
460 // same as above, but has special logic that processes a set of
461 // BED "blocks" from a single entry so as to avoid over-counting
462 // each "block" of a single BAM/BED12 as distinct coverage. That is,
463 // if one read has four block, we only want to count the coverage as
464 // coming from one read, not four.
465 void countSplitHits(const vector<BED> &bedBlock, bool sameStrand = false, bool diffStrand = false, bool countsOnly = false);
466
467 // Given a chrom, start, end and strand for a single feature,
468 // increment a the number of hits for each feature in B file
469 // that the feature overlaps
470 void countListHits(const BED &a, int index, bool sameStrand, bool diffStrand);
471
472 // the bedfile with which this instance is associated
473 string bedFile;
474 unsigned int bedType; // 3-6, 12 for BED
475 // 9 for GFF
476 bool isZeroBased;
477
478 // Main data structires used by BEDTools
479 masterBedCovMap bedCovMap;
480 masterBedCovListMap bedCovListMap;
481 masterBedMap bedMap;
482 masterBedMapNoBin bedMapNoBin;
483
484 private:
485
486 // data
487 bool _isGff;
488 bool _isVcf;
489 bool _typeIsKnown; // do we know the type? (i.e., BED, GFF, VCF)
490 FileType _fileType; // what is the file type? (BED? GFF? VCF?)
491 istream *_bedStream;
492 string _bedLine;
493 vector<string> _bedFields;
494 int _merged_start;
495 int _merged_end;
496 string _merged_chrom;
497 int _prev_start;
498 string _prev_chrom;
499
500 void setZeroBased(bool zeroBased);
501 void setGff (bool isGff);
502 void setVcf (bool isVcf);
503 void setFileType (FileType type);
504 void setBedType (int colNums);
505
506 /******************************************************
507 Private definitions to circumvent linker issues with
508 templated member functions.
509 *******************************************************/
510
511 /*
512 parseLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
513 */
514 template <typename T>
515 inline BedLineStatus parseLine (T &bed, const vector<string> &lineVector, int &lineNum) {
516
517 //char *p2End, *p3End, *p4End, *p5End;
518 //long l2, l3, l4, l5;
519 unsigned int numFields = lineVector.size();
520
521 // bail out if we have a blank line
522 if (numFields == 0) {
523 return BED_BLANK;
524 }
525
526 if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) {
527
528 if (numFields >= 3) {
529 // line parsing for all lines after the first non-header line
530 if (_typeIsKnown == true) {
531 switch(_fileType) {
532 case BED_FILETYPE:
533 if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
534 case VCF_FILETYPE:
535 if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
536 case GFF_FILETYPE:
537 if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
538 default:
539 printf("ERROR: file type encountered. Exiting\n");
540 exit(1);
541 }
542 }
543 // line parsing for first non-header line: figure out file contents
544 else {
545 // it's BED format if columns 2 and 3 are integers
546 if (isInteger(lineVector[1]) && isInteger(lineVector[2])) {
547 setGff(false);
548 setZeroBased(true);
549 setFileType(BED_FILETYPE);
550 setBedType(numFields); // we now expect numFields columns in each line
551 if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
552 }
553 // it's VCF, assuming the second column is numeric and there are at least 8 fields.
554 else if (isInteger(lineVector[1]) && numFields >= 8) {
555 setGff(false);
556 setVcf(true);
557 setZeroBased(false);
558 setFileType(VCF_FILETYPE);
559 setBedType(numFields); // we now expect numFields columns in each line
560 if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
561 }
562 // it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total.
563 else if ((numFields >= 8) && isInteger(lineVector[3]) && isInteger(lineVector[4])) {
564 setGff(true);
565 setZeroBased(false);
566 setFileType(GFF_FILETYPE);
567 setBedType(numFields); // we now expect numFields columns in each line
568 if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID;
569 }
570 else {
571 cerr << "Unexpected file format. Please use tab-delimited BED, GFF, or VCF. " <<
572 "Perhaps you have non-integer starts or ends at line " << lineNum << "?" << endl;
573 exit(1);
574 }
575 }
576 }
577 else {
578 cerr << "It looks as though you have less than 3 columns at line: " << lineNum << ". Are you sure your files are tab-delimited?" << endl;
579 exit(1);
580 }
581 }
582 else {
583 lineNum--;
584 return BED_HEADER;
585 }
586 // default
587 return BED_INVALID;
588 }
589
590
591 /*
592 parseBedLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
593 */
594 template <typename T>
595 inline bool parseBedLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) {
596
597 // process as long as the number of fields in this
598 // line matches what we expect for this file.
599 if (numFields == this->bedType) {
600 bed.chrom = lineVector[0];
601 int i;
602 i = atoi(lineVector[1].c_str());
603 if (i<0) {
604 cerr << "Error: malformed BED entry at line " << lineNum << ". Start Coordinate detected that is < 0. Exiting." << endl;
605 exit(1);
606 }
607 bed.start = (CHRPOS)i;
608 i = atoi(lineVector[2].c_str());
609 if (i<0) {
610 cerr << "Error: malformed BED entry at line " << lineNum << ". End Coordinate detected that is < 0. Exiting." << endl;
611 exit(1);
612 }
613 bed.end = (CHRPOS)i;
614
615 // handle starts == end (e.g., insertions in reference genome)
616 if (bed.start == bed.end) {
617 bed.start--;
618 bed.end++;
619 bed.zeroLength = true;
620 }
621
622 if (this->bedType == 4) {
623 bed.name = lineVector[3];
624 }
625 else if (this->bedType == 5) {
626 bed.name = lineVector[3];
627 bed.score = lineVector[4];
628 }
629 else if (this->bedType == 6) {
630 bed.name = lineVector[3];
631 bed.score = lineVector[4];
632 bed.strand = lineVector[5];
633 }
634 else if (this->bedType > 6) {
635 bed.name = lineVector[3];
636 bed.score = lineVector[4];
637 bed.strand = lineVector[5];
638 for (unsigned int i = 6; i < lineVector.size(); ++i) {
639 bed.otherFields.push_back(lineVector[i]);
640 }
641 }
642 else if (this->bedType != 3) {
643 cerr << "Error: unexpected number of fields at line: " << lineNum
644 << ". Verify that your files are TAB-delimited. Exiting..." << endl;
645 exit(1);
646 }
647
648 // sanity checks.
649 if (bed.start <= bed.end) {
650 return true;
651 }
652 else {
653 cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than end. Exiting." << endl;
654 exit(1);
655 }
656 }
657 else if (numFields == 1) {
658 cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl;
659 exit(1);
660 }
661 else if ((numFields != this->bedType) && (numFields != 0)) {
662 cerr << "Differing number of BED fields encountered at line: " << lineNum << ". Exiting..." << endl;
663 exit(1);
664 }
665 else if ((numFields < 3) && (numFields != 0)) {
666 cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required at line: "<< lineNum << ". Exiting..." << endl;
667 exit(1);
668 }
669 return false;
670 }
671
672
673 /*
674 parseVcfLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
675 */
676 template <typename T>
677 inline bool parseVcfLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) {
678 if (numFields == this->bedType) {
679 bed.chrom = lineVector[0];
680 bed.start = atoi(lineVector[1].c_str()) - 1; // VCF is one-based
681 bed.end = bed.start + lineVector[3].size(); // VCF 4.0 stores the size of the affected REF allele.
682 bed.strand = "+";
683 // construct the name from the ref and alt alleles.
684 // if it's an annotated variant, add the rsId as well.
685 bed.name = lineVector[3] + "/" + lineVector[4];
686 if (lineVector[2] != ".") {
687 bed.name += "_" + lineVector[2];
688 }
689
690 if (this->bedType > 2) {
691 for (unsigned int i = 2; i < numFields; ++i)
692 bed.otherFields.push_back(lineVector[i]);
693 }
694
695 if ((bed.start <= bed.end) && (bed.start >= 0) && (bed.end >= 0)) {
696 return true;
697 }
698 else if (bed.start > bed.end) {
699 cerr << "Error: malformed VCF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl;
700 exit(1);
701 }
702 else if ( (bed.start < 0) || (bed.end < 0) ) {
703 cerr << "Error: malformed VCF entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl;
704 exit(1);
705 }
706 }
707 else if (numFields == 1) {
708 cerr << "Only one VCF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl;
709 exit(1);
710 }
711 else if ((numFields != this->bedType) && (numFields != 0)) {
712 cerr << "Differing number of VCF fields encountered at line: " << lineNum << ". Exiting..." << endl;
713 exit(1);
714 }
715 else if ((numFields < 2) && (numFields != 0)) {
716 cerr << "TAB delimited VCF file with at least 2 fields (chrom, pos) is required at line: "<< lineNum << ". Exiting..." << endl;
717 exit(1);
718 }
719 return false;
720 }
721
722
723
724 /*
725 parseGffLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.)
726 */
727 template <typename T>
728 inline bool parseGffLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) {
729 if (numFields == this->bedType) {
730 if (this->bedType >= 8 && _isGff) {
731 bed.chrom = lineVector[0];
732 if (isInteger(lineVector[3]))
733 bed.start = atoi(lineVector[3].c_str());
734 if (isInteger(lineVector[4]))
735 bed.end = atoi(lineVector[4].c_str());
736 bed.name = lineVector[2];
737 bed.score = lineVector[5];
738 bed.strand = lineVector[6].c_str();
739 bed.otherFields.push_back(lineVector[1]); // add GFF "source". unused in BED
740 bed.otherFields.push_back(lineVector[7]); // add GFF "fname". unused in BED
741 // handle the optional 9th field.
742 if (this->bedType == 9)
743 bed.otherFields.push_back(lineVector[8]); // add GFF "group". unused in BED
744 bed.start--;
745 }
746 else {
747 cerr << "Error: unexpected number of fields at line: " << lineNum <<
748 ". Verify that your files are TAB-delimited and that your GFF file has 8 or 9 fields. Exiting..." << endl;
749 exit(1);
750 }
751 if (bed.start > bed.end) {
752 cerr << "Error: malformed GFF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl;
753 exit(1);
754 }
755 if ( (bed.start < 0) || (bed.end < 0) ) {
756 cerr << "Error: malformed GFF entry at line " << lineNum << ". Coordinate detected that is < 1. Exiting." << endl;
757 exit(1);
758 }
759 return true;
760 }
761 else if (numFields == 1) {
762 cerr << "Only one GFF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl;
763 exit(1);
764 }
765 else if ((numFields != this->bedType) && (numFields != 0)) {
766 cerr << "Differing number of GFF fields encountered at line: " << lineNum << ". Exiting..." << endl;
767 exit(1);
768 }
769 else if ((numFields < 8) && (numFields != 0)) {
770 cerr << "TAB delimited GFF file with 8 or 9 fields is required at line: "<< lineNum << ". Exiting..." << endl;
771 exit(1);
772 }
773 return false;
774 }
775
776
777 public:
778
779 /*
780 reportBedTab
781
782 Writes the _original_ BED entry with a TAB
783 at the end of the line.
784 Works for BED3 - BED6.
785 */
786 template <typename T>
787 inline void reportBedTab(const T &bed) {
788
789 // if it is azeroLength feature, we need to
790 // correct the start and end coords to what they were
791 // in the original file
792 CHRPOS start = bed.start;
793 CHRPOS end = bed.end;
794 if (bed.zeroLength) {
795 if (_isGff == false)
796 start++;
797 end--;
798 }
799
800 // BED
801 if (_isGff == false && _isVcf == false) {
802 if (this->bedType == 3) {
803 printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end);
804 }
805 else if (this->bedType == 4) {
806 printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str());
807 }
808 else if (this->bedType == 5) {
809 printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
810 bed.score.c_str());
811 }
812 else if (this->bedType == 6) {
813 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
814 bed.score.c_str(), bed.strand.c_str());
815 }
816 else if (this->bedType > 6) {
817 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
818 bed.score.c_str(), bed.strand.c_str());
819
820 vector<string>::const_iterator othIt = bed.otherFields.begin();
821 vector<string>::const_iterator othEnd = bed.otherFields.end();
822 for ( ; othIt != othEnd; ++othIt) {
823 printf("%s\t", othIt->c_str());
824 }
825 }
826 }
827 // VCF
828 else if (_isGff == false && _isVcf == true) {
829 printf ("%s\t%d\t", bed.chrom.c_str(), start+1);
830
831 vector<string>::const_iterator othIt = bed.otherFields.begin();
832 vector<string>::const_iterator othEnd = bed.otherFields.end();
833 for ( ; othIt != othEnd; ++othIt) {
834 printf("%s\t", othIt->c_str());
835 }
836 }
837 // GFF
838 else if (_isGff == true) {
839 // "GFF-8"
840 if (this->bedType == 8) {
841 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(),
842 bed.name.c_str(), start+1, end,
843 bed.score.c_str(), bed.strand.c_str(),
844 bed.otherFields[1].c_str());
845 }
846 // "GFF-9"
847 else if (this->bedType == 9) {
848 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(),
849 bed.name.c_str(), start+1, end,
850 bed.score.c_str(), bed.strand.c_str(),
851 bed.otherFields[1].c_str(), bed.otherFields[2].c_str());
852 }
853 }
854 }
855
856
857
858 /*
859 reportBedNewLine
860
861 Writes the _original_ BED entry with a NEWLINE
862 at the end of the line.
863 Works for BED3 - BED6.
864 */
865 template <typename T>
866 inline void reportBedNewLine(const T &bed) {
867
868 // if it is azeroLength feature, we need to
869 // correct the start and end coords to what they were
870 // in the original file
871 CHRPOS start = bed.start;
872 CHRPOS end = bed.end;
873 if (bed.zeroLength) {
874 if (_isGff == false)
875 start++;
876 end--;
877 }
878
879 //BED
880 if (_isGff == false && _isVcf == false) {
881 if (this->bedType == 3) {
882 printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end);
883 }
884 else if (this->bedType == 4) {
885 printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str());
886 }
887 else if (this->bedType == 5) {
888 printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(),
889 bed.score.c_str());
890 }
891 else if (this->bedType == 6) {
892 printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(),
893 bed.score.c_str(), bed.strand.c_str());
894 }
895 else if (this->bedType > 6) {
896 printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(),
897 bed.score.c_str(), bed.strand.c_str());
898
899 vector<string>::const_iterator othIt = bed.otherFields.begin();
900 vector<string>::const_iterator othEnd = bed.otherFields.end();
901 for ( ; othIt != othEnd; ++othIt) {
902 printf("\t%s", othIt->c_str());
903 }
904 printf("\n");
905 }
906 }
907 // VCF
908 else if (_isGff == false && _isVcf == true) {
909 printf ("%s\t%d\t", bed.chrom.c_str(), start+1);
910
911 vector<string>::const_iterator othIt = bed.otherFields.begin();
912 vector<string>::const_iterator othEnd = bed.otherFields.end();
913 for ( ; othIt != othEnd; ++othIt) {
914 printf("%s\t", othIt->c_str());
915 }
916 printf("\n");
917 }
918 // GFF
919 else if (_isGff == true) {
920 // "GFF-8"
921 if (this->bedType == 8) {
922 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(),
923 bed.name.c_str(), start+1, end,
924 bed.score.c_str(), bed.strand.c_str(),
925 bed.otherFields[1].c_str());
926 }
927 // "GFF-9"
928 else if (this->bedType == 9) {
929 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(),
930 bed.name.c_str(), start+1, end,
931 bed.score.c_str(), bed.strand.c_str(),
932 bed.otherFields[1].c_str(), bed.otherFields[2].c_str());
933 }
934 }
935 }
936
937
938
939 /*
940 reportBedRangeNewLine
941
942 Writes a custom start->end for a BED entry
943 with a NEWLINE at the end of the line.
944
945 Works for BED3 - BED6.
946 */
947 template <typename T>
948 inline void reportBedRangeTab(const T &bed, CHRPOS start, CHRPOS end) {
949
950 // if it is azeroLength feature, we need to
951 // correct the start and end coords to what they were
952 // in the original file
953 if (bed.zeroLength) {
954 start = bed.start + 1;
955 end = bed.end - 1;
956 }
957 // BED
958 if (_isGff == false && _isVcf == false) {
959 if (this->bedType == 3) {
960 printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end);
961 }
962 else if (this->bedType == 4) {
963 printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str());
964 }
965 else if (this->bedType == 5) {
966 printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
967 bed.score.c_str());
968 }
969 else if (this->bedType == 6) {
970 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
971 bed.score.c_str(), bed.strand.c_str());
972 }
973 else if (this->bedType > 6) {
974 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(),
975 bed.score.c_str(), bed.strand.c_str());
976
977 vector<string>::const_iterator othIt = bed.otherFields.begin();
978 vector<string>::const_iterator othEnd = bed.otherFields.end();
979 for ( ; othIt != othEnd; ++othIt) {
980 printf("%s\t", othIt->c_str());
981 }
982 }
983 }
984 // VCF
985 else if (_isGff == false && _isVcf == true) {
986 printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1);
987
988 vector<string>::const_iterator othIt = bed.otherFields.begin();
989 vector<string>::const_iterator othEnd = bed.otherFields.end();
990 for ( ; othIt != othEnd; ++othIt) {
991 printf("%s\t", othIt->c_str());
992 }
993 }
994 // GFF
995 else if (_isGff == true) {
996 // "GFF-8"
997 if (this->bedType == 8) {
998 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(),
999 bed.name.c_str(), start+1, end,
1000 bed.score.c_str(), bed.strand.c_str(),
1001 bed.otherFields[1].c_str());
1002 }
1003 // "GFF-9"
1004 else if (this->bedType == 9) {
1005 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(),
1006 bed.name.c_str(), start+1, end,
1007 bed.score.c_str(), bed.strand.c_str(),
1008 bed.otherFields[1].c_str(), bed.otherFields[2].c_str());
1009 }
1010 }
1011 }
1012
1013
1014
1015 /*
1016 reportBedRangeTab
1017
1018 Writes a custom start->end for a BED entry
1019 with a TAB at the end of the line.
1020
1021 Works for BED3 - BED6.
1022 */
1023 template <typename T>
1024 inline void reportBedRangeNewLine(const T &bed, CHRPOS start, CHRPOS end) {
1025
1026 // if it is azeroLength feature, we need to
1027 // correct the start and end coords to what they were
1028 // in the original file
1029 if (bed.zeroLength) {
1030 start = bed.start + 1;
1031 end = bed.end - 1;
1032 }
1033 // BED
1034 if (_isGff == false && _isVcf == false) {
1035 if (this->bedType == 3) {
1036 printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end);
1037 }
1038 else if (this->bedType == 4) {
1039 printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str());
1040 }
1041 else if (this->bedType == 5) {
1042 printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(),
1043 bed.score.c_str());
1044 }
1045 else if (this->bedType == 6) {
1046 printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(),
1047 bed.score.c_str(), bed.strand.c_str());
1048 }
1049 else if (this->bedType > 6) {
1050 printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(),
1051 bed.score.c_str(), bed.strand.c_str());
1052
1053 vector<string>::const_iterator othIt = bed.otherFields.begin();
1054 vector<string>::const_iterator othEnd = bed.otherFields.end();
1055 for ( ; othIt != othEnd; ++othIt) {
1056 printf("\t%s", othIt->c_str());
1057 }
1058 printf("\n");
1059 }
1060 }
1061 // VCF
1062 else if (_isGff == false && _isVcf == true) {
1063 printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1);
1064
1065 vector<string>::const_iterator othIt = bed.otherFields.begin();
1066 vector<string>::const_iterator othEnd = bed.otherFields.end();
1067 for ( ; othIt != othEnd; ++othIt) {
1068 printf("%s\t", othIt->c_str());
1069 }
1070 printf("\n");
1071 }
1072 // GFF
1073 else if (_isGff == true) {
1074 // "GFF-9"
1075 if (this->bedType == 8) {
1076 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(),
1077 bed.name.c_str(), start+1, end,
1078 bed.score.c_str(), bed.strand.c_str(),
1079 bed.otherFields[1].c_str());
1080 }
1081 // "GFF-8"
1082 else if (this->bedType == 9) {
1083 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(),
1084 bed.name.c_str(), start+1, end,
1085 bed.score.c_str(), bed.strand.c_str(),
1086 bed.otherFields[1].c_str(), bed.otherFields[2].c_str());
1087 }
1088 }
1089 }
1090
1091
1092 /*
1093 reportNullBedTab
1094 */
1095 void reportNullBedTab() {
1096
1097 if (_isGff == false && _isVcf == false) {
1098 if (this->bedType == 3) {
1099 printf (".\t-1\t-1\t");
1100 }
1101 else if (this->bedType == 4) {
1102 printf (".\t-1\t-1\t.\t");
1103 }
1104 else if (this->bedType == 5) {
1105 printf (".\t-1\t-1\t.\t-1\t");
1106 }
1107 else if (this->bedType == 6) {
1108 printf (".\t-1\t-1\t.\t-1\t.\t");
1109 }
1110 else if (this->bedType > 6) {
1111 printf (".\t-1\t-1\t.\t-1\t.\t");
1112 for (unsigned int i = 6; i < this->bedType; ++i) {
1113 printf(".\t");
1114 }
1115 }
1116 }
1117 else if (_isGff == true && _isVcf == false) {
1118 if (this->bedType == 8) {
1119 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t");
1120 }
1121 else if (this->bedType == 9) {
1122 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t");
1123 }
1124 }
1125 }
1126
1127
1128 /*
1129 reportNullBedTab
1130 */
1131 void reportNullBedNewLine() {
1132
1133 if (_isGff == false && _isVcf == false) {
1134 if (this->bedType == 3) {
1135 printf (".\t-1\t-1\n");
1136 }
1137 else if (this->bedType == 4) {
1138 printf (".\t-1\t-1\t.\n");
1139 }
1140 else if (this->bedType == 5) {
1141 printf (".\t-1\t-1\t.\t-1\n");
1142 }
1143 else if (this->bedType == 6) {
1144 printf (".\t-1\t-1\t.\t-1\t.\n");
1145 }
1146 else if (this->bedType > 6) {
1147 printf (".\t-1\t-1\t.\t-1\t.");
1148 for (unsigned int i = 6; i < this->bedType; ++i) {
1149 printf("\t.");
1150 }
1151 printf("\n");
1152 }
1153 }
1154 else if (_isGff == true && _isVcf == false) {
1155 if (this->bedType == 8) {
1156 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\n");
1157 }
1158 else if (this->bedType == 9) {
1159 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n");
1160 }
1161 }
1162 }
1163
1164
1165 };
1166
1167 #endif /* BEDFILE_H */