Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/utils/bedFile/bedFile.h @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfcd8b6c1bda |
---|---|
1 /***************************************************************************** | |
2 bedFile.h | |
3 | |
4 (c) 2009 - Aaron Quinlan | |
5 Hall Laboratory | |
6 Department of Biochemistry and Molecular Genetics | |
7 University of Virginia | |
8 aaronquinlan@gmail.com | |
9 | |
10 Licensed under the GNU General Public License 2.0 license. | |
11 ******************************************************************************/ | |
12 #ifndef BEDFILE_H | |
13 #define BEDFILE_H | |
14 | |
15 // "local" includes | |
16 #include "gzstream.h" | |
17 #include "lineFileUtilities.h" | |
18 #include "fileType.h" | |
19 | |
20 // standard includes | |
21 #include <vector> | |
22 #include <map> | |
23 #include <set> | |
24 #include <string> | |
25 #include <iostream> | |
26 #include <fstream> | |
27 #include <sstream> | |
28 #include <cstring> | |
29 #include <algorithm> | |
30 #include <limits.h> | |
31 #include <stdint.h> | |
32 #include <cstdio> | |
33 //#include <tr1/unordered_map> // Experimental. | |
34 using namespace std; | |
35 | |
36 | |
37 //************************************************* | |
38 // Data type tydedef | |
39 //************************************************* | |
40 typedef uint32_t CHRPOS; | |
41 typedef uint16_t BINLEVEL; | |
42 typedef uint32_t BIN; | |
43 typedef uint16_t USHORT; | |
44 typedef uint32_t UINT; | |
45 | |
46 //************************************************* | |
47 // Genome binning constants | |
48 //************************************************* | |
49 | |
50 const BIN _numBins = 37450; | |
51 const BINLEVEL _binLevels = 7; | |
52 | |
53 // bins range in size from 16kb to 512Mb | |
54 // Bin 0 spans 512Mbp, # Level 1 | |
55 // Bins 1-8 span 64Mbp, # Level 2 | |
56 // Bins 9-72 span 8Mbp, # Level 3 | |
57 // Bins 73-584 span 1Mbp # Level 4 | |
58 // Bins 585-4680 span 128Kbp # Level 5 | |
59 // Bins 4681-37449 span 16Kbp # Level 6 | |
60 const BIN _binOffsetsExtended[] = {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; | |
61 //const BIN _binOffsetsExtended[] = {4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; | |
62 | |
63 const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */ | |
64 const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */ | |
65 | |
66 | |
67 //************************************************* | |
68 // Common data structures | |
69 //************************************************* | |
70 | |
71 struct DEPTH { | |
72 UINT starts; | |
73 UINT ends; | |
74 }; | |
75 | |
76 | |
77 /* | |
78 Structure for regular BED records | |
79 */ | |
80 struct BED { | |
81 | |
82 // Regular BED fields | |
83 string chrom; | |
84 CHRPOS start; | |
85 CHRPOS end; | |
86 string name; | |
87 string score; | |
88 string strand; | |
89 | |
90 // Add'l fields for BED12 and/or custom BED annotations | |
91 vector<string> otherFields; | |
92 | |
93 // experimental fields for the FJOIN approach. | |
94 bool zeroLength; | |
95 bool added; | |
96 bool finished; | |
97 // list of hits from another file. | |
98 vector<BED> overlaps; | |
99 | |
100 public: | |
101 // constructors | |
102 | |
103 // Null | |
104 BED() | |
105 : chrom(""), | |
106 start(0), | |
107 end(0), | |
108 name(""), | |
109 score(""), | |
110 strand(""), | |
111 otherFields(), | |
112 zeroLength(false), | |
113 added(false), | |
114 finished(false), | |
115 overlaps() | |
116 {} | |
117 | |
118 // BED3 | |
119 BED(string chrom, CHRPOS start, CHRPOS end) | |
120 : chrom(chrom), | |
121 start(start), | |
122 end(end), | |
123 name(""), | |
124 score(""), | |
125 strand(""), | |
126 otherFields(), | |
127 zeroLength(false), | |
128 added(false), | |
129 finished(false), | |
130 overlaps() | |
131 {} | |
132 | |
133 // BED4 | |
134 BED(string chrom, CHRPOS start, CHRPOS end, string strand) | |
135 : chrom(chrom), | |
136 start(start), | |
137 end(end), | |
138 name(""), | |
139 score(""), | |
140 strand(strand), | |
141 otherFields(), | |
142 zeroLength(false), | |
143 added(false), | |
144 finished(false), | |
145 overlaps() | |
146 {} | |
147 | |
148 // BED6 | |
149 BED(string chrom, CHRPOS start, CHRPOS end, string name, | |
150 string score, string strand) | |
151 : chrom(chrom), | |
152 start(start), | |
153 end(end), | |
154 name(name), | |
155 score(score), | |
156 strand(strand), | |
157 otherFields(), | |
158 zeroLength(false), | |
159 added(false), | |
160 finished(false), | |
161 overlaps() | |
162 {} | |
163 | |
164 // BEDALL | |
165 BED(string chrom, CHRPOS start, CHRPOS end, string name, | |
166 string score, string strand, vector<string> otherFields) | |
167 : chrom(chrom), | |
168 start(start), | |
169 end(end), | |
170 name(name), | |
171 score(score), | |
172 strand(strand), | |
173 otherFields(otherFields), | |
174 zeroLength(false), | |
175 added(false), | |
176 finished(false), | |
177 overlaps() | |
178 {} | |
179 | |
180 int size() { | |
181 return end-start; | |
182 } | |
183 | |
184 }; // BED | |
185 | |
186 | |
187 /* | |
188 Structure for each end of a paired BED record | |
189 mate points to the other end. | |
190 */ | |
191 struct MATE { | |
192 BED bed; | |
193 int lineNum; | |
194 MATE *mate; | |
195 }; | |
196 | |
197 | |
198 /* | |
199 Structure for regular BED COVERAGE records | |
200 */ | |
201 struct BEDCOV { | |
202 | |
203 string chrom; | |
204 | |
205 // Regular BED fields | |
206 CHRPOS start; | |
207 CHRPOS end; | |
208 string name; | |
209 string score; | |
210 string strand; | |
211 | |
212 // Add'l fields for BED12 and/or custom BED annotations | |
213 vector<string> otherFields; | |
214 | |
215 // flag a zero-length feature | |
216 bool zeroLength; | |
217 | |
218 // Additional fields specific to computing coverage | |
219 map<unsigned int, DEPTH> depthMap; | |
220 unsigned int count; | |
221 CHRPOS minOverlapStart; | |
222 | |
223 | |
224 public: | |
225 // constructors | |
226 // Null | |
227 BEDCOV() | |
228 : chrom(""), | |
229 start(0), | |
230 end(0), | |
231 name(""), | |
232 score(""), | |
233 strand(""), | |
234 otherFields(), | |
235 zeroLength(false), | |
236 depthMap(), | |
237 count(0), | |
238 minOverlapStart(0) | |
239 {} | |
240 }; | |
241 | |
242 | |
243 /* | |
244 Structure for BED COVERAGE records having lists of | |
245 multiple coverages | |
246 */ | |
247 struct BEDCOVLIST { | |
248 | |
249 // Regular BED fields | |
250 string chrom; | |
251 CHRPOS start; | |
252 CHRPOS end; | |
253 string name; | |
254 string score; | |
255 string strand; | |
256 | |
257 // Add'l fields for BED12 and/or custom BED annotations | |
258 vector<string> otherFields; | |
259 | |
260 // flag a zero-length feature | |
261 bool zeroLength; | |
262 | |
263 // Additional fields specific to computing coverage | |
264 vector< map<unsigned int, DEPTH> > depthMapList; | |
265 vector<unsigned int> counts; | |
266 vector<CHRPOS> minOverlapStarts; | |
267 | |
268 | |
269 public: | |
270 // constructors | |
271 // Null | |
272 BEDCOVLIST() | |
273 : chrom(""), | |
274 start(0), | |
275 end(0), | |
276 name(""), | |
277 score(""), | |
278 strand(""), | |
279 otherFields(), | |
280 zeroLength(false), | |
281 depthMapList(), | |
282 counts(0), | |
283 minOverlapStarts(0) | |
284 {} | |
285 }; | |
286 | |
287 | |
288 // enum to flag the state of a given line in a BED file. | |
289 enum BedLineStatus | |
290 { | |
291 BED_INVALID = -1, | |
292 BED_HEADER = 0, | |
293 BED_BLANK = 1, | |
294 BED_VALID = 2 | |
295 }; | |
296 | |
297 // enum to indicate the type of file we are dealing with | |
298 enum FileType | |
299 { | |
300 BED_FILETYPE, | |
301 GFF_FILETYPE, | |
302 VCF_FILETYPE | |
303 }; | |
304 | |
305 //************************************************* | |
306 // Data structure typedefs | |
307 //************************************************* | |
308 typedef vector<BED> bedVector; | |
309 typedef vector<BEDCOV> bedCovVector; | |
310 typedef vector<MATE> mateVector; | |
311 typedef vector<BEDCOVLIST> bedCovListVector; | |
312 | |
313 typedef map<BIN, bedVector, std::less<BIN> > binsToBeds; | |
314 typedef map<BIN, bedCovVector, std::less<BIN> > binsToBedCovs; | |
315 typedef map<BIN, mateVector, std::less<BIN> > binsToMates; | |
316 typedef map<BIN, bedCovListVector, std::less<BIN> > binsToBedCovLists; | |
317 | |
318 typedef map<string, binsToBeds, std::less<string> > masterBedMap; | |
319 typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap; | |
320 typedef map<string, binsToMates, std::less<string> > masterMateMap; | |
321 typedef map<string, binsToBedCovLists, std::less<string> > masterBedCovListMap; | |
322 typedef map<string, bedVector, std::less<string> > masterBedMapNoBin; | |
323 | |
324 | |
325 // EXPERIMENTAL - wait for TR1 | |
326 // typedef vector<BED> bedVector; | |
327 // typedef vector<BEDCOV> bedCovVector; | |
328 // | |
329 // typedef tr1::unordered_map<BIN, bedVector> binsToBeds; | |
330 // typedef tr1::unordered_map<BIN, bedCovVector> binsToBedCovs; | |
331 // | |
332 // typedef tr1::unordered_map<string, binsToBeds> masterBedMap; | |
333 // typedef tr1::unordered_map<string, binsToBedCovs> masterBedCovMap; | |
334 // typedef tr1::unordered_map<string, bedVector> masterBedMapNoBin; | |
335 | |
336 | |
337 | |
338 // return the genome "bin" for a feature with this start and end | |
339 inline | |
340 BIN getBin(CHRPOS start, CHRPOS end) { | |
341 --end; | |
342 start >>= _binFirstShift; | |
343 end >>= _binFirstShift; | |
344 | |
345 for (register short i = 0; i < _binLevels; ++i) { | |
346 if (start == end) return _binOffsetsExtended[i] + start; | |
347 start >>= _binNextShift; | |
348 end >>= _binNextShift; | |
349 } | |
350 cerr << "start " << start << ", end " << end << " out of range in findBin (max is 512M)" << endl; | |
351 return 0; | |
352 } | |
353 | |
354 /**************************************************** | |
355 // isInteger(s): Tests if string s is a valid integer | |
356 *****************************************************/ | |
357 inline bool isInteger(const std::string& s) { | |
358 int len = s.length(); | |
359 for (int i = 0; i < len; i++) { | |
360 if (!std::isdigit(s[i])) return false; | |
361 } | |
362 return true; | |
363 } | |
364 | |
365 | |
366 // return the amount of overlap between two features. Negative if none and the the | |
367 // number of negative bases is the distance between the two. | |
368 inline | |
369 int overlaps(CHRPOS aS, CHRPOS aE, CHRPOS bS, CHRPOS bE) { | |
370 return min(aE, bE) - max(aS, bS); | |
371 } | |
372 | |
373 // is A after (to the right of) B? | |
374 inline | |
375 bool after(const BED &a, const BED &b) { | |
376 return (a.start >= b.end); | |
377 } | |
378 | |
379 | |
380 // Ancillary functions | |
381 void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks); | |
382 | |
383 | |
384 // BED Sorting Methods | |
385 bool sortByChrom(const BED &a, const BED &b); | |
386 bool sortByStart(const BED &a, const BED &b); | |
387 bool sortBySizeAsc(const BED &a, const BED &b); | |
388 bool sortBySizeDesc(const BED &a, const BED &b); | |
389 bool sortByScoreAsc(const BED &a, const BED &b); | |
390 bool sortByScoreDesc(const BED &a, const BED &b); | |
391 bool byChromThenStart(BED const &a, BED const &b); | |
392 | |
393 | |
394 | |
395 //************************************************ | |
396 // BedFile Class methods and elements | |
397 //************************************************ | |
398 class BedFile { | |
399 | |
400 public: | |
401 | |
402 // Constructor | |
403 BedFile(string &); | |
404 | |
405 // Destructor | |
406 ~BedFile(void); | |
407 | |
408 // Open a BED file for reading (creates an istream pointer) | |
409 void Open(void); | |
410 | |
411 // Rewind the pointer back to the beginning of the file | |
412 void Rewind(void); | |
413 | |
414 // Jump to a specific byte in the file | |
415 void Seek(unsigned long offset); | |
416 | |
417 bool Empty(); | |
418 | |
419 // Close an opened BED file. | |
420 void Close(void); | |
421 | |
422 // Get the next BED entry in an opened BED file. | |
423 BedLineStatus GetNextBed (BED &bed, int &lineNum, bool forceSorted = false); | |
424 | |
425 // Returns the next MERGED (i.e., non-overlapping) interval in an opened BED file | |
426 // NOTE: assumes input file is sorted by chrom then start | |
427 bool GetNextMergedBed(BED &merged_bed, int &lineNum); | |
428 | |
429 // load a BED file into a map keyed by chrom, then bin. value is vector of BEDs | |
430 void loadBedFileIntoMap(); | |
431 | |
432 // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVs | |
433 void loadBedCovFileIntoMap(); | |
434 | |
435 // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVLISTs | |
436 void loadBedCovListFileIntoMap(); | |
437 | |
438 // load a BED file into a map keyed by chrom. value is vector of BEDs | |
439 void loadBedFileIntoMapNoBin(); | |
440 | |
441 // Given a chrom, start, end and strand for a single feature, | |
442 // search for all overlapping features in another BED file. | |
443 // Searches through each relevant genome bin on the same chromosome | |
444 // as the single feature. Note: Adapted from kent source "binKeeperFind" | |
445 void FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool sameStrand, bool diffStrand); | |
446 | |
447 // return true if at least one overlap was found. otherwise, return false. | |
448 bool FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, | |
449 bool sameStrand, bool diffStrand, float overlapFraction = 0.0); | |
450 | |
451 // return true if at least one __reciprocal__ overlap was found. otherwise, return false. | |
452 bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, | |
453 bool sameStrand, bool diffStrand, float overlapFraction = 0.0); | |
454 | |
455 // Given a chrom, start, end and strand for a single feature, | |
456 // increment a the number of hits for each feature in B file | |
457 // that the feature overlaps | |
458 void countHits(const BED &a, bool sameStrand = false, bool diffStrand = false, bool countsOnly = false); | |
459 | |
460 // same as above, but has special logic that processes a set of | |
461 // BED "blocks" from a single entry so as to avoid over-counting | |
462 // each "block" of a single BAM/BED12 as distinct coverage. That is, | |
463 // if one read has four block, we only want to count the coverage as | |
464 // coming from one read, not four. | |
465 void countSplitHits(const vector<BED> &bedBlock, bool sameStrand = false, bool diffStrand = false, bool countsOnly = false); | |
466 | |
467 // Given a chrom, start, end and strand for a single feature, | |
468 // increment a the number of hits for each feature in B file | |
469 // that the feature overlaps | |
470 void countListHits(const BED &a, int index, bool sameStrand, bool diffStrand); | |
471 | |
472 // the bedfile with which this instance is associated | |
473 string bedFile; | |
474 unsigned int bedType; // 3-6, 12 for BED | |
475 // 9 for GFF | |
476 bool isZeroBased; | |
477 | |
478 // Main data structires used by BEDTools | |
479 masterBedCovMap bedCovMap; | |
480 masterBedCovListMap bedCovListMap; | |
481 masterBedMap bedMap; | |
482 masterBedMapNoBin bedMapNoBin; | |
483 | |
484 private: | |
485 | |
486 // data | |
487 bool _isGff; | |
488 bool _isVcf; | |
489 bool _typeIsKnown; // do we know the type? (i.e., BED, GFF, VCF) | |
490 FileType _fileType; // what is the file type? (BED? GFF? VCF?) | |
491 istream *_bedStream; | |
492 string _bedLine; | |
493 vector<string> _bedFields; | |
494 int _merged_start; | |
495 int _merged_end; | |
496 string _merged_chrom; | |
497 int _prev_start; | |
498 string _prev_chrom; | |
499 | |
500 void setZeroBased(bool zeroBased); | |
501 void setGff (bool isGff); | |
502 void setVcf (bool isVcf); | |
503 void setFileType (FileType type); | |
504 void setBedType (int colNums); | |
505 | |
506 /****************************************************** | |
507 Private definitions to circumvent linker issues with | |
508 templated member functions. | |
509 *******************************************************/ | |
510 | |
511 /* | |
512 parseLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) | |
513 */ | |
514 template <typename T> | |
515 inline BedLineStatus parseLine (T &bed, const vector<string> &lineVector, int &lineNum) { | |
516 | |
517 //char *p2End, *p3End, *p4End, *p5End; | |
518 //long l2, l3, l4, l5; | |
519 unsigned int numFields = lineVector.size(); | |
520 | |
521 // bail out if we have a blank line | |
522 if (numFields == 0) { | |
523 return BED_BLANK; | |
524 } | |
525 | |
526 if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) { | |
527 | |
528 if (numFields >= 3) { | |
529 // line parsing for all lines after the first non-header line | |
530 if (_typeIsKnown == true) { | |
531 switch(_fileType) { | |
532 case BED_FILETYPE: | |
533 if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; | |
534 case VCF_FILETYPE: | |
535 if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; | |
536 case GFF_FILETYPE: | |
537 if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; | |
538 default: | |
539 printf("ERROR: file type encountered. Exiting\n"); | |
540 exit(1); | |
541 } | |
542 } | |
543 // line parsing for first non-header line: figure out file contents | |
544 else { | |
545 // it's BED format if columns 2 and 3 are integers | |
546 if (isInteger(lineVector[1]) && isInteger(lineVector[2])) { | |
547 setGff(false); | |
548 setZeroBased(true); | |
549 setFileType(BED_FILETYPE); | |
550 setBedType(numFields); // we now expect numFields columns in each line | |
551 if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; | |
552 } | |
553 // it's VCF, assuming the second column is numeric and there are at least 8 fields. | |
554 else if (isInteger(lineVector[1]) && numFields >= 8) { | |
555 setGff(false); | |
556 setVcf(true); | |
557 setZeroBased(false); | |
558 setFileType(VCF_FILETYPE); | |
559 setBedType(numFields); // we now expect numFields columns in each line | |
560 if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; | |
561 } | |
562 // it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total. | |
563 else if ((numFields >= 8) && isInteger(lineVector[3]) && isInteger(lineVector[4])) { | |
564 setGff(true); | |
565 setZeroBased(false); | |
566 setFileType(GFF_FILETYPE); | |
567 setBedType(numFields); // we now expect numFields columns in each line | |
568 if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; | |
569 } | |
570 else { | |
571 cerr << "Unexpected file format. Please use tab-delimited BED, GFF, or VCF. " << | |
572 "Perhaps you have non-integer starts or ends at line " << lineNum << "?" << endl; | |
573 exit(1); | |
574 } | |
575 } | |
576 } | |
577 else { | |
578 cerr << "It looks as though you have less than 3 columns at line: " << lineNum << ". Are you sure your files are tab-delimited?" << endl; | |
579 exit(1); | |
580 } | |
581 } | |
582 else { | |
583 lineNum--; | |
584 return BED_HEADER; | |
585 } | |
586 // default | |
587 return BED_INVALID; | |
588 } | |
589 | |
590 | |
591 /* | |
592 parseBedLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) | |
593 */ | |
594 template <typename T> | |
595 inline bool parseBedLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { | |
596 | |
597 // process as long as the number of fields in this | |
598 // line matches what we expect for this file. | |
599 if (numFields == this->bedType) { | |
600 bed.chrom = lineVector[0]; | |
601 int i; | |
602 i = atoi(lineVector[1].c_str()); | |
603 if (i<0) { | |
604 cerr << "Error: malformed BED entry at line " << lineNum << ". Start Coordinate detected that is < 0. Exiting." << endl; | |
605 exit(1); | |
606 } | |
607 bed.start = (CHRPOS)i; | |
608 i = atoi(lineVector[2].c_str()); | |
609 if (i<0) { | |
610 cerr << "Error: malformed BED entry at line " << lineNum << ". End Coordinate detected that is < 0. Exiting." << endl; | |
611 exit(1); | |
612 } | |
613 bed.end = (CHRPOS)i; | |
614 | |
615 // handle starts == end (e.g., insertions in reference genome) | |
616 if (bed.start == bed.end) { | |
617 bed.start--; | |
618 bed.end++; | |
619 bed.zeroLength = true; | |
620 } | |
621 | |
622 if (this->bedType == 4) { | |
623 bed.name = lineVector[3]; | |
624 } | |
625 else if (this->bedType == 5) { | |
626 bed.name = lineVector[3]; | |
627 bed.score = lineVector[4]; | |
628 } | |
629 else if (this->bedType == 6) { | |
630 bed.name = lineVector[3]; | |
631 bed.score = lineVector[4]; | |
632 bed.strand = lineVector[5]; | |
633 } | |
634 else if (this->bedType > 6) { | |
635 bed.name = lineVector[3]; | |
636 bed.score = lineVector[4]; | |
637 bed.strand = lineVector[5]; | |
638 for (unsigned int i = 6; i < lineVector.size(); ++i) { | |
639 bed.otherFields.push_back(lineVector[i]); | |
640 } | |
641 } | |
642 else if (this->bedType != 3) { | |
643 cerr << "Error: unexpected number of fields at line: " << lineNum | |
644 << ". Verify that your files are TAB-delimited. Exiting..." << endl; | |
645 exit(1); | |
646 } | |
647 | |
648 // sanity checks. | |
649 if (bed.start <= bed.end) { | |
650 return true; | |
651 } | |
652 else { | |
653 cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; | |
654 exit(1); | |
655 } | |
656 } | |
657 else if (numFields == 1) { | |
658 cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; | |
659 exit(1); | |
660 } | |
661 else if ((numFields != this->bedType) && (numFields != 0)) { | |
662 cerr << "Differing number of BED fields encountered at line: " << lineNum << ". Exiting..." << endl; | |
663 exit(1); | |
664 } | |
665 else if ((numFields < 3) && (numFields != 0)) { | |
666 cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required at line: "<< lineNum << ". Exiting..." << endl; | |
667 exit(1); | |
668 } | |
669 return false; | |
670 } | |
671 | |
672 | |
673 /* | |
674 parseVcfLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) | |
675 */ | |
676 template <typename T> | |
677 inline bool parseVcfLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { | |
678 if (numFields == this->bedType) { | |
679 bed.chrom = lineVector[0]; | |
680 bed.start = atoi(lineVector[1].c_str()) - 1; // VCF is one-based | |
681 bed.end = bed.start + lineVector[3].size(); // VCF 4.0 stores the size of the affected REF allele. | |
682 bed.strand = "+"; | |
683 // construct the name from the ref and alt alleles. | |
684 // if it's an annotated variant, add the rsId as well. | |
685 bed.name = lineVector[3] + "/" + lineVector[4]; | |
686 if (lineVector[2] != ".") { | |
687 bed.name += "_" + lineVector[2]; | |
688 } | |
689 | |
690 if (this->bedType > 2) { | |
691 for (unsigned int i = 2; i < numFields; ++i) | |
692 bed.otherFields.push_back(lineVector[i]); | |
693 } | |
694 | |
695 if ((bed.start <= bed.end) && (bed.start >= 0) && (bed.end >= 0)) { | |
696 return true; | |
697 } | |
698 else if (bed.start > bed.end) { | |
699 cerr << "Error: malformed VCF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; | |
700 exit(1); | |
701 } | |
702 else if ( (bed.start < 0) || (bed.end < 0) ) { | |
703 cerr << "Error: malformed VCF entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; | |
704 exit(1); | |
705 } | |
706 } | |
707 else if (numFields == 1) { | |
708 cerr << "Only one VCF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; | |
709 exit(1); | |
710 } | |
711 else if ((numFields != this->bedType) && (numFields != 0)) { | |
712 cerr << "Differing number of VCF fields encountered at line: " << lineNum << ". Exiting..." << endl; | |
713 exit(1); | |
714 } | |
715 else if ((numFields < 2) && (numFields != 0)) { | |
716 cerr << "TAB delimited VCF file with at least 2 fields (chrom, pos) is required at line: "<< lineNum << ". Exiting..." << endl; | |
717 exit(1); | |
718 } | |
719 return false; | |
720 } | |
721 | |
722 | |
723 | |
724 /* | |
725 parseGffLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) | |
726 */ | |
727 template <typename T> | |
728 inline bool parseGffLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { | |
729 if (numFields == this->bedType) { | |
730 if (this->bedType >= 8 && _isGff) { | |
731 bed.chrom = lineVector[0]; | |
732 if (isInteger(lineVector[3])) | |
733 bed.start = atoi(lineVector[3].c_str()); | |
734 if (isInteger(lineVector[4])) | |
735 bed.end = atoi(lineVector[4].c_str()); | |
736 bed.name = lineVector[2]; | |
737 bed.score = lineVector[5]; | |
738 bed.strand = lineVector[6].c_str(); | |
739 bed.otherFields.push_back(lineVector[1]); // add GFF "source". unused in BED | |
740 bed.otherFields.push_back(lineVector[7]); // add GFF "fname". unused in BED | |
741 // handle the optional 9th field. | |
742 if (this->bedType == 9) | |
743 bed.otherFields.push_back(lineVector[8]); // add GFF "group". unused in BED | |
744 bed.start--; | |
745 } | |
746 else { | |
747 cerr << "Error: unexpected number of fields at line: " << lineNum << | |
748 ". Verify that your files are TAB-delimited and that your GFF file has 8 or 9 fields. Exiting..." << endl; | |
749 exit(1); | |
750 } | |
751 if (bed.start > bed.end) { | |
752 cerr << "Error: malformed GFF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; | |
753 exit(1); | |
754 } | |
755 if ( (bed.start < 0) || (bed.end < 0) ) { | |
756 cerr << "Error: malformed GFF entry at line " << lineNum << ". Coordinate detected that is < 1. Exiting." << endl; | |
757 exit(1); | |
758 } | |
759 return true; | |
760 } | |
761 else if (numFields == 1) { | |
762 cerr << "Only one GFF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; | |
763 exit(1); | |
764 } | |
765 else if ((numFields != this->bedType) && (numFields != 0)) { | |
766 cerr << "Differing number of GFF fields encountered at line: " << lineNum << ". Exiting..." << endl; | |
767 exit(1); | |
768 } | |
769 else if ((numFields < 8) && (numFields != 0)) { | |
770 cerr << "TAB delimited GFF file with 8 or 9 fields is required at line: "<< lineNum << ". Exiting..." << endl; | |
771 exit(1); | |
772 } | |
773 return false; | |
774 } | |
775 | |
776 | |
777 public: | |
778 | |
779 /* | |
780 reportBedTab | |
781 | |
782 Writes the _original_ BED entry with a TAB | |
783 at the end of the line. | |
784 Works for BED3 - BED6. | |
785 */ | |
786 template <typename T> | |
787 inline void reportBedTab(const T &bed) { | |
788 | |
789 // if it is azeroLength feature, we need to | |
790 // correct the start and end coords to what they were | |
791 // in the original file | |
792 CHRPOS start = bed.start; | |
793 CHRPOS end = bed.end; | |
794 if (bed.zeroLength) { | |
795 if (_isGff == false) | |
796 start++; | |
797 end--; | |
798 } | |
799 | |
800 // BED | |
801 if (_isGff == false && _isVcf == false) { | |
802 if (this->bedType == 3) { | |
803 printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end); | |
804 } | |
805 else if (this->bedType == 4) { | |
806 printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str()); | |
807 } | |
808 else if (this->bedType == 5) { | |
809 printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
810 bed.score.c_str()); | |
811 } | |
812 else if (this->bedType == 6) { | |
813 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
814 bed.score.c_str(), bed.strand.c_str()); | |
815 } | |
816 else if (this->bedType > 6) { | |
817 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
818 bed.score.c_str(), bed.strand.c_str()); | |
819 | |
820 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
821 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
822 for ( ; othIt != othEnd; ++othIt) { | |
823 printf("%s\t", othIt->c_str()); | |
824 } | |
825 } | |
826 } | |
827 // VCF | |
828 else if (_isGff == false && _isVcf == true) { | |
829 printf ("%s\t%d\t", bed.chrom.c_str(), start+1); | |
830 | |
831 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
832 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
833 for ( ; othIt != othEnd; ++othIt) { | |
834 printf("%s\t", othIt->c_str()); | |
835 } | |
836 } | |
837 // GFF | |
838 else if (_isGff == true) { | |
839 // "GFF-8" | |
840 if (this->bedType == 8) { | |
841 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
842 bed.name.c_str(), start+1, end, | |
843 bed.score.c_str(), bed.strand.c_str(), | |
844 bed.otherFields[1].c_str()); | |
845 } | |
846 // "GFF-9" | |
847 else if (this->bedType == 9) { | |
848 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
849 bed.name.c_str(), start+1, end, | |
850 bed.score.c_str(), bed.strand.c_str(), | |
851 bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); | |
852 } | |
853 } | |
854 } | |
855 | |
856 | |
857 | |
858 /* | |
859 reportBedNewLine | |
860 | |
861 Writes the _original_ BED entry with a NEWLINE | |
862 at the end of the line. | |
863 Works for BED3 - BED6. | |
864 */ | |
865 template <typename T> | |
866 inline void reportBedNewLine(const T &bed) { | |
867 | |
868 // if it is azeroLength feature, we need to | |
869 // correct the start and end coords to what they were | |
870 // in the original file | |
871 CHRPOS start = bed.start; | |
872 CHRPOS end = bed.end; | |
873 if (bed.zeroLength) { | |
874 if (_isGff == false) | |
875 start++; | |
876 end--; | |
877 } | |
878 | |
879 //BED | |
880 if (_isGff == false && _isVcf == false) { | |
881 if (this->bedType == 3) { | |
882 printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end); | |
883 } | |
884 else if (this->bedType == 4) { | |
885 printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str()); | |
886 } | |
887 else if (this->bedType == 5) { | |
888 printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
889 bed.score.c_str()); | |
890 } | |
891 else if (this->bedType == 6) { | |
892 printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
893 bed.score.c_str(), bed.strand.c_str()); | |
894 } | |
895 else if (this->bedType > 6) { | |
896 printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
897 bed.score.c_str(), bed.strand.c_str()); | |
898 | |
899 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
900 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
901 for ( ; othIt != othEnd; ++othIt) { | |
902 printf("\t%s", othIt->c_str()); | |
903 } | |
904 printf("\n"); | |
905 } | |
906 } | |
907 // VCF | |
908 else if (_isGff == false && _isVcf == true) { | |
909 printf ("%s\t%d\t", bed.chrom.c_str(), start+1); | |
910 | |
911 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
912 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
913 for ( ; othIt != othEnd; ++othIt) { | |
914 printf("%s\t", othIt->c_str()); | |
915 } | |
916 printf("\n"); | |
917 } | |
918 // GFF | |
919 else if (_isGff == true) { | |
920 // "GFF-8" | |
921 if (this->bedType == 8) { | |
922 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
923 bed.name.c_str(), start+1, end, | |
924 bed.score.c_str(), bed.strand.c_str(), | |
925 bed.otherFields[1].c_str()); | |
926 } | |
927 // "GFF-9" | |
928 else if (this->bedType == 9) { | |
929 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
930 bed.name.c_str(), start+1, end, | |
931 bed.score.c_str(), bed.strand.c_str(), | |
932 bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); | |
933 } | |
934 } | |
935 } | |
936 | |
937 | |
938 | |
939 /* | |
940 reportBedRangeNewLine | |
941 | |
942 Writes a custom start->end for a BED entry | |
943 with a NEWLINE at the end of the line. | |
944 | |
945 Works for BED3 - BED6. | |
946 */ | |
947 template <typename T> | |
948 inline void reportBedRangeTab(const T &bed, CHRPOS start, CHRPOS end) { | |
949 | |
950 // if it is azeroLength feature, we need to | |
951 // correct the start and end coords to what they were | |
952 // in the original file | |
953 if (bed.zeroLength) { | |
954 start = bed.start + 1; | |
955 end = bed.end - 1; | |
956 } | |
957 // BED | |
958 if (_isGff == false && _isVcf == false) { | |
959 if (this->bedType == 3) { | |
960 printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end); | |
961 } | |
962 else if (this->bedType == 4) { | |
963 printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str()); | |
964 } | |
965 else if (this->bedType == 5) { | |
966 printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
967 bed.score.c_str()); | |
968 } | |
969 else if (this->bedType == 6) { | |
970 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
971 bed.score.c_str(), bed.strand.c_str()); | |
972 } | |
973 else if (this->bedType > 6) { | |
974 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
975 bed.score.c_str(), bed.strand.c_str()); | |
976 | |
977 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
978 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
979 for ( ; othIt != othEnd; ++othIt) { | |
980 printf("%s\t", othIt->c_str()); | |
981 } | |
982 } | |
983 } | |
984 // VCF | |
985 else if (_isGff == false && _isVcf == true) { | |
986 printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); | |
987 | |
988 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
989 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
990 for ( ; othIt != othEnd; ++othIt) { | |
991 printf("%s\t", othIt->c_str()); | |
992 } | |
993 } | |
994 // GFF | |
995 else if (_isGff == true) { | |
996 // "GFF-8" | |
997 if (this->bedType == 8) { | |
998 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
999 bed.name.c_str(), start+1, end, | |
1000 bed.score.c_str(), bed.strand.c_str(), | |
1001 bed.otherFields[1].c_str()); | |
1002 } | |
1003 // "GFF-9" | |
1004 else if (this->bedType == 9) { | |
1005 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
1006 bed.name.c_str(), start+1, end, | |
1007 bed.score.c_str(), bed.strand.c_str(), | |
1008 bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); | |
1009 } | |
1010 } | |
1011 } | |
1012 | |
1013 | |
1014 | |
1015 /* | |
1016 reportBedRangeTab | |
1017 | |
1018 Writes a custom start->end for a BED entry | |
1019 with a TAB at the end of the line. | |
1020 | |
1021 Works for BED3 - BED6. | |
1022 */ | |
1023 template <typename T> | |
1024 inline void reportBedRangeNewLine(const T &bed, CHRPOS start, CHRPOS end) { | |
1025 | |
1026 // if it is azeroLength feature, we need to | |
1027 // correct the start and end coords to what they were | |
1028 // in the original file | |
1029 if (bed.zeroLength) { | |
1030 start = bed.start + 1; | |
1031 end = bed.end - 1; | |
1032 } | |
1033 // BED | |
1034 if (_isGff == false && _isVcf == false) { | |
1035 if (this->bedType == 3) { | |
1036 printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end); | |
1037 } | |
1038 else if (this->bedType == 4) { | |
1039 printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str()); | |
1040 } | |
1041 else if (this->bedType == 5) { | |
1042 printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
1043 bed.score.c_str()); | |
1044 } | |
1045 else if (this->bedType == 6) { | |
1046 printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
1047 bed.score.c_str(), bed.strand.c_str()); | |
1048 } | |
1049 else if (this->bedType > 6) { | |
1050 printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
1051 bed.score.c_str(), bed.strand.c_str()); | |
1052 | |
1053 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
1054 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
1055 for ( ; othIt != othEnd; ++othIt) { | |
1056 printf("\t%s", othIt->c_str()); | |
1057 } | |
1058 printf("\n"); | |
1059 } | |
1060 } | |
1061 // VCF | |
1062 else if (_isGff == false && _isVcf == true) { | |
1063 printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); | |
1064 | |
1065 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
1066 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
1067 for ( ; othIt != othEnd; ++othIt) { | |
1068 printf("%s\t", othIt->c_str()); | |
1069 } | |
1070 printf("\n"); | |
1071 } | |
1072 // GFF | |
1073 else if (_isGff == true) { | |
1074 // "GFF-9" | |
1075 if (this->bedType == 8) { | |
1076 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
1077 bed.name.c_str(), start+1, end, | |
1078 bed.score.c_str(), bed.strand.c_str(), | |
1079 bed.otherFields[1].c_str()); | |
1080 } | |
1081 // "GFF-8" | |
1082 else if (this->bedType == 9) { | |
1083 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
1084 bed.name.c_str(), start+1, end, | |
1085 bed.score.c_str(), bed.strand.c_str(), | |
1086 bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); | |
1087 } | |
1088 } | |
1089 } | |
1090 | |
1091 | |
1092 /* | |
1093 reportNullBedTab | |
1094 */ | |
1095 void reportNullBedTab() { | |
1096 | |
1097 if (_isGff == false && _isVcf == false) { | |
1098 if (this->bedType == 3) { | |
1099 printf (".\t-1\t-1\t"); | |
1100 } | |
1101 else if (this->bedType == 4) { | |
1102 printf (".\t-1\t-1\t.\t"); | |
1103 } | |
1104 else if (this->bedType == 5) { | |
1105 printf (".\t-1\t-1\t.\t-1\t"); | |
1106 } | |
1107 else if (this->bedType == 6) { | |
1108 printf (".\t-1\t-1\t.\t-1\t.\t"); | |
1109 } | |
1110 else if (this->bedType > 6) { | |
1111 printf (".\t-1\t-1\t.\t-1\t.\t"); | |
1112 for (unsigned int i = 6; i < this->bedType; ++i) { | |
1113 printf(".\t"); | |
1114 } | |
1115 } | |
1116 } | |
1117 else if (_isGff == true && _isVcf == false) { | |
1118 if (this->bedType == 8) { | |
1119 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t"); | |
1120 } | |
1121 else if (this->bedType == 9) { | |
1122 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t"); | |
1123 } | |
1124 } | |
1125 } | |
1126 | |
1127 | |
1128 /* | |
1129 reportNullBedTab | |
1130 */ | |
1131 void reportNullBedNewLine() { | |
1132 | |
1133 if (_isGff == false && _isVcf == false) { | |
1134 if (this->bedType == 3) { | |
1135 printf (".\t-1\t-1\n"); | |
1136 } | |
1137 else if (this->bedType == 4) { | |
1138 printf (".\t-1\t-1\t.\n"); | |
1139 } | |
1140 else if (this->bedType == 5) { | |
1141 printf (".\t-1\t-1\t.\t-1\n"); | |
1142 } | |
1143 else if (this->bedType == 6) { | |
1144 printf (".\t-1\t-1\t.\t-1\t.\n"); | |
1145 } | |
1146 else if (this->bedType > 6) { | |
1147 printf (".\t-1\t-1\t.\t-1\t."); | |
1148 for (unsigned int i = 6; i < this->bedType; ++i) { | |
1149 printf("\t."); | |
1150 } | |
1151 printf("\n"); | |
1152 } | |
1153 } | |
1154 else if (_isGff == true && _isVcf == false) { | |
1155 if (this->bedType == 8) { | |
1156 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\n"); | |
1157 } | |
1158 else if (this->bedType == 9) { | |
1159 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n"); | |
1160 } | |
1161 } | |
1162 } | |
1163 | |
1164 | |
1165 }; | |
1166 | |
1167 #endif /* BEDFILE_H */ |