Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/utils/bedFile/bedFile.h.orig @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfcd8b6c1bda |
---|---|
1 /***************************************************************************** | |
2 bedFile.h | |
3 | |
4 (c) 2009 - Aaron Quinlan | |
5 Hall Laboratory | |
6 Department of Biochemistry and Molecular Genetics | |
7 University of Virginia | |
8 aaronquinlan@gmail.com | |
9 | |
10 Licensed under the GNU General Public License 2.0 license. | |
11 ******************************************************************************/ | |
12 #ifndef BEDFILE_H | |
13 #define BEDFILE_H | |
14 | |
15 // "local" includes | |
16 #include "gzstream.h" | |
17 #include "lineFileUtilities.h" | |
18 #include "fileType.h" | |
19 | |
20 // standard includes | |
21 #include <vector> | |
22 #include <map> | |
23 #include <set> | |
24 #include <string> | |
25 #include <iostream> | |
26 #include <fstream> | |
27 #include <sstream> | |
28 #include <cstring> | |
29 #include <algorithm> | |
30 #include <limits.h> | |
31 #include <stdint.h> | |
32 #include <cstdio> | |
33 //#include <tr1/unordered_map> // Experimental. | |
34 using namespace std; | |
35 | |
36 | |
37 //************************************************* | |
38 // Data type tydedef | |
39 //************************************************* | |
40 typedef uint32_t CHRPOS; | |
41 typedef uint16_t BINLEVEL; | |
42 typedef uint32_t BIN; | |
43 typedef uint16_t USHORT; | |
44 typedef uint32_t UINT; | |
45 | |
46 //************************************************* | |
47 // Genome binning constants | |
48 //************************************************* | |
49 | |
50 const BIN _numBins = 37450; | |
51 const BINLEVEL _binLevels = 7; | |
52 | |
53 // bins range in size from 16kb to 512Mb | |
54 // Bin 0 spans 512Mbp, # Level 1 | |
55 // Bins 1-8 span 64Mbp, # Level 2 | |
56 // Bins 9-72 span 8Mbp, # Level 3 | |
57 // Bins 73-584 span 1Mbp # Level 4 | |
58 // Bins 585-4680 span 128Kbp # Level 5 | |
59 // Bins 4681-37449 span 16Kbp # Level 6 | |
60 const BIN _binOffsetsExtended[] = {32678+4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; | |
61 //const BIN _binOffsetsExtended[] = {4096+512+64+8+1, 4096+512+64+8+1, 512+64+8+1, 64+8+1, 8+1, 1, 0}; | |
62 | |
63 const USHORT _binFirstShift = 14; /* How much to shift to get to finest bin. */ | |
64 const USHORT _binNextShift = 3; /* How much to shift to get to next larger bin. */ | |
65 | |
66 | |
67 //************************************************* | |
68 // Common data structures | |
69 //************************************************* | |
70 | |
71 struct DEPTH { | |
72 UINT starts; | |
73 UINT ends; | |
74 }; | |
75 | |
76 | |
77 /* | |
78 Structure for regular BED records | |
79 */ | |
80 struct BED { | |
81 | |
82 // Regular BED fields | |
83 string chrom; | |
84 CHRPOS start; | |
85 CHRPOS end; | |
86 string name; | |
87 string score; | |
88 string strand; | |
89 | |
90 // Add'l fields for BED12 and/or custom BED annotations | |
91 vector<string> otherFields; | |
92 | |
93 // experimental fields for the FJOIN approach. | |
94 bool zeroLength; | |
95 bool added; | |
96 bool finished; | |
97 // list of hits from another file. | |
98 vector<BED> overlaps; | |
99 | |
100 public: | |
101 // constructors | |
102 | |
103 // Null | |
104 BED() | |
105 : chrom(""), | |
106 start(0), | |
107 end(0), | |
108 name(""), | |
109 score(""), | |
110 strand(""), | |
111 otherFields(), | |
112 zeroLength(false), | |
113 added(false), | |
114 finished(false), | |
115 overlaps() | |
116 {} | |
117 | |
118 // BED3 | |
119 BED(string chrom, CHRPOS start, CHRPOS end) | |
120 : chrom(chrom), | |
121 start(start), | |
122 end(end), | |
123 name(""), | |
124 score(""), | |
125 strand(""), | |
126 otherFields(), | |
127 zeroLength(false), | |
128 added(false), | |
129 finished(false), | |
130 overlaps() | |
131 {} | |
132 | |
133 // BED4 | |
134 BED(string chrom, CHRPOS start, CHRPOS end, string strand) | |
135 : chrom(chrom), | |
136 start(start), | |
137 end(end), | |
138 name(""), | |
139 score(""), | |
140 strand(strand), | |
141 otherFields(), | |
142 zeroLength(false), | |
143 added(false), | |
144 finished(false), | |
145 overlaps() | |
146 {} | |
147 | |
148 // BED6 | |
149 BED(string chrom, CHRPOS start, CHRPOS end, string name, | |
150 string score, string strand) | |
151 : chrom(chrom), | |
152 start(start), | |
153 end(end), | |
154 name(name), | |
155 score(score), | |
156 strand(strand), | |
157 otherFields(), | |
158 zeroLength(false), | |
159 added(false), | |
160 finished(false), | |
161 overlaps() | |
162 {} | |
163 | |
164 // BEDALL | |
165 BED(string chrom, CHRPOS start, CHRPOS end, string name, | |
166 string score, string strand, vector<string> otherFields) | |
167 : chrom(chrom), | |
168 start(start), | |
169 end(end), | |
170 name(name), | |
171 score(score), | |
172 strand(strand), | |
173 otherFields(otherFields), | |
174 zeroLength(false), | |
175 added(false), | |
176 finished(false), | |
177 overlaps() | |
178 {} | |
179 | |
180 int size() { | |
181 return end-start; | |
182 } | |
183 | |
184 }; // BED | |
185 | |
186 | |
187 /* | |
188 Structure for each end of a paired BED record | |
189 mate points to the other end. | |
190 */ | |
191 struct MATE { | |
192 BED bed; | |
193 int lineNum; | |
194 MATE *mate; | |
195 }; | |
196 | |
197 | |
198 /* | |
199 Structure for regular BED COVERAGE records | |
200 */ | |
201 struct BEDCOV { | |
202 | |
203 string chrom; | |
204 | |
205 // Regular BED fields | |
206 CHRPOS start; | |
207 CHRPOS end; | |
208 string name; | |
209 string score; | |
210 string strand; | |
211 | |
212 // Add'l fields for BED12 and/or custom BED annotations | |
213 vector<string> otherFields; | |
214 | |
215 // flag a zero-length feature | |
216 bool zeroLength; | |
217 | |
218 // Additional fields specific to computing coverage | |
219 map<unsigned int, DEPTH> depthMap; | |
220 unsigned int count; | |
221 CHRPOS minOverlapStart; | |
222 | |
223 | |
224 public: | |
225 // constructors | |
226 // Null | |
227 BEDCOV() | |
228 : chrom(""), | |
229 start(0), | |
230 end(0), | |
231 name(""), | |
232 score(""), | |
233 strand(""), | |
234 otherFields(), | |
235 zeroLength(false), | |
236 depthMap(), | |
237 count(0), | |
238 minOverlapStart(0) | |
239 {} | |
240 }; | |
241 | |
242 | |
243 /* | |
244 Structure for BED COVERAGE records having lists of | |
245 multiple coverages | |
246 */ | |
247 struct BEDCOVLIST { | |
248 | |
249 // Regular BED fields | |
250 string chrom; | |
251 CHRPOS start; | |
252 CHRPOS end; | |
253 string name; | |
254 string score; | |
255 string strand; | |
256 | |
257 // Add'l fields for BED12 and/or custom BED annotations | |
258 vector<string> otherFields; | |
259 | |
260 // flag a zero-length feature | |
261 bool zeroLength; | |
262 | |
263 // Additional fields specific to computing coverage | |
264 vector< map<unsigned int, DEPTH> > depthMapList; | |
265 vector<unsigned int> counts; | |
266 vector<CHRPOS> minOverlapStarts; | |
267 | |
268 | |
269 public: | |
270 // constructors | |
271 // Null | |
272 BEDCOVLIST() | |
273 : chrom(""), | |
274 start(0), | |
275 end(0), | |
276 name(""), | |
277 score(""), | |
278 strand(""), | |
279 otherFields(), | |
280 zeroLength(false), | |
281 depthMapList(), | |
282 counts(0), | |
283 minOverlapStarts(0) | |
284 {} | |
285 }; | |
286 | |
287 | |
288 // enum to flag the state of a given line in a BED file. | |
289 enum BedLineStatus | |
290 { | |
291 BED_INVALID = -1, | |
292 BED_HEADER = 0, | |
293 BED_BLANK = 1, | |
294 BED_VALID = 2 | |
295 }; | |
296 | |
297 // enum to indicate the type of file we are dealing with | |
298 enum FileType | |
299 { | |
300 BED_FILETYPE, | |
301 GFF_FILETYPE, | |
302 VCF_FILETYPE | |
303 }; | |
304 | |
305 //************************************************* | |
306 // Data structure typedefs | |
307 //************************************************* | |
308 typedef vector<BED> bedVector; | |
309 typedef vector<BEDCOV> bedCovVector; | |
310 typedef vector<MATE> mateVector; | |
311 typedef vector<BEDCOVLIST> bedCovListVector; | |
312 | |
313 typedef map<BIN, bedVector, std::less<BIN> > binsToBeds; | |
314 typedef map<BIN, bedCovVector, std::less<BIN> > binsToBedCovs; | |
315 typedef map<BIN, mateVector, std::less<BIN> > binsToMates; | |
316 typedef map<BIN, bedCovListVector, std::less<BIN> > binsToBedCovLists; | |
317 | |
318 typedef map<string, binsToBeds, std::less<string> > masterBedMap; | |
319 typedef map<string, binsToBedCovs, std::less<string> > masterBedCovMap; | |
320 typedef map<string, binsToMates, std::less<string> > masterMateMap; | |
321 typedef map<string, binsToBedCovLists, std::less<string> > masterBedCovListMap; | |
322 typedef map<string, bedVector, std::less<string> > masterBedMapNoBin; | |
323 | |
324 | |
325 // EXPERIMENTAL - wait for TR1 | |
326 // typedef vector<BED> bedVector; | |
327 // typedef vector<BEDCOV> bedCovVector; | |
328 // | |
329 // typedef tr1::unordered_map<BIN, bedVector> binsToBeds; | |
330 // typedef tr1::unordered_map<BIN, bedCovVector> binsToBedCovs; | |
331 // | |
332 // typedef tr1::unordered_map<string, binsToBeds> masterBedMap; | |
333 // typedef tr1::unordered_map<string, binsToBedCovs> masterBedCovMap; | |
334 // typedef tr1::unordered_map<string, bedVector> masterBedMapNoBin; | |
335 | |
336 | |
337 | |
338 // return the genome "bin" for a feature with this start and end | |
339 inline | |
340 BIN getBin(CHRPOS start, CHRPOS end) { | |
341 --end; | |
342 start >>= _binFirstShift; | |
343 end >>= _binFirstShift; | |
344 | |
345 for (register short i = 0; i < _binLevels; ++i) { | |
346 if (start == end) return _binOffsetsExtended[i] + start; | |
347 start >>= _binNextShift; | |
348 end >>= _binNextShift; | |
349 } | |
350 cerr << "start " << start << ", end " << end << " out of range in findBin (max is 512M)" << endl; | |
351 return 0; | |
352 } | |
353 | |
354 /**************************************************** | |
355 // isInteger(s): Tests if string s is a valid integer | |
356 *****************************************************/ | |
357 inline bool isInteger(const std::string& s) { | |
358 int len = s.length(); | |
359 for (int i = 0; i < len; i++) { | |
360 if (!std::isdigit(s[i])) return false; | |
361 } | |
362 return true; | |
363 } | |
364 | |
365 | |
366 // return the amount of overlap between two features. Negative if none and the the | |
367 // number of negative bases is the distance between the two. | |
368 inline | |
369 int overlaps(CHRPOS aS, CHRPOS aE, CHRPOS bS, CHRPOS bE) { | |
370 return min(aE, bE) - max(aS, bS); | |
371 } | |
372 | |
373 | |
374 // Ancillary functions | |
375 void splitBedIntoBlocks(const BED &bed, int lineNum, bedVector &bedBlocks); | |
376 | |
377 | |
378 // BED Sorting Methods | |
379 bool sortByChrom(const BED &a, const BED &b); | |
380 bool sortByStart(const BED &a, const BED &b); | |
381 bool sortBySizeAsc(const BED &a, const BED &b); | |
382 bool sortBySizeDesc(const BED &a, const BED &b); | |
383 bool sortByScoreAsc(const BED &a, const BED &b); | |
384 bool sortByScoreDesc(const BED &a, const BED &b); | |
385 bool byChromThenStart(BED const &a, BED const &b); | |
386 | |
387 | |
388 | |
389 //************************************************ | |
390 // BedFile Class methods and elements | |
391 //************************************************ | |
392 class BedFile { | |
393 | |
394 public: | |
395 | |
396 // Constructor | |
397 BedFile(string &); | |
398 | |
399 // Destructor | |
400 ~BedFile(void); | |
401 | |
402 // Open a BED file for reading (creates an istream pointer) | |
403 void Open(void); | |
404 | |
405 // Close an opened BED file. | |
406 void Close(void); | |
407 | |
408 // Get the next BED entry in an opened BED file. | |
409 BedLineStatus GetNextBed (BED &bed, int &lineNum); | |
410 | |
411 // load a BED file into a map keyed by chrom, then bin. value is vector of BEDs | |
412 void loadBedFileIntoMap(); | |
413 | |
414 // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVs | |
415 void loadBedCovFileIntoMap(); | |
416 | |
417 // load a BED file into a map keyed by chrom, then bin. value is vector of BEDCOVLISTs | |
418 void loadBedCovListFileIntoMap(); | |
419 | |
420 // load a BED file into a map keyed by chrom. value is vector of BEDs | |
421 void loadBedFileIntoMapNoBin(); | |
422 | |
423 // Given a chrom, start, end and strand for a single feature, | |
424 // search for all overlapping features in another BED file. | |
425 // Searches through each relevant genome bin on the same chromosome | |
426 // as the single feature. Note: Adapted from kent source "binKeeperFind" | |
427 void FindOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, vector<BED> &hits, bool forceStrand); | |
428 | |
429 // return true if at least one overlap was found. otherwise, return false. | |
430 bool FindOneOrMoreOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, | |
431 bool forceStrand, float overlapFraction = 0.0); | |
432 | |
433 // return true if at least one __reciprocal__ overlap was found. otherwise, return false. | |
434 bool FindOneOrMoreReciprocalOverlapsPerBin(string chrom, CHRPOS start, CHRPOS end, string strand, | |
435 bool forceStrand, float overlapFraction = 0.0); | |
436 | |
437 // Given a chrom, start, end and strand for a single feature, | |
438 // increment a the number of hits for each feature in B file | |
439 // that the feature overlaps | |
440 void countHits(const BED &a, bool forceStrand = false, bool countsOnly = false); | |
441 | |
442 // same as above, but has special logic that processes a set of | |
443 // BED "blocks" from a single entry so as to avoid over-counting | |
444 // each "block" of a single BAM/BED12 as distinct coverage. That is, | |
445 // if one read has four block, we only want to count the coverage as | |
446 // coming from one read, not four. | |
447 void countSplitHits(const vector<BED> &bedBlock, bool forceStrand = false, bool countsOnly = false); | |
448 | |
449 // Given a chrom, start, end and strand for a single feature, | |
450 // increment a the number of hits for each feature in B file | |
451 // that the feature overlaps | |
452 void countListHits(const BED &a, int index, bool forceStrand); | |
453 | |
454 // the bedfile with which this instance is associated | |
455 string bedFile; | |
456 unsigned int bedType; // 3-6, 12 for BED | |
457 // 9 for GFF | |
458 bool isZeroBased; | |
459 | |
460 // Main data structires used by BEDTools | |
461 masterBedCovMap bedCovMap; | |
462 masterBedCovListMap bedCovListMap; | |
463 masterBedMap bedMap; | |
464 masterBedMapNoBin bedMapNoBin; | |
465 | |
466 private: | |
467 | |
468 // data | |
469 bool _isGff; | |
470 bool _isVcf; | |
471 bool _typeIsKnown; // do we know the type? (i.e., BED, GFF, VCF) | |
472 FileType _fileType; // what is the file type? (BED? GFF? VCF?) | |
473 istream *_bedStream; | |
474 string _bedLine; | |
475 vector<string> _bedFields; | |
476 | |
477 void setZeroBased(bool zeroBased); | |
478 void setGff (bool isGff); | |
479 void setVcf (bool isVcf); | |
480 void setFileType (FileType type); | |
481 void setBedType (int colNums); | |
482 | |
483 /****************************************************** | |
484 Private definitions to circumvent linker issues with | |
485 templated member functions. | |
486 *******************************************************/ | |
487 | |
488 /* | |
489 parseLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) | |
490 */ | |
491 template <typename T> | |
492 inline BedLineStatus parseLine (T &bed, const vector<string> &lineVector, int &lineNum) { | |
493 | |
494 //char *p2End, *p3End, *p4End, *p5End; | |
495 //long l2, l3, l4, l5; | |
496 unsigned int numFields = lineVector.size(); | |
497 | |
498 // bail out if we have a blank line | |
499 if (numFields == 0) { | |
500 return BED_BLANK; | |
501 } | |
502 | |
503 if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) { | |
504 | |
505 if (numFields >= 3) { | |
506 // line parsing for all lines after the first non-header line | |
507 if (_typeIsKnown == true) { | |
508 switch(_fileType) { | |
509 case BED_FILETYPE: | |
510 if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; | |
511 case VCF_FILETYPE: | |
512 if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; | |
513 case GFF_FILETYPE: | |
514 if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; | |
515 default: | |
516 printf("ERROR: file type encountered. Exiting\n"); | |
517 exit(1); | |
518 } | |
519 } | |
520 // line parsing for first non-header line: figure out file contents | |
521 else { | |
522 // it's BED format if columns 2 and 3 are integers | |
523 if (isInteger(lineVector[1]) && isInteger(lineVector[2])) { | |
524 setGff(false); | |
525 setZeroBased(true); | |
526 setFileType(BED_FILETYPE); | |
527 setBedType(numFields); // we now expect numFields columns in each line | |
528 if (parseBedLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; | |
529 } | |
530 // it's VCF, assuming the second column is numeric and there are at least 8 fields. | |
531 else if (isInteger(lineVector[1]) && numFields >= 8) { | |
532 setGff(false); | |
533 setVcf(true); | |
534 setZeroBased(false); | |
535 setFileType(VCF_FILETYPE); | |
536 setBedType(numFields); // we now expect numFields columns in each line | |
537 if (parseVcfLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; | |
538 } | |
539 // it's GFF, assuming columns columns 4 and 5 are numeric and we have 9 fields total. | |
540 else if ((numFields >= 8) && isInteger(lineVector[3]) && isInteger(lineVector[4])) { | |
541 setGff(true); | |
542 setZeroBased(false); | |
543 setFileType(GFF_FILETYPE); | |
544 setBedType(numFields); // we now expect numFields columns in each line | |
545 if (parseGffLine(bed, lineVector, lineNum, numFields) == true) return BED_VALID; | |
546 } | |
547 else { | |
548 cerr << "Unexpected file format. Please use tab-delimited BED, GFF, or VCF. " << | |
549 "Perhaps you have non-integer starts or ends at line " << lineNum << "?" << endl; | |
550 exit(1); | |
551 } | |
552 } | |
553 } | |
554 else { | |
555 cerr << "It looks as though you have less than 3 columns at line: " << lineNum << ". Are you sure your files are tab-delimited?" << endl; | |
556 exit(1); | |
557 } | |
558 } | |
559 else { | |
560 lineNum--; | |
561 return BED_HEADER; | |
562 } | |
563 // default | |
564 return BED_INVALID; | |
565 } | |
566 | |
567 | |
568 /* | |
569 parseBedLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) | |
570 */ | |
571 template <typename T> | |
572 inline bool parseBedLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { | |
573 | |
574 // process as long as the number of fields in this | |
575 // line matches what we expect for this file. | |
576 if (numFields == this->bedType) { | |
577 bed.chrom = lineVector[0]; | |
578 bed.start = atoi(lineVector[1].c_str()); | |
579 bed.end = atoi(lineVector[2].c_str()); | |
580 | |
581 // handle starts == end (e.g., insertions in reference genome) | |
582 if (bed.start == bed.end) { | |
583 bed.start--; | |
584 bed.end++; | |
585 bed.zeroLength = true; | |
586 } | |
587 | |
588 if (this->bedType == 4) { | |
589 bed.name = lineVector[3]; | |
590 } | |
591 else if (this->bedType == 5) { | |
592 bed.name = lineVector[3]; | |
593 bed.score = lineVector[4]; | |
594 } | |
595 else if (this->bedType == 6) { | |
596 bed.name = lineVector[3]; | |
597 bed.score = lineVector[4]; | |
598 bed.strand = lineVector[5]; | |
599 } | |
600 else if (this->bedType > 6) { | |
601 bed.name = lineVector[3]; | |
602 bed.score = lineVector[4]; | |
603 bed.strand = lineVector[5]; | |
604 for (unsigned int i = 6; i < lineVector.size(); ++i) { | |
605 bed.otherFields.push_back(lineVector[i]); | |
606 } | |
607 } | |
608 else if (this->bedType != 3) { | |
609 cerr << "Error: unexpected number of fields at line: " << lineNum | |
610 << ". Verify that your files are TAB-delimited. Exiting..." << endl; | |
611 exit(1); | |
612 } | |
613 | |
614 // sanity checks. | |
615 if ((bed.start <= bed.end) && (bed.start >= 0) && (bed.end >= 0)) { | |
616 return true; | |
617 } | |
618 else if (bed.start > bed.end) { | |
619 cerr << "Error: malformed BED entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; | |
620 exit(1); | |
621 } | |
622 else if ( (bed.start < 0) || (bed.end < 0) ) { | |
623 cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; | |
624 exit(1); | |
625 } | |
626 } | |
627 else if (numFields == 1) { | |
628 cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; | |
629 exit(1); | |
630 } | |
631 else if ((numFields != this->bedType) && (numFields != 0)) { | |
632 cerr << "Differing number of BED fields encountered at line: " << lineNum << ". Exiting..." << endl; | |
633 exit(1); | |
634 } | |
635 else if ((numFields < 3) && (numFields != 0)) { | |
636 cerr << "TAB delimited BED file with at least 3 fields (chrom, start, end) is required at line: "<< lineNum << ". Exiting..." << endl; | |
637 exit(1); | |
638 } | |
639 return false; | |
640 } | |
641 | |
642 | |
643 /* | |
644 parseVcfLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) | |
645 */ | |
646 template <typename T> | |
647 inline bool parseVcfLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { | |
648 if (numFields == this->bedType) { | |
649 bed.chrom = lineVector[0]; | |
650 bed.start = atoi(lineVector[1].c_str()) - 1; // VCF is one-based | |
651 bed.end = bed.start + lineVector[3].size(); // VCF 4.0 stores the size of the affected REF allele. | |
652 bed.strand = "+"; | |
653 // construct the name from the ref and alt alleles. | |
654 // if it's an annotated variant, add the rsId as well. | |
655 bed.name = lineVector[3] + "/" + lineVector[4]; | |
656 if (lineVector[2] != ".") { | |
657 bed.name += "_" + lineVector[2]; | |
658 } | |
659 | |
660 if (this->bedType > 2) { | |
661 for (unsigned int i = 2; i < numFields; ++i) | |
662 bed.otherFields.push_back(lineVector[i]); | |
663 } | |
664 | |
665 if ((bed.start <= bed.end) && (bed.start > 0) && (bed.end > 0)) { | |
666 return true; | |
667 } | |
668 else if (bed.start > bed.end) { | |
669 cerr << "Error: malformed VCF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; | |
670 exit(1); | |
671 } | |
672 else if ( (bed.start < 0) || (bed.end < 0) ) { | |
673 cerr << "Error: malformed VCF entry at line " << lineNum << ". Coordinate detected that is < 0. Exiting." << endl; | |
674 exit(1); | |
675 } | |
676 } | |
677 else if (numFields == 1) { | |
678 cerr << "Only one VCF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; | |
679 exit(1); | |
680 } | |
681 else if ((numFields != this->bedType) && (numFields != 0)) { | |
682 cerr << "Differing number of VCF fields encountered at line: " << lineNum << ". Exiting..." << endl; | |
683 exit(1); | |
684 } | |
685 else if ((numFields < 2) && (numFields != 0)) { | |
686 cerr << "TAB delimited VCF file with at least 2 fields (chrom, pos) is required at line: "<< lineNum << ". Exiting..." << endl; | |
687 exit(1); | |
688 } | |
689 return false; | |
690 } | |
691 | |
692 | |
693 | |
694 /* | |
695 parseGffLine: converts a lineVector into either BED or BEDCOV (templated, hence in header to avoid linker issues.) | |
696 */ | |
697 template <typename T> | |
698 inline bool parseGffLine (T &bed, const vector<string> &lineVector, int lineNum, unsigned int numFields) { | |
699 if (numFields == this->bedType) { | |
700 if (this->bedType >= 8 && _isGff) { | |
701 bed.chrom = lineVector[0]; | |
702 bed.start = atoi(lineVector[3].c_str()); | |
703 bed.end = atoi(lineVector[4].c_str()); | |
704 bed.name = lineVector[2]; | |
705 bed.score = lineVector[5]; | |
706 bed.strand = lineVector[6].c_str(); | |
707 bed.otherFields.push_back(lineVector[1]); // add GFF "source". unused in BED | |
708 bed.otherFields.push_back(lineVector[7]); // add GFF "fname". unused in BED | |
709 // handle the optional 9th field. | |
710 if (this->bedType == 9) | |
711 bed.otherFields.push_back(lineVector[8]); // add GFF "group". unused in BED | |
712 | |
713 // handle starts == end (e.g., insertions in reference genome) | |
714 if (bed.start == bed.end) { | |
715 bed.end++; | |
716 bed.zeroLength = true; | |
717 } | |
718 // GFF uses 1-based starts, covert to zero-based | |
719 bed.start--; | |
720 } | |
721 else { | |
722 cerr << "Error: unexpected number of fields at line: " << lineNum << | |
723 ". Verify that your files are TAB-delimited and that your GFF file has 8 or 9 fields. Exiting..." << endl; | |
724 exit(1); | |
725 } | |
726 if (bed.start > bed.end) { | |
727 cerr << "Error: malformed GFF entry at line " << lineNum << ". Start was greater than end. Exiting." << endl; | |
728 exit(1); | |
729 } | |
730 else if ( (bed.start < 0) || (bed.end < 0) ) { | |
731 cerr << "Error: malformed GFF entry at line " << lineNum << ". Coordinate detected that is < 1. Exiting." << endl; | |
732 exit(1); | |
733 } | |
734 else return true; | |
735 } | |
736 else if (numFields == 1) { | |
737 cerr << "Only one GFF field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; | |
738 exit(1); | |
739 } | |
740 else if ((numFields != this->bedType) && (numFields != 0)) { | |
741 cerr << "Differing number of GFF fields encountered at line: " << lineNum << ". Exiting..." << endl; | |
742 exit(1); | |
743 } | |
744 else if ((numFields < 8) && (numFields != 0)) { | |
745 cerr << "TAB delimited GFF file with 8 or 9 fields is required at line: "<< lineNum << ". Exiting..." << endl; | |
746 exit(1); | |
747 } | |
748 return false; | |
749 } | |
750 | |
751 | |
752 public: | |
753 | |
754 /* | |
755 reportBedTab | |
756 | |
757 Writes the _original_ BED entry with a TAB | |
758 at the end of the line. | |
759 Works for BED3 - BED6. | |
760 */ | |
761 template <typename T> | |
762 inline void reportBedTab(const T &bed) { | |
763 | |
764 // if it is azeroLength feature, we need to | |
765 // correct the start and end coords to what they were | |
766 // in the original file | |
767 CHRPOS start = bed.start; | |
768 CHRPOS end = bed.end; | |
769 if (bed.zeroLength) { | |
770 if (_isGff == false) | |
771 start++; | |
772 end--; | |
773 } | |
774 | |
775 // BED | |
776 if (_isGff == false && _isVcf == false) { | |
777 if (this->bedType == 3) { | |
778 printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end); | |
779 } | |
780 else if (this->bedType == 4) { | |
781 printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str()); | |
782 } | |
783 else if (this->bedType == 5) { | |
784 printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
785 bed.score.c_str()); | |
786 } | |
787 else if (this->bedType == 6) { | |
788 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
789 bed.score.c_str(), bed.strand.c_str()); | |
790 } | |
791 else if (this->bedType > 6) { | |
792 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
793 bed.score.c_str(), bed.strand.c_str()); | |
794 | |
795 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
796 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
797 for ( ; othIt != othEnd; ++othIt) { | |
798 printf("%s\t", othIt->c_str()); | |
799 } | |
800 } | |
801 } | |
802 // VCF | |
803 else if (_isGff == false && _isVcf == true) { | |
804 printf ("%s\t%d\t", bed.chrom.c_str(), start+1); | |
805 | |
806 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
807 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
808 for ( ; othIt != othEnd; ++othIt) { | |
809 printf("%s\t", othIt->c_str()); | |
810 } | |
811 } | |
812 // GFF | |
813 else if (_isGff == true) { | |
814 // "GFF-8" | |
815 if (this->bedType == 8) { | |
816 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
817 bed.name.c_str(), start+1, end, | |
818 bed.score.c_str(), bed.strand.c_str(), | |
819 bed.otherFields[1].c_str()); | |
820 } | |
821 // "GFF-9" | |
822 else if (this->bedType == 9) { | |
823 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
824 bed.name.c_str(), start+1, end, | |
825 bed.score.c_str(), bed.strand.c_str(), | |
826 bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); | |
827 } | |
828 } | |
829 } | |
830 | |
831 | |
832 | |
833 /* | |
834 reportBedNewLine | |
835 | |
836 Writes the _original_ BED entry with a NEWLINE | |
837 at the end of the line. | |
838 Works for BED3 - BED6. | |
839 */ | |
840 template <typename T> | |
841 inline void reportBedNewLine(const T &bed) { | |
842 | |
843 // if it is azeroLength feature, we need to | |
844 // correct the start and end coords to what they were | |
845 // in the original file | |
846 CHRPOS start = bed.start; | |
847 CHRPOS end = bed.end; | |
848 if (bed.zeroLength) { | |
849 if (_isGff == false) | |
850 start++; | |
851 end--; | |
852 } | |
853 | |
854 //BED | |
855 if (_isGff == false && _isVcf == false) { | |
856 if (this->bedType == 3) { | |
857 printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end); | |
858 } | |
859 else if (this->bedType == 4) { | |
860 printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str()); | |
861 } | |
862 else if (this->bedType == 5) { | |
863 printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
864 bed.score.c_str()); | |
865 } | |
866 else if (this->bedType == 6) { | |
867 printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
868 bed.score.c_str(), bed.strand.c_str()); | |
869 } | |
870 else if (this->bedType > 6) { | |
871 printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
872 bed.score.c_str(), bed.strand.c_str()); | |
873 | |
874 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
875 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
876 for ( ; othIt != othEnd; ++othIt) { | |
877 printf("\t%s", othIt->c_str()); | |
878 } | |
879 printf("\n"); | |
880 } | |
881 } | |
882 // VCF | |
883 else if (_isGff == false && _isVcf == true) { | |
884 printf ("%s\t%d\t", bed.chrom.c_str(), start+1); | |
885 | |
886 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
887 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
888 for ( ; othIt != othEnd; ++othIt) { | |
889 printf("%s\t", othIt->c_str()); | |
890 } | |
891 printf("\n"); | |
892 } | |
893 // GFF | |
894 else if (_isGff == true) { | |
895 // "GFF-8" | |
896 if (this->bedType == 8) { | |
897 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
898 bed.name.c_str(), start+1, end, | |
899 bed.score.c_str(), bed.strand.c_str(), | |
900 bed.otherFields[1].c_str()); | |
901 } | |
902 // "GFF-9" | |
903 else if (this->bedType == 9) { | |
904 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
905 bed.name.c_str(), start+1, end, | |
906 bed.score.c_str(), bed.strand.c_str(), | |
907 bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); | |
908 } | |
909 } | |
910 } | |
911 | |
912 | |
913 | |
914 /* | |
915 reportBedRangeNewLine | |
916 | |
917 Writes a custom start->end for a BED entry | |
918 with a NEWLINE at the end of the line. | |
919 | |
920 Works for BED3 - BED6. | |
921 */ | |
922 template <typename T> | |
923 inline void reportBedRangeTab(const T &bed, CHRPOS start, CHRPOS end) { | |
924 | |
925 // if it is azeroLength feature, we need to | |
926 // correct the start and end coords to what they were | |
927 // in the original file | |
928 if (bed.zeroLength) { | |
929 start = bed.start + 1; | |
930 end = bed.end - 1; | |
931 } | |
932 | |
933 // BED | |
934 if (_isGff == false && _isVcf == false) { | |
935 if (this->bedType == 3) { | |
936 printf ("%s\t%d\t%d\t", bed.chrom.c_str(), start, end); | |
937 } | |
938 else if (this->bedType == 4) { | |
939 printf ("%s\t%d\t%d\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str()); | |
940 } | |
941 else if (this->bedType == 5) { | |
942 printf ("%s\t%d\t%d\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
943 bed.score.c_str()); | |
944 } | |
945 else if (this->bedType == 6) { | |
946 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
947 bed.score.c_str(), bed.strand.c_str()); | |
948 } | |
949 else if (this->bedType > 6) { | |
950 printf ("%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
951 bed.score.c_str(), bed.strand.c_str()); | |
952 | |
953 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
954 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
955 for ( ; othIt != othEnd; ++othIt) { | |
956 printf("%s\t", othIt->c_str()); | |
957 } | |
958 } | |
959 } | |
960 // VCF | |
961 else if (_isGff == false && _isVcf == true) { | |
962 printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); | |
963 | |
964 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
965 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
966 for ( ; othIt != othEnd; ++othIt) { | |
967 printf("%s\t", othIt->c_str()); | |
968 } | |
969 } | |
970 // GFF | |
971 else if (_isGff == true) { | |
972 // "GFF-8" | |
973 if (this->bedType == 8) { | |
974 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
975 bed.name.c_str(), start+1, end, | |
976 bed.score.c_str(), bed.strand.c_str(), | |
977 bed.otherFields[1].c_str()); | |
978 } | |
979 // "GFF-9" | |
980 else if (this->bedType == 9) { | |
981 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
982 bed.name.c_str(), start+1, end, | |
983 bed.score.c_str(), bed.strand.c_str(), | |
984 bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); | |
985 } | |
986 } | |
987 } | |
988 | |
989 | |
990 | |
991 /* | |
992 reportBedRangeTab | |
993 | |
994 Writes a custom start->end for a BED entry | |
995 with a TAB at the end of the line. | |
996 | |
997 Works for BED3 - BED6. | |
998 */ | |
999 template <typename T> | |
1000 inline void reportBedRangeNewLine(const T &bed, CHRPOS start, CHRPOS end) { | |
1001 | |
1002 // if it is azeroLength feature, we need to | |
1003 // correct the start and end coords to what they were | |
1004 // in the original file | |
1005 if (bed.zeroLength) { | |
1006 start = bed.start + 1; | |
1007 end = bed.end - 1; | |
1008 } | |
1009 | |
1010 // BED | |
1011 if (_isGff == false && _isVcf == false) { | |
1012 if (this->bedType == 3) { | |
1013 printf ("%s\t%d\t%d\n", bed.chrom.c_str(), start, end); | |
1014 } | |
1015 else if (this->bedType == 4) { | |
1016 printf ("%s\t%d\t%d\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str()); | |
1017 } | |
1018 else if (this->bedType == 5) { | |
1019 printf ("%s\t%d\t%d\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
1020 bed.score.c_str()); | |
1021 } | |
1022 else if (this->bedType == 6) { | |
1023 printf ("%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
1024 bed.score.c_str(), bed.strand.c_str()); | |
1025 } | |
1026 else if (this->bedType > 6) { | |
1027 printf ("%s\t%d\t%d\t%s\t%s\t%s", bed.chrom.c_str(), start, end, bed.name.c_str(), | |
1028 bed.score.c_str(), bed.strand.c_str()); | |
1029 | |
1030 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
1031 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
1032 for ( ; othIt != othEnd; ++othIt) { | |
1033 printf("\t%s", othIt->c_str()); | |
1034 } | |
1035 printf("\n"); | |
1036 } | |
1037 } | |
1038 // VCF | |
1039 else if (_isGff == false && _isVcf == true) { | |
1040 printf ("%s\t%d\t", bed.chrom.c_str(), bed.start+1); | |
1041 | |
1042 vector<string>::const_iterator othIt = bed.otherFields.begin(); | |
1043 vector<string>::const_iterator othEnd = bed.otherFields.end(); | |
1044 for ( ; othIt != othEnd; ++othIt) { | |
1045 printf("%s\t", othIt->c_str()); | |
1046 } | |
1047 printf("\n"); | |
1048 } | |
1049 // GFF | |
1050 else if (_isGff == true) { | |
1051 // "GFF-9" | |
1052 if (this->bedType == 8) { | |
1053 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
1054 bed.name.c_str(), start+1, end, | |
1055 bed.score.c_str(), bed.strand.c_str(), | |
1056 bed.otherFields[1].c_str()); | |
1057 } | |
1058 // "GFF-8" | |
1059 else if (this->bedType == 9) { | |
1060 printf ("%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", bed.chrom.c_str(), bed.otherFields[0].c_str(), | |
1061 bed.name.c_str(), start+1, end, | |
1062 bed.score.c_str(), bed.strand.c_str(), | |
1063 bed.otherFields[1].c_str(), bed.otherFields[2].c_str()); | |
1064 } | |
1065 } | |
1066 } | |
1067 | |
1068 | |
1069 /* | |
1070 reportNullBedTab | |
1071 */ | |
1072 void reportNullBedTab() { | |
1073 | |
1074 if (_isGff == false && _isVcf == false) { | |
1075 if (this->bedType == 3) { | |
1076 printf (".\t-1\t-1\t"); | |
1077 } | |
1078 else if (this->bedType == 4) { | |
1079 printf (".\t-1\t-1\t.\t"); | |
1080 } | |
1081 else if (this->bedType == 5) { | |
1082 printf (".\t-1\t-1\t.\t-1\t"); | |
1083 } | |
1084 else if (this->bedType == 6) { | |
1085 printf (".\t-1\t-1\t.\t-1\t.\t"); | |
1086 } | |
1087 else if (this->bedType > 6) { | |
1088 printf (".\t-1\t-1\t.\t-1\t.\t"); | |
1089 for (unsigned int i = 6; i < this->bedType; ++i) { | |
1090 printf(".\t"); | |
1091 } | |
1092 } | |
1093 } | |
1094 else if (_isGff == true && _isVcf == false) { | |
1095 if (this->bedType == 8) { | |
1096 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t"); | |
1097 } | |
1098 else if (this->bedType == 9) { | |
1099 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\t"); | |
1100 } | |
1101 } | |
1102 } | |
1103 | |
1104 | |
1105 /* | |
1106 reportNullBedTab | |
1107 */ | |
1108 void reportNullBedNewLine() { | |
1109 | |
1110 if (_isGff == false && _isVcf == false) { | |
1111 if (this->bedType == 3) { | |
1112 printf (".\t-1\t-1\n"); | |
1113 } | |
1114 else if (this->bedType == 4) { | |
1115 printf (".\t-1\t-1\t.\n"); | |
1116 } | |
1117 else if (this->bedType == 5) { | |
1118 printf (".\t-1\t-1\t.\t-1\n"); | |
1119 } | |
1120 else if (this->bedType == 6) { | |
1121 printf (".\t-1\t-1\t.\t-1\t.\n"); | |
1122 } | |
1123 else if (this->bedType > 6) { | |
1124 printf (".\t-1\t-1\t.\t-1\t."); | |
1125 for (unsigned int i = 6; i < this->bedType; ++i) { | |
1126 printf("\t."); | |
1127 } | |
1128 printf("\n"); | |
1129 } | |
1130 } | |
1131 else if (_isGff == true && _isVcf == false) { | |
1132 if (this->bedType == 8) { | |
1133 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\n"); | |
1134 } | |
1135 else if (this->bedType == 9) { | |
1136 printf (".\t.\t.\t-1\t-1\t-1\t.\t.\t.\n"); | |
1137 } | |
1138 } | |
1139 } | |
1140 | |
1141 | |
1142 }; | |
1143 | |
1144 #endif /* BEDFILE_H */ |