comparison BEDTools-Version-2.14.3/src/bamToBed/bamToBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfcd8b6c1bda
1 /*****************************************************************************
2 bamToBed.cpp
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "version.h"
13 #include "api/BamReader.h"
14 #include "api/BamAux.h"
15 #include "BamAncillary.h"
16 #include "bedFile.h"
17 using namespace BamTools;
18
19 #include <vector>
20 #include <algorithm> // for swap()
21 #include <iostream>
22 #include <fstream>
23 #include <stdlib.h>
24
25 using namespace std;
26
27
28 // define our program name
29 #define PROGRAM_NAME "bamToBed"
30
31 // define our parameter checking macro
32 #define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
33
34
35 // function declarations
36 void ShowHelp(void);
37
38 void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, const string &bamTag,
39 const bool &writeBed12, const bool &obeySplits, const string &color, const bool &useCigar);
40 void ConvertBamToBedpe(const string &bamFile, const bool &useEditDistance);
41
42 void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, bool obeySplits, bool useCigar);
43 void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, string color = "255,0,0");
44 void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2,
45 const RefVector &refs, bool useEditDistance);
46
47 void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts,
48 vector<int> &blockEnds, int &alignmentEnd);
49 string BuildCigarString(const vector<CigarOp> &cigar);
50
51 bool IsCorrectMappingForBEDPE (const BamAlignment &bam);
52
53
54 int main(int argc, char* argv[]) {
55
56 // our configuration variables
57 bool showHelp = false;
58
59 // input files
60 string bamFile = "stdin";
61 string color = "255,0,0";
62 string tag = "";
63
64 bool haveBam = true;
65 bool haveColor = false;
66 bool haveOtherTag = false;
67 bool writeBedPE = false;
68 bool writeBed12 = false;
69 bool useEditDistance = false;
70 bool useAlignmentScore = false;
71 bool useCigar = false;
72 bool obeySplits = false;
73
74 // check to see if we should print out some help
75
76 for(int i = 1; i < argc; i++) {
77 int parameterLength = (int)strlen(argv[i]);
78
79 if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
80 (PARAMETER_CHECK("--help", 5, parameterLength))) {
81 showHelp = true;
82 }
83 }
84
85 if(showHelp) ShowHelp();
86
87 // do some parsing (all of these parameters require 2 strings)
88 for(int i = 1; i < argc; i++) {
89
90 int parameterLength = (int)strlen(argv[i]);
91
92 if(PARAMETER_CHECK("-i", 2, parameterLength)) {
93 if ((i+1) < argc) {
94 bamFile = argv[i + 1];
95 i++;
96 }
97 }
98 else if(PARAMETER_CHECK("-bedpe", 6, parameterLength)) {
99 writeBedPE = true;
100 }
101 else if(PARAMETER_CHECK("-bed12", 6, parameterLength)) {
102 writeBed12 = true;
103 }
104 else if(PARAMETER_CHECK("-split", 6, parameterLength)) {
105 obeySplits = true;
106 }
107 else if(PARAMETER_CHECK("-ed", 3, parameterLength)) {
108 useEditDistance = true;
109 }
110 else if(PARAMETER_CHECK("-cigar", 6, parameterLength)) {
111 useCigar = true;
112 }
113 else if(PARAMETER_CHECK("-as", 3, parameterLength)) {
114 useAlignmentScore = true;
115 }
116 else if(PARAMETER_CHECK("-color", 6, parameterLength)) {
117 if ((i+1) < argc) {
118 haveColor = true;
119 color = argv[i + 1];
120 i++;
121 }
122 }
123 else if(PARAMETER_CHECK("-tag", 4, parameterLength)) {
124 if ((i+1) < argc) {
125 haveOtherTag = true;
126 tag = argv[i + 1];
127 i++;
128 }
129 }
130 else {
131 cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
132 showHelp = true;
133 }
134 }
135
136 // make sure we have an input files
137 if (haveBam == false) {
138 cerr << endl << "*****" << endl << "*****ERROR: Need -i (BAM) file. " << endl << "*****" << endl;
139 showHelp = true;
140 }
141 if (haveColor == true && writeBed12 == false) {
142 cerr << endl << "*****" << endl << "*****ERROR: Cannot use color without BED12. " << endl << "*****" << endl;
143 showHelp = true;
144 }
145 if (useEditDistance == true && obeySplits == true) {
146 cerr << endl << "*****" << endl << "*****ERROR: Cannot use -ed with -splits. Edit distances cannot be computed for each \'chunk\'." << endl << "*****" << endl;
147 showHelp = true;
148 }
149 if (useEditDistance == true && useCigar == true) {
150 cerr << endl << "*****" << endl << "*****ERROR: Cannot use -cigar with -splits. Not yet supported." << endl << "*****" << endl;
151 showHelp = true;
152 }
153 if (useEditDistance == true && haveOtherTag == true) {
154 cerr << endl << "*****" << endl << "*****ERROR: Cannot use -ed with -tag. Choose one or the other." << endl << "*****" << endl;
155 showHelp = true;
156 }
157 if (writeBedPE == true && haveOtherTag == true) {
158 cerr << endl << "*****" << endl << "*****ERROR: Cannot use -tag with -bedpe." << endl << "*****" << endl;
159 showHelp = true;
160 }
161 // if there are no problems, let's convert BAM to BED or BEDPE
162 if (!showHelp) {
163 if (writeBedPE == false)
164 ConvertBamToBed(bamFile, useEditDistance, tag, writeBed12, obeySplits, color, useCigar); // BED or "blocked BED"
165 else
166 ConvertBamToBedpe(bamFile, useEditDistance); // BEDPE
167 }
168 else {
169 ShowHelp();
170 }
171 }
172
173
174 void ShowHelp(void) {
175
176 cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl;
177
178 cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl;
179
180 cerr << "Summary: Converts BAM alignments to BED6 or BEDPE format." << endl << endl;
181
182 cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bam> " << endl << endl;
183
184 cerr << "Options: " << endl;
185
186 cerr << "\t-bedpe\t" << "Write BEDPE format." << endl;
187 cerr << "\t\t- Requires BAM to be grouped or sorted by query." << endl << endl;
188
189 cerr << "\t-bed12\t" << "Write \"blocked\" BED format (aka \"BED12\")." << endl << endl;
190 cerr << "\t\thttp://genome-test.cse.ucsc.edu/FAQ/FAQformat#format1" << endl << endl;
191
192 cerr << "\t-split\t" << "Report \"split\" BAM alignments as separate BED entries." << endl << endl;
193
194 cerr << "\t-ed\t" << "Use BAM edit distance (NM tag) for BED score." << endl;
195 cerr << "\t\t- Default for BED is to use mapping quality." << endl;
196 cerr << "\t\t- Default for BEDPE is to use the minimum of" << endl;
197 cerr << "\t\t the two mapping qualities for the pair." << endl;
198 cerr << "\t\t- When -ed is used with -bedpe, the total edit" << endl;
199 cerr << "\t\t distance from the two mates is reported." << endl << endl;
200
201 cerr << "\t-tag\t" << "Use other NUMERIC BAM alignment tag for BED score." << endl;
202 cerr << "\t\t- Default for BED is to use mapping quality." << endl;
203 cerr << "\t\t Disallowed with BEDPE output." << endl << endl;
204
205 cerr << "\t-color\t" << "An R,G,B string for the color used with BED12 format." << endl;
206 cerr << "\t\tDefault is (255,0,0)." << endl << endl;
207
208 cerr << "\t-cigar\t" << "Add the CIGAR string to the BED entry as a 7th column." << endl << endl;
209
210
211 // end the program here
212 exit(1);
213 }
214
215
216 void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, const string &bamTag,
217 const bool &writeBed12, const bool &obeySplits, const string &color, const bool &useCigar) {
218 // open the BAM file
219 BamReader reader;
220 reader.Open(bamFile);
221
222 // get header & reference information
223 string header = reader.GetHeaderText();
224 RefVector refs = reader.GetReferenceData();
225
226 // rip through the BAM file and convert each mapped entry to BED
227 BamAlignment bam;
228 while (reader.GetNextAlignment(bam)) {
229 if (bam.IsMapped() == true) {
230 if (writeBed12 == false) // BED
231 PrintBed(bam, refs, useEditDistance, bamTag, obeySplits, useCigar);
232 else //"blocked" BED
233 PrintBed12(bam, refs, useEditDistance, bamTag, color);
234 }
235 }
236 reader.Close();
237 }
238
239
240 /*
241 Assumptions:
242 1. The BAM file is grouped/sorted by query name,
243 not alignment position
244 */
245 void ConvertBamToBedpe(const string &bamFile, const bool &useEditDistance) {
246 // open the BAM file
247 BamReader reader;
248 reader.Open(bamFile);
249
250 // get header & reference information
251 string header = reader.GetHeaderText();
252 RefVector refs = reader.GetReferenceData();
253
254 // rip through the BAM file and convert each mapped entry to BEDPE
255 BamAlignment bam1, bam2;
256 while (reader.GetNextAlignment(bam1)) {
257 // the alignment must be paired
258 if (bam1.IsPaired() == true) {
259 // grab the second alignment for the pair.
260 reader.GetNextAlignment(bam2);
261
262 // require that the alignments are from the same query
263 if (bam1.Name == bam2.Name) {
264 PrintBedPE(bam1, bam2, refs, useEditDistance);
265 }
266 else {
267 cerr << "*****ERROR: -bedpe requires BAM to be sorted/grouped by query name. " << endl;
268 exit(1);
269 }
270 }
271 }
272 reader.Close();
273 }
274
275
276 void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts, vector<int> &blockLengths, unsigned int &alignmentEnd) {
277
278 int currPosition = 0;
279 int blockLength = 0;
280
281 // Rip through the CIGAR ops and figure out if there is more
282 // than one block for this alignment
283 vector<CigarOp>::const_iterator cigItr = cigar.begin();
284 vector<CigarOp>::const_iterator cigEnd = cigar.end();
285 for (; cigItr != cigEnd; ++cigItr) {
286 switch (cigItr->Type) {
287 case ('M') :
288 blockLength += cigItr->Length;
289 currPosition += cigItr->Length;
290 case ('I') : break;
291 case ('S') : break;
292 case ('D') : break;
293 blockLength += cigItr->Length;
294 currPosition += cigItr->Length;
295 case ('P') : break;
296 case ('N') :
297 blockStarts.push_back(currPosition + cigItr->Length);
298 blockLengths.push_back(blockLength);
299 currPosition += cigItr->Length;
300 blockLength = 0;
301 case ('H') : break; // for 'H' - do nothing, move to next op
302 default :
303 printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here
304 exit(1);
305 }
306 }
307 // add the kast block and set the
308 // alignment end (i.e., relative to the start)
309 blockLengths.push_back(blockLength);
310 alignmentEnd = currPosition;
311 }
312
313
314 string BuildCigarString(const vector<CigarOp> &cigar) {
315
316 stringstream cigarString;
317
318 for (size_t i = 0; i < cigar.size(); ++i) {
319 //cerr << cigar[i].Type << " " << cigar[i].Length << endl;
320 switch (cigar[i].Type) {
321 case ('M') :
322 case ('I') :
323 case ('D') :
324 case ('N') :
325 case ('S') :
326 case ('H') :
327 case ('P') :
328 cigarString << cigar[i].Length << cigar[i].Type;
329 }
330 }
331 return cigarString.str();
332 }
333
334
335 void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, bool obeySplits, bool useCigar) {
336
337 // set the strand
338 string strand = "+";
339 if (bam.IsReverseStrand() == true) strand = "-";
340
341 // set the name of the feature based on the sequence
342 string name = bam.Name;
343 if (bam.IsFirstMate() == true) name += "/1";
344 if (bam.IsSecondMate() == true) name += "/2";
345
346 // get the unpadded (parm = false) end position based on the CIGAR
347 unsigned int alignmentEnd = bam.GetEndPosition(false, false);
348
349 // report the entire BAM footprint as a single BED entry
350 if (obeySplits == false) {
351 // report the alignment in BED6 format.
352 if (useEditDistance == false && bamTag == "") {
353 printf("%s\t%d\t%d\t\%s\t%d\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position,
354 alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str());
355 }
356 else if (useEditDistance == true && bamTag == "") {
357 uint32_t editDistance;
358 if (bam.GetTag("NM", editDistance)) {
359 printf("%s\t%d\t%d\t\%s\t%u\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position,
360 alignmentEnd, name.c_str(), editDistance, strand.c_str());
361 }
362 else {
363 cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n";
364 exit(1);
365 }
366 }
367 else if (useEditDistance == false && bamTag != "") {
368 int32_t tagValue;
369 if (bam.GetTag(bamTag, tagValue)) {
370 printf("%s\t%d\t%d\t\%s\t%d\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position,
371 alignmentEnd, name.c_str(), tagValue, strand.c_str());
372 }
373 else {
374 cerr << "The requested tag (" << bamTag << ") was not found in the BAM file. Exiting\n";
375 exit(1);
376 }
377 }
378
379 // does the user want CIGAR as well?
380 if (useCigar == false) {
381 printf("\n");
382 }
383 else {
384 string cigar = BuildCigarString(bam.CigarData);
385 printf("\t%s\n", cigar.c_str());
386 }
387 }
388 // Report each chunk of the BAM alignment as a discrete BED entry
389 // For example 10M100N10M would be reported as two seprate BED entries of length 10
390 else {
391 vector<BED> bedBlocks;
392 // Load the alignment blocks in bam into the bedBlocks vector.
393 // Don't trigger a new block when a "D" (deletion) CIGAR op is found.
394 getBamBlocks(bam, refs, bedBlocks, false);
395
396 vector<BED>::const_iterator bedItr = bedBlocks.begin();
397 vector<BED>::const_iterator bedEnd = bedBlocks.end();
398 for (; bedItr != bedEnd; ++bedItr) {
399 printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bedItr->start,
400 bedItr->end, name.c_str(), bam.MapQuality, strand.c_str());
401 }
402 }
403 }
404
405
406 void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, string color) {
407
408 // set the strand
409 string strand = "+";
410 if (bam.IsReverseStrand()) strand = "-";
411
412 // set the name of the feature based on the sequence
413 string name = bam.Name;
414 if (bam.IsFirstMate()) name += "/1";
415 if (bam.IsSecondMate()) name += "/2";
416
417 // parse the CIGAR string and figure out the alignment blocks
418 unsigned int alignmentEnd;
419 vector<int> blockLengths;
420 vector<int> blockStarts;
421 blockStarts.push_back(0);
422
423 // extract the block starts and lengths from the CIGAR string
424 ParseCigarBed12(bam.CigarData, blockStarts, blockLengths, alignmentEnd);
425 alignmentEnd += bam.Position;
426
427 // write BED6 portion
428 if (useEditDistance == false && bamTag == "") {
429 printf("%s\t%d\t%d\t\%s\t%d\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position,
430 alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str());
431 }
432 else if (useEditDistance == true && bamTag != "") {
433 uint32_t editDistance;
434 if (bam.GetTag("NM", editDistance)) {
435 printf("%s\t%d\t%d\t\%s\t%u\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position,
436 alignmentEnd, name.c_str(), editDistance, strand.c_str());
437 }
438 else {
439 cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n";
440 exit(1);
441 }
442 }
443 else if (useEditDistance == false && bamTag != "") {
444 int32_t tagValue;
445 if (bam.GetTag(bamTag, tagValue)) {
446 printf("%s\t%d\t%d\t\%s\t%d\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position,
447 alignmentEnd, name.c_str(), tagValue, strand.c_str());
448 }
449 else {
450 cerr << "The requested tag (" << bamTag << ") was not found in the BAM file. Exiting\n";
451 exit(1);
452 }
453 }
454
455 // write the colors, etc.
456 printf("%d\t%d\t%s\t%d\t", bam.Position, alignmentEnd, color.c_str(), (int) blockStarts.size());
457
458 // now write the lengths portion
459 unsigned int b;
460 for (b = 0; b < blockLengths.size() - 1; ++b) {
461 printf("%d,", blockLengths[b]);
462 }
463 printf("%d\t", blockLengths[b]);
464
465 // now write the starts portion
466 for (b = 0; b < blockStarts.size() - 1; ++b) {
467 printf("%d,", blockStarts[b]);
468 }
469 printf("%d\n", blockStarts[b]);
470 }
471
472
473 void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVector &refs, bool useEditDistance) {
474
475 // initialize BEDPE variables
476 string chrom1, chrom2, strand1, strand2;
477 int start1, start2, end1, end2;
478 uint32_t editDistance1, editDistance2;
479 start1 = start2 = end1 = end2 = -1;
480 chrom1 = chrom2 = strand1 = strand2 = ".";
481 editDistance1 = editDistance2 = 0;
482 uint16_t minMapQuality = 0;
483
484 // extract relevant info for end 1
485 if (bam1.IsMapped()) {
486 chrom1 = refs.at(bam1.RefID).RefName;
487 start1 = bam1.Position;
488 end1 = bam1.GetEndPosition(false);
489 strand1 = "+";
490 if (bam1.IsReverseStrand()) strand1 = "-";
491
492 // extract the edit distance from the NM tag
493 // if possible. otherwise, complain.
494 if (useEditDistance == true && bam1.GetTag("NM", editDistance1) == false) {
495 cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n";
496 exit(1);
497 }
498 }
499
500 // extract relevant info for end 2
501 if (bam2.IsMapped()) {
502 chrom2 = refs.at(bam2.RefID).RefName;
503 start2 = bam2.Position;
504 end2 = bam2.GetEndPosition(false);
505 strand2 = "+";
506 if (bam2.IsReverseStrand()) strand2 = "-";
507
508 // extract the edit distance from the NM tag
509 // if possible. otherwise, complain.
510 if (useEditDistance == true && bam2.GetTag("NM", editDistance2) == false) {
511 cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n";
512 exit(1);
513 }
514 }
515
516 // swap the ends if necessary
517 if ( chrom1 > chrom2 || ((chrom1 == chrom2) && (start1 > start2)) ) {
518 swap(chrom1, chrom2);
519 swap(start1, start2);
520 swap(end1, end2);
521 swap(strand1, strand2);
522 }
523
524 // report BEDPE using min mapQuality
525 if (useEditDistance == false) {
526 // compute the minimum mapping quality b/w the two ends of the pair.
527 if (bam1.IsMapped() == true && bam2.IsMapped() == true)
528 minMapQuality = min(bam1.MapQuality, bam2.MapQuality);
529
530 printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n",
531 chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2,
532 bam1.Name.c_str(), minMapQuality, strand1.c_str(), strand2.c_str());
533 }
534 // report BEDPE using total edit distance
535 else {
536 uint16_t totalEditDistance = 0;
537 if (bam1.IsMapped() == true && bam2.IsMapped() == true)
538 totalEditDistance = editDistance1 + editDistance2;
539 else if (bam1.IsMapped() == true)
540 totalEditDistance = editDistance1;
541 else if (bam2.IsMapped() == true)
542 totalEditDistance = editDistance2;
543
544 printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n",
545 chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2,
546 bam1.Name.c_str(), totalEditDistance, strand1.c_str(), strand2.c_str());
547 }
548 }
549
550
551 // deprecated.
552 bool IsCorrectMappingForBEDPE (const BamAlignment &bam) {
553
554 if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize > 0) ) {
555 return true;
556 }
557 else if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize == 0) && bam.IsFirstMate() ) {
558 return true;
559 }
560 else if ( (bam.RefID != bam.MateRefID) && bam.IsFirstMate() ) {
561 return true;
562 }
563 else return false;
564 }