comparison BEDTools-Version-2.14.3/src/bedToBam/bedToBam.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfcd8b6c1bda
1 /*****************************************************************************
2 bedToBam.cpp
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "lineFileUtilities.h"
13 #include "bedFile.h"
14 #include "genomeFile.h"
15 #include "version.h"
16
17
18 #include "api/BamReader.h"
19 #include "api/BamAux.h"
20 #include "api/BamWriter.h"
21 using namespace BamTools;
22
23 #include <vector>
24 #include <iostream>
25 #include <fstream>
26 #include <stdlib.h>
27
28 using namespace std;
29
30
31 // define our program name
32 #define PROGRAM_NAME "bedToBam"
33
34 // define our parameter checking macro
35 #define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
36
37
38 // function declarations
39 void ShowHelp(void);
40 void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED12, int mapQual, bool uncompressedBam);
41 void ConvertBedToBam(const BED &bed, BamAlignment &bam, map<string, int> &chromToId, bool isBED12, int mapQual, int lineNum);
42 void MakeBamHeader(const string &genomeFile, RefVector &refs, string &header, map<string, int> &chromToInt);
43 int reg2bin(int beg, int end);
44
45
46
47 int main(int argc, char* argv[]) {
48
49 // our configuration variables
50 bool showHelp = false;
51
52 // input files
53 string bedFile = "stdin";
54 string genomeFile;
55
56 unsigned int mapQual = 255;
57
58 bool haveBed = true;
59 bool haveGenome = false;
60 bool haveMapQual = false;
61 bool isBED12 = false;
62 bool uncompressedBam = false;
63
64 for(int i = 1; i < argc; i++) {
65 int parameterLength = (int)strlen(argv[i]);
66
67 if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
68 (PARAMETER_CHECK("--help", 5, parameterLength))) {
69 showHelp = true;
70 }
71 }
72
73 if(showHelp) ShowHelp();
74
75 // do some parsing (all of these parameters require 2 strings)
76 for(int i = 1; i < argc; i++) {
77
78 int parameterLength = (int)strlen(argv[i]);
79
80 if(PARAMETER_CHECK("-i", 2, parameterLength)) {
81 if ((i+1) < argc) {
82 bedFile = argv[i + 1];
83 i++;
84 }
85 }
86 else if(PARAMETER_CHECK("-g", 2, parameterLength)) {
87 if ((i+1) < argc) {
88 haveGenome = true;
89 genomeFile = argv[i + 1];
90 i++;
91 }
92 }
93 else if(PARAMETER_CHECK("-mapq", 5, parameterLength)) {
94 haveMapQual = true;
95 if ((i+1) < argc) {
96 mapQual = atoi(argv[i + 1]);
97 i++;
98 }
99 }
100 else if(PARAMETER_CHECK("-bed12", 6, parameterLength)) {
101 isBED12 = true;
102 }
103 else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) {
104 uncompressedBam = true;
105 }
106 else {
107 cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
108 showHelp = true;
109 }
110 }
111
112 // make sure we have an input files
113 if (!haveBed ) {
114 cerr << endl << "*****" << endl << "*****ERROR: Need -i (BED) file. " << endl << "*****" << endl;
115 showHelp = true;
116 }
117 if (!haveGenome ) {
118 cerr << endl << "*****" << endl << "*****ERROR: Need -g (genome) file. " << endl << "*****" << endl;
119 showHelp = true;
120 }
121 if (mapQual < 0 || mapQual > 255) {
122 cerr << endl << "*****" << endl << "*****ERROR: MAPQ must be in range [0,255]. " << endl << "*****" << endl;
123 showHelp = true;
124 }
125
126
127 if (!showHelp) {
128 BedFile *bed = new BedFile(bedFile);
129 GenomeFile *genome = new GenomeFile(genomeFile);
130
131 ProcessBed(cin, bed, genome, isBED12, mapQual, uncompressedBam);
132 }
133 else {
134 ShowHelp();
135 }
136 }
137
138
139 void ShowHelp(void) {
140
141 cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl;
142
143 cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl;
144
145 cerr << "Summary: Converts feature records to BAM format." << endl << endl;
146
147 cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl;
148
149 cerr << "Options: " << endl;
150
151 cerr << "\t-mapq\t" << "Set the mappinq quality for the BAM records." << endl;
152 cerr << "\t\t(INT) Default: 255" << endl << endl;
153
154 cerr << "\t-bed12\t" << "The BED file is in BED12 format. The BAM CIGAR" << endl;
155 cerr << "\t\tstring will reflect BED \"blocks\"." << endl << endl;
156
157 cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default is to write compressed BAM." << endl << endl;
158
159 cerr << "Notes: " << endl;
160 cerr << "\t(1) BED files must be at least BED4 to be amenable to BAM (needs name field)." << endl << endl;
161
162
163 // end the program here
164 exit(1);
165 }
166
167
168 void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED12, int mapQual, bool uncompressedBam) {
169
170 BamWriter *writer = new BamWriter();
171
172 // build a BAM header from the genomeFile
173 RefVector refs;
174 string bamHeader;
175 map<string, int, std::less<string> > chromToId;
176 MakeBamHeader(genome->getGenomeFileName(), refs, bamHeader, chromToId);
177
178 // set compression mode
179 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
180 if ( uncompressedBam ) compressionMode = BamWriter::Uncompressed;
181 writer->SetCompressionMode(compressionMode);
182 // open a BAM and add the reference headers to the BAM file
183 writer->Open("stdout", bamHeader, refs);
184
185
186 // process each BED entry and convert to BAM
187 BED bedEntry, nullBed;
188 int lineNum = 0;
189 BedLineStatus bedStatus;
190 // open the BED file for reading.
191 bed->Open();
192 while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
193 if (bedStatus == BED_VALID) {
194 BamAlignment bamEntry;
195 if (bed->bedType >= 4) {
196 ConvertBedToBam(bedEntry, bamEntry, chromToId, isBED12, mapQual, lineNum);
197 writer->SaveAlignment(bamEntry);
198 }
199 else {
200 cerr << "Error: BED entry without name found at line: " << lineNum << ". Exiting!" << endl;
201 exit (1);
202 }
203 bedEntry = nullBed;
204 }
205 }
206 //close up
207 bed->Close();
208 writer->Close();
209 }
210
211
212 void ConvertBedToBam(const BED &bed, BamAlignment &bam, map<string, int, std::less<string> > &chromToId,
213 bool isBED12, int mapQual, int lineNum) {
214
215 bam.Name = bed.name;
216 bam.Position = bed.start;
217 bam.Bin = reg2bin(bed.start, bed.end);
218
219 // hard-code the sequence and qualities.
220 int bedLength = bed.end - bed.start;
221
222 // set dummy seq and qual strings. the input is BED,
223 // so the sequence is inherently the same as it's
224 // reference genome.
225 // Thanks to James M. Ward for pointing this out.
226 bam.QueryBases = "";
227 bam.Qualities = "";
228
229 // chrom and map quality
230 bam.RefID = chromToId[bed.chrom];
231 bam.MapQuality = mapQual;
232
233 // set the BAM FLAG
234 bam.AlignmentFlag = 0;
235 if (bed.strand == "-")
236 bam.SetIsReverseStrand(true);
237
238 bam.MatePosition = -1;
239 bam.InsertSize = 0;
240 bam.MateRefID = -1;
241
242 bam.CigarData.clear();
243
244 if (isBED12 == false) {
245 CigarOp cOp;
246 cOp.Type = 'M';
247 cOp.Length = bedLength;
248 bam.CigarData.push_back(cOp);
249 }
250 // we're being told that the input is BED12.
251 else{
252
253 // does it smell like BED12? if so, process it.
254 if (bed.otherFields.size() == 6) {
255
256 // extract the relevant BED fields to convert BED12 to BAM
257 // namely: blockCount, blockStarts, blockEnds
258 unsigned int blockCount = atoi(bed.otherFields[3].c_str());
259
260 vector<int> blockSizes, blockStarts;
261 Tokenize(bed.otherFields[4], blockSizes, ",");
262 Tokenize(bed.otherFields[5], blockStarts, ",");
263
264 // make sure this is a well-formed BED12 entry.
265 if (blockSizes.size() != blockCount) {
266 cerr << "Error: Number of BED blocks does not match blockCount at line: " << lineNum << ". Exiting!" << endl;
267 exit (1);
268 }
269 else {
270 // does the first block start after the bed.start?
271 // if so, we need to do some "splicing"
272 if (blockStarts[0] > 0) {
273 CigarOp cOp;
274 cOp.Length = blockStarts[0];
275 cOp.Type = 'N';
276 bam.CigarData.push_back(cOp);
277 }
278 // handle the "middle" blocks
279 for (unsigned int i = 0; i < blockCount - 1; ++i) {
280 CigarOp cOp;
281 cOp.Length = blockSizes[i];
282 cOp.Type = 'M';
283 bam.CigarData.push_back(cOp);
284
285 if (blockStarts[i+1] > (blockStarts[i] + blockSizes[i])) {
286 CigarOp cOp;
287 cOp.Length = (blockStarts[i+1] - (blockStarts[i] + blockSizes[i]));
288 cOp.Type = 'N';
289 bam.CigarData.push_back(cOp);
290 }
291 }
292 // handle the last block.
293 CigarOp cOp;
294 cOp.Length = blockSizes[blockCount - 1];
295 cOp.Type = 'M';
296 bam.CigarData.push_back(cOp);
297 }
298 }
299 // it doesn't smell like BED12. complain.
300 else {
301 cerr << "You've indicated that the input file is in BED12 format, yet the relevant fields cannot be found. Exiting." << endl << endl;
302 exit(1);
303 }
304 }
305 }
306
307
308 void MakeBamHeader(const string &genomeFile, RefVector &refs, string &header,
309 map<string, int, std::less<string> > &chromToId) {
310
311 // make a genome map of the genome file.
312 GenomeFile genome(genomeFile);
313
314 header += "@HD\tVN:1.0\tSO:unsorted\n";
315 header += "@PG\tID:BEDTools_bedToBam\tVN:V";
316 header += VERSION;
317 header += "\n";
318
319 int chromId = 0;
320 vector<string> chromList = genome.getChromList();
321 sort(chromList.begin(), chromList.end());
322
323 // create a BAM header (@SQ) entry for each chrom in the BEDTools genome file.
324 vector<string>::const_iterator genomeItr = chromList.begin();
325 vector<string>::const_iterator genomeEnd = chromList.end();
326 for (; genomeItr != genomeEnd; ++genomeItr) {
327 chromToId[*genomeItr] = chromId;
328 chromId++;
329
330 // add to the header text
331 int size = genome.getChromSize(*genomeItr);
332 string chromLine = "@SQ\tSN:" + *genomeItr + "\tAS:" + genomeFile + "\tLN:" + ToString(size) + "\n";
333 header += chromLine;
334
335 // create a chrom entry and add it to
336 // the RefVector
337 RefData chrom;
338 chrom.RefName = *genomeItr;
339 chrom.RefLength = size;
340 refs.push_back(chrom);
341 }
342 }
343
344
345 /* Taken directly from the SAMTools spec
346 calculate bin given an alignment in [beg,end) (zero-based, half-close, half-open) */
347 int reg2bin(int beg, int end) {
348 --end;
349 if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
350 if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
351 if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20);
352 if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23);
353 if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26);
354 return 0;
355 }
356
357