Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/bedToBam/bedToBam.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfcd8b6c1bda |
---|---|
1 /***************************************************************************** | |
2 bedToBam.cpp | |
3 | |
4 (c) 2009 - Aaron Quinlan | |
5 Hall Laboratory | |
6 Department of Biochemistry and Molecular Genetics | |
7 University of Virginia | |
8 aaronquinlan@gmail.com | |
9 | |
10 Licenced under the GNU General Public License 2.0 license. | |
11 ******************************************************************************/ | |
12 #include "lineFileUtilities.h" | |
13 #include "bedFile.h" | |
14 #include "genomeFile.h" | |
15 #include "version.h" | |
16 | |
17 | |
18 #include "api/BamReader.h" | |
19 #include "api/BamAux.h" | |
20 #include "api/BamWriter.h" | |
21 using namespace BamTools; | |
22 | |
23 #include <vector> | |
24 #include <iostream> | |
25 #include <fstream> | |
26 #include <stdlib.h> | |
27 | |
28 using namespace std; | |
29 | |
30 | |
31 // define our program name | |
32 #define PROGRAM_NAME "bedToBam" | |
33 | |
34 // define our parameter checking macro | |
35 #define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) | |
36 | |
37 | |
38 // function declarations | |
39 void ShowHelp(void); | |
40 void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED12, int mapQual, bool uncompressedBam); | |
41 void ConvertBedToBam(const BED &bed, BamAlignment &bam, map<string, int> &chromToId, bool isBED12, int mapQual, int lineNum); | |
42 void MakeBamHeader(const string &genomeFile, RefVector &refs, string &header, map<string, int> &chromToInt); | |
43 int reg2bin(int beg, int end); | |
44 | |
45 | |
46 | |
47 int main(int argc, char* argv[]) { | |
48 | |
49 // our configuration variables | |
50 bool showHelp = false; | |
51 | |
52 // input files | |
53 string bedFile = "stdin"; | |
54 string genomeFile; | |
55 | |
56 unsigned int mapQual = 255; | |
57 | |
58 bool haveBed = true; | |
59 bool haveGenome = false; | |
60 bool haveMapQual = false; | |
61 bool isBED12 = false; | |
62 bool uncompressedBam = false; | |
63 | |
64 for(int i = 1; i < argc; i++) { | |
65 int parameterLength = (int)strlen(argv[i]); | |
66 | |
67 if((PARAMETER_CHECK("-h", 2, parameterLength)) || | |
68 (PARAMETER_CHECK("--help", 5, parameterLength))) { | |
69 showHelp = true; | |
70 } | |
71 } | |
72 | |
73 if(showHelp) ShowHelp(); | |
74 | |
75 // do some parsing (all of these parameters require 2 strings) | |
76 for(int i = 1; i < argc; i++) { | |
77 | |
78 int parameterLength = (int)strlen(argv[i]); | |
79 | |
80 if(PARAMETER_CHECK("-i", 2, parameterLength)) { | |
81 if ((i+1) < argc) { | |
82 bedFile = argv[i + 1]; | |
83 i++; | |
84 } | |
85 } | |
86 else if(PARAMETER_CHECK("-g", 2, parameterLength)) { | |
87 if ((i+1) < argc) { | |
88 haveGenome = true; | |
89 genomeFile = argv[i + 1]; | |
90 i++; | |
91 } | |
92 } | |
93 else if(PARAMETER_CHECK("-mapq", 5, parameterLength)) { | |
94 haveMapQual = true; | |
95 if ((i+1) < argc) { | |
96 mapQual = atoi(argv[i + 1]); | |
97 i++; | |
98 } | |
99 } | |
100 else if(PARAMETER_CHECK("-bed12", 6, parameterLength)) { | |
101 isBED12 = true; | |
102 } | |
103 else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) { | |
104 uncompressedBam = true; | |
105 } | |
106 else { | |
107 cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; | |
108 showHelp = true; | |
109 } | |
110 } | |
111 | |
112 // make sure we have an input files | |
113 if (!haveBed ) { | |
114 cerr << endl << "*****" << endl << "*****ERROR: Need -i (BED) file. " << endl << "*****" << endl; | |
115 showHelp = true; | |
116 } | |
117 if (!haveGenome ) { | |
118 cerr << endl << "*****" << endl << "*****ERROR: Need -g (genome) file. " << endl << "*****" << endl; | |
119 showHelp = true; | |
120 } | |
121 if (mapQual < 0 || mapQual > 255) { | |
122 cerr << endl << "*****" << endl << "*****ERROR: MAPQ must be in range [0,255]. " << endl << "*****" << endl; | |
123 showHelp = true; | |
124 } | |
125 | |
126 | |
127 if (!showHelp) { | |
128 BedFile *bed = new BedFile(bedFile); | |
129 GenomeFile *genome = new GenomeFile(genomeFile); | |
130 | |
131 ProcessBed(cin, bed, genome, isBED12, mapQual, uncompressedBam); | |
132 } | |
133 else { | |
134 ShowHelp(); | |
135 } | |
136 } | |
137 | |
138 | |
139 void ShowHelp(void) { | |
140 | |
141 cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; | |
142 | |
143 cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; | |
144 | |
145 cerr << "Summary: Converts feature records to BAM format." << endl << endl; | |
146 | |
147 cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl; | |
148 | |
149 cerr << "Options: " << endl; | |
150 | |
151 cerr << "\t-mapq\t" << "Set the mappinq quality for the BAM records." << endl; | |
152 cerr << "\t\t(INT) Default: 255" << endl << endl; | |
153 | |
154 cerr << "\t-bed12\t" << "The BED file is in BED12 format. The BAM CIGAR" << endl; | |
155 cerr << "\t\tstring will reflect BED \"blocks\"." << endl << endl; | |
156 | |
157 cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default is to write compressed BAM." << endl << endl; | |
158 | |
159 cerr << "Notes: " << endl; | |
160 cerr << "\t(1) BED files must be at least BED4 to be amenable to BAM (needs name field)." << endl << endl; | |
161 | |
162 | |
163 // end the program here | |
164 exit(1); | |
165 } | |
166 | |
167 | |
168 void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED12, int mapQual, bool uncompressedBam) { | |
169 | |
170 BamWriter *writer = new BamWriter(); | |
171 | |
172 // build a BAM header from the genomeFile | |
173 RefVector refs; | |
174 string bamHeader; | |
175 map<string, int, std::less<string> > chromToId; | |
176 MakeBamHeader(genome->getGenomeFileName(), refs, bamHeader, chromToId); | |
177 | |
178 // set compression mode | |
179 BamWriter::CompressionMode compressionMode = BamWriter::Compressed; | |
180 if ( uncompressedBam ) compressionMode = BamWriter::Uncompressed; | |
181 writer->SetCompressionMode(compressionMode); | |
182 // open a BAM and add the reference headers to the BAM file | |
183 writer->Open("stdout", bamHeader, refs); | |
184 | |
185 | |
186 // process each BED entry and convert to BAM | |
187 BED bedEntry, nullBed; | |
188 int lineNum = 0; | |
189 BedLineStatus bedStatus; | |
190 // open the BED file for reading. | |
191 bed->Open(); | |
192 while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { | |
193 if (bedStatus == BED_VALID) { | |
194 BamAlignment bamEntry; | |
195 if (bed->bedType >= 4) { | |
196 ConvertBedToBam(bedEntry, bamEntry, chromToId, isBED12, mapQual, lineNum); | |
197 writer->SaveAlignment(bamEntry); | |
198 } | |
199 else { | |
200 cerr << "Error: BED entry without name found at line: " << lineNum << ". Exiting!" << endl; | |
201 exit (1); | |
202 } | |
203 bedEntry = nullBed; | |
204 } | |
205 } | |
206 //close up | |
207 bed->Close(); | |
208 writer->Close(); | |
209 } | |
210 | |
211 | |
212 void ConvertBedToBam(const BED &bed, BamAlignment &bam, map<string, int, std::less<string> > &chromToId, | |
213 bool isBED12, int mapQual, int lineNum) { | |
214 | |
215 bam.Name = bed.name; | |
216 bam.Position = bed.start; | |
217 bam.Bin = reg2bin(bed.start, bed.end); | |
218 | |
219 // hard-code the sequence and qualities. | |
220 int bedLength = bed.end - bed.start; | |
221 | |
222 // set dummy seq and qual strings. the input is BED, | |
223 // so the sequence is inherently the same as it's | |
224 // reference genome. | |
225 // Thanks to James M. Ward for pointing this out. | |
226 bam.QueryBases = ""; | |
227 bam.Qualities = ""; | |
228 | |
229 // chrom and map quality | |
230 bam.RefID = chromToId[bed.chrom]; | |
231 bam.MapQuality = mapQual; | |
232 | |
233 // set the BAM FLAG | |
234 bam.AlignmentFlag = 0; | |
235 if (bed.strand == "-") | |
236 bam.SetIsReverseStrand(true); | |
237 | |
238 bam.MatePosition = -1; | |
239 bam.InsertSize = 0; | |
240 bam.MateRefID = -1; | |
241 | |
242 bam.CigarData.clear(); | |
243 | |
244 if (isBED12 == false) { | |
245 CigarOp cOp; | |
246 cOp.Type = 'M'; | |
247 cOp.Length = bedLength; | |
248 bam.CigarData.push_back(cOp); | |
249 } | |
250 // we're being told that the input is BED12. | |
251 else{ | |
252 | |
253 // does it smell like BED12? if so, process it. | |
254 if (bed.otherFields.size() == 6) { | |
255 | |
256 // extract the relevant BED fields to convert BED12 to BAM | |
257 // namely: blockCount, blockStarts, blockEnds | |
258 unsigned int blockCount = atoi(bed.otherFields[3].c_str()); | |
259 | |
260 vector<int> blockSizes, blockStarts; | |
261 Tokenize(bed.otherFields[4], blockSizes, ","); | |
262 Tokenize(bed.otherFields[5], blockStarts, ","); | |
263 | |
264 // make sure this is a well-formed BED12 entry. | |
265 if (blockSizes.size() != blockCount) { | |
266 cerr << "Error: Number of BED blocks does not match blockCount at line: " << lineNum << ". Exiting!" << endl; | |
267 exit (1); | |
268 } | |
269 else { | |
270 // does the first block start after the bed.start? | |
271 // if so, we need to do some "splicing" | |
272 if (blockStarts[0] > 0) { | |
273 CigarOp cOp; | |
274 cOp.Length = blockStarts[0]; | |
275 cOp.Type = 'N'; | |
276 bam.CigarData.push_back(cOp); | |
277 } | |
278 // handle the "middle" blocks | |
279 for (unsigned int i = 0; i < blockCount - 1; ++i) { | |
280 CigarOp cOp; | |
281 cOp.Length = blockSizes[i]; | |
282 cOp.Type = 'M'; | |
283 bam.CigarData.push_back(cOp); | |
284 | |
285 if (blockStarts[i+1] > (blockStarts[i] + blockSizes[i])) { | |
286 CigarOp cOp; | |
287 cOp.Length = (blockStarts[i+1] - (blockStarts[i] + blockSizes[i])); | |
288 cOp.Type = 'N'; | |
289 bam.CigarData.push_back(cOp); | |
290 } | |
291 } | |
292 // handle the last block. | |
293 CigarOp cOp; | |
294 cOp.Length = blockSizes[blockCount - 1]; | |
295 cOp.Type = 'M'; | |
296 bam.CigarData.push_back(cOp); | |
297 } | |
298 } | |
299 // it doesn't smell like BED12. complain. | |
300 else { | |
301 cerr << "You've indicated that the input file is in BED12 format, yet the relevant fields cannot be found. Exiting." << endl << endl; | |
302 exit(1); | |
303 } | |
304 } | |
305 } | |
306 | |
307 | |
308 void MakeBamHeader(const string &genomeFile, RefVector &refs, string &header, | |
309 map<string, int, std::less<string> > &chromToId) { | |
310 | |
311 // make a genome map of the genome file. | |
312 GenomeFile genome(genomeFile); | |
313 | |
314 header += "@HD\tVN:1.0\tSO:unsorted\n"; | |
315 header += "@PG\tID:BEDTools_bedToBam\tVN:V"; | |
316 header += VERSION; | |
317 header += "\n"; | |
318 | |
319 int chromId = 0; | |
320 vector<string> chromList = genome.getChromList(); | |
321 sort(chromList.begin(), chromList.end()); | |
322 | |
323 // create a BAM header (@SQ) entry for each chrom in the BEDTools genome file. | |
324 vector<string>::const_iterator genomeItr = chromList.begin(); | |
325 vector<string>::const_iterator genomeEnd = chromList.end(); | |
326 for (; genomeItr != genomeEnd; ++genomeItr) { | |
327 chromToId[*genomeItr] = chromId; | |
328 chromId++; | |
329 | |
330 // add to the header text | |
331 int size = genome.getChromSize(*genomeItr); | |
332 string chromLine = "@SQ\tSN:" + *genomeItr + "\tAS:" + genomeFile + "\tLN:" + ToString(size) + "\n"; | |
333 header += chromLine; | |
334 | |
335 // create a chrom entry and add it to | |
336 // the RefVector | |
337 RefData chrom; | |
338 chrom.RefName = *genomeItr; | |
339 chrom.RefLength = size; | |
340 refs.push_back(chrom); | |
341 } | |
342 } | |
343 | |
344 | |
345 /* Taken directly from the SAMTools spec | |
346 calculate bin given an alignment in [beg,end) (zero-based, half-close, half-open) */ | |
347 int reg2bin(int beg, int end) { | |
348 --end; | |
349 if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14); | |
350 if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17); | |
351 if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20); | |
352 if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23); | |
353 if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26); | |
354 return 0; | |
355 } | |
356 | |
357 |