Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/bedToBam/bedToBam.cpp @ 0:dfcd8b6c1bda
Uploaded
| author | aaronquinlan |
|---|---|
| date | Thu, 03 Nov 2011 10:25:04 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfcd8b6c1bda |
|---|---|
| 1 /***************************************************************************** | |
| 2 bedToBam.cpp | |
| 3 | |
| 4 (c) 2009 - Aaron Quinlan | |
| 5 Hall Laboratory | |
| 6 Department of Biochemistry and Molecular Genetics | |
| 7 University of Virginia | |
| 8 aaronquinlan@gmail.com | |
| 9 | |
| 10 Licenced under the GNU General Public License 2.0 license. | |
| 11 ******************************************************************************/ | |
| 12 #include "lineFileUtilities.h" | |
| 13 #include "bedFile.h" | |
| 14 #include "genomeFile.h" | |
| 15 #include "version.h" | |
| 16 | |
| 17 | |
| 18 #include "api/BamReader.h" | |
| 19 #include "api/BamAux.h" | |
| 20 #include "api/BamWriter.h" | |
| 21 using namespace BamTools; | |
| 22 | |
| 23 #include <vector> | |
| 24 #include <iostream> | |
| 25 #include <fstream> | |
| 26 #include <stdlib.h> | |
| 27 | |
| 28 using namespace std; | |
| 29 | |
| 30 | |
| 31 // define our program name | |
| 32 #define PROGRAM_NAME "bedToBam" | |
| 33 | |
| 34 // define our parameter checking macro | |
| 35 #define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) | |
| 36 | |
| 37 | |
| 38 // function declarations | |
| 39 void ShowHelp(void); | |
| 40 void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED12, int mapQual, bool uncompressedBam); | |
| 41 void ConvertBedToBam(const BED &bed, BamAlignment &bam, map<string, int> &chromToId, bool isBED12, int mapQual, int lineNum); | |
| 42 void MakeBamHeader(const string &genomeFile, RefVector &refs, string &header, map<string, int> &chromToInt); | |
| 43 int reg2bin(int beg, int end); | |
| 44 | |
| 45 | |
| 46 | |
| 47 int main(int argc, char* argv[]) { | |
| 48 | |
| 49 // our configuration variables | |
| 50 bool showHelp = false; | |
| 51 | |
| 52 // input files | |
| 53 string bedFile = "stdin"; | |
| 54 string genomeFile; | |
| 55 | |
| 56 unsigned int mapQual = 255; | |
| 57 | |
| 58 bool haveBed = true; | |
| 59 bool haveGenome = false; | |
| 60 bool haveMapQual = false; | |
| 61 bool isBED12 = false; | |
| 62 bool uncompressedBam = false; | |
| 63 | |
| 64 for(int i = 1; i < argc; i++) { | |
| 65 int parameterLength = (int)strlen(argv[i]); | |
| 66 | |
| 67 if((PARAMETER_CHECK("-h", 2, parameterLength)) || | |
| 68 (PARAMETER_CHECK("--help", 5, parameterLength))) { | |
| 69 showHelp = true; | |
| 70 } | |
| 71 } | |
| 72 | |
| 73 if(showHelp) ShowHelp(); | |
| 74 | |
| 75 // do some parsing (all of these parameters require 2 strings) | |
| 76 for(int i = 1; i < argc; i++) { | |
| 77 | |
| 78 int parameterLength = (int)strlen(argv[i]); | |
| 79 | |
| 80 if(PARAMETER_CHECK("-i", 2, parameterLength)) { | |
| 81 if ((i+1) < argc) { | |
| 82 bedFile = argv[i + 1]; | |
| 83 i++; | |
| 84 } | |
| 85 } | |
| 86 else if(PARAMETER_CHECK("-g", 2, parameterLength)) { | |
| 87 if ((i+1) < argc) { | |
| 88 haveGenome = true; | |
| 89 genomeFile = argv[i + 1]; | |
| 90 i++; | |
| 91 } | |
| 92 } | |
| 93 else if(PARAMETER_CHECK("-mapq", 5, parameterLength)) { | |
| 94 haveMapQual = true; | |
| 95 if ((i+1) < argc) { | |
| 96 mapQual = atoi(argv[i + 1]); | |
| 97 i++; | |
| 98 } | |
| 99 } | |
| 100 else if(PARAMETER_CHECK("-bed12", 6, parameterLength)) { | |
| 101 isBED12 = true; | |
| 102 } | |
| 103 else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) { | |
| 104 uncompressedBam = true; | |
| 105 } | |
| 106 else { | |
| 107 cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; | |
| 108 showHelp = true; | |
| 109 } | |
| 110 } | |
| 111 | |
| 112 // make sure we have an input files | |
| 113 if (!haveBed ) { | |
| 114 cerr << endl << "*****" << endl << "*****ERROR: Need -i (BED) file. " << endl << "*****" << endl; | |
| 115 showHelp = true; | |
| 116 } | |
| 117 if (!haveGenome ) { | |
| 118 cerr << endl << "*****" << endl << "*****ERROR: Need -g (genome) file. " << endl << "*****" << endl; | |
| 119 showHelp = true; | |
| 120 } | |
| 121 if (mapQual < 0 || mapQual > 255) { | |
| 122 cerr << endl << "*****" << endl << "*****ERROR: MAPQ must be in range [0,255]. " << endl << "*****" << endl; | |
| 123 showHelp = true; | |
| 124 } | |
| 125 | |
| 126 | |
| 127 if (!showHelp) { | |
| 128 BedFile *bed = new BedFile(bedFile); | |
| 129 GenomeFile *genome = new GenomeFile(genomeFile); | |
| 130 | |
| 131 ProcessBed(cin, bed, genome, isBED12, mapQual, uncompressedBam); | |
| 132 } | |
| 133 else { | |
| 134 ShowHelp(); | |
| 135 } | |
| 136 } | |
| 137 | |
| 138 | |
| 139 void ShowHelp(void) { | |
| 140 | |
| 141 cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; | |
| 142 | |
| 143 cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; | |
| 144 | |
| 145 cerr << "Summary: Converts feature records to BAM format." << endl << endl; | |
| 146 | |
| 147 cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl; | |
| 148 | |
| 149 cerr << "Options: " << endl; | |
| 150 | |
| 151 cerr << "\t-mapq\t" << "Set the mappinq quality for the BAM records." << endl; | |
| 152 cerr << "\t\t(INT) Default: 255" << endl << endl; | |
| 153 | |
| 154 cerr << "\t-bed12\t" << "The BED file is in BED12 format. The BAM CIGAR" << endl; | |
| 155 cerr << "\t\tstring will reflect BED \"blocks\"." << endl << endl; | |
| 156 | |
| 157 cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default is to write compressed BAM." << endl << endl; | |
| 158 | |
| 159 cerr << "Notes: " << endl; | |
| 160 cerr << "\t(1) BED files must be at least BED4 to be amenable to BAM (needs name field)." << endl << endl; | |
| 161 | |
| 162 | |
| 163 // end the program here | |
| 164 exit(1); | |
| 165 } | |
| 166 | |
| 167 | |
| 168 void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED12, int mapQual, bool uncompressedBam) { | |
| 169 | |
| 170 BamWriter *writer = new BamWriter(); | |
| 171 | |
| 172 // build a BAM header from the genomeFile | |
| 173 RefVector refs; | |
| 174 string bamHeader; | |
| 175 map<string, int, std::less<string> > chromToId; | |
| 176 MakeBamHeader(genome->getGenomeFileName(), refs, bamHeader, chromToId); | |
| 177 | |
| 178 // set compression mode | |
| 179 BamWriter::CompressionMode compressionMode = BamWriter::Compressed; | |
| 180 if ( uncompressedBam ) compressionMode = BamWriter::Uncompressed; | |
| 181 writer->SetCompressionMode(compressionMode); | |
| 182 // open a BAM and add the reference headers to the BAM file | |
| 183 writer->Open("stdout", bamHeader, refs); | |
| 184 | |
| 185 | |
| 186 // process each BED entry and convert to BAM | |
| 187 BED bedEntry, nullBed; | |
| 188 int lineNum = 0; | |
| 189 BedLineStatus bedStatus; | |
| 190 // open the BED file for reading. | |
| 191 bed->Open(); | |
| 192 while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { | |
| 193 if (bedStatus == BED_VALID) { | |
| 194 BamAlignment bamEntry; | |
| 195 if (bed->bedType >= 4) { | |
| 196 ConvertBedToBam(bedEntry, bamEntry, chromToId, isBED12, mapQual, lineNum); | |
| 197 writer->SaveAlignment(bamEntry); | |
| 198 } | |
| 199 else { | |
| 200 cerr << "Error: BED entry without name found at line: " << lineNum << ". Exiting!" << endl; | |
| 201 exit (1); | |
| 202 } | |
| 203 bedEntry = nullBed; | |
| 204 } | |
| 205 } | |
| 206 //close up | |
| 207 bed->Close(); | |
| 208 writer->Close(); | |
| 209 } | |
| 210 | |
| 211 | |
| 212 void ConvertBedToBam(const BED &bed, BamAlignment &bam, map<string, int, std::less<string> > &chromToId, | |
| 213 bool isBED12, int mapQual, int lineNum) { | |
| 214 | |
| 215 bam.Name = bed.name; | |
| 216 bam.Position = bed.start; | |
| 217 bam.Bin = reg2bin(bed.start, bed.end); | |
| 218 | |
| 219 // hard-code the sequence and qualities. | |
| 220 int bedLength = bed.end - bed.start; | |
| 221 | |
| 222 // set dummy seq and qual strings. the input is BED, | |
| 223 // so the sequence is inherently the same as it's | |
| 224 // reference genome. | |
| 225 // Thanks to James M. Ward for pointing this out. | |
| 226 bam.QueryBases = ""; | |
| 227 bam.Qualities = ""; | |
| 228 | |
| 229 // chrom and map quality | |
| 230 bam.RefID = chromToId[bed.chrom]; | |
| 231 bam.MapQuality = mapQual; | |
| 232 | |
| 233 // set the BAM FLAG | |
| 234 bam.AlignmentFlag = 0; | |
| 235 if (bed.strand == "-") | |
| 236 bam.SetIsReverseStrand(true); | |
| 237 | |
| 238 bam.MatePosition = -1; | |
| 239 bam.InsertSize = 0; | |
| 240 bam.MateRefID = -1; | |
| 241 | |
| 242 bam.CigarData.clear(); | |
| 243 | |
| 244 if (isBED12 == false) { | |
| 245 CigarOp cOp; | |
| 246 cOp.Type = 'M'; | |
| 247 cOp.Length = bedLength; | |
| 248 bam.CigarData.push_back(cOp); | |
| 249 } | |
| 250 // we're being told that the input is BED12. | |
| 251 else{ | |
| 252 | |
| 253 // does it smell like BED12? if so, process it. | |
| 254 if (bed.otherFields.size() == 6) { | |
| 255 | |
| 256 // extract the relevant BED fields to convert BED12 to BAM | |
| 257 // namely: blockCount, blockStarts, blockEnds | |
| 258 unsigned int blockCount = atoi(bed.otherFields[3].c_str()); | |
| 259 | |
| 260 vector<int> blockSizes, blockStarts; | |
| 261 Tokenize(bed.otherFields[4], blockSizes, ","); | |
| 262 Tokenize(bed.otherFields[5], blockStarts, ","); | |
| 263 | |
| 264 // make sure this is a well-formed BED12 entry. | |
| 265 if (blockSizes.size() != blockCount) { | |
| 266 cerr << "Error: Number of BED blocks does not match blockCount at line: " << lineNum << ". Exiting!" << endl; | |
| 267 exit (1); | |
| 268 } | |
| 269 else { | |
| 270 // does the first block start after the bed.start? | |
| 271 // if so, we need to do some "splicing" | |
| 272 if (blockStarts[0] > 0) { | |
| 273 CigarOp cOp; | |
| 274 cOp.Length = blockStarts[0]; | |
| 275 cOp.Type = 'N'; | |
| 276 bam.CigarData.push_back(cOp); | |
| 277 } | |
| 278 // handle the "middle" blocks | |
| 279 for (unsigned int i = 0; i < blockCount - 1; ++i) { | |
| 280 CigarOp cOp; | |
| 281 cOp.Length = blockSizes[i]; | |
| 282 cOp.Type = 'M'; | |
| 283 bam.CigarData.push_back(cOp); | |
| 284 | |
| 285 if (blockStarts[i+1] > (blockStarts[i] + blockSizes[i])) { | |
| 286 CigarOp cOp; | |
| 287 cOp.Length = (blockStarts[i+1] - (blockStarts[i] + blockSizes[i])); | |
| 288 cOp.Type = 'N'; | |
| 289 bam.CigarData.push_back(cOp); | |
| 290 } | |
| 291 } | |
| 292 // handle the last block. | |
| 293 CigarOp cOp; | |
| 294 cOp.Length = blockSizes[blockCount - 1]; | |
| 295 cOp.Type = 'M'; | |
| 296 bam.CigarData.push_back(cOp); | |
| 297 } | |
| 298 } | |
| 299 // it doesn't smell like BED12. complain. | |
| 300 else { | |
| 301 cerr << "You've indicated that the input file is in BED12 format, yet the relevant fields cannot be found. Exiting." << endl << endl; | |
| 302 exit(1); | |
| 303 } | |
| 304 } | |
| 305 } | |
| 306 | |
| 307 | |
| 308 void MakeBamHeader(const string &genomeFile, RefVector &refs, string &header, | |
| 309 map<string, int, std::less<string> > &chromToId) { | |
| 310 | |
| 311 // make a genome map of the genome file. | |
| 312 GenomeFile genome(genomeFile); | |
| 313 | |
| 314 header += "@HD\tVN:1.0\tSO:unsorted\n"; | |
| 315 header += "@PG\tID:BEDTools_bedToBam\tVN:V"; | |
| 316 header += VERSION; | |
| 317 header += "\n"; | |
| 318 | |
| 319 int chromId = 0; | |
| 320 vector<string> chromList = genome.getChromList(); | |
| 321 sort(chromList.begin(), chromList.end()); | |
| 322 | |
| 323 // create a BAM header (@SQ) entry for each chrom in the BEDTools genome file. | |
| 324 vector<string>::const_iterator genomeItr = chromList.begin(); | |
| 325 vector<string>::const_iterator genomeEnd = chromList.end(); | |
| 326 for (; genomeItr != genomeEnd; ++genomeItr) { | |
| 327 chromToId[*genomeItr] = chromId; | |
| 328 chromId++; | |
| 329 | |
| 330 // add to the header text | |
| 331 int size = genome.getChromSize(*genomeItr); | |
| 332 string chromLine = "@SQ\tSN:" + *genomeItr + "\tAS:" + genomeFile + "\tLN:" + ToString(size) + "\n"; | |
| 333 header += chromLine; | |
| 334 | |
| 335 // create a chrom entry and add it to | |
| 336 // the RefVector | |
| 337 RefData chrom; | |
| 338 chrom.RefName = *genomeItr; | |
| 339 chrom.RefLength = size; | |
| 340 refs.push_back(chrom); | |
| 341 } | |
| 342 } | |
| 343 | |
| 344 | |
| 345 /* Taken directly from the SAMTools spec | |
| 346 calculate bin given an alignment in [beg,end) (zero-based, half-close, half-open) */ | |
| 347 int reg2bin(int beg, int end) { | |
| 348 --end; | |
| 349 if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14); | |
| 350 if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17); | |
| 351 if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20); | |
| 352 if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23); | |
| 353 if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26); | |
| 354 return 0; | |
| 355 } | |
| 356 | |
| 357 |
