Mercurial > repos > aaronquinlan > multi_intersect
diff BEDTools-Version-2.14.3/src/bedToBam/bedToBam.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BEDTools-Version-2.14.3/src/bedToBam/bedToBam.cpp Thu Nov 03 10:25:04 2011 -0400 @@ -0,0 +1,357 @@ +/***************************************************************************** + bedToBam.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "lineFileUtilities.h" +#include "bedFile.h" +#include "genomeFile.h" +#include "version.h" + + +#include "api/BamReader.h" +#include "api/BamAux.h" +#include "api/BamWriter.h" +using namespace BamTools; + +#include <vector> +#include <iostream> +#include <fstream> +#include <stdlib.h> + +using namespace std; + + +// define our program name +#define PROGRAM_NAME "bedToBam" + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + + +// function declarations +void ShowHelp(void); +void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED12, int mapQual, bool uncompressedBam); +void ConvertBedToBam(const BED &bed, BamAlignment &bam, map<string, int> &chromToId, bool isBED12, int mapQual, int lineNum); +void MakeBamHeader(const string &genomeFile, RefVector &refs, string &header, map<string, int> &chromToInt); +int reg2bin(int beg, int end); + + + +int main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input files + string bedFile = "stdin"; + string genomeFile; + + unsigned int mapQual = 255; + + bool haveBed = true; + bool haveGenome = false; + bool haveMapQual = false; + bool isBED12 = false; + bool uncompressedBam = false; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-g", 2, parameterLength)) { + if ((i+1) < argc) { + haveGenome = true; + genomeFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-mapq", 5, parameterLength)) { + haveMapQual = true; + if ((i+1) < argc) { + mapQual = atoi(argv[i + 1]); + i++; + } + } + else if(PARAMETER_CHECK("-bed12", 6, parameterLength)) { + isBED12 = true; + } + else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) { + uncompressedBam = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have an input files + if (!haveBed ) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i (BED) file. " << endl << "*****" << endl; + showHelp = true; + } + if (!haveGenome ) { + cerr << endl << "*****" << endl << "*****ERROR: Need -g (genome) file. " << endl << "*****" << endl; + showHelp = true; + } + if (mapQual < 0 || mapQual > 255) { + cerr << endl << "*****" << endl << "*****ERROR: MAPQ must be in range [0,255]. " << endl << "*****" << endl; + showHelp = true; + } + + + if (!showHelp) { + BedFile *bed = new BedFile(bedFile); + GenomeFile *genome = new GenomeFile(genomeFile); + + ProcessBed(cin, bed, genome, isBED12, mapQual, uncompressedBam); + } + else { + ShowHelp(); + } +} + + +void ShowHelp(void) { + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Converts feature records to BAM format." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-mapq\t" << "Set the mappinq quality for the BAM records." << endl; + cerr << "\t\t(INT) Default: 255" << endl << endl; + + cerr << "\t-bed12\t" << "The BED file is in BED12 format. The BAM CIGAR" << endl; + cerr << "\t\tstring will reflect BED \"blocks\"." << endl << endl; + + cerr << "\t-ubam\t" << "Write uncompressed BAM output. Default is to write compressed BAM." << endl << endl; + + cerr << "Notes: " << endl; + cerr << "\t(1) BED files must be at least BED4 to be amenable to BAM (needs name field)." << endl << endl; + + + // end the program here + exit(1); +} + + +void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED12, int mapQual, bool uncompressedBam) { + + BamWriter *writer = new BamWriter(); + + // build a BAM header from the genomeFile + RefVector refs; + string bamHeader; + map<string, int, std::less<string> > chromToId; + MakeBamHeader(genome->getGenomeFileName(), refs, bamHeader, chromToId); + + // set compression mode + BamWriter::CompressionMode compressionMode = BamWriter::Compressed; + if ( uncompressedBam ) compressionMode = BamWriter::Uncompressed; + writer->SetCompressionMode(compressionMode); + // open a BAM and add the reference headers to the BAM file + writer->Open("stdout", bamHeader, refs); + + + // process each BED entry and convert to BAM + BED bedEntry, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + // open the BED file for reading. + bed->Open(); + while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + BamAlignment bamEntry; + if (bed->bedType >= 4) { + ConvertBedToBam(bedEntry, bamEntry, chromToId, isBED12, mapQual, lineNum); + writer->SaveAlignment(bamEntry); + } + else { + cerr << "Error: BED entry without name found at line: " << lineNum << ". Exiting!" << endl; + exit (1); + } + bedEntry = nullBed; + } + } + //close up + bed->Close(); + writer->Close(); +} + + +void ConvertBedToBam(const BED &bed, BamAlignment &bam, map<string, int, std::less<string> > &chromToId, + bool isBED12, int mapQual, int lineNum) { + + bam.Name = bed.name; + bam.Position = bed.start; + bam.Bin = reg2bin(bed.start, bed.end); + + // hard-code the sequence and qualities. + int bedLength = bed.end - bed.start; + + // set dummy seq and qual strings. the input is BED, + // so the sequence is inherently the same as it's + // reference genome. + // Thanks to James M. Ward for pointing this out. + bam.QueryBases = ""; + bam.Qualities = ""; + + // chrom and map quality + bam.RefID = chromToId[bed.chrom]; + bam.MapQuality = mapQual; + + // set the BAM FLAG + bam.AlignmentFlag = 0; + if (bed.strand == "-") + bam.SetIsReverseStrand(true); + + bam.MatePosition = -1; + bam.InsertSize = 0; + bam.MateRefID = -1; + + bam.CigarData.clear(); + + if (isBED12 == false) { + CigarOp cOp; + cOp.Type = 'M'; + cOp.Length = bedLength; + bam.CigarData.push_back(cOp); + } + // we're being told that the input is BED12. + else{ + + // does it smell like BED12? if so, process it. + if (bed.otherFields.size() == 6) { + + // extract the relevant BED fields to convert BED12 to BAM + // namely: blockCount, blockStarts, blockEnds + unsigned int blockCount = atoi(bed.otherFields[3].c_str()); + + vector<int> blockSizes, blockStarts; + Tokenize(bed.otherFields[4], blockSizes, ","); + Tokenize(bed.otherFields[5], blockStarts, ","); + + // make sure this is a well-formed BED12 entry. + if (blockSizes.size() != blockCount) { + cerr << "Error: Number of BED blocks does not match blockCount at line: " << lineNum << ". Exiting!" << endl; + exit (1); + } + else { + // does the first block start after the bed.start? + // if so, we need to do some "splicing" + if (blockStarts[0] > 0) { + CigarOp cOp; + cOp.Length = blockStarts[0]; + cOp.Type = 'N'; + bam.CigarData.push_back(cOp); + } + // handle the "middle" blocks + for (unsigned int i = 0; i < blockCount - 1; ++i) { + CigarOp cOp; + cOp.Length = blockSizes[i]; + cOp.Type = 'M'; + bam.CigarData.push_back(cOp); + + if (blockStarts[i+1] > (blockStarts[i] + blockSizes[i])) { + CigarOp cOp; + cOp.Length = (blockStarts[i+1] - (blockStarts[i] + blockSizes[i])); + cOp.Type = 'N'; + bam.CigarData.push_back(cOp); + } + } + // handle the last block. + CigarOp cOp; + cOp.Length = blockSizes[blockCount - 1]; + cOp.Type = 'M'; + bam.CigarData.push_back(cOp); + } + } + // it doesn't smell like BED12. complain. + else { + cerr << "You've indicated that the input file is in BED12 format, yet the relevant fields cannot be found. Exiting." << endl << endl; + exit(1); + } + } +} + + +void MakeBamHeader(const string &genomeFile, RefVector &refs, string &header, + map<string, int, std::less<string> > &chromToId) { + + // make a genome map of the genome file. + GenomeFile genome(genomeFile); + + header += "@HD\tVN:1.0\tSO:unsorted\n"; + header += "@PG\tID:BEDTools_bedToBam\tVN:V"; + header += VERSION; + header += "\n"; + + int chromId = 0; + vector<string> chromList = genome.getChromList(); + sort(chromList.begin(), chromList.end()); + + // create a BAM header (@SQ) entry for each chrom in the BEDTools genome file. + vector<string>::const_iterator genomeItr = chromList.begin(); + vector<string>::const_iterator genomeEnd = chromList.end(); + for (; genomeItr != genomeEnd; ++genomeItr) { + chromToId[*genomeItr] = chromId; + chromId++; + + // add to the header text + int size = genome.getChromSize(*genomeItr); + string chromLine = "@SQ\tSN:" + *genomeItr + "\tAS:" + genomeFile + "\tLN:" + ToString(size) + "\n"; + header += chromLine; + + // create a chrom entry and add it to + // the RefVector + RefData chrom; + chrom.RefName = *genomeItr; + chrom.RefLength = size; + refs.push_back(chrom); + } +} + + +/* Taken directly from the SAMTools spec +calculate bin given an alignment in [beg,end) (zero-based, half-close, half-open) */ +int reg2bin(int beg, int end) { + --end; + if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14); + if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17); + if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20); + if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23); + if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26); + return 0; +} + +