Mercurial > repos > aaronquinlan > multi_intersect
diff BEDTools-Version-2.14.3/src/bamToBed/bamToBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BEDTools-Version-2.14.3/src/bamToBed/bamToBed.cpp Thu Nov 03 10:25:04 2011 -0400 @@ -0,0 +1,564 @@ +/***************************************************************************** + bamToBed.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "version.h" +#include "api/BamReader.h" +#include "api/BamAux.h" +#include "BamAncillary.h" +#include "bedFile.h" +using namespace BamTools; + +#include <vector> +#include <algorithm> // for swap() +#include <iostream> +#include <fstream> +#include <stdlib.h> + +using namespace std; + + +// define our program name +#define PROGRAM_NAME "bamToBed" + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + + +// function declarations +void ShowHelp(void); + +void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, const string &bamTag, + const bool &writeBed12, const bool &obeySplits, const string &color, const bool &useCigar); +void ConvertBamToBedpe(const string &bamFile, const bool &useEditDistance); + +void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, bool obeySplits, bool useCigar); +void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, string color = "255,0,0"); +void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, + const RefVector &refs, bool useEditDistance); + +void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts, + vector<int> &blockEnds, int &alignmentEnd); +string BuildCigarString(const vector<CigarOp> &cigar); + +bool IsCorrectMappingForBEDPE (const BamAlignment &bam); + + +int main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input files + string bamFile = "stdin"; + string color = "255,0,0"; + string tag = ""; + + bool haveBam = true; + bool haveColor = false; + bool haveOtherTag = false; + bool writeBedPE = false; + bool writeBed12 = false; + bool useEditDistance = false; + bool useAlignmentScore = false; + bool useCigar = false; + bool obeySplits = false; + + // check to see if we should print out some help + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) ShowHelp(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bamFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-bedpe", 6, parameterLength)) { + writeBedPE = true; + } + else if(PARAMETER_CHECK("-bed12", 6, parameterLength)) { + writeBed12 = true; + } + else if(PARAMETER_CHECK("-split", 6, parameterLength)) { + obeySplits = true; + } + else if(PARAMETER_CHECK("-ed", 3, parameterLength)) { + useEditDistance = true; + } + else if(PARAMETER_CHECK("-cigar", 6, parameterLength)) { + useCigar = true; + } + else if(PARAMETER_CHECK("-as", 3, parameterLength)) { + useAlignmentScore = true; + } + else if(PARAMETER_CHECK("-color", 6, parameterLength)) { + if ((i+1) < argc) { + haveColor = true; + color = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-tag", 4, parameterLength)) { + if ((i+1) < argc) { + haveOtherTag = true; + tag = argv[i + 1]; + i++; + } + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have an input files + if (haveBam == false) { + cerr << endl << "*****" << endl << "*****ERROR: Need -i (BAM) file. " << endl << "*****" << endl; + showHelp = true; + } + if (haveColor == true && writeBed12 == false) { + cerr << endl << "*****" << endl << "*****ERROR: Cannot use color without BED12. " << endl << "*****" << endl; + showHelp = true; + } + if (useEditDistance == true && obeySplits == true) { + cerr << endl << "*****" << endl << "*****ERROR: Cannot use -ed with -splits. Edit distances cannot be computed for each \'chunk\'." << endl << "*****" << endl; + showHelp = true; + } + if (useEditDistance == true && useCigar == true) { + cerr << endl << "*****" << endl << "*****ERROR: Cannot use -cigar with -splits. Not yet supported." << endl << "*****" << endl; + showHelp = true; + } + if (useEditDistance == true && haveOtherTag == true) { + cerr << endl << "*****" << endl << "*****ERROR: Cannot use -ed with -tag. Choose one or the other." << endl << "*****" << endl; + showHelp = true; + } + if (writeBedPE == true && haveOtherTag == true) { + cerr << endl << "*****" << endl << "*****ERROR: Cannot use -tag with -bedpe." << endl << "*****" << endl; + showHelp = true; + } + // if there are no problems, let's convert BAM to BED or BEDPE + if (!showHelp) { + if (writeBedPE == false) + ConvertBamToBed(bamFile, useEditDistance, tag, writeBed12, obeySplits, color, useCigar); // BED or "blocked BED" + else + ConvertBamToBedpe(bamFile, useEditDistance); // BEDPE + } + else { + ShowHelp(); + } +} + + +void ShowHelp(void) { + + cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; + + cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; + + cerr << "Summary: Converts BAM alignments to BED6 or BEDPE format." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bam> " << endl << endl; + + cerr << "Options: " << endl; + + cerr << "\t-bedpe\t" << "Write BEDPE format." << endl; + cerr << "\t\t- Requires BAM to be grouped or sorted by query." << endl << endl; + + cerr << "\t-bed12\t" << "Write \"blocked\" BED format (aka \"BED12\")." << endl << endl; + cerr << "\t\thttp://genome-test.cse.ucsc.edu/FAQ/FAQformat#format1" << endl << endl; + + cerr << "\t-split\t" << "Report \"split\" BAM alignments as separate BED entries." << endl << endl; + + cerr << "\t-ed\t" << "Use BAM edit distance (NM tag) for BED score." << endl; + cerr << "\t\t- Default for BED is to use mapping quality." << endl; + cerr << "\t\t- Default for BEDPE is to use the minimum of" << endl; + cerr << "\t\t the two mapping qualities for the pair." << endl; + cerr << "\t\t- When -ed is used with -bedpe, the total edit" << endl; + cerr << "\t\t distance from the two mates is reported." << endl << endl; + + cerr << "\t-tag\t" << "Use other NUMERIC BAM alignment tag for BED score." << endl; + cerr << "\t\t- Default for BED is to use mapping quality." << endl; + cerr << "\t\t Disallowed with BEDPE output." << endl << endl; + + cerr << "\t-color\t" << "An R,G,B string for the color used with BED12 format." << endl; + cerr << "\t\tDefault is (255,0,0)." << endl << endl; + + cerr << "\t-cigar\t" << "Add the CIGAR string to the BED entry as a 7th column." << endl << endl; + + + // end the program here + exit(1); +} + + +void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, const string &bamTag, + const bool &writeBed12, const bool &obeySplits, const string &color, const bool &useCigar) { + // open the BAM file + BamReader reader; + reader.Open(bamFile); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // rip through the BAM file and convert each mapped entry to BED + BamAlignment bam; + while (reader.GetNextAlignment(bam)) { + if (bam.IsMapped() == true) { + if (writeBed12 == false) // BED + PrintBed(bam, refs, useEditDistance, bamTag, obeySplits, useCigar); + else //"blocked" BED + PrintBed12(bam, refs, useEditDistance, bamTag, color); + } + } + reader.Close(); +} + + +/* + Assumptions: + 1. The BAM file is grouped/sorted by query name, + not alignment position +*/ +void ConvertBamToBedpe(const string &bamFile, const bool &useEditDistance) { + // open the BAM file + BamReader reader; + reader.Open(bamFile); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // rip through the BAM file and convert each mapped entry to BEDPE + BamAlignment bam1, bam2; + while (reader.GetNextAlignment(bam1)) { + // the alignment must be paired + if (bam1.IsPaired() == true) { + // grab the second alignment for the pair. + reader.GetNextAlignment(bam2); + + // require that the alignments are from the same query + if (bam1.Name == bam2.Name) { + PrintBedPE(bam1, bam2, refs, useEditDistance); + } + else { + cerr << "*****ERROR: -bedpe requires BAM to be sorted/grouped by query name. " << endl; + exit(1); + } + } + } + reader.Close(); +} + + +void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts, vector<int> &blockLengths, unsigned int &alignmentEnd) { + + int currPosition = 0; + int blockLength = 0; + + // Rip through the CIGAR ops and figure out if there is more + // than one block for this alignment + vector<CigarOp>::const_iterator cigItr = cigar.begin(); + vector<CigarOp>::const_iterator cigEnd = cigar.end(); + for (; cigItr != cigEnd; ++cigItr) { + switch (cigItr->Type) { + case ('M') : + blockLength += cigItr->Length; + currPosition += cigItr->Length; + case ('I') : break; + case ('S') : break; + case ('D') : break; + blockLength += cigItr->Length; + currPosition += cigItr->Length; + case ('P') : break; + case ('N') : + blockStarts.push_back(currPosition + cigItr->Length); + blockLengths.push_back(blockLength); + currPosition += cigItr->Length; + blockLength = 0; + case ('H') : break; // for 'H' - do nothing, move to next op + default : + printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here + exit(1); + } + } + // add the kast block and set the + // alignment end (i.e., relative to the start) + blockLengths.push_back(blockLength); + alignmentEnd = currPosition; +} + + +string BuildCigarString(const vector<CigarOp> &cigar) { + + stringstream cigarString; + + for (size_t i = 0; i < cigar.size(); ++i) { + //cerr << cigar[i].Type << " " << cigar[i].Length << endl; + switch (cigar[i].Type) { + case ('M') : + case ('I') : + case ('D') : + case ('N') : + case ('S') : + case ('H') : + case ('P') : + cigarString << cigar[i].Length << cigar[i].Type; + } + } + return cigarString.str(); +} + + +void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, bool obeySplits, bool useCigar) { + + // set the strand + string strand = "+"; + if (bam.IsReverseStrand() == true) strand = "-"; + + // set the name of the feature based on the sequence + string name = bam.Name; + if (bam.IsFirstMate() == true) name += "/1"; + if (bam.IsSecondMate() == true) name += "/2"; + + // get the unpadded (parm = false) end position based on the CIGAR + unsigned int alignmentEnd = bam.GetEndPosition(false, false); + + // report the entire BAM footprint as a single BED entry + if (obeySplits == false) { + // report the alignment in BED6 format. + if (useEditDistance == false && bamTag == "") { + printf("%s\t%d\t%d\t\%s\t%d\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, + alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str()); + } + else if (useEditDistance == true && bamTag == "") { + uint32_t editDistance; + if (bam.GetTag("NM", editDistance)) { + printf("%s\t%d\t%d\t\%s\t%u\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, + alignmentEnd, name.c_str(), editDistance, strand.c_str()); + } + else { + cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; + exit(1); + } + } + else if (useEditDistance == false && bamTag != "") { + int32_t tagValue; + if (bam.GetTag(bamTag, tagValue)) { + printf("%s\t%d\t%d\t\%s\t%d\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, + alignmentEnd, name.c_str(), tagValue, strand.c_str()); + } + else { + cerr << "The requested tag (" << bamTag << ") was not found in the BAM file. Exiting\n"; + exit(1); + } + } + + // does the user want CIGAR as well? + if (useCigar == false) { + printf("\n"); + } + else { + string cigar = BuildCigarString(bam.CigarData); + printf("\t%s\n", cigar.c_str()); + } + } + // Report each chunk of the BAM alignment as a discrete BED entry + // For example 10M100N10M would be reported as two seprate BED entries of length 10 + else { + vector<BED> bedBlocks; + // Load the alignment blocks in bam into the bedBlocks vector. + // Don't trigger a new block when a "D" (deletion) CIGAR op is found. + getBamBlocks(bam, refs, bedBlocks, false); + + vector<BED>::const_iterator bedItr = bedBlocks.begin(); + vector<BED>::const_iterator bedEnd = bedBlocks.end(); + for (; bedItr != bedEnd; ++bedItr) { + printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bedItr->start, + bedItr->end, name.c_str(), bam.MapQuality, strand.c_str()); + } + } +} + + +void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, string color) { + + // set the strand + string strand = "+"; + if (bam.IsReverseStrand()) strand = "-"; + + // set the name of the feature based on the sequence + string name = bam.Name; + if (bam.IsFirstMate()) name += "/1"; + if (bam.IsSecondMate()) name += "/2"; + + // parse the CIGAR string and figure out the alignment blocks + unsigned int alignmentEnd; + vector<int> blockLengths; + vector<int> blockStarts; + blockStarts.push_back(0); + + // extract the block starts and lengths from the CIGAR string + ParseCigarBed12(bam.CigarData, blockStarts, blockLengths, alignmentEnd); + alignmentEnd += bam.Position; + + // write BED6 portion + if (useEditDistance == false && bamTag == "") { + printf("%s\t%d\t%d\t\%s\t%d\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position, + alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str()); + } + else if (useEditDistance == true && bamTag != "") { + uint32_t editDistance; + if (bam.GetTag("NM", editDistance)) { + printf("%s\t%d\t%d\t\%s\t%u\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position, + alignmentEnd, name.c_str(), editDistance, strand.c_str()); + } + else { + cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; + exit(1); + } + } + else if (useEditDistance == false && bamTag != "") { + int32_t tagValue; + if (bam.GetTag(bamTag, tagValue)) { + printf("%s\t%d\t%d\t\%s\t%d\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position, + alignmentEnd, name.c_str(), tagValue, strand.c_str()); + } + else { + cerr << "The requested tag (" << bamTag << ") was not found in the BAM file. Exiting\n"; + exit(1); + } + } + + // write the colors, etc. + printf("%d\t%d\t%s\t%d\t", bam.Position, alignmentEnd, color.c_str(), (int) blockStarts.size()); + + // now write the lengths portion + unsigned int b; + for (b = 0; b < blockLengths.size() - 1; ++b) { + printf("%d,", blockLengths[b]); + } + printf("%d\t", blockLengths[b]); + + // now write the starts portion + for (b = 0; b < blockStarts.size() - 1; ++b) { + printf("%d,", blockStarts[b]); + } + printf("%d\n", blockStarts[b]); +} + + +void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVector &refs, bool useEditDistance) { + + // initialize BEDPE variables + string chrom1, chrom2, strand1, strand2; + int start1, start2, end1, end2; + uint32_t editDistance1, editDistance2; + start1 = start2 = end1 = end2 = -1; + chrom1 = chrom2 = strand1 = strand2 = "."; + editDistance1 = editDistance2 = 0; + uint16_t minMapQuality = 0; + + // extract relevant info for end 1 + if (bam1.IsMapped()) { + chrom1 = refs.at(bam1.RefID).RefName; + start1 = bam1.Position; + end1 = bam1.GetEndPosition(false); + strand1 = "+"; + if (bam1.IsReverseStrand()) strand1 = "-"; + + // extract the edit distance from the NM tag + // if possible. otherwise, complain. + if (useEditDistance == true && bam1.GetTag("NM", editDistance1) == false) { + cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; + exit(1); + } + } + + // extract relevant info for end 2 + if (bam2.IsMapped()) { + chrom2 = refs.at(bam2.RefID).RefName; + start2 = bam2.Position; + end2 = bam2.GetEndPosition(false); + strand2 = "+"; + if (bam2.IsReverseStrand()) strand2 = "-"; + + // extract the edit distance from the NM tag + // if possible. otherwise, complain. + if (useEditDistance == true && bam2.GetTag("NM", editDistance2) == false) { + cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; + exit(1); + } + } + + // swap the ends if necessary + if ( chrom1 > chrom2 || ((chrom1 == chrom2) && (start1 > start2)) ) { + swap(chrom1, chrom2); + swap(start1, start2); + swap(end1, end2); + swap(strand1, strand2); + } + + // report BEDPE using min mapQuality + if (useEditDistance == false) { + // compute the minimum mapping quality b/w the two ends of the pair. + if (bam1.IsMapped() == true && bam2.IsMapped() == true) + minMapQuality = min(bam1.MapQuality, bam2.MapQuality); + + printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n", + chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2, + bam1.Name.c_str(), minMapQuality, strand1.c_str(), strand2.c_str()); + } + // report BEDPE using total edit distance + else { + uint16_t totalEditDistance = 0; + if (bam1.IsMapped() == true && bam2.IsMapped() == true) + totalEditDistance = editDistance1 + editDistance2; + else if (bam1.IsMapped() == true) + totalEditDistance = editDistance1; + else if (bam2.IsMapped() == true) + totalEditDistance = editDistance2; + + printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n", + chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2, + bam1.Name.c_str(), totalEditDistance, strand1.c_str(), strand2.c_str()); + } +} + + +// deprecated. +bool IsCorrectMappingForBEDPE (const BamAlignment &bam) { + + if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize > 0) ) { + return true; + } + else if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize == 0) && bam.IsFirstMate() ) { + return true; + } + else if ( (bam.RefID != bam.MateRefID) && bam.IsFirstMate() ) { + return true; + } + else return false; +}