Mercurial > repos > aaronquinlan > multi_intersect
diff BEDTools-Version-2.14.3/src/bamToBed/bamToBed.cpp @ 1:bec36315bd12 default tip
Deleted selected files
author | aaronquinlan |
---|---|
date | Sat, 19 Nov 2011 14:17:03 -0500 |
parents | dfcd8b6c1bda |
children |
line wrap: on
line diff
--- a/BEDTools-Version-2.14.3/src/bamToBed/bamToBed.cpp Thu Nov 03 10:25:04 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,564 +0,0 @@ -/***************************************************************************** - bamToBed.cpp - - (c) 2009 - Aaron Quinlan - Hall Laboratory - Department of Biochemistry and Molecular Genetics - University of Virginia - aaronquinlan@gmail.com - - Licenced under the GNU General Public License 2.0 license. -******************************************************************************/ -#include "version.h" -#include "api/BamReader.h" -#include "api/BamAux.h" -#include "BamAncillary.h" -#include "bedFile.h" -using namespace BamTools; - -#include <vector> -#include <algorithm> // for swap() -#include <iostream> -#include <fstream> -#include <stdlib.h> - -using namespace std; - - -// define our program name -#define PROGRAM_NAME "bamToBed" - -// define our parameter checking macro -#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) - - -// function declarations -void ShowHelp(void); - -void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, const string &bamTag, - const bool &writeBed12, const bool &obeySplits, const string &color, const bool &useCigar); -void ConvertBamToBedpe(const string &bamFile, const bool &useEditDistance); - -void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, bool obeySplits, bool useCigar); -void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, string color = "255,0,0"); -void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, - const RefVector &refs, bool useEditDistance); - -void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts, - vector<int> &blockEnds, int &alignmentEnd); -string BuildCigarString(const vector<CigarOp> &cigar); - -bool IsCorrectMappingForBEDPE (const BamAlignment &bam); - - -int main(int argc, char* argv[]) { - - // our configuration variables - bool showHelp = false; - - // input files - string bamFile = "stdin"; - string color = "255,0,0"; - string tag = ""; - - bool haveBam = true; - bool haveColor = false; - bool haveOtherTag = false; - bool writeBedPE = false; - bool writeBed12 = false; - bool useEditDistance = false; - bool useAlignmentScore = false; - bool useCigar = false; - bool obeySplits = false; - - // check to see if we should print out some help - - for(int i = 1; i < argc; i++) { - int parameterLength = (int)strlen(argv[i]); - - if((PARAMETER_CHECK("-h", 2, parameterLength)) || - (PARAMETER_CHECK("--help", 5, parameterLength))) { - showHelp = true; - } - } - - if(showHelp) ShowHelp(); - - // do some parsing (all of these parameters require 2 strings) - for(int i = 1; i < argc; i++) { - - int parameterLength = (int)strlen(argv[i]); - - if(PARAMETER_CHECK("-i", 2, parameterLength)) { - if ((i+1) < argc) { - bamFile = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-bedpe", 6, parameterLength)) { - writeBedPE = true; - } - else if(PARAMETER_CHECK("-bed12", 6, parameterLength)) { - writeBed12 = true; - } - else if(PARAMETER_CHECK("-split", 6, parameterLength)) { - obeySplits = true; - } - else if(PARAMETER_CHECK("-ed", 3, parameterLength)) { - useEditDistance = true; - } - else if(PARAMETER_CHECK("-cigar", 6, parameterLength)) { - useCigar = true; - } - else if(PARAMETER_CHECK("-as", 3, parameterLength)) { - useAlignmentScore = true; - } - else if(PARAMETER_CHECK("-color", 6, parameterLength)) { - if ((i+1) < argc) { - haveColor = true; - color = argv[i + 1]; - i++; - } - } - else if(PARAMETER_CHECK("-tag", 4, parameterLength)) { - if ((i+1) < argc) { - haveOtherTag = true; - tag = argv[i + 1]; - i++; - } - } - else { - cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; - showHelp = true; - } - } - - // make sure we have an input files - if (haveBam == false) { - cerr << endl << "*****" << endl << "*****ERROR: Need -i (BAM) file. " << endl << "*****" << endl; - showHelp = true; - } - if (haveColor == true && writeBed12 == false) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use color without BED12. " << endl << "*****" << endl; - showHelp = true; - } - if (useEditDistance == true && obeySplits == true) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use -ed with -splits. Edit distances cannot be computed for each \'chunk\'." << endl << "*****" << endl; - showHelp = true; - } - if (useEditDistance == true && useCigar == true) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use -cigar with -splits. Not yet supported." << endl << "*****" << endl; - showHelp = true; - } - if (useEditDistance == true && haveOtherTag == true) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use -ed with -tag. Choose one or the other." << endl << "*****" << endl; - showHelp = true; - } - if (writeBedPE == true && haveOtherTag == true) { - cerr << endl << "*****" << endl << "*****ERROR: Cannot use -tag with -bedpe." << endl << "*****" << endl; - showHelp = true; - } - // if there are no problems, let's convert BAM to BED or BEDPE - if (!showHelp) { - if (writeBedPE == false) - ConvertBamToBed(bamFile, useEditDistance, tag, writeBed12, obeySplits, color, useCigar); // BED or "blocked BED" - else - ConvertBamToBedpe(bamFile, useEditDistance); // BEDPE - } - else { - ShowHelp(); - } -} - - -void ShowHelp(void) { - - cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl; - - cerr << "Author: Aaron Quinlan (aaronquinlan@gmail.com)" << endl; - - cerr << "Summary: Converts BAM alignments to BED6 or BEDPE format." << endl << endl; - - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bam> " << endl << endl; - - cerr << "Options: " << endl; - - cerr << "\t-bedpe\t" << "Write BEDPE format." << endl; - cerr << "\t\t- Requires BAM to be grouped or sorted by query." << endl << endl; - - cerr << "\t-bed12\t" << "Write \"blocked\" BED format (aka \"BED12\")." << endl << endl; - cerr << "\t\thttp://genome-test.cse.ucsc.edu/FAQ/FAQformat#format1" << endl << endl; - - cerr << "\t-split\t" << "Report \"split\" BAM alignments as separate BED entries." << endl << endl; - - cerr << "\t-ed\t" << "Use BAM edit distance (NM tag) for BED score." << endl; - cerr << "\t\t- Default for BED is to use mapping quality." << endl; - cerr << "\t\t- Default for BEDPE is to use the minimum of" << endl; - cerr << "\t\t the two mapping qualities for the pair." << endl; - cerr << "\t\t- When -ed is used with -bedpe, the total edit" << endl; - cerr << "\t\t distance from the two mates is reported." << endl << endl; - - cerr << "\t-tag\t" << "Use other NUMERIC BAM alignment tag for BED score." << endl; - cerr << "\t\t- Default for BED is to use mapping quality." << endl; - cerr << "\t\t Disallowed with BEDPE output." << endl << endl; - - cerr << "\t-color\t" << "An R,G,B string for the color used with BED12 format." << endl; - cerr << "\t\tDefault is (255,0,0)." << endl << endl; - - cerr << "\t-cigar\t" << "Add the CIGAR string to the BED entry as a 7th column." << endl << endl; - - - // end the program here - exit(1); -} - - -void ConvertBamToBed(const string &bamFile, const bool &useEditDistance, const string &bamTag, - const bool &writeBed12, const bool &obeySplits, const string &color, const bool &useCigar) { - // open the BAM file - BamReader reader; - reader.Open(bamFile); - - // get header & reference information - string header = reader.GetHeaderText(); - RefVector refs = reader.GetReferenceData(); - - // rip through the BAM file and convert each mapped entry to BED - BamAlignment bam; - while (reader.GetNextAlignment(bam)) { - if (bam.IsMapped() == true) { - if (writeBed12 == false) // BED - PrintBed(bam, refs, useEditDistance, bamTag, obeySplits, useCigar); - else //"blocked" BED - PrintBed12(bam, refs, useEditDistance, bamTag, color); - } - } - reader.Close(); -} - - -/* - Assumptions: - 1. The BAM file is grouped/sorted by query name, - not alignment position -*/ -void ConvertBamToBedpe(const string &bamFile, const bool &useEditDistance) { - // open the BAM file - BamReader reader; - reader.Open(bamFile); - - // get header & reference information - string header = reader.GetHeaderText(); - RefVector refs = reader.GetReferenceData(); - - // rip through the BAM file and convert each mapped entry to BEDPE - BamAlignment bam1, bam2; - while (reader.GetNextAlignment(bam1)) { - // the alignment must be paired - if (bam1.IsPaired() == true) { - // grab the second alignment for the pair. - reader.GetNextAlignment(bam2); - - // require that the alignments are from the same query - if (bam1.Name == bam2.Name) { - PrintBedPE(bam1, bam2, refs, useEditDistance); - } - else { - cerr << "*****ERROR: -bedpe requires BAM to be sorted/grouped by query name. " << endl; - exit(1); - } - } - } - reader.Close(); -} - - -void ParseCigarBed12(const vector<CigarOp> &cigar, vector<int> &blockStarts, vector<int> &blockLengths, unsigned int &alignmentEnd) { - - int currPosition = 0; - int blockLength = 0; - - // Rip through the CIGAR ops and figure out if there is more - // than one block for this alignment - vector<CigarOp>::const_iterator cigItr = cigar.begin(); - vector<CigarOp>::const_iterator cigEnd = cigar.end(); - for (; cigItr != cigEnd; ++cigItr) { - switch (cigItr->Type) { - case ('M') : - blockLength += cigItr->Length; - currPosition += cigItr->Length; - case ('I') : break; - case ('S') : break; - case ('D') : break; - blockLength += cigItr->Length; - currPosition += cigItr->Length; - case ('P') : break; - case ('N') : - blockStarts.push_back(currPosition + cigItr->Length); - blockLengths.push_back(blockLength); - currPosition += cigItr->Length; - blockLength = 0; - case ('H') : break; // for 'H' - do nothing, move to next op - default : - printf("ERROR: Invalid Cigar op type\n"); // shouldn't get here - exit(1); - } - } - // add the kast block and set the - // alignment end (i.e., relative to the start) - blockLengths.push_back(blockLength); - alignmentEnd = currPosition; -} - - -string BuildCigarString(const vector<CigarOp> &cigar) { - - stringstream cigarString; - - for (size_t i = 0; i < cigar.size(); ++i) { - //cerr << cigar[i].Type << " " << cigar[i].Length << endl; - switch (cigar[i].Type) { - case ('M') : - case ('I') : - case ('D') : - case ('N') : - case ('S') : - case ('H') : - case ('P') : - cigarString << cigar[i].Length << cigar[i].Type; - } - } - return cigarString.str(); -} - - -void PrintBed(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, bool obeySplits, bool useCigar) { - - // set the strand - string strand = "+"; - if (bam.IsReverseStrand() == true) strand = "-"; - - // set the name of the feature based on the sequence - string name = bam.Name; - if (bam.IsFirstMate() == true) name += "/1"; - if (bam.IsSecondMate() == true) name += "/2"; - - // get the unpadded (parm = false) end position based on the CIGAR - unsigned int alignmentEnd = bam.GetEndPosition(false, false); - - // report the entire BAM footprint as a single BED entry - if (obeySplits == false) { - // report the alignment in BED6 format. - if (useEditDistance == false && bamTag == "") { - printf("%s\t%d\t%d\t\%s\t%d\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str()); - } - else if (useEditDistance == true && bamTag == "") { - uint32_t editDistance; - if (bam.GetTag("NM", editDistance)) { - printf("%s\t%d\t%d\t\%s\t%u\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), editDistance, strand.c_str()); - } - else { - cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; - exit(1); - } - } - else if (useEditDistance == false && bamTag != "") { - int32_t tagValue; - if (bam.GetTag(bamTag, tagValue)) { - printf("%s\t%d\t%d\t\%s\t%d\t%s", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), tagValue, strand.c_str()); - } - else { - cerr << "The requested tag (" << bamTag << ") was not found in the BAM file. Exiting\n"; - exit(1); - } - } - - // does the user want CIGAR as well? - if (useCigar == false) { - printf("\n"); - } - else { - string cigar = BuildCigarString(bam.CigarData); - printf("\t%s\n", cigar.c_str()); - } - } - // Report each chunk of the BAM alignment as a discrete BED entry - // For example 10M100N10M would be reported as two seprate BED entries of length 10 - else { - vector<BED> bedBlocks; - // Load the alignment blocks in bam into the bedBlocks vector. - // Don't trigger a new block when a "D" (deletion) CIGAR op is found. - getBamBlocks(bam, refs, bedBlocks, false); - - vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) { - printf("%s\t%d\t%d\t\%s\t%d\t%s\n", refs.at(bam.RefID).RefName.c_str(), bedItr->start, - bedItr->end, name.c_str(), bam.MapQuality, strand.c_str()); - } - } -} - - -void PrintBed12(const BamAlignment &bam, const RefVector &refs, bool useEditDistance, const string &bamTag, string color) { - - // set the strand - string strand = "+"; - if (bam.IsReverseStrand()) strand = "-"; - - // set the name of the feature based on the sequence - string name = bam.Name; - if (bam.IsFirstMate()) name += "/1"; - if (bam.IsSecondMate()) name += "/2"; - - // parse the CIGAR string and figure out the alignment blocks - unsigned int alignmentEnd; - vector<int> blockLengths; - vector<int> blockStarts; - blockStarts.push_back(0); - - // extract the block starts and lengths from the CIGAR string - ParseCigarBed12(bam.CigarData, blockStarts, blockLengths, alignmentEnd); - alignmentEnd += bam.Position; - - // write BED6 portion - if (useEditDistance == false && bamTag == "") { - printf("%s\t%d\t%d\t\%s\t%d\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), bam.MapQuality, strand.c_str()); - } - else if (useEditDistance == true && bamTag != "") { - uint32_t editDistance; - if (bam.GetTag("NM", editDistance)) { - printf("%s\t%d\t%d\t\%s\t%u\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), editDistance, strand.c_str()); - } - else { - cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; - exit(1); - } - } - else if (useEditDistance == false && bamTag != "") { - int32_t tagValue; - if (bam.GetTag(bamTag, tagValue)) { - printf("%s\t%d\t%d\t\%s\t%d\t%s\t", refs.at(bam.RefID).RefName.c_str(), bam.Position, - alignmentEnd, name.c_str(), tagValue, strand.c_str()); - } - else { - cerr << "The requested tag (" << bamTag << ") was not found in the BAM file. Exiting\n"; - exit(1); - } - } - - // write the colors, etc. - printf("%d\t%d\t%s\t%d\t", bam.Position, alignmentEnd, color.c_str(), (int) blockStarts.size()); - - // now write the lengths portion - unsigned int b; - for (b = 0; b < blockLengths.size() - 1; ++b) { - printf("%d,", blockLengths[b]); - } - printf("%d\t", blockLengths[b]); - - // now write the starts portion - for (b = 0; b < blockStarts.size() - 1; ++b) { - printf("%d,", blockStarts[b]); - } - printf("%d\n", blockStarts[b]); -} - - -void PrintBedPE(const BamAlignment &bam1, const BamAlignment &bam2, const RefVector &refs, bool useEditDistance) { - - // initialize BEDPE variables - string chrom1, chrom2, strand1, strand2; - int start1, start2, end1, end2; - uint32_t editDistance1, editDistance2; - start1 = start2 = end1 = end2 = -1; - chrom1 = chrom2 = strand1 = strand2 = "."; - editDistance1 = editDistance2 = 0; - uint16_t minMapQuality = 0; - - // extract relevant info for end 1 - if (bam1.IsMapped()) { - chrom1 = refs.at(bam1.RefID).RefName; - start1 = bam1.Position; - end1 = bam1.GetEndPosition(false); - strand1 = "+"; - if (bam1.IsReverseStrand()) strand1 = "-"; - - // extract the edit distance from the NM tag - // if possible. otherwise, complain. - if (useEditDistance == true && bam1.GetTag("NM", editDistance1) == false) { - cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; - exit(1); - } - } - - // extract relevant info for end 2 - if (bam2.IsMapped()) { - chrom2 = refs.at(bam2.RefID).RefName; - start2 = bam2.Position; - end2 = bam2.GetEndPosition(false); - strand2 = "+"; - if (bam2.IsReverseStrand()) strand2 = "-"; - - // extract the edit distance from the NM tag - // if possible. otherwise, complain. - if (useEditDistance == true && bam2.GetTag("NM", editDistance2) == false) { - cerr << "The edit distance tag (NM) was not found in the BAM file. Please disable -ed. Exiting\n"; - exit(1); - } - } - - // swap the ends if necessary - if ( chrom1 > chrom2 || ((chrom1 == chrom2) && (start1 > start2)) ) { - swap(chrom1, chrom2); - swap(start1, start2); - swap(end1, end2); - swap(strand1, strand2); - } - - // report BEDPE using min mapQuality - if (useEditDistance == false) { - // compute the minimum mapping quality b/w the two ends of the pair. - if (bam1.IsMapped() == true && bam2.IsMapped() == true) - minMapQuality = min(bam1.MapQuality, bam2.MapQuality); - - printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n", - chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2, - bam1.Name.c_str(), minMapQuality, strand1.c_str(), strand2.c_str()); - } - // report BEDPE using total edit distance - else { - uint16_t totalEditDistance = 0; - if (bam1.IsMapped() == true && bam2.IsMapped() == true) - totalEditDistance = editDistance1 + editDistance2; - else if (bam1.IsMapped() == true) - totalEditDistance = editDistance1; - else if (bam2.IsMapped() == true) - totalEditDistance = editDistance2; - - printf("%s\t%d\t%d\t\%s\t%d\t%d\t%s\t%d\t%s\t%s\n", - chrom1.c_str(), start1, end1, chrom2.c_str(), start2, end2, - bam1.Name.c_str(), totalEditDistance, strand1.c_str(), strand2.c_str()); - } -} - - -// deprecated. -bool IsCorrectMappingForBEDPE (const BamAlignment &bam) { - - if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize > 0) ) { - return true; - } - else if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize == 0) && bam.IsFirstMate() ) { - return true; - } - else if ( (bam.RefID != bam.MateRefID) && bam.IsFirstMate() ) { - return true; - } - else return false; -}