diff BEDTools-Version-2.14.3/src/bedToBam/bedToBam.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BEDTools-Version-2.14.3/src/bedToBam/bedToBam.cpp	Thu Nov 03 10:25:04 2011 -0400
@@ -0,0 +1,357 @@
+/*****************************************************************************
+  bedToBam.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "lineFileUtilities.h"
+#include "bedFile.h"
+#include "genomeFile.h"
+#include "version.h"
+
+
+#include "api/BamReader.h"
+#include "api/BamAux.h"
+#include "api/BamWriter.h"
+using namespace BamTools;
+
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+
+using namespace std;
+
+
+// define our program name
+#define PROGRAM_NAME "bedToBam"
+
+// define our parameter checking macro
+#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)
+
+
+// function declarations
+void ShowHelp(void);
+void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED12, int mapQual, bool uncompressedBam);
+void ConvertBedToBam(const BED &bed, BamAlignment &bam, map<string, int> &chromToId, bool isBED12, int mapQual, int lineNum);
+void MakeBamHeader(const string &genomeFile, RefVector &refs, string &header, map<string, int> &chromToInt);
+int  reg2bin(int beg, int end);
+
+
+
+int main(int argc, char* argv[]) {
+
+    // our configuration variables
+    bool showHelp = false;
+
+    // input files
+    string bedFile = "stdin";
+    string genomeFile;
+
+    unsigned int mapQual = 255;
+
+    bool haveBed         = true;
+    bool haveGenome      = false;
+    bool haveMapQual     = false;
+    bool isBED12         = false;
+    bool uncompressedBam = false;
+
+    for(int i = 1; i < argc; i++) {
+        int parameterLength = (int)strlen(argv[i]);
+
+        if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
+        (PARAMETER_CHECK("--help", 5, parameterLength))) {
+            showHelp = true;
+        }
+    }
+
+    if(showHelp) ShowHelp();
+
+    // do some parsing (all of these parameters require 2 strings)
+    for(int i = 1; i < argc; i++) {
+
+        int parameterLength = (int)strlen(argv[i]);
+
+        if(PARAMETER_CHECK("-i", 2, parameterLength)) {
+            if ((i+1) < argc) {
+                bedFile = argv[i + 1];
+                i++;
+            }
+        }
+        else if(PARAMETER_CHECK("-g", 2, parameterLength)) {
+            if ((i+1) < argc) {
+                haveGenome = true;
+                genomeFile = argv[i + 1];
+                i++;
+            }
+        }
+        else if(PARAMETER_CHECK("-mapq", 5, parameterLength)) {
+            haveMapQual = true;
+            if ((i+1) < argc) {
+                mapQual = atoi(argv[i + 1]);
+                i++;
+            }
+        }
+        else if(PARAMETER_CHECK("-bed12", 6, parameterLength)) {
+            isBED12 = true;
+        }
+        else if(PARAMETER_CHECK("-ubam", 5, parameterLength)) {
+            uncompressedBam = true;
+        }
+        else {
+            cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
+            showHelp = true;
+        }
+    }
+
+    // make sure we have an input files
+    if (!haveBed ) {
+        cerr << endl << "*****" << endl << "*****ERROR: Need -i (BED) file. " << endl << "*****" << endl;
+        showHelp = true;
+    }
+    if (!haveGenome ) {
+        cerr << endl << "*****" << endl << "*****ERROR: Need -g (genome) file. " << endl << "*****" << endl;
+        showHelp = true;
+    }
+    if (mapQual < 0 || mapQual > 255) {
+        cerr << endl << "*****" << endl << "*****ERROR: MAPQ must be in range [0,255]. " << endl << "*****" << endl;
+        showHelp = true;
+    }
+
+
+    if (!showHelp) {
+        BedFile *bed       = new BedFile(bedFile);
+        GenomeFile *genome = new GenomeFile(genomeFile);
+
+        ProcessBed(cin, bed, genome, isBED12, mapQual, uncompressedBam);
+    }
+    else {
+        ShowHelp();
+    }
+}
+
+
+void ShowHelp(void) {
+
+    cerr << endl << "Program: " << PROGRAM_NAME << " (v" << VERSION << ")" << endl;
+
+    cerr << "Author:  Aaron Quinlan (aaronquinlan@gmail.com)" << endl;
+
+    cerr << "Summary: Converts feature records to BAM format." << endl << endl;
+
+    cerr << "Usage:   " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl;
+
+    cerr << "Options: " << endl;
+
+    cerr << "\t-mapq\t" << "Set the mappinq quality for the BAM records." << endl;
+    cerr                    << "\t\t(INT) Default: 255" << endl << endl;
+
+    cerr << "\t-bed12\t"    << "The BED file is in BED12 format.  The BAM CIGAR" << endl;
+    cerr                    << "\t\tstring will reflect BED \"blocks\"." << endl << endl;
+
+    cerr << "\t-ubam\t"     << "Write uncompressed BAM output. Default is to write compressed BAM." << endl << endl;
+
+    cerr << "Notes: " << endl;
+    cerr << "\t(1)  BED files must be at least BED4 to be amenable to BAM (needs name field)." << endl << endl;
+
+
+    // end the program here
+    exit(1);
+}
+
+
+void ProcessBed(istream &bedInput, BedFile *bed, GenomeFile *genome, bool isBED12, int mapQual, bool uncompressedBam) {
+
+    BamWriter *writer = new BamWriter();
+
+    // build a BAM header from the genomeFile
+    RefVector refs;
+    string    bamHeader;
+    map<string, int, std::less<string> > chromToId;
+    MakeBamHeader(genome->getGenomeFileName(), refs, bamHeader, chromToId);
+
+    // set compression mode
+    BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
+    if ( uncompressedBam ) compressionMode = BamWriter::Uncompressed;
+    writer->SetCompressionMode(compressionMode);
+    // open a BAM and add the reference headers to the BAM file
+    writer->Open("stdout", bamHeader, refs);
+
+
+    // process each BED entry and convert to BAM
+    BED bedEntry, nullBed;
+    int lineNum = 0;
+    BedLineStatus bedStatus;
+    // open the BED file for reading.
+    bed->Open();
+    while ((bedStatus = bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
+        if (bedStatus == BED_VALID) {
+            BamAlignment bamEntry;
+            if (bed->bedType >= 4) {
+                ConvertBedToBam(bedEntry, bamEntry, chromToId, isBED12, mapQual, lineNum);
+                writer->SaveAlignment(bamEntry);
+            }
+            else {
+                cerr << "Error: BED entry without name found at line: " << lineNum << ".  Exiting!" << endl;
+                exit (1);
+            }
+            bedEntry = nullBed;
+        }
+    }
+    //close up
+    bed->Close();
+    writer->Close();
+}
+
+
+void ConvertBedToBam(const BED &bed, BamAlignment &bam, map<string, int, std::less<string> > &chromToId,
+                     bool isBED12, int mapQual, int lineNum) {
+
+    bam.Name       = bed.name;
+    bam.Position   = bed.start;
+    bam.Bin        = reg2bin(bed.start, bed.end);
+
+    // hard-code the sequence and qualities.
+    int bedLength  = bed.end - bed.start;
+
+    // set dummy seq and qual strings.  the input is BED,
+    // so the sequence is inherently the same as it's
+    // reference genome.
+    // Thanks to James M. Ward for pointing this out.
+    bam.QueryBases = "";
+    bam.Qualities  = "";
+
+    // chrom and map quality
+    bam.RefID      = chromToId[bed.chrom];
+    bam.MapQuality = mapQual;
+
+    // set the BAM FLAG
+    bam.AlignmentFlag = 0;
+    if (bed.strand == "-")
+        bam.SetIsReverseStrand(true);
+
+    bam.MatePosition = -1;
+    bam.InsertSize   = 0;
+    bam.MateRefID    = -1;
+
+    bam.CigarData.clear();
+
+    if (isBED12 == false) {
+        CigarOp cOp;
+        cOp.Type = 'M';
+        cOp.Length = bedLength;
+        bam.CigarData.push_back(cOp);
+    }
+    // we're being told that the input is BED12.
+    else{
+
+        // does it smell like BED12?  if so, process it.
+        if (bed.otherFields.size() == 6) {
+
+            // extract the relevant BED fields to convert BED12 to BAM
+            // namely: blockCount, blockStarts, blockEnds
+            unsigned int blockCount = atoi(bed.otherFields[3].c_str());
+
+            vector<int> blockSizes, blockStarts;
+            Tokenize(bed.otherFields[4], blockSizes, ",");
+            Tokenize(bed.otherFields[5], blockStarts, ",");
+
+            // make sure this is a well-formed BED12 entry.
+            if (blockSizes.size() != blockCount) {
+                cerr << "Error: Number of BED blocks does not match blockCount at line: " << lineNum << ".  Exiting!" << endl;
+                exit (1);
+            }
+            else {
+                // does the first block start after the bed.start?
+                // if so, we need to do some "splicing"
+                if (blockStarts[0] > 0) {
+                    CigarOp cOp;
+                    cOp.Length = blockStarts[0];
+                    cOp.Type = 'N';
+                    bam.CigarData.push_back(cOp);
+                }
+                // handle the "middle" blocks
+                for (unsigned int i = 0; i < blockCount - 1; ++i) {
+                    CigarOp cOp;
+                    cOp.Length = blockSizes[i];
+                    cOp.Type = 'M';
+                    bam.CigarData.push_back(cOp);
+
+                    if (blockStarts[i+1] > (blockStarts[i] + blockSizes[i])) {
+                        CigarOp cOp;
+                        cOp.Length = (blockStarts[i+1] - (blockStarts[i] + blockSizes[i]));
+                        cOp.Type = 'N';
+                        bam.CigarData.push_back(cOp);
+                    }
+                }
+                // handle the last block.
+                CigarOp cOp;
+                cOp.Length = blockSizes[blockCount - 1];
+                cOp.Type = 'M';
+                bam.CigarData.push_back(cOp);
+            }
+        }
+        // it doesn't smell like BED12.  complain.
+        else {
+            cerr << "You've indicated that the input file is in BED12 format, yet the relevant fields cannot be found.  Exiting." << endl << endl;
+            exit(1);
+        }
+    }
+}
+
+
+void MakeBamHeader(const string &genomeFile, RefVector &refs, string &header,
+                   map<string, int, std::less<string> > &chromToId) {
+
+    // make a genome map of the genome file.
+    GenomeFile genome(genomeFile);
+
+    header += "@HD\tVN:1.0\tSO:unsorted\n";
+    header += "@PG\tID:BEDTools_bedToBam\tVN:V";
+    header += VERSION;
+    header += "\n";
+
+    int chromId = 0;
+    vector<string> chromList = genome.getChromList();
+    sort(chromList.begin(), chromList.end());
+
+    // create a BAM header (@SQ) entry for each chrom in the BEDTools genome file.
+    vector<string>::const_iterator genomeItr  = chromList.begin();
+    vector<string>::const_iterator genomeEnd  = chromList.end();
+    for (; genomeItr != genomeEnd; ++genomeItr) {
+        chromToId[*genomeItr] = chromId;
+        chromId++;
+
+        // add to the header text
+        int size = genome.getChromSize(*genomeItr);
+        string chromLine = "@SQ\tSN:" + *genomeItr + "\tAS:" + genomeFile + "\tLN:" + ToString(size) + "\n";
+        header += chromLine;
+
+        // create a chrom entry and add it to
+        // the RefVector
+        RefData chrom;
+        chrom.RefName            = *genomeItr;
+        chrom.RefLength          = size;
+        refs.push_back(chrom);
+    }
+}
+
+
+/* Taken directly from the SAMTools spec
+calculate bin given an alignment in [beg,end) (zero-based, half-close, half-open) */
+int reg2bin(int beg, int end) {
+    --end;
+    if (beg>>14 == end>>14) return ((1<<15)-1)/7 + (beg>>14);
+    if (beg>>17 == end>>17) return ((1<<12)-1)/7 + (beg>>17);
+    if (beg>>20 == end>>20) return ((1<<9)-1)/7 + (beg>>20);
+    if (beg>>23 == end>>23) return ((1<<6)-1)/7 + (beg>>23);
+    if (beg>>26 == end>>26) return ((1<<3)-1)/7 + (beg>>26);
+    return 0;
+}
+
+