diff BEDTools-Version-2.14.3/src/intersectBed/intersectBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BEDTools-Version-2.14.3/src/intersectBed/intersectBed.cpp	Thu Nov 03 10:25:04 2011 -0400
@@ -0,0 +1,367 @@
+/*****************************************************************************
+  intersectBed.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "lineFileUtilities.h"
+#include "intersectBed.h"
+
+/************************************
+Helper functions
+************************************/
+bool BedIntersect::processHits(const BED &a, const vector<BED> &hits) {
+
+    // how many overlaps are there b/w the bed and the set of hits?
+    CHRPOS s, e;
+    int overlapBases;
+    int  numOverlaps = 0;
+    bool hitsFound   = false;
+    int aLength      = (a.end - a.start);   // the length of a in b.p.
+
+    // loop through the hits and report those that meet the user's criteria
+    vector<BED>::const_iterator h       = hits.begin();
+    vector<BED>::const_iterator hitsEnd = hits.end();
+    for (; h != hitsEnd; ++h) {
+        s            = max(a.start, h->start);
+        e            = min(a.end, h->end);
+        overlapBases = (e - s);             // the number of overlapping bases b/w a and b
+
+        // is there enough overlap relative to the user's request? (default ~ 1bp)
+        if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) {
+            // Report the hit if the user doesn't care about reciprocal overlap between A and B.
+            if (_reciprocal == false) {
+                hitsFound = true;
+                numOverlaps++;
+                if (_printable == true)
+                    ReportOverlapDetail(overlapBases, a, *h, s, e);
+            }
+            // we require there to be sufficient __reciprocal__ overlap
+            else {
+                int bLength    = (h->end - h->start);
+                float bOverlap = ( (float) overlapBases / (float) bLength );
+                if (bOverlap >= _overlapFraction) {
+                    hitsFound = true;
+                    numOverlaps++;
+                    if (_printable == true)
+                        ReportOverlapDetail(overlapBases, a, *h, s, e);
+                }
+            }
+        }
+    }
+    // report the summary of the overlaps if requested.
+    ReportOverlapSummary(a, numOverlaps);
+    // were hits found for this BED feature?
+    return hitsFound;
+}
+
+
+/*
+    Constructor
+*/
+BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit,
+                           bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap,
+                           float overlapFraction, bool noHit, bool writeCount, bool sameStrand, bool diffStrand,
+                           bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam,
+                           bool sortedInput) {
+
+    _bedAFile            = bedAFile;
+    _bedBFile            = bedBFile;
+    _anyHit              = anyHit;
+    _noHit               = noHit;
+    _writeA              = writeA;
+    _writeB              = writeB;
+    _writeOverlap        = writeOverlap;
+    _writeAllOverlap     = writeAllOverlap;
+    _writeCount          = writeCount;
+    _overlapFraction     = overlapFraction;
+    _sameStrand          = sameStrand;
+    _diffStrand          = diffStrand;
+    _reciprocal          = reciprocal;
+    _obeySplits          = obeySplits;
+    _bamInput            = bamInput;
+    _bamOutput           = bamOutput;
+    _isUncompressedBam   = isUncompressedBam;
+    _sortedInput         = sortedInput;
+
+    // should we print each overlap, or does the user want summary information?
+    _printable = true;
+    if (_anyHit || _noHit || _writeCount)
+        _printable = false;
+        
+    if (_bamInput == false)
+        IntersectBed();
+    else
+        IntersectBam(bedAFile);
+}
+
+
+/*
+    Destructor
+*/
+BedIntersect::~BedIntersect(void) {
+}
+
+
+bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) {
+    bool hitsFound = false;
+    
+    // collect and report the sufficient hits
+    _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand);
+    hitsFound = processHits(a, hits);
+    return hitsFound;
+}
+
+
+void BedIntersect::ReportOverlapDetail(int overlapBases, const BED &a, const BED &b, CHRPOS s, CHRPOS e) {
+    // default. simple intersection only
+    if (_writeA == false && _writeB == false && _writeOverlap == false) {
+        _bedA->reportBedRangeNewLine(a,s,e);
+    }
+    //  -wa -wb write the original A and B
+    else if (_writeA == true && _writeB == true) {
+        _bedA->reportBedTab(a);
+        _bedB->reportBedNewLine(b);
+    }
+    // -wa write just the original A
+    else if (_writeA == true) {
+        _bedA->reportBedNewLine(a);
+    }
+    // -wb write the intersected portion of A and the original B
+    else if (_writeB == true) {
+        _bedA->reportBedRangeTab(a,s,e);
+        _bedB->reportBedNewLine(b);
+    }
+    // -wo write the original A and B plus the no. of overlapping bases.
+    else if (_writeOverlap == true) {
+        _bedA->reportBedTab(a);
+        _bedB->reportBedTab(b);
+        if (b.zeroLength == false) printf("%d\n", overlapBases);
+        else printf("0\n");
+    }
+}
+
+
+void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) {
+    // -u  just report the fact that there was >= 1 overlaps
+    if (_anyHit && (numOverlapsFound >= 1)) {
+        _bedA->reportBedNewLine(a);
+    }
+    // -c  report the total number of features overlapped in B
+    else if (_writeCount) {
+        _bedA->reportBedTab(a);
+        printf("%d\n", numOverlapsFound);
+    }
+    // -v  report iff there were no overlaps
+    else if (_noHit && (numOverlapsFound == 0)) {
+        _bedA->reportBedNewLine(a);
+    }
+    // -wao the user wants to force the reporting of 0 overlap
+    else if (_writeAllOverlap && (numOverlapsFound == 0)) {
+        _bedA->reportBedTab(a);
+        _bedB->reportNullBedTab();
+        printf("0\n");
+    }
+}
+
+
+bool BedIntersect::FindOneOrMoreOverlap(const BED &a) {
+    bool overlapsFound = false;
+    if (_reciprocal == false) {
+        overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand,
+                                                          _sameStrand, _diffStrand, _overlapFraction);
+    }
+    else {
+        overlapsFound = _bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand,
+                                                                    _sameStrand, _diffStrand, _overlapFraction);
+    }
+    return overlapsFound;
+}
+
+
+void BedIntersect::IntersectBed() {
+
+    // create new BED file objects for A and B
+    _bedA = new BedFile(_bedAFile);
+    _bedB = new BedFile(_bedBFile);
+
+    if (_sortedInput == false) {
+        // load the "B" file into a map in order to
+        // compare each entry in A to it in search of overlaps.
+        _bedB->loadBedFileIntoMap();
+
+        int lineNum = 0;
+        vector<BED> hits;
+        hits.reserve(100);
+        BED a, nullBed;
+        BedLineStatus bedStatus;
+
+        // open the "A" file, process each BED entry and searh for overlaps.
+        _bedA->Open();
+        while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) {
+            if (bedStatus == BED_VALID) {
+                // treat the BED as a single "block"
+                if (_obeySplits == false) {
+                    FindOverlaps(a, hits);
+                    hits.clear();
+                }
+                // split the BED12 into blocks and look for overlaps in each discrete block
+                else {
+                    bedVector bedBlocks;  // vec to store the discrete BED "blocks"
+                    splitBedIntoBlocks(a, lineNum, bedBlocks);
+
+                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
+                    vector<BED>::const_iterator bedEnd  = bedBlocks.end();
+                    for (; bedItr != bedEnd; ++bedItr) {
+                        FindOverlaps(*bedItr, hits);
+                        hits.clear();
+                    }
+                }
+            }
+            a = nullBed;
+        }
+        _bedA->Close();
+    }
+    else {
+        // use the chromsweep algorithm to detect overlaps on the fly.
+        ChromSweep sweep = ChromSweep(_bedA, _bedB, _sameStrand, _diffStrand);
+
+        pair<BED, vector<BED> > hit_set;
+        hit_set.second.reserve(10000);
+        while (sweep.Next(hit_set)) {
+            processHits(hit_set.first, hit_set.second);
+        }
+    }
+}
+
+
+void BedIntersect::IntersectBam(string bamFile) {
+
+    // load the "B" bed file into a map so
+    // that we can easily compare "A" to it for overlaps
+    _bedB = new BedFile(_bedBFile);
+    _bedB->loadBedFileIntoMap();
+
+    // create a dummy BED A file for printing purposes if not
+    // using BAM output.
+    if (_bamOutput == false) {
+        _bedA = new BedFile(_bedAFile);
+        _bedA->bedType = 6;
+    }
+
+    // open the BAM file
+    BamReader reader;
+    BamWriter writer;
+    
+    reader.Open(bamFile);
+
+    // get header & reference information
+    string bamHeader  = reader.GetHeaderText();
+    RefVector refs    = reader.GetReferenceData();
+
+    // open a BAM output to stdout if we are writing BAM
+    if (_bamOutput == true) {
+        // set compression mode
+        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
+        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
+        writer.SetCompressionMode(compressionMode);
+        // open our BAM writer
+        writer.Open("stdout", bamHeader, refs);
+    }
+
+    vector<BED> hits;
+    // reserve some space
+    hits.reserve(100);
+
+    
+    BamAlignment bam;    
+    // get each set of alignments for each pair.
+    while (reader.GetNextAlignment(bam)) {
+
+        if (bam.IsMapped()) {
+            BED a;
+            a.chrom = refs.at(bam.RefID).RefName;
+            a.start = bam.Position;
+            a.end   = bam.GetEndPosition(false, false);
+
+            // build the name field from the BAM alignment.
+            a.name = bam.Name;
+            if (bam.IsFirstMate()) a.name += "/1";
+            if (bam.IsSecondMate()) a.name += "/2";
+
+            a.score  = ToString(bam.MapQuality);
+
+            a.strand = "+";
+            if (bam.IsReverseStrand()) a.strand = "-";
+
+            if (_bamOutput == true) {
+                bool overlapsFound = false;
+                // treat the BAM alignment as a single "block"
+                if (_obeySplits == false) {
+                    overlapsFound = FindOneOrMoreOverlap(a);
+                }
+                // split the BAM alignment into discrete blocks and
+                // look for overlaps only within each block.
+                else {
+                    bool overlapFoundForBlock;
+                    bedVector bedBlocks;  // vec to store the discrete BED "blocks" from a
+                    // we don't want to split on "D" ops, hence the "false"
+                    getBamBlocks(bam, refs, bedBlocks, false);
+
+                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
+                    vector<BED>::const_iterator bedEnd  = bedBlocks.end();
+                    for (; bedItr != bedEnd; ++bedItr) {
+                        overlapFoundForBlock = FindOneOrMoreOverlap(*bedItr);
+                        if (overlapFoundForBlock == true)
+                            overlapsFound = true;
+                    }
+                }
+                if (overlapsFound == true) {
+                    if (_noHit == false)
+                        writer.SaveAlignment(bam);
+                }
+                else {
+                    if (_noHit == true) {
+                        writer.SaveAlignment(bam);
+                    }
+                }
+            }
+            else {
+                // treat the BAM alignment as a single BED "block"
+                if (_obeySplits == false) {
+                    FindOverlaps(a, hits);
+                    hits.clear();
+                }
+                // split the BAM alignment into discrete BED blocks and
+                // look for overlaps only within each block.
+                else {
+                    bedVector bedBlocks;  // vec to store the discrete BED "blocks" from a
+                    getBamBlocks(bam, refs, bedBlocks, false);
+
+                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
+                    vector<BED>::const_iterator bedEnd  = bedBlocks.end();
+                    for (; bedItr != bedEnd; ++bedItr) {
+                        FindOverlaps(*bedItr, hits);
+                        hits.clear();
+                    }
+                }
+            }
+        }
+        // BAM IsMapped() is false
+        else if (_noHit == true) {
+            writer.SaveAlignment(bam);
+        }
+    }
+
+    // close the relevant BAM files.
+    reader.Close();
+    if (_bamOutput == true) {
+        writer.Close();
+    }
+}
+