diff BEDTools-Version-2.14.3/src/subtractBed/subtractBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BEDTools-Version-2.14.3/src/subtractBed/subtractBed.cpp	Thu Nov 03 10:25:04 2011 -0400
@@ -0,0 +1,178 @@
+/*****************************************************************************
+  subtractBed.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "lineFileUtilities.h"
+#include "subtractBed.h"
+
+
+/*
+    Constructor
+*/
+BedSubtract::BedSubtract(string &bedAFile, string &bedBFile, float overlapFraction, bool sameStrand, bool diffStrand) {
+
+    _bedAFile = bedAFile;
+    _bedBFile = bedBFile;
+    _overlapFraction = overlapFraction;
+    _sameStrand = sameStrand;
+    _diffStrand = diffStrand;
+
+    _bedA = new BedFile(bedAFile);
+    _bedB = new BedFile(bedBFile);
+
+    SubtractBed();
+}
+
+
+/*
+    Destructor
+*/
+BedSubtract::~BedSubtract(void) {
+}
+
+
+void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) {
+
+    // find all of the overlaps between a and B.
+    _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand);
+
+    //  is A completely spanned by an entry in B?
+    //  if so, A should not be reported.
+    int numConsumedByB = 0;
+    int numOverlaps = 0;
+    vector<BED> bOverlaps;  // list of hits in B.  Special processing if there are multiple.
+
+    vector<BED>::const_iterator h = hits.begin();
+    vector<BED>::const_iterator hitsEnd = hits.end();
+    for (; h != hitsEnd; ++h) {
+
+        int s = max(a.start, h->start);
+        int e = min(a.end, h->end);
+        int overlapBases = (e - s);             // the number of overlapping bases b/w a and b
+        int aLength = (a.end - a.start);        // the length of a in b.p.
+
+        if (s < e) {
+
+            // is there enough overlap (default ~ 1bp)
+            float overlap = ((float) overlapBases / (float) aLength);
+
+            if (overlap >= 1.0) {
+                numOverlaps++;
+                numConsumedByB++;
+            }
+            else if ( overlap >= _overlapFraction ) {
+                numOverlaps++;
+                bOverlaps.push_back(*h);
+            }
+        }
+    }
+
+    if (numOverlaps == 0) {
+        // no overlap found, so just report A as-is.
+        _bedA->reportBedNewLine(a);
+    }
+    else if (numOverlaps == 1) {
+        // one overlap found.  only need to look at the single
+        // entry in bOverlaps.
+
+        // if A was not "consumed" by any entry in B
+        if (numConsumedByB == 0) {
+
+            BED theHit = bOverlaps[0];
+
+            // A    ++++++++++++
+            // B        ----
+            // Res. ====    ====
+            if ( (theHit.start > a.start) && (theHit.end < a.end) ) {
+                _bedA->reportBedRangeNewLine(a,a.start,theHit.start);
+                _bedA->reportBedRangeNewLine(a,theHit.end,a.end);
+            }
+            // A    ++++++++++++
+            // B    ----------
+            // Res.           ==
+            else if (theHit.start == a.start) {
+                _bedA->reportBedRangeNewLine(a,theHit.end,a.end);
+            }
+            // A          ++++++++++++
+            // B    ----------
+            // Res.       ====
+            else if (theHit.start < a.start) {
+                _bedA->reportBedRangeNewLine(a,theHit.end,a.end);
+            }
+            // A    ++++++++++++
+            // B           ----------
+            // Res. =======
+            else if (theHit.start > a.start) {
+                _bedA->reportBedRangeNewLine(a,a.start,theHit.start);
+            }
+        }
+    }
+    else if (numOverlaps > 1) {
+        // multiple overlapz found.  look at all the hits
+        // and figure out which bases in A survived.  then
+        // report the contigous intervals that survived.
+
+        vector<bool> aKeep(a.end - a.start, true);
+
+        if (numConsumedByB == 0) {
+            // track the number of hit starts and ends at each position in A
+            for (vector<BED>::iterator h = bOverlaps.begin(); h != bOverlaps.end(); ++h) {
+                int s = max(a.start, h->start);
+                int e = min(a.end, h->end);
+
+                for (int i = s+1; i <= e; ++i) {
+                    aKeep[i-a.start-1] = false;
+                }
+            }
+            // report the remaining blocks.
+            for (unsigned int i = 0; i < aKeep.size(); ++i) {
+                if (aKeep[i] == true) {
+                    CHRPOS blockStart = i + a.start;
+                    while ((aKeep[i] == true) && (i < aKeep.size())) {
+                        i++;
+                    }
+                    CHRPOS blockEnd = i + a.start;
+                    blockEnd = min(a.end, blockEnd);
+                    _bedA->reportBedRangeNewLine(a,blockStart,blockEnd);
+                }
+            }
+        }
+    }
+}
+
+
+
+void BedSubtract::SubtractBed() {
+
+    // load the "B" bed file into a map so
+    // that we can easily compare "A" to it for overlaps
+    _bedB->loadBedFileIntoMap();
+
+    BED a, nullBed;
+    BedLineStatus bedStatus;
+    int lineNum = 0;                    // current input line number
+    vector<BED> hits;                   // vector of potential hits
+    // reserve some space
+    hits.reserve(100);
+
+    _bedA->Open();
+    while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) {
+        if (bedStatus == BED_VALID) {
+            FindAndSubtractOverlaps(a, hits);
+            hits.clear();
+            a = nullBed;
+        }
+    }
+    _bedA->Close();
+
+}
+// END Intersect
+
+