diff BEDTools-Version-2.14.3/src/utils/chromsweep/chromsweep.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BEDTools-Version-2.14.3/src/utils/chromsweep/chromsweep.cpp	Thu Nov 03 10:25:04 2011 -0400
@@ -0,0 +1,180 @@
+/*****************************************************************************
+  chromsweep.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "lineFileUtilities.h"
+#include "chromsweep.h"
+#include <queue>
+
+bool after(const BED &a, const BED &b);
+void report_hits(const BED &curr_qy, const vector<BED> &hits);
+vector<BED> scan_cache(const BED &curr_qy, BedLineStatus qy_status, const vector<BED> &db_cache, vector<BED> &hits);
+
+
+/*
+    // constructor using existing BedFile pointers
+*/
+ChromSweep::ChromSweep(BedFile *bedA, BedFile *bedB, bool sameStrand, bool diffStrand)
+: _bedA(bedA)
+, _bedB(bedB)
+, _sameStrand(sameStrand)
+, _diffStrand(diffStrand)
+{
+    // prime the results pump.
+    _qy_lineNum = 0;
+    _db_lineNum = 0;
+    
+    _hits.reserve(1000);
+    _cache.reserve(1000);
+    
+    _bedA->Open();
+    _bedB->Open();
+    _qy_status = _bedA->GetNextBed(_curr_qy, _qy_lineNum);
+    _db_status = _bedB->GetNextBed(_curr_db, _db_lineNum);
+}
+
+/*
+    Constructor with filenames
+*/
+ChromSweep::ChromSweep(string &bedAFile, string &bedBFile) 
+{
+    // prime the results pump.
+    _qy_lineNum = 0;
+    _db_lineNum = 0;
+    
+    _hits.reserve(100000);
+    _cache.reserve(100000);
+    
+    _bedA = new BedFile(bedAFile);
+    _bedB = new BedFile(bedBFile);
+    
+    _bedA->Open();
+    _bedB->Open();
+    
+    _qy_status = _bedA->GetNextBed(_curr_qy, _qy_lineNum);
+    _db_status = _bedB->GetNextBed(_curr_db, _db_lineNum);
+}
+
+
+/*
+    Destructor
+*/
+ChromSweep::~ChromSweep(void) {
+}
+
+
+void ChromSweep::ScanCache() {
+    if (_qy_status != BED_INVALID) {
+        vector<BED>::iterator c = _cache.begin();
+        while (c != _cache.end())
+        {
+            if ((_curr_qy.chrom == c->chrom) && !(after(_curr_qy, *c))) {
+                if (IsValidHit(_curr_qy, *c)) {
+                    _hits.push_back(*c);
+                }
+                ++c;
+            }
+            else {
+                c = _cache.erase(c);
+            }
+        }
+    }
+}
+
+
+bool ChromSweep::ChromChange()
+{
+    // the files are on the same chrom
+    if ((_curr_qy.chrom == _curr_db.chrom) || (_db_status == BED_INVALID) || (_qy_status == BED_INVALID)) {
+        return false;
+    }
+    // the query is ahead of the database. fast-forward the database to catch-up.
+    else if (_curr_qy.chrom > _curr_db.chrom) {
+        while (!_bedB->Empty() && _curr_db.chrom < _curr_qy.chrom)
+        {
+            _db_status = _bedB->GetNextBed(_curr_db, _db_lineNum);
+        }
+        _cache.clear();
+        return false;
+    }
+    // the database is ahead of the query.
+    else {
+        // 1. scan the cache for remaining hits on the query's current chrom.
+        if (_curr_qy.chrom == _curr_chrom)
+        {
+            ScanCache();
+            _results.push(make_pair(_curr_qy, _hits));
+            _hits.clear();
+        }
+        // 2. fast-forward until we catch up and report 0 hits until we do.
+        else if (_curr_qy.chrom < _curr_db.chrom)
+        {
+            _results.push(make_pair(_curr_qy, _no_hits));
+            _cache.clear();
+        }
+        _qy_status = _bedA->GetNextBed(_curr_qy, _qy_lineNum);
+        _curr_chrom = _curr_qy.chrom;
+        return true;
+    }
+}
+
+bool ChromSweep::IsValidHit(const BED &query, const BED &db) {
+    // do we have an overlap in the DB?
+    if (overlaps(query.start, query.end, db.start, db.end) > 0) {
+        // Now test for necessary strandedness.
+        bool strands_are_same = (query.strand == db.strand);
+        if ( (_sameStrand == false && _diffStrand == false)
+             ||
+             (_sameStrand == true && strands_are_same == true)
+             ||
+             (_diffStrand == true && strands_are_same == false)
+           )
+        {
+            return true;
+        }
+    }
+    return false;
+}
+
+
+bool ChromSweep::Next(pair<BED, vector<BED> > &next) {
+    if (!_bedA->Empty()) {
+        // have we changed chromosomes?
+        if (ChromChange() == false) {
+            // scan the database cache for hits
+            ScanCache();
+            // advance the db until we are ahead of the query. update hits and cache as necessary
+            while (!_bedB->Empty() && _curr_qy.chrom == _curr_db.chrom && !(after(_curr_db, _curr_qy)))
+            {
+                if (IsValidHit(_curr_qy, _curr_db)) {
+                    _hits.push_back(_curr_db);
+                }
+                _cache.push_back(_curr_db);
+                _db_status = _bedB->GetNextBed(_curr_db, _db_lineNum);
+            }
+            // add the hits for this query to the pump
+            _results.push(make_pair(_curr_qy, _hits));
+            // reset for the next query
+            _hits.clear();
+            _curr_qy = _nullBed;
+            _qy_status = _bedA->GetNextBed(_curr_qy, _qy_lineNum);
+            _curr_chrom = _curr_qy.chrom;
+        }
+    }
+    // report the next set if hits if there are still overlaps in the pump
+    if (!_results.empty()) {
+        next = _results.front();
+        _results.pop();
+        return true;
+    }
+    // otherwise, the party is over.
+    else {return false;}
+}
+