view BEDTools-Version-2.14.3/src/closestBed/closestBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
line wrap: on
line source

/*****************************************************************************
  closestBed.cpp

  (c) 2009 - Aaron Quinlan
  Hall Laboratory
  Department of Biochemistry and Molecular Genetics
  University of Virginia
  aaronquinlan@gmail.com

  Licenced under the GNU General Public License 2.0 license.
******************************************************************************/
#include "lineFileUtilities.h"
#include "closestBed.h"

const int MAXSLOP = 256000000;  // 2*MAXSLOP = 512 megabases.
                                // We don't want to keep looking if we
                                // can't find a nearby feature within 512 Mb.
const int SLOPGROWTH = 2048000;


/*
    Constructor
*/
BedClosest::BedClosest(string &bedAFile, string &bedBFile, bool sameStrand, bool diffStrand,
                       string &tieMode, bool reportDistance, bool signDistance, string &_strandedDistMode,
                       bool ignoreOverlaps) 
    : _bedAFile(bedAFile)
    , _bedBFile(bedBFile)
    , _tieMode(tieMode)
    , _sameStrand(sameStrand)
    , _diffStrand(diffStrand)
    , _reportDistance(reportDistance)
    , _signDistance(signDistance)
    , _strandedDistMode(_strandedDistMode)
    , _ignoreOverlaps(ignoreOverlaps)
{
    _bedA           = new BedFile(_bedAFile);
    _bedB           = new BedFile(_bedBFile);
    FindClosestBed();
}


/*
    Destructor
*/
BedClosest::~BedClosest(void) {
}


void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) {

    int slop = 0;  // start out just looking for overlaps
                   // within the current bin (~128Kb)

    // update the current feature's start and end

    CHRPOS aFudgeStart = 0;
    CHRPOS aFudgeEnd;
    int numOverlaps = 0;
    vector<BED> closestB;
    CHRPOS minDistance = INT_MAX;
    int32_t curDistance = INT_MAX;
    vector<int32_t> distances;

    // is there at least one feature in B on the same chrom
    // as the current A feature?
    if(_bedB->bedMap.find(a.chrom) != _bedB->bedMap.end()) {

        while ((numOverlaps == 0) && (slop <= MAXSLOP)) {

            // add some slop (starting at 0 bases) to a in hopes
            // of finding a hit in B
            if ((static_cast<int>(a.start) - slop) > 0)
                aFudgeStart = a.start - slop;
            else
                aFudgeStart = 0;

            if ((static_cast<int>(a.start) + slop) < (2 * MAXSLOP))
                aFudgeEnd = a.end + slop;
            else
                aFudgeEnd = 2 * MAXSLOP;

            // THE HEAVY LIFTING
            // search for hits with the current slop added
            _bedB->FindOverlapsPerBin(a.chrom, aFudgeStart, aFudgeEnd, a.strand, hits, _sameStrand, _diffStrand);

            vector<BED>::const_iterator h = hits.begin();
            vector<BED>::const_iterator hitsEnd = hits.end();
            for (; h != hitsEnd; ++h) {

                // do the actual features overlap?
                int s = max(a.start, h->start);
                int e = min(a.end, h->end);
                int overlapBases = (e - s);             // the number of overlapping bases b/w a and b

                // make sure we allow overlapping features.
                if ((overlapBases > 0) && (_ignoreOverlaps == true))
                    continue;
                else
                    numOverlaps++;

                // there is overlap. make sure we allow overlapping features ()
                if (overlapBases > 0) {
                    closestB.push_back(*h);
                    distances.push_back(0);
                }
                // the hit is to the "left" of A
                else if (h->end <= a.start) {
                    curDistance = a.start - h->end;
                    if (_signDistance) {
                        if ((_strandedDistMode == "ref")
                                || (_strandedDistMode == "a" && a.strand != "-")
                                || (_strandedDistMode == "b" && h->strand == "-")) {
                            curDistance = -curDistance;
                        }
                    }
                    
                    if (abs(curDistance) < minDistance) {
                        minDistance = abs(curDistance);
                        
                        closestB.clear();
                        closestB.push_back(*h);
                        distances.clear();
                        distances.push_back(curDistance);
                    }
                    else if (abs(curDistance) == minDistance) {
                        minDistance = abs(curDistance);
                        closestB.push_back(*h);
                        distances.push_back(curDistance);
                    }
                }
                // the hit is to the "right" of A
                else if (h->start >= a.end) {
                    curDistance = h->start - a.end;
                    if (_signDistance) {
                        if ((_strandedDistMode == "a" && a.strand == "-")
                                || (_strandedDistMode == "b" && h->strand != "-")) {
                            curDistance = -curDistance;
                        }
                    }
                    if (abs(curDistance) < minDistance) {
                        minDistance = abs(curDistance);
                        closestB.clear();
                        closestB.push_back(*h);
                        distances.clear();
                        distances.push_back(curDistance);
                    }
                    else if (abs(curDistance) == minDistance) {
                        minDistance = abs(curDistance);
                        closestB.push_back(*h);
                        distances.push_back(curDistance);
                    }
                }
            }
            // if no overlaps were found, we'll widen the range
            // by SLOPGROWTH in each direction and search again.
            slop += SLOPGROWTH;
        }
    }
    // there is no feature in B on the same chromosome as A
    else {
        _bedA->reportBedTab(a);
        if (_reportDistance == true) {
            _bedB->reportNullBedTab();
            cout << -1 << endl;
        }
        else
            _bedB->reportNullBedNewLine();
    }

    // report the closest feature(s) in B to the current A feature.
    // obey the user's reporting request (_tieMode)
    if (numOverlaps > 0) {
        if (closestB.size() == 1 || _tieMode == "first") {
            _bedA->reportBedTab(a);
            if (_reportDistance == true) {
                _bedB->reportBedTab(closestB[0]);
                cout << distances[0] << endl;
            }
            else
                _bedB->reportBedNewLine(closestB[0]);
        }
        else {
            if (_tieMode == "all") {
                size_t i = 0;
                for (vector<BED>::iterator b = closestB.begin(); b != closestB.end(); ++b) {
                    _bedA->reportBedTab(a);
                    if (_reportDistance == true) {
                        _bedB->reportBedTab(*b);
                        cout << distances[i++] <<endl;
                    }
                    else
                        _bedB->reportBedNewLine(*b);
                }
            }
            else if (_tieMode == "last") {
                _bedA->reportBedTab(a);
                if (_reportDistance == true) {
                    _bedB->reportBedTab(closestB[closestB.size()-1]);
                    cout << distances[distances.size() - 1]<<endl;
                }
                else
                    _bedB->reportBedNewLine(closestB[closestB.size()-1]);
            }
        }
    }
}


void BedClosest::FindClosestBed() {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedFileIntoMap();

    BED a, nullBed;
    int lineNum = 0;                    // current input line number
    vector<BED> hits;                   // vector of potential hits
    hits.reserve(100);
    BedLineStatus bedStatus;

    _bedA->Open();
    // process each entry in A in search of the closest feature in B
    while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) {
        if (bedStatus == BED_VALID) {
            FindWindowOverlaps(a, hits);
            hits.clear();
            a = nullBed;
        }
    }
    _bedA->Close();
}
// END ClosestBed