Mercurial > repos > aaronquinlan > multi_intersect
diff BEDTools-Version-2.14.3/src/closestBed/closestBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BEDTools-Version-2.14.3/src/closestBed/closestBed.cpp Thu Nov 03 10:25:04 2011 -0400 @@ -0,0 +1,234 @@ +/***************************************************************************** + closestBed.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "lineFileUtilities.h" +#include "closestBed.h" + +const int MAXSLOP = 256000000; // 2*MAXSLOP = 512 megabases. + // We don't want to keep looking if we + // can't find a nearby feature within 512 Mb. +const int SLOPGROWTH = 2048000; + + +/* + Constructor +*/ +BedClosest::BedClosest(string &bedAFile, string &bedBFile, bool sameStrand, bool diffStrand, + string &tieMode, bool reportDistance, bool signDistance, string &_strandedDistMode, + bool ignoreOverlaps) + : _bedAFile(bedAFile) + , _bedBFile(bedBFile) + , _tieMode(tieMode) + , _sameStrand(sameStrand) + , _diffStrand(diffStrand) + , _reportDistance(reportDistance) + , _signDistance(signDistance) + , _strandedDistMode(_strandedDistMode) + , _ignoreOverlaps(ignoreOverlaps) +{ + _bedA = new BedFile(_bedAFile); + _bedB = new BedFile(_bedBFile); + FindClosestBed(); +} + + +/* + Destructor +*/ +BedClosest::~BedClosest(void) { +} + + +void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) { + + int slop = 0; // start out just looking for overlaps + // within the current bin (~128Kb) + + // update the current feature's start and end + + CHRPOS aFudgeStart = 0; + CHRPOS aFudgeEnd; + int numOverlaps = 0; + vector<BED> closestB; + CHRPOS minDistance = INT_MAX; + int32_t curDistance = INT_MAX; + vector<int32_t> distances; + + // is there at least one feature in B on the same chrom + // as the current A feature? + if(_bedB->bedMap.find(a.chrom) != _bedB->bedMap.end()) { + + while ((numOverlaps == 0) && (slop <= MAXSLOP)) { + + // add some slop (starting at 0 bases) to a in hopes + // of finding a hit in B + if ((static_cast<int>(a.start) - slop) > 0) + aFudgeStart = a.start - slop; + else + aFudgeStart = 0; + + if ((static_cast<int>(a.start) + slop) < (2 * MAXSLOP)) + aFudgeEnd = a.end + slop; + else + aFudgeEnd = 2 * MAXSLOP; + + // THE HEAVY LIFTING + // search for hits with the current slop added + _bedB->FindOverlapsPerBin(a.chrom, aFudgeStart, aFudgeEnd, a.strand, hits, _sameStrand, _diffStrand); + + vector<BED>::const_iterator h = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; h != hitsEnd; ++h) { + + // do the actual features overlap? + int s = max(a.start, h->start); + int e = min(a.end, h->end); + int overlapBases = (e - s); // the number of overlapping bases b/w a and b + + // make sure we allow overlapping features. + if ((overlapBases > 0) && (_ignoreOverlaps == true)) + continue; + else + numOverlaps++; + + // there is overlap. make sure we allow overlapping features () + if (overlapBases > 0) { + closestB.push_back(*h); + distances.push_back(0); + } + // the hit is to the "left" of A + else if (h->end <= a.start) { + curDistance = a.start - h->end; + if (_signDistance) { + if ((_strandedDistMode == "ref") + || (_strandedDistMode == "a" && a.strand != "-") + || (_strandedDistMode == "b" && h->strand == "-")) { + curDistance = -curDistance; + } + } + + if (abs(curDistance) < minDistance) { + minDistance = abs(curDistance); + + closestB.clear(); + closestB.push_back(*h); + distances.clear(); + distances.push_back(curDistance); + } + else if (abs(curDistance) == minDistance) { + minDistance = abs(curDistance); + closestB.push_back(*h); + distances.push_back(curDistance); + } + } + // the hit is to the "right" of A + else if (h->start >= a.end) { + curDistance = h->start - a.end; + if (_signDistance) { + if ((_strandedDistMode == "a" && a.strand == "-") + || (_strandedDistMode == "b" && h->strand != "-")) { + curDistance = -curDistance; + } + } + if (abs(curDistance) < minDistance) { + minDistance = abs(curDistance); + closestB.clear(); + closestB.push_back(*h); + distances.clear(); + distances.push_back(curDistance); + } + else if (abs(curDistance) == minDistance) { + minDistance = abs(curDistance); + closestB.push_back(*h); + distances.push_back(curDistance); + } + } + } + // if no overlaps were found, we'll widen the range + // by SLOPGROWTH in each direction and search again. + slop += SLOPGROWTH; + } + } + // there is no feature in B on the same chromosome as A + else { + _bedA->reportBedTab(a); + if (_reportDistance == true) { + _bedB->reportNullBedTab(); + cout << -1 << endl; + } + else + _bedB->reportNullBedNewLine(); + } + + // report the closest feature(s) in B to the current A feature. + // obey the user's reporting request (_tieMode) + if (numOverlaps > 0) { + if (closestB.size() == 1 || _tieMode == "first") { + _bedA->reportBedTab(a); + if (_reportDistance == true) { + _bedB->reportBedTab(closestB[0]); + cout << distances[0] << endl; + } + else + _bedB->reportBedNewLine(closestB[0]); + } + else { + if (_tieMode == "all") { + size_t i = 0; + for (vector<BED>::iterator b = closestB.begin(); b != closestB.end(); ++b) { + _bedA->reportBedTab(a); + if (_reportDistance == true) { + _bedB->reportBedTab(*b); + cout << distances[i++] <<endl; + } + else + _bedB->reportBedNewLine(*b); + } + } + else if (_tieMode == "last") { + _bedA->reportBedTab(a); + if (_reportDistance == true) { + _bedB->reportBedTab(closestB[closestB.size()-1]); + cout << distances[distances.size() - 1]<<endl; + } + else + _bedB->reportBedNewLine(closestB[closestB.size()-1]); + } + } + } +} + + +void BedClosest::FindClosestBed() { + + // load the "B" bed file into a map so + // that we can easily compare "A" to it for overlaps + _bedB->loadBedFileIntoMap(); + + BED a, nullBed; + int lineNum = 0; // current input line number + vector<BED> hits; // vector of potential hits + hits.reserve(100); + BedLineStatus bedStatus; + + _bedA->Open(); + // process each entry in A in search of the closest feature in B + while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + FindWindowOverlaps(a, hits); + hits.clear(); + a = nullBed; + } + } + _bedA->Close(); +} +// END ClosestBed +