Mercurial > repos > aaronquinlan > multi_intersect
diff BEDTools-Version-2.14.3/src/fjoin/fjoin.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BEDTools-Version-2.14.3/src/fjoin/fjoin.cpp Thu Nov 03 10:25:04 2011 -0400 @@ -0,0 +1,350 @@ +/***************************************************************************** + intersectBed.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "lineFileUtilities.h" +#include "fjoin.h" +#include <queue> +#include <set> + +bool leftOf(const BED &a, const BED &b); + + +bool BedIntersect::processHits(BED &a, vector<BED> &hits) { + // how many overlaps are there b/w the bed and the set of hits? + int s, e, overlapBases; + int numOverlaps = 0; + bool hitsFound = false; + int aLength = (a.end - a.start); // the length of a in b.p. + + // loop through the hits and report those that meet the user's criteria + vector<BED>::const_iterator h = hits.begin(); + vector<BED>::const_iterator hitsEnd = hits.end(); + for (; h != hitsEnd; ++h) { + s = max(a.start, h->start); + e = min(a.end, h->end); + overlapBases = (e - s); // the number of overlapping bases b/w a and b + + // is there enough overlap relative to the user's request? (default ~ 1bp) + if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { + // Report the hit if the user doesn't care about reciprocal overlap between A and B. + if (_reciprocal == false) { + hitsFound = true; + numOverlaps++; + if (_printable == true) + ReportOverlapDetail(overlapBases, a, *h, s, e); + } + // we require there to be sufficient __reciprocal__ overlap + else { + int bLength = (h->end - h->start); + float bOverlap = ( (float) overlapBases / (float) bLength ); + if (bOverlap >= _overlapFraction) { + hitsFound = true; + numOverlaps++; + if (_printable == true) + ReportOverlapDetail(overlapBases, a, *h, s, e); + } + } + } + } + // report the summary of the overlaps if requested. + ReportOverlapSummary(a, numOverlaps); + // were hits found for this BED feature? + return hitsFound; +} + +/* + Constructor +*/ +BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, + bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, + float overlapFraction, bool noHit, bool writeCount, bool forceStrand, + bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput) { + + _bedAFile = bedAFile; + _bedBFile = bedBFile; + _anyHit = anyHit; + _noHit = noHit; + _writeA = writeA; + _writeB = writeB; + _writeOverlap = writeOverlap; + _writeAllOverlap = writeAllOverlap; + _writeCount = writeCount; + _overlapFraction = overlapFraction; + _forceStrand = forceStrand; + _reciprocal = reciprocal; + _obeySplits = obeySplits; + _bamInput = bamInput; + _bamOutput = bamOutput; + + if (_anyHit || _noHit || _writeCount) + _printable = false; + else + _printable = true; + + // create new BED file objects for A and B + _bedA = new BedFile(bedAFile); + _bedB = new BedFile(bedBFile); + + IntersectBed(); +} + + +/* + Destructor +*/ +BedIntersect::~BedIntersect(void) { +} + + +bool leftOf(const BED &a, const BED &b) { + return (a.end <= b.start); +} + + +void BedIntersect::ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b, + const CHRPOS &s, const CHRPOS &e) { + // default. simple intersection only + if (_writeA == false && _writeB == false && _writeOverlap == false) { + _bedA->reportBedRangeNewLine(a,s,e); + } + // -wa -wbwrite the original A and B + else if (_writeA == true && _writeB == true) { + _bedA->reportBedTab(a); + _bedB->reportBedNewLine(b); + } + // -wa write just the original A + else if (_writeA == true) { + _bedA->reportBedNewLine(a); + } + // -wb write the intersected portion of A and the original B + else if (_writeB == true) { + _bedA->reportBedRangeTab(a,s,e); + _bedB->reportBedNewLine(b); + } + // -wo write the original A and B plus the no. of overlapping bases. + else if (_writeOverlap == true) { + _bedA->reportBedTab(a); + _bedB->reportBedTab(b); + printf("%d\n", overlapBases); + } +} + + +void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) { + // -u just report the fact that there was >= 1 overlaps + if (_anyHit && (numOverlapsFound >= 1)) { + _bedA->reportBedNewLine(a); + } + // -c report the total number of features overlapped in B + else if (_writeCount) { + _bedA->reportBedTab(a); + printf("%d\n", numOverlapsFound); + } + // -v report iff there were no overlaps + else if (_noHit && (numOverlapsFound == 0)) { + _bedA->reportBedNewLine(a); + } + // -wao the user wants to force the reporting of 0 overlap + else if (_writeAllOverlap && (numOverlapsFound == 0)) { + _bedA->reportBedTab(a); + _bedB->reportNullBedTab(); + printf("0\n"); + } +} + + + +void BedIntersect::Scan(BED *x, vector<BED *> *windowX, BedLineStatus xStatus, + const BED &y, vector<BED *> *windowY, BedLineStatus yStatus) { + + if (xStatus != BED_VALID) { + return; + } + + std::vector<BED *>::iterator wYIter = windowY->begin(); + while (wYIter != windowY->end()) { + if (leftOf(*(*wYIter), *x) == true) { + (*wYIter)->finished = true; + wYIter = windowY->erase(wYIter); // erase auto-increments to the next position + } + else if (overlaps((*wYIter)->start, (*wYIter)->end, x->start, x->end) > 0) { + if (_lastPick == 0) { + AddHits(x, *(*wYIter)); + } + else { + AddHits(*wYIter, *x); + } + ++wYIter; // force incrementing + } + } + if (leftOf(*x,y) == false) + windowX->push_back(x); + else { + x->finished = true; + } + // dump the buffered results (if any) + FlushOutputBuffer(); +} + + +void BedIntersect::AddHits(BED *x, const BED &y) { + if (x->overlaps.empty() == true) + _outputBuffer.push(x); + x->overlaps.push_back(y); +} + + +void BedIntersect::FlushOutputBuffer(bool final) { + while (_outputBuffer.empty() == false) + { + if (final == false && _outputBuffer.front()->finished == false) + break; + + processHits(*_outputBuffer.front(), _outputBuffer.front()->overlaps); + // remove the finished BED entry from the heap + delete _outputBuffer.front(); + _outputBuffer.pop(); + } +} + + +vector<BED*>* BedIntersect::GetWindow(const string &chrom, bool isA) { + + // iterator to test if a window for a given chrom exists. + map<string, vector<BED*> >::iterator it; + + // grab the current window for A or B, depending on + // the request. if a window hasn't yet been created + // for the requested chrom, create one. + + if (isA) { + it = _windowA.find(chrom); + if (it != _windowA.end()) { + return & _windowA[chrom]; + } + else { + _windowA.insert(pair<string, vector<BED *> >(chrom, vector<BED *>())); + return & _windowA[chrom]; + } + } + else { + it = _windowB.find(chrom); + if (it != _windowB.end()) { + return & _windowB[chrom]; + } + else { + _windowB.insert(pair<string, vector<BED *> >(chrom, vector<BED *>())); + return & _windowB[chrom]; + } + } +} + + +void BedIntersect::ChromSwitch(const string &chrom) { + + vector<BED*>::iterator windowAIter = _windowA[chrom].begin(); + vector<BED*>::iterator windowAEnd = _windowA[chrom].end(); + for (; windowAIter != windowAEnd; ++windowAIter) + (*windowAIter)->finished = true; + + vector<BED*>::iterator windowBIter = _windowB[chrom].begin(); + vector<BED*>::iterator windowBEnd = _windowB[chrom].end(); + for (; windowBIter != windowBEnd; ++windowBIter) + (*windowBIter)->finished = true; + + FlushOutputBuffer(); +} + + +void BedIntersect::IntersectBed() { + + int aLineNum = 0; + int bLineNum = 0; + + // current feature from each file + BED *a, *b, *prevA, *prevB; + + // status of the current lines + BedLineStatus aStatus, bStatus; + + // open the files; get the first line from each + _bedA->Open(); + _bedB->Open(); + + prevA = NULL; + prevB = NULL; + a = new BED(); + b = new BED(); + aStatus = _bedA->GetNextBed(*a, aLineNum); + bStatus = _bedB->GetNextBed(*b, bLineNum); + + cout << a->chrom << " " << a->start << " " << a->chrom << " " << b->start << endl; + while (aStatus != BED_INVALID || bStatus != BED_INVALID) { + + if ((a->start <= b->start) && (a->chrom == b->chrom)) { + prevA = a; + _lastPick = 0; + Scan(a, GetWindow(a->chrom, true), aStatus, + *b, GetWindow(a->chrom, false), bStatus); + + a = new BED(); + aStatus = _bedA->GetNextBed(*a, aLineNum); + } + else if ((a->start > b->start) && (a->chrom == b->chrom)) { + prevB = b; + _lastPick = 1; + Scan(b, GetWindow(b->chrom, false), bStatus, + *a, GetWindow(b->chrom, true), aStatus); + + b = new BED(); + bStatus = _bedB->GetNextBed(*b, bLineNum); + } + else if (a->chrom != b->chrom) { + // A was most recently read + if (_lastPick == 0) { + prevB = b; + while (b->chrom == prevA->chrom){ + _windowB[prevA->chrom].push_back(b); + b = new BED(); + bStatus = _bedB->GetNextBed(*b, bLineNum); + } + Scan(prevA, GetWindow(prevA->chrom, true), aStatus, + *prevB, GetWindow(prevA->chrom, false), bStatus); + } + // B was most recently read + else { + prevA = a; + while (a->chrom == prevB->chrom) { + _windowA[prevB->chrom].push_back(a); + a = new BED(); + aStatus = _bedA->GetNextBed(*a, aLineNum); + } + Scan(prevB, GetWindow(prevB->chrom, false), bStatus, + *prevA, GetWindow(prevB->chrom, true), aStatus); + } + FlushOutputBuffer(true); + } + if (prevA!=NULL&&prevB!=NULL) + //cout << prevA->chrom << " " << a->chrom << " " << a->start << " " + // << prevB->chrom << " " << b->chrom << " " << b->start << "\n"; + if (aStatus == BED_INVALID) a->start = INT_MAX; + if (bStatus == BED_INVALID) b->start = INT_MAX; + } + + // clear out the final bit of staged output + FlushOutputBuffer(true); + + // close the files + _bedA->Close(); + _bedB->Close(); +} + +