Mercurial > repos > aaronquinlan > multi_intersect
diff BEDTools-Version-2.14.3/src/intersectBed/intersectBed.cpp @ 1:bec36315bd12 default tip
Deleted selected files
author | aaronquinlan |
---|---|
date | Sat, 19 Nov 2011 14:17:03 -0500 |
parents | dfcd8b6c1bda |
children |
line wrap: on
line diff
--- a/BEDTools-Version-2.14.3/src/intersectBed/intersectBed.cpp Thu Nov 03 10:25:04 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,367 +0,0 @@ -/***************************************************************************** - intersectBed.cpp - - (c) 2009 - Aaron Quinlan - Hall Laboratory - Department of Biochemistry and Molecular Genetics - University of Virginia - aaronquinlan@gmail.com - - Licenced under the GNU General Public License 2.0 license. -******************************************************************************/ -#include "lineFileUtilities.h" -#include "intersectBed.h" - -/************************************ -Helper functions -************************************/ -bool BedIntersect::processHits(const BED &a, const vector<BED> &hits) { - - // how many overlaps are there b/w the bed and the set of hits? - CHRPOS s, e; - int overlapBases; - int numOverlaps = 0; - bool hitsFound = false; - int aLength = (a.end - a.start); // the length of a in b.p. - - // loop through the hits and report those that meet the user's criteria - vector<BED>::const_iterator h = hits.begin(); - vector<BED>::const_iterator hitsEnd = hits.end(); - for (; h != hitsEnd; ++h) { - s = max(a.start, h->start); - e = min(a.end, h->end); - overlapBases = (e - s); // the number of overlapping bases b/w a and b - - // is there enough overlap relative to the user's request? (default ~ 1bp) - if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { - // Report the hit if the user doesn't care about reciprocal overlap between A and B. - if (_reciprocal == false) { - hitsFound = true; - numOverlaps++; - if (_printable == true) - ReportOverlapDetail(overlapBases, a, *h, s, e); - } - // we require there to be sufficient __reciprocal__ overlap - else { - int bLength = (h->end - h->start); - float bOverlap = ( (float) overlapBases / (float) bLength ); - if (bOverlap >= _overlapFraction) { - hitsFound = true; - numOverlaps++; - if (_printable == true) - ReportOverlapDetail(overlapBases, a, *h, s, e); - } - } - } - } - // report the summary of the overlaps if requested. - ReportOverlapSummary(a, numOverlaps); - // were hits found for this BED feature? - return hitsFound; -} - - -/* - Constructor -*/ -BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, - bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, - float overlapFraction, bool noHit, bool writeCount, bool sameStrand, bool diffStrand, - bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam, - bool sortedInput) { - - _bedAFile = bedAFile; - _bedBFile = bedBFile; - _anyHit = anyHit; - _noHit = noHit; - _writeA = writeA; - _writeB = writeB; - _writeOverlap = writeOverlap; - _writeAllOverlap = writeAllOverlap; - _writeCount = writeCount; - _overlapFraction = overlapFraction; - _sameStrand = sameStrand; - _diffStrand = diffStrand; - _reciprocal = reciprocal; - _obeySplits = obeySplits; - _bamInput = bamInput; - _bamOutput = bamOutput; - _isUncompressedBam = isUncompressedBam; - _sortedInput = sortedInput; - - // should we print each overlap, or does the user want summary information? - _printable = true; - if (_anyHit || _noHit || _writeCount) - _printable = false; - - if (_bamInput == false) - IntersectBed(); - else - IntersectBam(bedAFile); -} - - -/* - Destructor -*/ -BedIntersect::~BedIntersect(void) { -} - - -bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { - bool hitsFound = false; - - // collect and report the sufficient hits - _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand); - hitsFound = processHits(a, hits); - return hitsFound; -} - - -void BedIntersect::ReportOverlapDetail(int overlapBases, const BED &a, const BED &b, CHRPOS s, CHRPOS e) { - // default. simple intersection only - if (_writeA == false && _writeB == false && _writeOverlap == false) { - _bedA->reportBedRangeNewLine(a,s,e); - } - // -wa -wb write the original A and B - else if (_writeA == true && _writeB == true) { - _bedA->reportBedTab(a); - _bedB->reportBedNewLine(b); - } - // -wa write just the original A - else if (_writeA == true) { - _bedA->reportBedNewLine(a); - } - // -wb write the intersected portion of A and the original B - else if (_writeB == true) { - _bedA->reportBedRangeTab(a,s,e); - _bedB->reportBedNewLine(b); - } - // -wo write the original A and B plus the no. of overlapping bases. - else if (_writeOverlap == true) { - _bedA->reportBedTab(a); - _bedB->reportBedTab(b); - if (b.zeroLength == false) printf("%d\n", overlapBases); - else printf("0\n"); - } -} - - -void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) { - // -u just report the fact that there was >= 1 overlaps - if (_anyHit && (numOverlapsFound >= 1)) { - _bedA->reportBedNewLine(a); - } - // -c report the total number of features overlapped in B - else if (_writeCount) { - _bedA->reportBedTab(a); - printf("%d\n", numOverlapsFound); - } - // -v report iff there were no overlaps - else if (_noHit && (numOverlapsFound == 0)) { - _bedA->reportBedNewLine(a); - } - // -wao the user wants to force the reporting of 0 overlap - else if (_writeAllOverlap && (numOverlapsFound == 0)) { - _bedA->reportBedTab(a); - _bedB->reportNullBedTab(); - printf("0\n"); - } -} - - -bool BedIntersect::FindOneOrMoreOverlap(const BED &a) { - bool overlapsFound = false; - if (_reciprocal == false) { - overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, - _sameStrand, _diffStrand, _overlapFraction); - } - else { - overlapsFound = _bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand, - _sameStrand, _diffStrand, _overlapFraction); - } - return overlapsFound; -} - - -void BedIntersect::IntersectBed() { - - // create new BED file objects for A and B - _bedA = new BedFile(_bedAFile); - _bedB = new BedFile(_bedBFile); - - if (_sortedInput == false) { - // load the "B" file into a map in order to - // compare each entry in A to it in search of overlaps. - _bedB->loadBedFileIntoMap(); - - int lineNum = 0; - vector<BED> hits; - hits.reserve(100); - BED a, nullBed; - BedLineStatus bedStatus; - - // open the "A" file, process each BED entry and searh for overlaps. - _bedA->Open(); - while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { - if (bedStatus == BED_VALID) { - // treat the BED as a single "block" - if (_obeySplits == false) { - FindOverlaps(a, hits); - hits.clear(); - } - // split the BED12 into blocks and look for overlaps in each discrete block - else { - bedVector bedBlocks; // vec to store the discrete BED "blocks" - splitBedIntoBlocks(a, lineNum, bedBlocks); - - vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) { - FindOverlaps(*bedItr, hits); - hits.clear(); - } - } - } - a = nullBed; - } - _bedA->Close(); - } - else { - // use the chromsweep algorithm to detect overlaps on the fly. - ChromSweep sweep = ChromSweep(_bedA, _bedB, _sameStrand, _diffStrand); - - pair<BED, vector<BED> > hit_set; - hit_set.second.reserve(10000); - while (sweep.Next(hit_set)) { - processHits(hit_set.first, hit_set.second); - } - } -} - - -void BedIntersect::IntersectBam(string bamFile) { - - // load the "B" bed file into a map so - // that we can easily compare "A" to it for overlaps - _bedB = new BedFile(_bedBFile); - _bedB->loadBedFileIntoMap(); - - // create a dummy BED A file for printing purposes if not - // using BAM output. - if (_bamOutput == false) { - _bedA = new BedFile(_bedAFile); - _bedA->bedType = 6; - } - - // open the BAM file - BamReader reader; - BamWriter writer; - - reader.Open(bamFile); - - // get header & reference information - string bamHeader = reader.GetHeaderText(); - RefVector refs = reader.GetReferenceData(); - - // open a BAM output to stdout if we are writing BAM - if (_bamOutput == true) { - // set compression mode - BamWriter::CompressionMode compressionMode = BamWriter::Compressed; - if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; - writer.SetCompressionMode(compressionMode); - // open our BAM writer - writer.Open("stdout", bamHeader, refs); - } - - vector<BED> hits; - // reserve some space - hits.reserve(100); - - - BamAlignment bam; - // get each set of alignments for each pair. - while (reader.GetNextAlignment(bam)) { - - if (bam.IsMapped()) { - BED a; - a.chrom = refs.at(bam.RefID).RefName; - a.start = bam.Position; - a.end = bam.GetEndPosition(false, false); - - // build the name field from the BAM alignment. - a.name = bam.Name; - if (bam.IsFirstMate()) a.name += "/1"; - if (bam.IsSecondMate()) a.name += "/2"; - - a.score = ToString(bam.MapQuality); - - a.strand = "+"; - if (bam.IsReverseStrand()) a.strand = "-"; - - if (_bamOutput == true) { - bool overlapsFound = false; - // treat the BAM alignment as a single "block" - if (_obeySplits == false) { - overlapsFound = FindOneOrMoreOverlap(a); - } - // split the BAM alignment into discrete blocks and - // look for overlaps only within each block. - else { - bool overlapFoundForBlock; - bedVector bedBlocks; // vec to store the discrete BED "blocks" from a - // we don't want to split on "D" ops, hence the "false" - getBamBlocks(bam, refs, bedBlocks, false); - - vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) { - overlapFoundForBlock = FindOneOrMoreOverlap(*bedItr); - if (overlapFoundForBlock == true) - overlapsFound = true; - } - } - if (overlapsFound == true) { - if (_noHit == false) - writer.SaveAlignment(bam); - } - else { - if (_noHit == true) { - writer.SaveAlignment(bam); - } - } - } - else { - // treat the BAM alignment as a single BED "block" - if (_obeySplits == false) { - FindOverlaps(a, hits); - hits.clear(); - } - // split the BAM alignment into discrete BED blocks and - // look for overlaps only within each block. - else { - bedVector bedBlocks; // vec to store the discrete BED "blocks" from a - getBamBlocks(bam, refs, bedBlocks, false); - - vector<BED>::const_iterator bedItr = bedBlocks.begin(); - vector<BED>::const_iterator bedEnd = bedBlocks.end(); - for (; bedItr != bedEnd; ++bedItr) { - FindOverlaps(*bedItr, hits); - hits.clear(); - } - } - } - } - // BAM IsMapped() is false - else if (_noHit == true) { - writer.SaveAlignment(bam); - } - } - - // close the relevant BAM files. - reader.Close(); - if (_bamOutput == true) { - writer.Close(); - } -} -