Mercurial > repos > aaronquinlan > multi_intersect
view BEDTools-Version-2.14.3/src/intersectBed/intersectBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
line wrap: on
line source
/***************************************************************************** intersectBed.cpp (c) 2009 - Aaron Quinlan Hall Laboratory Department of Biochemistry and Molecular Genetics University of Virginia aaronquinlan@gmail.com Licenced under the GNU General Public License 2.0 license. ******************************************************************************/ #include "lineFileUtilities.h" #include "intersectBed.h" /************************************ Helper functions ************************************/ bool BedIntersect::processHits(const BED &a, const vector<BED> &hits) { // how many overlaps are there b/w the bed and the set of hits? CHRPOS s, e; int overlapBases; int numOverlaps = 0; bool hitsFound = false; int aLength = (a.end - a.start); // the length of a in b.p. // loop through the hits and report those that meet the user's criteria vector<BED>::const_iterator h = hits.begin(); vector<BED>::const_iterator hitsEnd = hits.end(); for (; h != hitsEnd; ++h) { s = max(a.start, h->start); e = min(a.end, h->end); overlapBases = (e - s); // the number of overlapping bases b/w a and b // is there enough overlap relative to the user's request? (default ~ 1bp) if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { // Report the hit if the user doesn't care about reciprocal overlap between A and B. if (_reciprocal == false) { hitsFound = true; numOverlaps++; if (_printable == true) ReportOverlapDetail(overlapBases, a, *h, s, e); } // we require there to be sufficient __reciprocal__ overlap else { int bLength = (h->end - h->start); float bOverlap = ( (float) overlapBases / (float) bLength ); if (bOverlap >= _overlapFraction) { hitsFound = true; numOverlaps++; if (_printable == true) ReportOverlapDetail(overlapBases, a, *h, s, e); } } } } // report the summary of the overlaps if requested. ReportOverlapSummary(a, numOverlaps); // were hits found for this BED feature? return hitsFound; } /* Constructor */ BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, float overlapFraction, bool noHit, bool writeCount, bool sameStrand, bool diffStrand, bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam, bool sortedInput) { _bedAFile = bedAFile; _bedBFile = bedBFile; _anyHit = anyHit; _noHit = noHit; _writeA = writeA; _writeB = writeB; _writeOverlap = writeOverlap; _writeAllOverlap = writeAllOverlap; _writeCount = writeCount; _overlapFraction = overlapFraction; _sameStrand = sameStrand; _diffStrand = diffStrand; _reciprocal = reciprocal; _obeySplits = obeySplits; _bamInput = bamInput; _bamOutput = bamOutput; _isUncompressedBam = isUncompressedBam; _sortedInput = sortedInput; // should we print each overlap, or does the user want summary information? _printable = true; if (_anyHit || _noHit || _writeCount) _printable = false; if (_bamInput == false) IntersectBed(); else IntersectBam(bedAFile); } /* Destructor */ BedIntersect::~BedIntersect(void) { } bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { bool hitsFound = false; // collect and report the sufficient hits _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand); hitsFound = processHits(a, hits); return hitsFound; } void BedIntersect::ReportOverlapDetail(int overlapBases, const BED &a, const BED &b, CHRPOS s, CHRPOS e) { // default. simple intersection only if (_writeA == false && _writeB == false && _writeOverlap == false) { _bedA->reportBedRangeNewLine(a,s,e); } // -wa -wb write the original A and B else if (_writeA == true && _writeB == true) { _bedA->reportBedTab(a); _bedB->reportBedNewLine(b); } // -wa write just the original A else if (_writeA == true) { _bedA->reportBedNewLine(a); } // -wb write the intersected portion of A and the original B else if (_writeB == true) { _bedA->reportBedRangeTab(a,s,e); _bedB->reportBedNewLine(b); } // -wo write the original A and B plus the no. of overlapping bases. else if (_writeOverlap == true) { _bedA->reportBedTab(a); _bedB->reportBedTab(b); if (b.zeroLength == false) printf("%d\n", overlapBases); else printf("0\n"); } } void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) { // -u just report the fact that there was >= 1 overlaps if (_anyHit && (numOverlapsFound >= 1)) { _bedA->reportBedNewLine(a); } // -c report the total number of features overlapped in B else if (_writeCount) { _bedA->reportBedTab(a); printf("%d\n", numOverlapsFound); } // -v report iff there were no overlaps else if (_noHit && (numOverlapsFound == 0)) { _bedA->reportBedNewLine(a); } // -wao the user wants to force the reporting of 0 overlap else if (_writeAllOverlap && (numOverlapsFound == 0)) { _bedA->reportBedTab(a); _bedB->reportNullBedTab(); printf("0\n"); } } bool BedIntersect::FindOneOrMoreOverlap(const BED &a) { bool overlapsFound = false; if (_reciprocal == false) { overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, _sameStrand, _diffStrand, _overlapFraction); } else { overlapsFound = _bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand, _sameStrand, _diffStrand, _overlapFraction); } return overlapsFound; } void BedIntersect::IntersectBed() { // create new BED file objects for A and B _bedA = new BedFile(_bedAFile); _bedB = new BedFile(_bedBFile); if (_sortedInput == false) { // load the "B" file into a map in order to // compare each entry in A to it in search of overlaps. _bedB->loadBedFileIntoMap(); int lineNum = 0; vector<BED> hits; hits.reserve(100); BED a, nullBed; BedLineStatus bedStatus; // open the "A" file, process each BED entry and searh for overlaps. _bedA->Open(); while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { // treat the BED as a single "block" if (_obeySplits == false) { FindOverlaps(a, hits); hits.clear(); } // split the BED12 into blocks and look for overlaps in each discrete block else { bedVector bedBlocks; // vec to store the discrete BED "blocks" splitBedIntoBlocks(a, lineNum, bedBlocks); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); for (; bedItr != bedEnd; ++bedItr) { FindOverlaps(*bedItr, hits); hits.clear(); } } } a = nullBed; } _bedA->Close(); } else { // use the chromsweep algorithm to detect overlaps on the fly. ChromSweep sweep = ChromSweep(_bedA, _bedB, _sameStrand, _diffStrand); pair<BED, vector<BED> > hit_set; hit_set.second.reserve(10000); while (sweep.Next(hit_set)) { processHits(hit_set.first, hit_set.second); } } } void BedIntersect::IntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB = new BedFile(_bedBFile); _bedB->loadBedFileIntoMap(); // create a dummy BED A file for printing purposes if not // using BAM output. if (_bamOutput == false) { _bedA = new BedFile(_bedAFile); _bedA->bedType = 6; } // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } vector<BED> hits; // reserve some space hits.reserve(100); BamAlignment bam; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { if (bam.IsMapped()) { BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false, false); // build the name field from the BAM alignment. a.name = bam.Name; if (bam.IsFirstMate()) a.name += "/1"; if (bam.IsSecondMate()) a.name += "/2"; a.score = ToString(bam.MapQuality); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; if (_bamOutput == true) { bool overlapsFound = false; // treat the BAM alignment as a single "block" if (_obeySplits == false) { overlapsFound = FindOneOrMoreOverlap(a); } // split the BAM alignment into discrete blocks and // look for overlaps only within each block. else { bool overlapFoundForBlock; bedVector bedBlocks; // vec to store the discrete BED "blocks" from a // we don't want to split on "D" ops, hence the "false" getBamBlocks(bam, refs, bedBlocks, false); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); for (; bedItr != bedEnd; ++bedItr) { overlapFoundForBlock = FindOneOrMoreOverlap(*bedItr); if (overlapFoundForBlock == true) overlapsFound = true; } } if (overlapsFound == true) { if (_noHit == false) writer.SaveAlignment(bam); } else { if (_noHit == true) { writer.SaveAlignment(bam); } } } else { // treat the BAM alignment as a single BED "block" if (_obeySplits == false) { FindOverlaps(a, hits); hits.clear(); } // split the BAM alignment into discrete BED blocks and // look for overlaps only within each block. else { bedVector bedBlocks; // vec to store the discrete BED "blocks" from a getBamBlocks(bam, refs, bedBlocks, false); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); for (; bedItr != bedEnd; ++bedItr) { FindOverlaps(*bedItr, hits); hits.clear(); } } } } // BAM IsMapped() is false else if (_noHit == true) { writer.SaveAlignment(bam); } } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }