comparison BEDTools-Version-2.14.3/src/intersectBed/intersectBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfcd8b6c1bda
1 /*****************************************************************************
2 intersectBed.cpp
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "lineFileUtilities.h"
13 #include "intersectBed.h"
14
15 /************************************
16 Helper functions
17 ************************************/
18 bool BedIntersect::processHits(const BED &a, const vector<BED> &hits) {
19
20 // how many overlaps are there b/w the bed and the set of hits?
21 CHRPOS s, e;
22 int overlapBases;
23 int numOverlaps = 0;
24 bool hitsFound = false;
25 int aLength = (a.end - a.start); // the length of a in b.p.
26
27 // loop through the hits and report those that meet the user's criteria
28 vector<BED>::const_iterator h = hits.begin();
29 vector<BED>::const_iterator hitsEnd = hits.end();
30 for (; h != hitsEnd; ++h) {
31 s = max(a.start, h->start);
32 e = min(a.end, h->end);
33 overlapBases = (e - s); // the number of overlapping bases b/w a and b
34
35 // is there enough overlap relative to the user's request? (default ~ 1bp)
36 if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) {
37 // Report the hit if the user doesn't care about reciprocal overlap between A and B.
38 if (_reciprocal == false) {
39 hitsFound = true;
40 numOverlaps++;
41 if (_printable == true)
42 ReportOverlapDetail(overlapBases, a, *h, s, e);
43 }
44 // we require there to be sufficient __reciprocal__ overlap
45 else {
46 int bLength = (h->end - h->start);
47 float bOverlap = ( (float) overlapBases / (float) bLength );
48 if (bOverlap >= _overlapFraction) {
49 hitsFound = true;
50 numOverlaps++;
51 if (_printable == true)
52 ReportOverlapDetail(overlapBases, a, *h, s, e);
53 }
54 }
55 }
56 }
57 // report the summary of the overlaps if requested.
58 ReportOverlapSummary(a, numOverlaps);
59 // were hits found for this BED feature?
60 return hitsFound;
61 }
62
63
64 /*
65 Constructor
66 */
67 BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit,
68 bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap,
69 float overlapFraction, bool noHit, bool writeCount, bool sameStrand, bool diffStrand,
70 bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam,
71 bool sortedInput) {
72
73 _bedAFile = bedAFile;
74 _bedBFile = bedBFile;
75 _anyHit = anyHit;
76 _noHit = noHit;
77 _writeA = writeA;
78 _writeB = writeB;
79 _writeOverlap = writeOverlap;
80 _writeAllOverlap = writeAllOverlap;
81 _writeCount = writeCount;
82 _overlapFraction = overlapFraction;
83 _sameStrand = sameStrand;
84 _diffStrand = diffStrand;
85 _reciprocal = reciprocal;
86 _obeySplits = obeySplits;
87 _bamInput = bamInput;
88 _bamOutput = bamOutput;
89 _isUncompressedBam = isUncompressedBam;
90 _sortedInput = sortedInput;
91
92 // should we print each overlap, or does the user want summary information?
93 _printable = true;
94 if (_anyHit || _noHit || _writeCount)
95 _printable = false;
96
97 if (_bamInput == false)
98 IntersectBed();
99 else
100 IntersectBam(bedAFile);
101 }
102
103
104 /*
105 Destructor
106 */
107 BedIntersect::~BedIntersect(void) {
108 }
109
110
111 bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) {
112 bool hitsFound = false;
113
114 // collect and report the sufficient hits
115 _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand);
116 hitsFound = processHits(a, hits);
117 return hitsFound;
118 }
119
120
121 void BedIntersect::ReportOverlapDetail(int overlapBases, const BED &a, const BED &b, CHRPOS s, CHRPOS e) {
122 // default. simple intersection only
123 if (_writeA == false && _writeB == false && _writeOverlap == false) {
124 _bedA->reportBedRangeNewLine(a,s,e);
125 }
126 // -wa -wb write the original A and B
127 else if (_writeA == true && _writeB == true) {
128 _bedA->reportBedTab(a);
129 _bedB->reportBedNewLine(b);
130 }
131 // -wa write just the original A
132 else if (_writeA == true) {
133 _bedA->reportBedNewLine(a);
134 }
135 // -wb write the intersected portion of A and the original B
136 else if (_writeB == true) {
137 _bedA->reportBedRangeTab(a,s,e);
138 _bedB->reportBedNewLine(b);
139 }
140 // -wo write the original A and B plus the no. of overlapping bases.
141 else if (_writeOverlap == true) {
142 _bedA->reportBedTab(a);
143 _bedB->reportBedTab(b);
144 if (b.zeroLength == false) printf("%d\n", overlapBases);
145 else printf("0\n");
146 }
147 }
148
149
150 void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) {
151 // -u just report the fact that there was >= 1 overlaps
152 if (_anyHit && (numOverlapsFound >= 1)) {
153 _bedA->reportBedNewLine(a);
154 }
155 // -c report the total number of features overlapped in B
156 else if (_writeCount) {
157 _bedA->reportBedTab(a);
158 printf("%d\n", numOverlapsFound);
159 }
160 // -v report iff there were no overlaps
161 else if (_noHit && (numOverlapsFound == 0)) {
162 _bedA->reportBedNewLine(a);
163 }
164 // -wao the user wants to force the reporting of 0 overlap
165 else if (_writeAllOverlap && (numOverlapsFound == 0)) {
166 _bedA->reportBedTab(a);
167 _bedB->reportNullBedTab();
168 printf("0\n");
169 }
170 }
171
172
173 bool BedIntersect::FindOneOrMoreOverlap(const BED &a) {
174 bool overlapsFound = false;
175 if (_reciprocal == false) {
176 overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand,
177 _sameStrand, _diffStrand, _overlapFraction);
178 }
179 else {
180 overlapsFound = _bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand,
181 _sameStrand, _diffStrand, _overlapFraction);
182 }
183 return overlapsFound;
184 }
185
186
187 void BedIntersect::IntersectBed() {
188
189 // create new BED file objects for A and B
190 _bedA = new BedFile(_bedAFile);
191 _bedB = new BedFile(_bedBFile);
192
193 if (_sortedInput == false) {
194 // load the "B" file into a map in order to
195 // compare each entry in A to it in search of overlaps.
196 _bedB->loadBedFileIntoMap();
197
198 int lineNum = 0;
199 vector<BED> hits;
200 hits.reserve(100);
201 BED a, nullBed;
202 BedLineStatus bedStatus;
203
204 // open the "A" file, process each BED entry and searh for overlaps.
205 _bedA->Open();
206 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) {
207 if (bedStatus == BED_VALID) {
208 // treat the BED as a single "block"
209 if (_obeySplits == false) {
210 FindOverlaps(a, hits);
211 hits.clear();
212 }
213 // split the BED12 into blocks and look for overlaps in each discrete block
214 else {
215 bedVector bedBlocks; // vec to store the discrete BED "blocks"
216 splitBedIntoBlocks(a, lineNum, bedBlocks);
217
218 vector<BED>::const_iterator bedItr = bedBlocks.begin();
219 vector<BED>::const_iterator bedEnd = bedBlocks.end();
220 for (; bedItr != bedEnd; ++bedItr) {
221 FindOverlaps(*bedItr, hits);
222 hits.clear();
223 }
224 }
225 }
226 a = nullBed;
227 }
228 _bedA->Close();
229 }
230 else {
231 // use the chromsweep algorithm to detect overlaps on the fly.
232 ChromSweep sweep = ChromSweep(_bedA, _bedB, _sameStrand, _diffStrand);
233
234 pair<BED, vector<BED> > hit_set;
235 hit_set.second.reserve(10000);
236 while (sweep.Next(hit_set)) {
237 processHits(hit_set.first, hit_set.second);
238 }
239 }
240 }
241
242
243 void BedIntersect::IntersectBam(string bamFile) {
244
245 // load the "B" bed file into a map so
246 // that we can easily compare "A" to it for overlaps
247 _bedB = new BedFile(_bedBFile);
248 _bedB->loadBedFileIntoMap();
249
250 // create a dummy BED A file for printing purposes if not
251 // using BAM output.
252 if (_bamOutput == false) {
253 _bedA = new BedFile(_bedAFile);
254 _bedA->bedType = 6;
255 }
256
257 // open the BAM file
258 BamReader reader;
259 BamWriter writer;
260
261 reader.Open(bamFile);
262
263 // get header & reference information
264 string bamHeader = reader.GetHeaderText();
265 RefVector refs = reader.GetReferenceData();
266
267 // open a BAM output to stdout if we are writing BAM
268 if (_bamOutput == true) {
269 // set compression mode
270 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
271 if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
272 writer.SetCompressionMode(compressionMode);
273 // open our BAM writer
274 writer.Open("stdout", bamHeader, refs);
275 }
276
277 vector<BED> hits;
278 // reserve some space
279 hits.reserve(100);
280
281
282 BamAlignment bam;
283 // get each set of alignments for each pair.
284 while (reader.GetNextAlignment(bam)) {
285
286 if (bam.IsMapped()) {
287 BED a;
288 a.chrom = refs.at(bam.RefID).RefName;
289 a.start = bam.Position;
290 a.end = bam.GetEndPosition(false, false);
291
292 // build the name field from the BAM alignment.
293 a.name = bam.Name;
294 if (bam.IsFirstMate()) a.name += "/1";
295 if (bam.IsSecondMate()) a.name += "/2";
296
297 a.score = ToString(bam.MapQuality);
298
299 a.strand = "+";
300 if (bam.IsReverseStrand()) a.strand = "-";
301
302 if (_bamOutput == true) {
303 bool overlapsFound = false;
304 // treat the BAM alignment as a single "block"
305 if (_obeySplits == false) {
306 overlapsFound = FindOneOrMoreOverlap(a);
307 }
308 // split the BAM alignment into discrete blocks and
309 // look for overlaps only within each block.
310 else {
311 bool overlapFoundForBlock;
312 bedVector bedBlocks; // vec to store the discrete BED "blocks" from a
313 // we don't want to split on "D" ops, hence the "false"
314 getBamBlocks(bam, refs, bedBlocks, false);
315
316 vector<BED>::const_iterator bedItr = bedBlocks.begin();
317 vector<BED>::const_iterator bedEnd = bedBlocks.end();
318 for (; bedItr != bedEnd; ++bedItr) {
319 overlapFoundForBlock = FindOneOrMoreOverlap(*bedItr);
320 if (overlapFoundForBlock == true)
321 overlapsFound = true;
322 }
323 }
324 if (overlapsFound == true) {
325 if (_noHit == false)
326 writer.SaveAlignment(bam);
327 }
328 else {
329 if (_noHit == true) {
330 writer.SaveAlignment(bam);
331 }
332 }
333 }
334 else {
335 // treat the BAM alignment as a single BED "block"
336 if (_obeySplits == false) {
337 FindOverlaps(a, hits);
338 hits.clear();
339 }
340 // split the BAM alignment into discrete BED blocks and
341 // look for overlaps only within each block.
342 else {
343 bedVector bedBlocks; // vec to store the discrete BED "blocks" from a
344 getBamBlocks(bam, refs, bedBlocks, false);
345
346 vector<BED>::const_iterator bedItr = bedBlocks.begin();
347 vector<BED>::const_iterator bedEnd = bedBlocks.end();
348 for (; bedItr != bedEnd; ++bedItr) {
349 FindOverlaps(*bedItr, hits);
350 hits.clear();
351 }
352 }
353 }
354 }
355 // BAM IsMapped() is false
356 else if (_noHit == true) {
357 writer.SaveAlignment(bam);
358 }
359 }
360
361 // close the relevant BAM files.
362 reader.Close();
363 if (_bamOutput == true) {
364 writer.Close();
365 }
366 }
367