Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/pairToBed/pairToBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfcd8b6c1bda |
---|---|
1 /***************************************************************************** | |
2 pairToBed.cpp | |
3 | |
4 (c) 2009 - Aaron Quinlan | |
5 Hall Laboratory | |
6 Department of Biochemistry and Molecular Genetics | |
7 University of Virginia | |
8 aaronquinlan@gmail.com | |
9 | |
10 Licenced under the GNU General Public License 2.0 license. | |
11 ******************************************************************************/ | |
12 #include "lineFileUtilities.h" | |
13 #include "pairToBed.h" | |
14 | |
15 | |
16 bool IsCorrectMappingForBEDPE (const BamAlignment &bam) { | |
17 | |
18 if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize > 0) ) { | |
19 return true; | |
20 } | |
21 else if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize == 0) && bam.IsFirstMate() ) { | |
22 return true; | |
23 } | |
24 else if ( (bam.RefID != bam.MateRefID) && bam.IsFirstMate() ) { | |
25 return true; | |
26 } | |
27 else return false; | |
28 } | |
29 | |
30 | |
31 /* | |
32 Constructor | |
33 */ | |
34 | |
35 | |
36 BedIntersectPE::BedIntersectPE(string bedAFilePE, string bedBFile, float overlapFraction, | |
37 string searchType, bool sameStrand, bool diffStrand, bool bamInput, | |
38 bool bamOutput, bool uncompressedBam, bool useEditDistance) { | |
39 | |
40 _bedAFilePE = bedAFilePE; | |
41 _bedBFile = bedBFile; | |
42 _overlapFraction = overlapFraction; | |
43 _sameStrand = sameStrand; | |
44 _diffStrand = diffStrand; | |
45 _useEditDistance = useEditDistance; | |
46 _searchType = searchType; | |
47 _bamInput = bamInput; | |
48 _bamOutput = bamOutput; | |
49 _isUncompressedBam = uncompressedBam; | |
50 | |
51 _bedA = new BedFilePE(bedAFilePE); | |
52 _bedB = new BedFile(bedBFile); | |
53 | |
54 if (_bamInput == false) | |
55 IntersectBedPE(); | |
56 else | |
57 IntersectBamPE(_bedAFilePE); | |
58 } | |
59 | |
60 | |
61 /* | |
62 Destructor | |
63 */ | |
64 | |
65 BedIntersectPE::~BedIntersectPE(void) { | |
66 } | |
67 | |
68 | |
69 | |
70 void BedIntersectPE::FindOverlaps(const BEDPE &a, vector<BED> &hits1, vector<BED> &hits2, const string &type) { | |
71 | |
72 // list of hits on each end of BEDPE | |
73 // that exceed the requested overlap fraction | |
74 vector<BED> qualityHits1; | |
75 vector<BED> qualityHits2; | |
76 | |
77 // count of hits on each end of BEDPE | |
78 // that exceed the requested overlap fraction | |
79 int numOverlapsEnd1 = 0; | |
80 int numOverlapsEnd2 = 0; | |
81 | |
82 // make sure we have a valid chromosome before we search | |
83 if (a.chrom1 != ".") { | |
84 // Find the quality hits between ***end1*** of the BEDPE and the B BED file | |
85 _bedB->FindOverlapsPerBin(a.chrom1, a.start1, a.end1, a.strand1, hits1, _sameStrand, _diffStrand); | |
86 | |
87 vector<BED>::const_iterator h = hits1.begin(); | |
88 vector<BED>::const_iterator hitsEnd = hits1.end(); | |
89 for (; h != hitsEnd; ++h) { | |
90 | |
91 int s = max(a.start1, h->start); | |
92 int e = min(a.end1, h->end); | |
93 int overlapBases = (e - s); // the number of overlapping bases b/w a and b | |
94 int aLength = (a.end1 - a.start1); // the length of a in b.p. | |
95 | |
96 // is there enough overlap relative to the user's request? (default ~ 1bp) | |
97 if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { | |
98 numOverlapsEnd1++; | |
99 | |
100 if (type == "either") { | |
101 _bedA->reportBedPETab(a); | |
102 _bedB->reportBedNewLine(*h); | |
103 } | |
104 else { | |
105 qualityHits1.push_back(*h); | |
106 } | |
107 } | |
108 } | |
109 } | |
110 | |
111 | |
112 // make sure we have a valid chromosome before we search | |
113 if (a.chrom2 != ".") { | |
114 // Now find the quality hits between ***end2*** of the BEDPE and the B BED file | |
115 _bedB->FindOverlapsPerBin(a.chrom2, a.start2, a.end2, a.strand2, hits2, _sameStrand, _diffStrand); | |
116 | |
117 vector<BED>::const_iterator h = hits2.begin(); | |
118 vector<BED>::const_iterator hitsEnd = hits2.end(); | |
119 for (; h != hitsEnd; ++h) { | |
120 | |
121 int s = max(a.start2, h->start); | |
122 int e = min(a.end2, h->end); | |
123 int overlapBases = (e - s); // the number of overlapping bases b/w a and b | |
124 int aLength = (a.end2 - a.start2); // the length of a in b.p. | |
125 | |
126 // is there enough overlap relative to the user's request? (default ~ 1bp) | |
127 if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { | |
128 numOverlapsEnd2++; | |
129 | |
130 if (type == "either") { | |
131 _bedA->reportBedPETab(a); | |
132 _bedB->reportBedNewLine(*h); | |
133 } | |
134 else { | |
135 qualityHits2.push_back(*h); | |
136 } | |
137 } | |
138 } | |
139 } | |
140 | |
141 // Now report the hits depending on what the user has requested. | |
142 if (type == "neither") { | |
143 if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 == 0) ) { | |
144 _bedA->reportBedPENewLine(a); | |
145 } | |
146 } | |
147 else if (type == "notboth") { | |
148 if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 == 0) ) { | |
149 _bedA->reportBedPENewLine(a); | |
150 } | |
151 else if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 == 0) ) { | |
152 for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) { | |
153 _bedA->reportBedPETab(a); | |
154 _bedB->reportBedNewLine(*q); | |
155 } | |
156 } | |
157 else if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 > 0) ) { | |
158 for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) { | |
159 _bedA->reportBedPETab(a); | |
160 _bedB->reportBedNewLine(*q); | |
161 } | |
162 } | |
163 } | |
164 else if (type == "xor") { | |
165 if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 == 0) ) { | |
166 for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) { | |
167 _bedA->reportBedPETab(a); | |
168 _bedB->reportBedNewLine(*q); | |
169 } | |
170 } | |
171 else if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 > 0) ) { | |
172 for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) { | |
173 _bedA->reportBedPETab(a); | |
174 _bedB->reportBedNewLine(*q); | |
175 } | |
176 } | |
177 } | |
178 else if (type == "both") { | |
179 if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 > 0) ) { | |
180 for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) { | |
181 _bedA->reportBedPETab(a); | |
182 _bedB->reportBedNewLine(*q); | |
183 } | |
184 for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) { | |
185 _bedA->reportBedPETab(a); | |
186 _bedB->reportBedNewLine(*q); | |
187 } | |
188 } | |
189 } | |
190 } | |
191 | |
192 | |
193 bool BedIntersectPE::FindOneOrMoreOverlaps(const BEDPE &a, const string &type) { | |
194 | |
195 // flags for the existence of hits on each end of BEDPE | |
196 // that exceed the requested overlap fraction | |
197 bool end1Found = false; | |
198 bool end2Found = false; | |
199 | |
200 // Look for overlaps in end 1 assuming we have an aligned chromosome. | |
201 if (a.chrom1 != ".") { | |
202 end1Found = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom1, a.start1, a.end1, a.strand1, | |
203 _sameStrand, _diffStrand, _overlapFraction); | |
204 | |
205 // can we bail out without checking end2? | |
206 if ((type == "either") && (end1Found == true)) return true; | |
207 else if ((type == "neither") && (end1Found == true)) return false; | |
208 else if ((type == "notboth") && (end1Found == false)) return true; | |
209 else if ((type == "both") && (end1Found == false)) return false; | |
210 } | |
211 | |
212 // Now look for overlaps in end 2 assuming we have an aligned chromosome. | |
213 if (a.chrom2 != ".") { | |
214 end2Found = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom2, a.start2, a.end2, a.strand2, | |
215 _sameStrand, _diffStrand, _overlapFraction); | |
216 | |
217 if ((type == "either") && (end2Found == true)) return true; | |
218 else if ((type == "neither") && (end2Found == true)) return false; | |
219 else if ((type == "notboth") && (end2Found == false)) return true; | |
220 else if ((type == "both") && (end2Found == false)) return false; | |
221 } | |
222 | |
223 // Now report the hits depending on what the user has requested. | |
224 if (type == "notboth") { | |
225 if ( (end1Found == false) || (end2Found == false) ) return true; | |
226 else return false; | |
227 } | |
228 else if (type == "either") { | |
229 if ( (end1Found == false) && (end2Found == false) ) return false; | |
230 } | |
231 else if (type == "neither") { | |
232 if ( (end1Found == false) && (end2Found == false) ) return true; | |
233 else return false; | |
234 } | |
235 else if (type == "xor") { | |
236 if ( (end1Found == true) && (end2Found == false) ) return true; | |
237 else if ( (end1Found == false) && (end2Found == true) ) return true; | |
238 else return false; | |
239 } | |
240 else if (type == "both") { | |
241 if ( (end1Found == true) && (end2Found == true) ) return true; | |
242 return false; | |
243 } | |
244 return false; | |
245 } | |
246 | |
247 | |
248 void BedIntersectPE::FindSpanningOverlaps(const BEDPE &a, vector<BED> &hits, const string &type) { | |
249 | |
250 // count of hits on _between_ end of BEDPE | |
251 // that exceed the requested overlap fraction | |
252 int numOverlaps = 0; | |
253 CHRPOS spanStart = 0; | |
254 CHRPOS spanEnd = 0; | |
255 CHRPOS spanLength = 0; | |
256 | |
257 if ((type == "ispan") || (type == "notispan")) { | |
258 spanStart = a.end1; | |
259 spanEnd = a.start2; | |
260 if (a.end1 > a.start2) { | |
261 spanStart = a.end2; | |
262 spanEnd = a.start1; | |
263 } | |
264 } | |
265 else if ((type == "ospan") || (type == "notospan")) { | |
266 spanStart = a.start1; | |
267 spanEnd = a.end2; | |
268 if (a.start1 > a.start2) { | |
269 spanStart = a.start2; | |
270 spanEnd = a.end1; | |
271 } | |
272 } | |
273 spanLength = spanEnd - spanStart; | |
274 | |
275 // get the hits for the span | |
276 _bedB->FindOverlapsPerBin(a.chrom1, spanStart, spanEnd, a.strand1, hits, _sameStrand, _diffStrand); | |
277 | |
278 vector<BED>::const_iterator h = hits.begin(); | |
279 vector<BED>::const_iterator hitsEnd = hits.end(); | |
280 for (; h != hitsEnd; ++h) { | |
281 | |
282 int s = max(spanStart, h->start); | |
283 int e = min(spanEnd, h->end); | |
284 int overlapBases = (e - s); // the number of overlapping bases b/w a and b | |
285 int spanLength = (spanEnd - spanStart); // the length of a in b.p. | |
286 | |
287 // is there enough overlap relative to the user's request? (default ~ 1bp) | |
288 if ( ( (float) overlapBases / (float) spanLength ) >= _overlapFraction ) { | |
289 numOverlaps++; | |
290 if ((type == "ispan") || (type == "ospan")) { | |
291 _bedA->reportBedPETab(a); | |
292 _bedB->reportBedNewLine(*h); | |
293 } | |
294 } | |
295 } | |
296 | |
297 if ( ( (type == "notispan") || (type == "notospan") ) && numOverlaps == 0 ) { | |
298 _bedA->reportBedPENewLine(a); | |
299 } | |
300 } | |
301 | |
302 | |
303 bool BedIntersectPE::FindOneOrMoreSpanningOverlaps(const BEDPE &a, const string &type) { | |
304 | |
305 int spanStart = 0; | |
306 int spanEnd = 0; | |
307 int spanLength = 0; | |
308 bool overlapFound; | |
309 | |
310 if ((type == "ispan") || (type == "notispan")) { | |
311 spanStart = a.end1; | |
312 spanEnd = a.start2; | |
313 if (a.end1 > a.start2) { | |
314 spanStart = a.end2; | |
315 spanEnd = a.start1; | |
316 } | |
317 } | |
318 else if ((type == "ospan") || (type == "notospan")) { | |
319 spanStart = a.start1; | |
320 spanEnd = a.end2; | |
321 if (a.start1 > a.start2) { | |
322 spanStart = a.start2; | |
323 spanEnd = a.end1; | |
324 } | |
325 } | |
326 spanLength = spanEnd - spanStart; | |
327 | |
328 overlapFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom1, spanStart, spanEnd, a.strand1, | |
329 _sameStrand, _diffStrand, _overlapFraction); | |
330 | |
331 return overlapFound; | |
332 } | |
333 | |
334 | |
335 void BedIntersectPE::IntersectBedPE() { | |
336 | |
337 // load the "B" bed file into a map so | |
338 // that we can easily compare "A" to it for overlaps | |
339 _bedB->loadBedFileIntoMap(); | |
340 | |
341 int lineNum = 0; // current input line number | |
342 vector<BED> hits, hits1, hits2; // vector of potential hits | |
343 | |
344 // reserve some space | |
345 hits.reserve(100); | |
346 hits1.reserve(100); | |
347 hits2.reserve(100); | |
348 | |
349 BEDPE a, nullBedPE; | |
350 BedLineStatus bedStatus; | |
351 | |
352 _bedA->Open(); | |
353 while ((bedStatus = _bedA->GetNextBedPE(a, lineNum)) != BED_INVALID) { | |
354 if (bedStatus == BED_VALID) { | |
355 if ( (_searchType == "ispan") || (_searchType == "ospan") || | |
356 (_searchType == "notispan") || (_searchType == "notospan") ) { | |
357 if (a.chrom1 == a.chrom2) { | |
358 FindSpanningOverlaps(a, hits, _searchType); | |
359 hits.clear(); | |
360 } | |
361 } | |
362 else { | |
363 FindOverlaps(a, hits1, hits2, _searchType); | |
364 hits1.clear(); | |
365 hits2.clear(); | |
366 } | |
367 a = nullBedPE; | |
368 } | |
369 } | |
370 _bedA->Close(); | |
371 } | |
372 | |
373 | |
374 void BedIntersectPE::IntersectBamPE(string bamFile) { | |
375 | |
376 // load the "B" bed file into a map so | |
377 // that we can easily compare "A" to it for overlaps | |
378 _bedB->loadBedFileIntoMap(); | |
379 | |
380 // open the BAM file | |
381 BamReader reader; | |
382 BamWriter writer; | |
383 reader.Open(bamFile); | |
384 | |
385 // get header & reference information | |
386 string bamHeader = reader.GetHeaderText(); | |
387 RefVector refs = reader.GetReferenceData(); | |
388 | |
389 // open a BAM output to stdout if we are writing BAM | |
390 if (_bamOutput == true) { | |
391 // set compression mode | |
392 BamWriter::CompressionMode compressionMode = BamWriter::Compressed; | |
393 if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; | |
394 writer.SetCompressionMode(compressionMode); | |
395 // open our BAM writer | |
396 writer.Open("stdout", bamHeader, refs); | |
397 } | |
398 | |
399 // track the previous and current sequence | |
400 // names so that we can identify blocks of | |
401 // alignments for a given read ID. | |
402 string prevName, currName; | |
403 prevName = currName = ""; | |
404 | |
405 vector<BamAlignment> alignments; // vector of BAM alignments for a given ID in a BAM file. | |
406 alignments.reserve(100); | |
407 | |
408 _bedA->bedType = 10; // it's a full BEDPE given it's BAM | |
409 | |
410 // rip through the BAM file and convert each mapped entry to BEDPE | |
411 BamAlignment bam1, bam2; | |
412 while (reader.GetNextAlignment(bam1)) { | |
413 // the alignment must be paired | |
414 if (bam1.IsPaired() == true) { | |
415 // grab the second alignment for the pair. | |
416 reader.GetNextAlignment(bam2); | |
417 | |
418 // require that the alignments are from the same query | |
419 if (bam1.Name == bam2.Name) { | |
420 ProcessBamBlock(bam1, bam2, refs, writer); | |
421 } | |
422 else { | |
423 cerr << "*****ERROR: BAM must be sorted by query name. " << endl; | |
424 exit(1); | |
425 } | |
426 } | |
427 } | |
428 // close up | |
429 reader.Close(); | |
430 if (_bamOutput == true) { | |
431 writer.Close(); | |
432 } | |
433 } | |
434 | |
435 | |
436 void BedIntersectPE::ProcessBamBlock (const BamAlignment &bam1, const BamAlignment &bam2, | |
437 const RefVector &refs, BamWriter &writer) { | |
438 | |
439 vector<BED> hits, hits1, hits2; // vector of potential hits | |
440 hits.reserve(1000); // reserve some space | |
441 hits1.reserve(1000); | |
442 hits2.reserve(1000); | |
443 | |
444 bool overlapsFound; // flag to indicate if overlaps were found | |
445 | |
446 if ( (_searchType == "either") || (_searchType == "xor") || | |
447 (_searchType == "both") || (_searchType == "notboth") || | |
448 (_searchType == "neither") ) { | |
449 | |
450 // create a new BEDPE feature from the BAM alignments. | |
451 BEDPE a; | |
452 ConvertBamToBedPE(bam1, bam2, refs, a); | |
453 if (_bamOutput == true) { // BAM output | |
454 // write to BAM if correct hits found | |
455 overlapsFound = FindOneOrMoreOverlaps(a, _searchType); | |
456 if (overlapsFound == true) { | |
457 writer.SaveAlignment(bam1); | |
458 writer.SaveAlignment(bam2); | |
459 } | |
460 } | |
461 else { // BEDPE output | |
462 FindOverlaps(a, hits1, hits2, _searchType); | |
463 hits1.clear(); | |
464 hits2.clear(); | |
465 } | |
466 } | |
467 else if ( (_searchType == "ispan") || (_searchType == "ospan") ) { | |
468 // only look for ispan and ospan when both ends are mapped. | |
469 if (bam1.IsMapped() && bam2.IsMapped()) { | |
470 // only do an inspan or outspan check if the alignment is intrachromosomal | |
471 if (bam1.RefID == bam2.RefID) { | |
472 // create a new BEDPE feature from the BAM alignments. | |
473 BEDPE a; | |
474 ConvertBamToBedPE(bam1, bam2, refs, a); | |
475 if (_bamOutput == true) { // BAM output | |
476 // look for overlaps, and write to BAM if >=1 were found | |
477 overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType); | |
478 if (overlapsFound == true) { | |
479 writer.SaveAlignment(bam1); | |
480 writer.SaveAlignment(bam2); | |
481 } | |
482 } | |
483 else { // BEDPE output | |
484 FindSpanningOverlaps(a, hits, _searchType); | |
485 hits.clear(); | |
486 } | |
487 } | |
488 } | |
489 } | |
490 else if ( (_searchType == "notispan") || (_searchType == "notospan") ) { | |
491 // only look for notispan and notospan when both ends are mapped. | |
492 if (bam1.IsMapped() && bam2.IsMapped()) { | |
493 // only do an inspan or outspan check if the alignment is intrachromosomal | |
494 if (bam1.RefID == bam2.RefID) { | |
495 // create a new BEDPE feature from the BAM alignments. | |
496 BEDPE a; | |
497 ConvertBamToBedPE(bam1, bam2, refs, a); | |
498 if (_bamOutput == true) { // BAM output | |
499 // write to BAM if there were no overlaps | |
500 overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType); | |
501 if (overlapsFound == false) { | |
502 writer.SaveAlignment(bam1); | |
503 writer.SaveAlignment(bam2); | |
504 } | |
505 } | |
506 else { // BEDPE output | |
507 FindSpanningOverlaps(a, hits, _searchType); | |
508 hits.clear(); | |
509 } | |
510 } | |
511 // if inter-chromosomal or orphaned, we know it's not ispan and not ospan | |
512 else if (_bamOutput == true) { | |
513 writer.SaveAlignment(bam1); | |
514 writer.SaveAlignment(bam2); | |
515 } | |
516 } | |
517 // if both ends aren't mapped, we know that it's notispan and not ospan | |
518 else if (_bamOutput == true) { | |
519 writer.SaveAlignment(bam1); | |
520 writer.SaveAlignment(bam2); | |
521 } | |
522 } | |
523 } | |
524 | |
525 |