comparison BEDTools-Version-2.14.3/src/pairToBed/pairToBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfcd8b6c1bda
1 /*****************************************************************************
2 pairToBed.cpp
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "lineFileUtilities.h"
13 #include "pairToBed.h"
14
15
16 bool IsCorrectMappingForBEDPE (const BamAlignment &bam) {
17
18 if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize > 0) ) {
19 return true;
20 }
21 else if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize == 0) && bam.IsFirstMate() ) {
22 return true;
23 }
24 else if ( (bam.RefID != bam.MateRefID) && bam.IsFirstMate() ) {
25 return true;
26 }
27 else return false;
28 }
29
30
31 /*
32 Constructor
33 */
34
35
36 BedIntersectPE::BedIntersectPE(string bedAFilePE, string bedBFile, float overlapFraction,
37 string searchType, bool sameStrand, bool diffStrand, bool bamInput,
38 bool bamOutput, bool uncompressedBam, bool useEditDistance) {
39
40 _bedAFilePE = bedAFilePE;
41 _bedBFile = bedBFile;
42 _overlapFraction = overlapFraction;
43 _sameStrand = sameStrand;
44 _diffStrand = diffStrand;
45 _useEditDistance = useEditDistance;
46 _searchType = searchType;
47 _bamInput = bamInput;
48 _bamOutput = bamOutput;
49 _isUncompressedBam = uncompressedBam;
50
51 _bedA = new BedFilePE(bedAFilePE);
52 _bedB = new BedFile(bedBFile);
53
54 if (_bamInput == false)
55 IntersectBedPE();
56 else
57 IntersectBamPE(_bedAFilePE);
58 }
59
60
61 /*
62 Destructor
63 */
64
65 BedIntersectPE::~BedIntersectPE(void) {
66 }
67
68
69
70 void BedIntersectPE::FindOverlaps(const BEDPE &a, vector<BED> &hits1, vector<BED> &hits2, const string &type) {
71
72 // list of hits on each end of BEDPE
73 // that exceed the requested overlap fraction
74 vector<BED> qualityHits1;
75 vector<BED> qualityHits2;
76
77 // count of hits on each end of BEDPE
78 // that exceed the requested overlap fraction
79 int numOverlapsEnd1 = 0;
80 int numOverlapsEnd2 = 0;
81
82 // make sure we have a valid chromosome before we search
83 if (a.chrom1 != ".") {
84 // Find the quality hits between ***end1*** of the BEDPE and the B BED file
85 _bedB->FindOverlapsPerBin(a.chrom1, a.start1, a.end1, a.strand1, hits1, _sameStrand, _diffStrand);
86
87 vector<BED>::const_iterator h = hits1.begin();
88 vector<BED>::const_iterator hitsEnd = hits1.end();
89 for (; h != hitsEnd; ++h) {
90
91 int s = max(a.start1, h->start);
92 int e = min(a.end1, h->end);
93 int overlapBases = (e - s); // the number of overlapping bases b/w a and b
94 int aLength = (a.end1 - a.start1); // the length of a in b.p.
95
96 // is there enough overlap relative to the user's request? (default ~ 1bp)
97 if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) {
98 numOverlapsEnd1++;
99
100 if (type == "either") {
101 _bedA->reportBedPETab(a);
102 _bedB->reportBedNewLine(*h);
103 }
104 else {
105 qualityHits1.push_back(*h);
106 }
107 }
108 }
109 }
110
111
112 // make sure we have a valid chromosome before we search
113 if (a.chrom2 != ".") {
114 // Now find the quality hits between ***end2*** of the BEDPE and the B BED file
115 _bedB->FindOverlapsPerBin(a.chrom2, a.start2, a.end2, a.strand2, hits2, _sameStrand, _diffStrand);
116
117 vector<BED>::const_iterator h = hits2.begin();
118 vector<BED>::const_iterator hitsEnd = hits2.end();
119 for (; h != hitsEnd; ++h) {
120
121 int s = max(a.start2, h->start);
122 int e = min(a.end2, h->end);
123 int overlapBases = (e - s); // the number of overlapping bases b/w a and b
124 int aLength = (a.end2 - a.start2); // the length of a in b.p.
125
126 // is there enough overlap relative to the user's request? (default ~ 1bp)
127 if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) {
128 numOverlapsEnd2++;
129
130 if (type == "either") {
131 _bedA->reportBedPETab(a);
132 _bedB->reportBedNewLine(*h);
133 }
134 else {
135 qualityHits2.push_back(*h);
136 }
137 }
138 }
139 }
140
141 // Now report the hits depending on what the user has requested.
142 if (type == "neither") {
143 if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 == 0) ) {
144 _bedA->reportBedPENewLine(a);
145 }
146 }
147 else if (type == "notboth") {
148 if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 == 0) ) {
149 _bedA->reportBedPENewLine(a);
150 }
151 else if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 == 0) ) {
152 for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) {
153 _bedA->reportBedPETab(a);
154 _bedB->reportBedNewLine(*q);
155 }
156 }
157 else if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 > 0) ) {
158 for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) {
159 _bedA->reportBedPETab(a);
160 _bedB->reportBedNewLine(*q);
161 }
162 }
163 }
164 else if (type == "xor") {
165 if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 == 0) ) {
166 for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) {
167 _bedA->reportBedPETab(a);
168 _bedB->reportBedNewLine(*q);
169 }
170 }
171 else if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 > 0) ) {
172 for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) {
173 _bedA->reportBedPETab(a);
174 _bedB->reportBedNewLine(*q);
175 }
176 }
177 }
178 else if (type == "both") {
179 if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 > 0) ) {
180 for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) {
181 _bedA->reportBedPETab(a);
182 _bedB->reportBedNewLine(*q);
183 }
184 for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) {
185 _bedA->reportBedPETab(a);
186 _bedB->reportBedNewLine(*q);
187 }
188 }
189 }
190 }
191
192
193 bool BedIntersectPE::FindOneOrMoreOverlaps(const BEDPE &a, const string &type) {
194
195 // flags for the existence of hits on each end of BEDPE
196 // that exceed the requested overlap fraction
197 bool end1Found = false;
198 bool end2Found = false;
199
200 // Look for overlaps in end 1 assuming we have an aligned chromosome.
201 if (a.chrom1 != ".") {
202 end1Found = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom1, a.start1, a.end1, a.strand1,
203 _sameStrand, _diffStrand, _overlapFraction);
204
205 // can we bail out without checking end2?
206 if ((type == "either") && (end1Found == true)) return true;
207 else if ((type == "neither") && (end1Found == true)) return false;
208 else if ((type == "notboth") && (end1Found == false)) return true;
209 else if ((type == "both") && (end1Found == false)) return false;
210 }
211
212 // Now look for overlaps in end 2 assuming we have an aligned chromosome.
213 if (a.chrom2 != ".") {
214 end2Found = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom2, a.start2, a.end2, a.strand2,
215 _sameStrand, _diffStrand, _overlapFraction);
216
217 if ((type == "either") && (end2Found == true)) return true;
218 else if ((type == "neither") && (end2Found == true)) return false;
219 else if ((type == "notboth") && (end2Found == false)) return true;
220 else if ((type == "both") && (end2Found == false)) return false;
221 }
222
223 // Now report the hits depending on what the user has requested.
224 if (type == "notboth") {
225 if ( (end1Found == false) || (end2Found == false) ) return true;
226 else return false;
227 }
228 else if (type == "either") {
229 if ( (end1Found == false) && (end2Found == false) ) return false;
230 }
231 else if (type == "neither") {
232 if ( (end1Found == false) && (end2Found == false) ) return true;
233 else return false;
234 }
235 else if (type == "xor") {
236 if ( (end1Found == true) && (end2Found == false) ) return true;
237 else if ( (end1Found == false) && (end2Found == true) ) return true;
238 else return false;
239 }
240 else if (type == "both") {
241 if ( (end1Found == true) && (end2Found == true) ) return true;
242 return false;
243 }
244 return false;
245 }
246
247
248 void BedIntersectPE::FindSpanningOverlaps(const BEDPE &a, vector<BED> &hits, const string &type) {
249
250 // count of hits on _between_ end of BEDPE
251 // that exceed the requested overlap fraction
252 int numOverlaps = 0;
253 CHRPOS spanStart = 0;
254 CHRPOS spanEnd = 0;
255 CHRPOS spanLength = 0;
256
257 if ((type == "ispan") || (type == "notispan")) {
258 spanStart = a.end1;
259 spanEnd = a.start2;
260 if (a.end1 > a.start2) {
261 spanStart = a.end2;
262 spanEnd = a.start1;
263 }
264 }
265 else if ((type == "ospan") || (type == "notospan")) {
266 spanStart = a.start1;
267 spanEnd = a.end2;
268 if (a.start1 > a.start2) {
269 spanStart = a.start2;
270 spanEnd = a.end1;
271 }
272 }
273 spanLength = spanEnd - spanStart;
274
275 // get the hits for the span
276 _bedB->FindOverlapsPerBin(a.chrom1, spanStart, spanEnd, a.strand1, hits, _sameStrand, _diffStrand);
277
278 vector<BED>::const_iterator h = hits.begin();
279 vector<BED>::const_iterator hitsEnd = hits.end();
280 for (; h != hitsEnd; ++h) {
281
282 int s = max(spanStart, h->start);
283 int e = min(spanEnd, h->end);
284 int overlapBases = (e - s); // the number of overlapping bases b/w a and b
285 int spanLength = (spanEnd - spanStart); // the length of a in b.p.
286
287 // is there enough overlap relative to the user's request? (default ~ 1bp)
288 if ( ( (float) overlapBases / (float) spanLength ) >= _overlapFraction ) {
289 numOverlaps++;
290 if ((type == "ispan") || (type == "ospan")) {
291 _bedA->reportBedPETab(a);
292 _bedB->reportBedNewLine(*h);
293 }
294 }
295 }
296
297 if ( ( (type == "notispan") || (type == "notospan") ) && numOverlaps == 0 ) {
298 _bedA->reportBedPENewLine(a);
299 }
300 }
301
302
303 bool BedIntersectPE::FindOneOrMoreSpanningOverlaps(const BEDPE &a, const string &type) {
304
305 int spanStart = 0;
306 int spanEnd = 0;
307 int spanLength = 0;
308 bool overlapFound;
309
310 if ((type == "ispan") || (type == "notispan")) {
311 spanStart = a.end1;
312 spanEnd = a.start2;
313 if (a.end1 > a.start2) {
314 spanStart = a.end2;
315 spanEnd = a.start1;
316 }
317 }
318 else if ((type == "ospan") || (type == "notospan")) {
319 spanStart = a.start1;
320 spanEnd = a.end2;
321 if (a.start1 > a.start2) {
322 spanStart = a.start2;
323 spanEnd = a.end1;
324 }
325 }
326 spanLength = spanEnd - spanStart;
327
328 overlapFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom1, spanStart, spanEnd, a.strand1,
329 _sameStrand, _diffStrand, _overlapFraction);
330
331 return overlapFound;
332 }
333
334
335 void BedIntersectPE::IntersectBedPE() {
336
337 // load the "B" bed file into a map so
338 // that we can easily compare "A" to it for overlaps
339 _bedB->loadBedFileIntoMap();
340
341 int lineNum = 0; // current input line number
342 vector<BED> hits, hits1, hits2; // vector of potential hits
343
344 // reserve some space
345 hits.reserve(100);
346 hits1.reserve(100);
347 hits2.reserve(100);
348
349 BEDPE a, nullBedPE;
350 BedLineStatus bedStatus;
351
352 _bedA->Open();
353 while ((bedStatus = _bedA->GetNextBedPE(a, lineNum)) != BED_INVALID) {
354 if (bedStatus == BED_VALID) {
355 if ( (_searchType == "ispan") || (_searchType == "ospan") ||
356 (_searchType == "notispan") || (_searchType == "notospan") ) {
357 if (a.chrom1 == a.chrom2) {
358 FindSpanningOverlaps(a, hits, _searchType);
359 hits.clear();
360 }
361 }
362 else {
363 FindOverlaps(a, hits1, hits2, _searchType);
364 hits1.clear();
365 hits2.clear();
366 }
367 a = nullBedPE;
368 }
369 }
370 _bedA->Close();
371 }
372
373
374 void BedIntersectPE::IntersectBamPE(string bamFile) {
375
376 // load the "B" bed file into a map so
377 // that we can easily compare "A" to it for overlaps
378 _bedB->loadBedFileIntoMap();
379
380 // open the BAM file
381 BamReader reader;
382 BamWriter writer;
383 reader.Open(bamFile);
384
385 // get header & reference information
386 string bamHeader = reader.GetHeaderText();
387 RefVector refs = reader.GetReferenceData();
388
389 // open a BAM output to stdout if we are writing BAM
390 if (_bamOutput == true) {
391 // set compression mode
392 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
393 if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
394 writer.SetCompressionMode(compressionMode);
395 // open our BAM writer
396 writer.Open("stdout", bamHeader, refs);
397 }
398
399 // track the previous and current sequence
400 // names so that we can identify blocks of
401 // alignments for a given read ID.
402 string prevName, currName;
403 prevName = currName = "";
404
405 vector<BamAlignment> alignments; // vector of BAM alignments for a given ID in a BAM file.
406 alignments.reserve(100);
407
408 _bedA->bedType = 10; // it's a full BEDPE given it's BAM
409
410 // rip through the BAM file and convert each mapped entry to BEDPE
411 BamAlignment bam1, bam2;
412 while (reader.GetNextAlignment(bam1)) {
413 // the alignment must be paired
414 if (bam1.IsPaired() == true) {
415 // grab the second alignment for the pair.
416 reader.GetNextAlignment(bam2);
417
418 // require that the alignments are from the same query
419 if (bam1.Name == bam2.Name) {
420 ProcessBamBlock(bam1, bam2, refs, writer);
421 }
422 else {
423 cerr << "*****ERROR: BAM must be sorted by query name. " << endl;
424 exit(1);
425 }
426 }
427 }
428 // close up
429 reader.Close();
430 if (_bamOutput == true) {
431 writer.Close();
432 }
433 }
434
435
436 void BedIntersectPE::ProcessBamBlock (const BamAlignment &bam1, const BamAlignment &bam2,
437 const RefVector &refs, BamWriter &writer) {
438
439 vector<BED> hits, hits1, hits2; // vector of potential hits
440 hits.reserve(1000); // reserve some space
441 hits1.reserve(1000);
442 hits2.reserve(1000);
443
444 bool overlapsFound; // flag to indicate if overlaps were found
445
446 if ( (_searchType == "either") || (_searchType == "xor") ||
447 (_searchType == "both") || (_searchType == "notboth") ||
448 (_searchType == "neither") ) {
449
450 // create a new BEDPE feature from the BAM alignments.
451 BEDPE a;
452 ConvertBamToBedPE(bam1, bam2, refs, a);
453 if (_bamOutput == true) { // BAM output
454 // write to BAM if correct hits found
455 overlapsFound = FindOneOrMoreOverlaps(a, _searchType);
456 if (overlapsFound == true) {
457 writer.SaveAlignment(bam1);
458 writer.SaveAlignment(bam2);
459 }
460 }
461 else { // BEDPE output
462 FindOverlaps(a, hits1, hits2, _searchType);
463 hits1.clear();
464 hits2.clear();
465 }
466 }
467 else if ( (_searchType == "ispan") || (_searchType == "ospan") ) {
468 // only look for ispan and ospan when both ends are mapped.
469 if (bam1.IsMapped() && bam2.IsMapped()) {
470 // only do an inspan or outspan check if the alignment is intrachromosomal
471 if (bam1.RefID == bam2.RefID) {
472 // create a new BEDPE feature from the BAM alignments.
473 BEDPE a;
474 ConvertBamToBedPE(bam1, bam2, refs, a);
475 if (_bamOutput == true) { // BAM output
476 // look for overlaps, and write to BAM if >=1 were found
477 overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType);
478 if (overlapsFound == true) {
479 writer.SaveAlignment(bam1);
480 writer.SaveAlignment(bam2);
481 }
482 }
483 else { // BEDPE output
484 FindSpanningOverlaps(a, hits, _searchType);
485 hits.clear();
486 }
487 }
488 }
489 }
490 else if ( (_searchType == "notispan") || (_searchType == "notospan") ) {
491 // only look for notispan and notospan when both ends are mapped.
492 if (bam1.IsMapped() && bam2.IsMapped()) {
493 // only do an inspan or outspan check if the alignment is intrachromosomal
494 if (bam1.RefID == bam2.RefID) {
495 // create a new BEDPE feature from the BAM alignments.
496 BEDPE a;
497 ConvertBamToBedPE(bam1, bam2, refs, a);
498 if (_bamOutput == true) { // BAM output
499 // write to BAM if there were no overlaps
500 overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType);
501 if (overlapsFound == false) {
502 writer.SaveAlignment(bam1);
503 writer.SaveAlignment(bam2);
504 }
505 }
506 else { // BEDPE output
507 FindSpanningOverlaps(a, hits, _searchType);
508 hits.clear();
509 }
510 }
511 // if inter-chromosomal or orphaned, we know it's not ispan and not ospan
512 else if (_bamOutput == true) {
513 writer.SaveAlignment(bam1);
514 writer.SaveAlignment(bam2);
515 }
516 }
517 // if both ends aren't mapped, we know that it's notispan and not ospan
518 else if (_bamOutput == true) {
519 writer.SaveAlignment(bam1);
520 writer.SaveAlignment(bam2);
521 }
522 }
523 }
524
525