0
|
1 /*****************************************************************************
|
|
2 pairToBed.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "pairToBed.h"
|
|
14
|
|
15
|
|
16 bool IsCorrectMappingForBEDPE (const BamAlignment &bam) {
|
|
17
|
|
18 if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize > 0) ) {
|
|
19 return true;
|
|
20 }
|
|
21 else if ( (bam.RefID == bam.MateRefID) && (bam.InsertSize == 0) && bam.IsFirstMate() ) {
|
|
22 return true;
|
|
23 }
|
|
24 else if ( (bam.RefID != bam.MateRefID) && bam.IsFirstMate() ) {
|
|
25 return true;
|
|
26 }
|
|
27 else return false;
|
|
28 }
|
|
29
|
|
30
|
|
31 /*
|
|
32 Constructor
|
|
33 */
|
|
34
|
|
35
|
|
36 BedIntersectPE::BedIntersectPE(string bedAFilePE, string bedBFile, float overlapFraction,
|
|
37 string searchType, bool sameStrand, bool diffStrand, bool bamInput,
|
|
38 bool bamOutput, bool uncompressedBam, bool useEditDistance) {
|
|
39
|
|
40 _bedAFilePE = bedAFilePE;
|
|
41 _bedBFile = bedBFile;
|
|
42 _overlapFraction = overlapFraction;
|
|
43 _sameStrand = sameStrand;
|
|
44 _diffStrand = diffStrand;
|
|
45 _useEditDistance = useEditDistance;
|
|
46 _searchType = searchType;
|
|
47 _bamInput = bamInput;
|
|
48 _bamOutput = bamOutput;
|
|
49 _isUncompressedBam = uncompressedBam;
|
|
50
|
|
51 _bedA = new BedFilePE(bedAFilePE);
|
|
52 _bedB = new BedFile(bedBFile);
|
|
53
|
|
54 if (_bamInput == false)
|
|
55 IntersectBedPE();
|
|
56 else
|
|
57 IntersectBamPE(_bedAFilePE);
|
|
58 }
|
|
59
|
|
60
|
|
61 /*
|
|
62 Destructor
|
|
63 */
|
|
64
|
|
65 BedIntersectPE::~BedIntersectPE(void) {
|
|
66 }
|
|
67
|
|
68
|
|
69
|
|
70 void BedIntersectPE::FindOverlaps(const BEDPE &a, vector<BED> &hits1, vector<BED> &hits2, const string &type) {
|
|
71
|
|
72 // list of hits on each end of BEDPE
|
|
73 // that exceed the requested overlap fraction
|
|
74 vector<BED> qualityHits1;
|
|
75 vector<BED> qualityHits2;
|
|
76
|
|
77 // count of hits on each end of BEDPE
|
|
78 // that exceed the requested overlap fraction
|
|
79 int numOverlapsEnd1 = 0;
|
|
80 int numOverlapsEnd2 = 0;
|
|
81
|
|
82 // make sure we have a valid chromosome before we search
|
|
83 if (a.chrom1 != ".") {
|
|
84 // Find the quality hits between ***end1*** of the BEDPE and the B BED file
|
|
85 _bedB->FindOverlapsPerBin(a.chrom1, a.start1, a.end1, a.strand1, hits1, _sameStrand, _diffStrand);
|
|
86
|
|
87 vector<BED>::const_iterator h = hits1.begin();
|
|
88 vector<BED>::const_iterator hitsEnd = hits1.end();
|
|
89 for (; h != hitsEnd; ++h) {
|
|
90
|
|
91 int s = max(a.start1, h->start);
|
|
92 int e = min(a.end1, h->end);
|
|
93 int overlapBases = (e - s); // the number of overlapping bases b/w a and b
|
|
94 int aLength = (a.end1 - a.start1); // the length of a in b.p.
|
|
95
|
|
96 // is there enough overlap relative to the user's request? (default ~ 1bp)
|
|
97 if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) {
|
|
98 numOverlapsEnd1++;
|
|
99
|
|
100 if (type == "either") {
|
|
101 _bedA->reportBedPETab(a);
|
|
102 _bedB->reportBedNewLine(*h);
|
|
103 }
|
|
104 else {
|
|
105 qualityHits1.push_back(*h);
|
|
106 }
|
|
107 }
|
|
108 }
|
|
109 }
|
|
110
|
|
111
|
|
112 // make sure we have a valid chromosome before we search
|
|
113 if (a.chrom2 != ".") {
|
|
114 // Now find the quality hits between ***end2*** of the BEDPE and the B BED file
|
|
115 _bedB->FindOverlapsPerBin(a.chrom2, a.start2, a.end2, a.strand2, hits2, _sameStrand, _diffStrand);
|
|
116
|
|
117 vector<BED>::const_iterator h = hits2.begin();
|
|
118 vector<BED>::const_iterator hitsEnd = hits2.end();
|
|
119 for (; h != hitsEnd; ++h) {
|
|
120
|
|
121 int s = max(a.start2, h->start);
|
|
122 int e = min(a.end2, h->end);
|
|
123 int overlapBases = (e - s); // the number of overlapping bases b/w a and b
|
|
124 int aLength = (a.end2 - a.start2); // the length of a in b.p.
|
|
125
|
|
126 // is there enough overlap relative to the user's request? (default ~ 1bp)
|
|
127 if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) {
|
|
128 numOverlapsEnd2++;
|
|
129
|
|
130 if (type == "either") {
|
|
131 _bedA->reportBedPETab(a);
|
|
132 _bedB->reportBedNewLine(*h);
|
|
133 }
|
|
134 else {
|
|
135 qualityHits2.push_back(*h);
|
|
136 }
|
|
137 }
|
|
138 }
|
|
139 }
|
|
140
|
|
141 // Now report the hits depending on what the user has requested.
|
|
142 if (type == "neither") {
|
|
143 if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 == 0) ) {
|
|
144 _bedA->reportBedPENewLine(a);
|
|
145 }
|
|
146 }
|
|
147 else if (type == "notboth") {
|
|
148 if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 == 0) ) {
|
|
149 _bedA->reportBedPENewLine(a);
|
|
150 }
|
|
151 else if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 == 0) ) {
|
|
152 for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) {
|
|
153 _bedA->reportBedPETab(a);
|
|
154 _bedB->reportBedNewLine(*q);
|
|
155 }
|
|
156 }
|
|
157 else if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 > 0) ) {
|
|
158 for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) {
|
|
159 _bedA->reportBedPETab(a);
|
|
160 _bedB->reportBedNewLine(*q);
|
|
161 }
|
|
162 }
|
|
163 }
|
|
164 else if (type == "xor") {
|
|
165 if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 == 0) ) {
|
|
166 for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) {
|
|
167 _bedA->reportBedPETab(a);
|
|
168 _bedB->reportBedNewLine(*q);
|
|
169 }
|
|
170 }
|
|
171 else if ( (numOverlapsEnd1 == 0) && (numOverlapsEnd2 > 0) ) {
|
|
172 for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) {
|
|
173 _bedA->reportBedPETab(a);
|
|
174 _bedB->reportBedNewLine(*q);
|
|
175 }
|
|
176 }
|
|
177 }
|
|
178 else if (type == "both") {
|
|
179 if ( (numOverlapsEnd1 > 0) && (numOverlapsEnd2 > 0) ) {
|
|
180 for (vector<BED>::iterator q = qualityHits1.begin(); q != qualityHits1.end(); ++q) {
|
|
181 _bedA->reportBedPETab(a);
|
|
182 _bedB->reportBedNewLine(*q);
|
|
183 }
|
|
184 for (vector<BED>::iterator q = qualityHits2.begin(); q != qualityHits2.end(); ++q) {
|
|
185 _bedA->reportBedPETab(a);
|
|
186 _bedB->reportBedNewLine(*q);
|
|
187 }
|
|
188 }
|
|
189 }
|
|
190 }
|
|
191
|
|
192
|
|
193 bool BedIntersectPE::FindOneOrMoreOverlaps(const BEDPE &a, const string &type) {
|
|
194
|
|
195 // flags for the existence of hits on each end of BEDPE
|
|
196 // that exceed the requested overlap fraction
|
|
197 bool end1Found = false;
|
|
198 bool end2Found = false;
|
|
199
|
|
200 // Look for overlaps in end 1 assuming we have an aligned chromosome.
|
|
201 if (a.chrom1 != ".") {
|
|
202 end1Found = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom1, a.start1, a.end1, a.strand1,
|
|
203 _sameStrand, _diffStrand, _overlapFraction);
|
|
204
|
|
205 // can we bail out without checking end2?
|
|
206 if ((type == "either") && (end1Found == true)) return true;
|
|
207 else if ((type == "neither") && (end1Found == true)) return false;
|
|
208 else if ((type == "notboth") && (end1Found == false)) return true;
|
|
209 else if ((type == "both") && (end1Found == false)) return false;
|
|
210 }
|
|
211
|
|
212 // Now look for overlaps in end 2 assuming we have an aligned chromosome.
|
|
213 if (a.chrom2 != ".") {
|
|
214 end2Found = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom2, a.start2, a.end2, a.strand2,
|
|
215 _sameStrand, _diffStrand, _overlapFraction);
|
|
216
|
|
217 if ((type == "either") && (end2Found == true)) return true;
|
|
218 else if ((type == "neither") && (end2Found == true)) return false;
|
|
219 else if ((type == "notboth") && (end2Found == false)) return true;
|
|
220 else if ((type == "both") && (end2Found == false)) return false;
|
|
221 }
|
|
222
|
|
223 // Now report the hits depending on what the user has requested.
|
|
224 if (type == "notboth") {
|
|
225 if ( (end1Found == false) || (end2Found == false) ) return true;
|
|
226 else return false;
|
|
227 }
|
|
228 else if (type == "either") {
|
|
229 if ( (end1Found == false) && (end2Found == false) ) return false;
|
|
230 }
|
|
231 else if (type == "neither") {
|
|
232 if ( (end1Found == false) && (end2Found == false) ) return true;
|
|
233 else return false;
|
|
234 }
|
|
235 else if (type == "xor") {
|
|
236 if ( (end1Found == true) && (end2Found == false) ) return true;
|
|
237 else if ( (end1Found == false) && (end2Found == true) ) return true;
|
|
238 else return false;
|
|
239 }
|
|
240 else if (type == "both") {
|
|
241 if ( (end1Found == true) && (end2Found == true) ) return true;
|
|
242 return false;
|
|
243 }
|
|
244 return false;
|
|
245 }
|
|
246
|
|
247
|
|
248 void BedIntersectPE::FindSpanningOverlaps(const BEDPE &a, vector<BED> &hits, const string &type) {
|
|
249
|
|
250 // count of hits on _between_ end of BEDPE
|
|
251 // that exceed the requested overlap fraction
|
|
252 int numOverlaps = 0;
|
|
253 CHRPOS spanStart = 0;
|
|
254 CHRPOS spanEnd = 0;
|
|
255 CHRPOS spanLength = 0;
|
|
256
|
|
257 if ((type == "ispan") || (type == "notispan")) {
|
|
258 spanStart = a.end1;
|
|
259 spanEnd = a.start2;
|
|
260 if (a.end1 > a.start2) {
|
|
261 spanStart = a.end2;
|
|
262 spanEnd = a.start1;
|
|
263 }
|
|
264 }
|
|
265 else if ((type == "ospan") || (type == "notospan")) {
|
|
266 spanStart = a.start1;
|
|
267 spanEnd = a.end2;
|
|
268 if (a.start1 > a.start2) {
|
|
269 spanStart = a.start2;
|
|
270 spanEnd = a.end1;
|
|
271 }
|
|
272 }
|
|
273 spanLength = spanEnd - spanStart;
|
|
274
|
|
275 // get the hits for the span
|
|
276 _bedB->FindOverlapsPerBin(a.chrom1, spanStart, spanEnd, a.strand1, hits, _sameStrand, _diffStrand);
|
|
277
|
|
278 vector<BED>::const_iterator h = hits.begin();
|
|
279 vector<BED>::const_iterator hitsEnd = hits.end();
|
|
280 for (; h != hitsEnd; ++h) {
|
|
281
|
|
282 int s = max(spanStart, h->start);
|
|
283 int e = min(spanEnd, h->end);
|
|
284 int overlapBases = (e - s); // the number of overlapping bases b/w a and b
|
|
285 int spanLength = (spanEnd - spanStart); // the length of a in b.p.
|
|
286
|
|
287 // is there enough overlap relative to the user's request? (default ~ 1bp)
|
|
288 if ( ( (float) overlapBases / (float) spanLength ) >= _overlapFraction ) {
|
|
289 numOverlaps++;
|
|
290 if ((type == "ispan") || (type == "ospan")) {
|
|
291 _bedA->reportBedPETab(a);
|
|
292 _bedB->reportBedNewLine(*h);
|
|
293 }
|
|
294 }
|
|
295 }
|
|
296
|
|
297 if ( ( (type == "notispan") || (type == "notospan") ) && numOverlaps == 0 ) {
|
|
298 _bedA->reportBedPENewLine(a);
|
|
299 }
|
|
300 }
|
|
301
|
|
302
|
|
303 bool BedIntersectPE::FindOneOrMoreSpanningOverlaps(const BEDPE &a, const string &type) {
|
|
304
|
|
305 int spanStart = 0;
|
|
306 int spanEnd = 0;
|
|
307 int spanLength = 0;
|
|
308 bool overlapFound;
|
|
309
|
|
310 if ((type == "ispan") || (type == "notispan")) {
|
|
311 spanStart = a.end1;
|
|
312 spanEnd = a.start2;
|
|
313 if (a.end1 > a.start2) {
|
|
314 spanStart = a.end2;
|
|
315 spanEnd = a.start1;
|
|
316 }
|
|
317 }
|
|
318 else if ((type == "ospan") || (type == "notospan")) {
|
|
319 spanStart = a.start1;
|
|
320 spanEnd = a.end2;
|
|
321 if (a.start1 > a.start2) {
|
|
322 spanStart = a.start2;
|
|
323 spanEnd = a.end1;
|
|
324 }
|
|
325 }
|
|
326 spanLength = spanEnd - spanStart;
|
|
327
|
|
328 overlapFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom1, spanStart, spanEnd, a.strand1,
|
|
329 _sameStrand, _diffStrand, _overlapFraction);
|
|
330
|
|
331 return overlapFound;
|
|
332 }
|
|
333
|
|
334
|
|
335 void BedIntersectPE::IntersectBedPE() {
|
|
336
|
|
337 // load the "B" bed file into a map so
|
|
338 // that we can easily compare "A" to it for overlaps
|
|
339 _bedB->loadBedFileIntoMap();
|
|
340
|
|
341 int lineNum = 0; // current input line number
|
|
342 vector<BED> hits, hits1, hits2; // vector of potential hits
|
|
343
|
|
344 // reserve some space
|
|
345 hits.reserve(100);
|
|
346 hits1.reserve(100);
|
|
347 hits2.reserve(100);
|
|
348
|
|
349 BEDPE a, nullBedPE;
|
|
350 BedLineStatus bedStatus;
|
|
351
|
|
352 _bedA->Open();
|
|
353 while ((bedStatus = _bedA->GetNextBedPE(a, lineNum)) != BED_INVALID) {
|
|
354 if (bedStatus == BED_VALID) {
|
|
355 if ( (_searchType == "ispan") || (_searchType == "ospan") ||
|
|
356 (_searchType == "notispan") || (_searchType == "notospan") ) {
|
|
357 if (a.chrom1 == a.chrom2) {
|
|
358 FindSpanningOverlaps(a, hits, _searchType);
|
|
359 hits.clear();
|
|
360 }
|
|
361 }
|
|
362 else {
|
|
363 FindOverlaps(a, hits1, hits2, _searchType);
|
|
364 hits1.clear();
|
|
365 hits2.clear();
|
|
366 }
|
|
367 a = nullBedPE;
|
|
368 }
|
|
369 }
|
|
370 _bedA->Close();
|
|
371 }
|
|
372
|
|
373
|
|
374 void BedIntersectPE::IntersectBamPE(string bamFile) {
|
|
375
|
|
376 // load the "B" bed file into a map so
|
|
377 // that we can easily compare "A" to it for overlaps
|
|
378 _bedB->loadBedFileIntoMap();
|
|
379
|
|
380 // open the BAM file
|
|
381 BamReader reader;
|
|
382 BamWriter writer;
|
|
383 reader.Open(bamFile);
|
|
384
|
|
385 // get header & reference information
|
|
386 string bamHeader = reader.GetHeaderText();
|
|
387 RefVector refs = reader.GetReferenceData();
|
|
388
|
|
389 // open a BAM output to stdout if we are writing BAM
|
|
390 if (_bamOutput == true) {
|
|
391 // set compression mode
|
|
392 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
|
|
393 if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
|
|
394 writer.SetCompressionMode(compressionMode);
|
|
395 // open our BAM writer
|
|
396 writer.Open("stdout", bamHeader, refs);
|
|
397 }
|
|
398
|
|
399 // track the previous and current sequence
|
|
400 // names so that we can identify blocks of
|
|
401 // alignments for a given read ID.
|
|
402 string prevName, currName;
|
|
403 prevName = currName = "";
|
|
404
|
|
405 vector<BamAlignment> alignments; // vector of BAM alignments for a given ID in a BAM file.
|
|
406 alignments.reserve(100);
|
|
407
|
|
408 _bedA->bedType = 10; // it's a full BEDPE given it's BAM
|
|
409
|
|
410 // rip through the BAM file and convert each mapped entry to BEDPE
|
|
411 BamAlignment bam1, bam2;
|
|
412 while (reader.GetNextAlignment(bam1)) {
|
|
413 // the alignment must be paired
|
|
414 if (bam1.IsPaired() == true) {
|
|
415 // grab the second alignment for the pair.
|
|
416 reader.GetNextAlignment(bam2);
|
|
417
|
|
418 // require that the alignments are from the same query
|
|
419 if (bam1.Name == bam2.Name) {
|
|
420 ProcessBamBlock(bam1, bam2, refs, writer);
|
|
421 }
|
|
422 else {
|
|
423 cerr << "*****ERROR: BAM must be sorted by query name. " << endl;
|
|
424 exit(1);
|
|
425 }
|
|
426 }
|
|
427 }
|
|
428 // close up
|
|
429 reader.Close();
|
|
430 if (_bamOutput == true) {
|
|
431 writer.Close();
|
|
432 }
|
|
433 }
|
|
434
|
|
435
|
|
436 void BedIntersectPE::ProcessBamBlock (const BamAlignment &bam1, const BamAlignment &bam2,
|
|
437 const RefVector &refs, BamWriter &writer) {
|
|
438
|
|
439 vector<BED> hits, hits1, hits2; // vector of potential hits
|
|
440 hits.reserve(1000); // reserve some space
|
|
441 hits1.reserve(1000);
|
|
442 hits2.reserve(1000);
|
|
443
|
|
444 bool overlapsFound; // flag to indicate if overlaps were found
|
|
445
|
|
446 if ( (_searchType == "either") || (_searchType == "xor") ||
|
|
447 (_searchType == "both") || (_searchType == "notboth") ||
|
|
448 (_searchType == "neither") ) {
|
|
449
|
|
450 // create a new BEDPE feature from the BAM alignments.
|
|
451 BEDPE a;
|
|
452 ConvertBamToBedPE(bam1, bam2, refs, a);
|
|
453 if (_bamOutput == true) { // BAM output
|
|
454 // write to BAM if correct hits found
|
|
455 overlapsFound = FindOneOrMoreOverlaps(a, _searchType);
|
|
456 if (overlapsFound == true) {
|
|
457 writer.SaveAlignment(bam1);
|
|
458 writer.SaveAlignment(bam2);
|
|
459 }
|
|
460 }
|
|
461 else { // BEDPE output
|
|
462 FindOverlaps(a, hits1, hits2, _searchType);
|
|
463 hits1.clear();
|
|
464 hits2.clear();
|
|
465 }
|
|
466 }
|
|
467 else if ( (_searchType == "ispan") || (_searchType == "ospan") ) {
|
|
468 // only look for ispan and ospan when both ends are mapped.
|
|
469 if (bam1.IsMapped() && bam2.IsMapped()) {
|
|
470 // only do an inspan or outspan check if the alignment is intrachromosomal
|
|
471 if (bam1.RefID == bam2.RefID) {
|
|
472 // create a new BEDPE feature from the BAM alignments.
|
|
473 BEDPE a;
|
|
474 ConvertBamToBedPE(bam1, bam2, refs, a);
|
|
475 if (_bamOutput == true) { // BAM output
|
|
476 // look for overlaps, and write to BAM if >=1 were found
|
|
477 overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType);
|
|
478 if (overlapsFound == true) {
|
|
479 writer.SaveAlignment(bam1);
|
|
480 writer.SaveAlignment(bam2);
|
|
481 }
|
|
482 }
|
|
483 else { // BEDPE output
|
|
484 FindSpanningOverlaps(a, hits, _searchType);
|
|
485 hits.clear();
|
|
486 }
|
|
487 }
|
|
488 }
|
|
489 }
|
|
490 else if ( (_searchType == "notispan") || (_searchType == "notospan") ) {
|
|
491 // only look for notispan and notospan when both ends are mapped.
|
|
492 if (bam1.IsMapped() && bam2.IsMapped()) {
|
|
493 // only do an inspan or outspan check if the alignment is intrachromosomal
|
|
494 if (bam1.RefID == bam2.RefID) {
|
|
495 // create a new BEDPE feature from the BAM alignments.
|
|
496 BEDPE a;
|
|
497 ConvertBamToBedPE(bam1, bam2, refs, a);
|
|
498 if (_bamOutput == true) { // BAM output
|
|
499 // write to BAM if there were no overlaps
|
|
500 overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType);
|
|
501 if (overlapsFound == false) {
|
|
502 writer.SaveAlignment(bam1);
|
|
503 writer.SaveAlignment(bam2);
|
|
504 }
|
|
505 }
|
|
506 else { // BEDPE output
|
|
507 FindSpanningOverlaps(a, hits, _searchType);
|
|
508 hits.clear();
|
|
509 }
|
|
510 }
|
|
511 // if inter-chromosomal or orphaned, we know it's not ispan and not ospan
|
|
512 else if (_bamOutput == true) {
|
|
513 writer.SaveAlignment(bam1);
|
|
514 writer.SaveAlignment(bam2);
|
|
515 }
|
|
516 }
|
|
517 // if both ends aren't mapped, we know that it's notispan and not ospan
|
|
518 else if (_bamOutput == true) {
|
|
519 writer.SaveAlignment(bam1);
|
|
520 writer.SaveAlignment(bam2);
|
|
521 }
|
|
522 }
|
|
523 }
|
|
524
|
|
525
|