0
|
1 /*****************************************************************************
|
|
2 intersectBed.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "intersectBed.h"
|
|
14
|
|
15 /************************************
|
|
16 Helper functions
|
|
17 ************************************/
|
|
18 bool BedIntersect::processHits(const BED &a, const vector<BED> &hits) {
|
|
19
|
|
20 // how many overlaps are there b/w the bed and the set of hits?
|
|
21 CHRPOS s, e;
|
|
22 int overlapBases;
|
|
23 int numOverlaps = 0;
|
|
24 bool hitsFound = false;
|
|
25 int aLength = (a.end - a.start); // the length of a in b.p.
|
|
26
|
|
27 // loop through the hits and report those that meet the user's criteria
|
|
28 vector<BED>::const_iterator h = hits.begin();
|
|
29 vector<BED>::const_iterator hitsEnd = hits.end();
|
|
30 for (; h != hitsEnd; ++h) {
|
|
31 s = max(a.start, h->start);
|
|
32 e = min(a.end, h->end);
|
|
33 overlapBases = (e - s); // the number of overlapping bases b/w a and b
|
|
34
|
|
35 // is there enough overlap relative to the user's request? (default ~ 1bp)
|
|
36 if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) {
|
|
37 // Report the hit if the user doesn't care about reciprocal overlap between A and B.
|
|
38 if (_reciprocal == false) {
|
|
39 hitsFound = true;
|
|
40 numOverlaps++;
|
|
41 if (_printable == true)
|
|
42 ReportOverlapDetail(overlapBases, a, *h, s, e);
|
|
43 }
|
|
44 // we require there to be sufficient __reciprocal__ overlap
|
|
45 else {
|
|
46 int bLength = (h->end - h->start);
|
|
47 float bOverlap = ( (float) overlapBases / (float) bLength );
|
|
48 if (bOverlap >= _overlapFraction) {
|
|
49 hitsFound = true;
|
|
50 numOverlaps++;
|
|
51 if (_printable == true)
|
|
52 ReportOverlapDetail(overlapBases, a, *h, s, e);
|
|
53 }
|
|
54 }
|
|
55 }
|
|
56 }
|
|
57 // report the summary of the overlaps if requested.
|
|
58 ReportOverlapSummary(a, numOverlaps);
|
|
59 // were hits found for this BED feature?
|
|
60 return hitsFound;
|
|
61 }
|
|
62
|
|
63
|
|
64 /*
|
|
65 Constructor
|
|
66 */
|
|
67 BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit,
|
|
68 bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap,
|
|
69 float overlapFraction, bool noHit, bool writeCount, bool sameStrand, bool diffStrand,
|
|
70 bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam,
|
|
71 bool sortedInput) {
|
|
72
|
|
73 _bedAFile = bedAFile;
|
|
74 _bedBFile = bedBFile;
|
|
75 _anyHit = anyHit;
|
|
76 _noHit = noHit;
|
|
77 _writeA = writeA;
|
|
78 _writeB = writeB;
|
|
79 _writeOverlap = writeOverlap;
|
|
80 _writeAllOverlap = writeAllOverlap;
|
|
81 _writeCount = writeCount;
|
|
82 _overlapFraction = overlapFraction;
|
|
83 _sameStrand = sameStrand;
|
|
84 _diffStrand = diffStrand;
|
|
85 _reciprocal = reciprocal;
|
|
86 _obeySplits = obeySplits;
|
|
87 _bamInput = bamInput;
|
|
88 _bamOutput = bamOutput;
|
|
89 _isUncompressedBam = isUncompressedBam;
|
|
90 _sortedInput = sortedInput;
|
|
91
|
|
92 // should we print each overlap, or does the user want summary information?
|
|
93 _printable = true;
|
|
94 if (_anyHit || _noHit || _writeCount)
|
|
95 _printable = false;
|
|
96
|
|
97 if (_bamInput == false)
|
|
98 IntersectBed();
|
|
99 else
|
|
100 IntersectBam(bedAFile);
|
|
101 }
|
|
102
|
|
103
|
|
104 /*
|
|
105 Destructor
|
|
106 */
|
|
107 BedIntersect::~BedIntersect(void) {
|
|
108 }
|
|
109
|
|
110
|
|
111 bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) {
|
|
112 bool hitsFound = false;
|
|
113
|
|
114 // collect and report the sufficient hits
|
|
115 _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand);
|
|
116 hitsFound = processHits(a, hits);
|
|
117 return hitsFound;
|
|
118 }
|
|
119
|
|
120
|
|
121 void BedIntersect::ReportOverlapDetail(int overlapBases, const BED &a, const BED &b, CHRPOS s, CHRPOS e) {
|
|
122 // default. simple intersection only
|
|
123 if (_writeA == false && _writeB == false && _writeOverlap == false) {
|
|
124 _bedA->reportBedRangeNewLine(a,s,e);
|
|
125 }
|
|
126 // -wa -wb write the original A and B
|
|
127 else if (_writeA == true && _writeB == true) {
|
|
128 _bedA->reportBedTab(a);
|
|
129 _bedB->reportBedNewLine(b);
|
|
130 }
|
|
131 // -wa write just the original A
|
|
132 else if (_writeA == true) {
|
|
133 _bedA->reportBedNewLine(a);
|
|
134 }
|
|
135 // -wb write the intersected portion of A and the original B
|
|
136 else if (_writeB == true) {
|
|
137 _bedA->reportBedRangeTab(a,s,e);
|
|
138 _bedB->reportBedNewLine(b);
|
|
139 }
|
|
140 // -wo write the original A and B plus the no. of overlapping bases.
|
|
141 else if (_writeOverlap == true) {
|
|
142 _bedA->reportBedTab(a);
|
|
143 _bedB->reportBedTab(b);
|
|
144 if (b.zeroLength == false) printf("%d\n", overlapBases);
|
|
145 else printf("0\n");
|
|
146 }
|
|
147 }
|
|
148
|
|
149
|
|
150 void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) {
|
|
151 // -u just report the fact that there was >= 1 overlaps
|
|
152 if (_anyHit && (numOverlapsFound >= 1)) {
|
|
153 _bedA->reportBedNewLine(a);
|
|
154 }
|
|
155 // -c report the total number of features overlapped in B
|
|
156 else if (_writeCount) {
|
|
157 _bedA->reportBedTab(a);
|
|
158 printf("%d\n", numOverlapsFound);
|
|
159 }
|
|
160 // -v report iff there were no overlaps
|
|
161 else if (_noHit && (numOverlapsFound == 0)) {
|
|
162 _bedA->reportBedNewLine(a);
|
|
163 }
|
|
164 // -wao the user wants to force the reporting of 0 overlap
|
|
165 else if (_writeAllOverlap && (numOverlapsFound == 0)) {
|
|
166 _bedA->reportBedTab(a);
|
|
167 _bedB->reportNullBedTab();
|
|
168 printf("0\n");
|
|
169 }
|
|
170 }
|
|
171
|
|
172
|
|
173 bool BedIntersect::FindOneOrMoreOverlap(const BED &a) {
|
|
174 bool overlapsFound = false;
|
|
175 if (_reciprocal == false) {
|
|
176 overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand,
|
|
177 _sameStrand, _diffStrand, _overlapFraction);
|
|
178 }
|
|
179 else {
|
|
180 overlapsFound = _bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand,
|
|
181 _sameStrand, _diffStrand, _overlapFraction);
|
|
182 }
|
|
183 return overlapsFound;
|
|
184 }
|
|
185
|
|
186
|
|
187 void BedIntersect::IntersectBed() {
|
|
188
|
|
189 // create new BED file objects for A and B
|
|
190 _bedA = new BedFile(_bedAFile);
|
|
191 _bedB = new BedFile(_bedBFile);
|
|
192
|
|
193 if (_sortedInput == false) {
|
|
194 // load the "B" file into a map in order to
|
|
195 // compare each entry in A to it in search of overlaps.
|
|
196 _bedB->loadBedFileIntoMap();
|
|
197
|
|
198 int lineNum = 0;
|
|
199 vector<BED> hits;
|
|
200 hits.reserve(100);
|
|
201 BED a, nullBed;
|
|
202 BedLineStatus bedStatus;
|
|
203
|
|
204 // open the "A" file, process each BED entry and searh for overlaps.
|
|
205 _bedA->Open();
|
|
206 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) {
|
|
207 if (bedStatus == BED_VALID) {
|
|
208 // treat the BED as a single "block"
|
|
209 if (_obeySplits == false) {
|
|
210 FindOverlaps(a, hits);
|
|
211 hits.clear();
|
|
212 }
|
|
213 // split the BED12 into blocks and look for overlaps in each discrete block
|
|
214 else {
|
|
215 bedVector bedBlocks; // vec to store the discrete BED "blocks"
|
|
216 splitBedIntoBlocks(a, lineNum, bedBlocks);
|
|
217
|
|
218 vector<BED>::const_iterator bedItr = bedBlocks.begin();
|
|
219 vector<BED>::const_iterator bedEnd = bedBlocks.end();
|
|
220 for (; bedItr != bedEnd; ++bedItr) {
|
|
221 FindOverlaps(*bedItr, hits);
|
|
222 hits.clear();
|
|
223 }
|
|
224 }
|
|
225 }
|
|
226 a = nullBed;
|
|
227 }
|
|
228 _bedA->Close();
|
|
229 }
|
|
230 else {
|
|
231 // use the chromsweep algorithm to detect overlaps on the fly.
|
|
232 ChromSweep sweep = ChromSweep(_bedA, _bedB, _sameStrand, _diffStrand);
|
|
233
|
|
234 pair<BED, vector<BED> > hit_set;
|
|
235 hit_set.second.reserve(10000);
|
|
236 while (sweep.Next(hit_set)) {
|
|
237 processHits(hit_set.first, hit_set.second);
|
|
238 }
|
|
239 }
|
|
240 }
|
|
241
|
|
242
|
|
243 void BedIntersect::IntersectBam(string bamFile) {
|
|
244
|
|
245 // load the "B" bed file into a map so
|
|
246 // that we can easily compare "A" to it for overlaps
|
|
247 _bedB = new BedFile(_bedBFile);
|
|
248 _bedB->loadBedFileIntoMap();
|
|
249
|
|
250 // create a dummy BED A file for printing purposes if not
|
|
251 // using BAM output.
|
|
252 if (_bamOutput == false) {
|
|
253 _bedA = new BedFile(_bedAFile);
|
|
254 _bedA->bedType = 6;
|
|
255 }
|
|
256
|
|
257 // open the BAM file
|
|
258 BamReader reader;
|
|
259 BamWriter writer;
|
|
260
|
|
261 reader.Open(bamFile);
|
|
262
|
|
263 // get header & reference information
|
|
264 string bamHeader = reader.GetHeaderText();
|
|
265 RefVector refs = reader.GetReferenceData();
|
|
266
|
|
267 // open a BAM output to stdout if we are writing BAM
|
|
268 if (_bamOutput == true) {
|
|
269 // set compression mode
|
|
270 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
|
|
271 if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
|
|
272 writer.SetCompressionMode(compressionMode);
|
|
273 // open our BAM writer
|
|
274 writer.Open("stdout", bamHeader, refs);
|
|
275 }
|
|
276
|
|
277 vector<BED> hits;
|
|
278 // reserve some space
|
|
279 hits.reserve(100);
|
|
280
|
|
281
|
|
282 BamAlignment bam;
|
|
283 // get each set of alignments for each pair.
|
|
284 while (reader.GetNextAlignment(bam)) {
|
|
285
|
|
286 if (bam.IsMapped()) {
|
|
287 BED a;
|
|
288 a.chrom = refs.at(bam.RefID).RefName;
|
|
289 a.start = bam.Position;
|
|
290 a.end = bam.GetEndPosition(false, false);
|
|
291
|
|
292 // build the name field from the BAM alignment.
|
|
293 a.name = bam.Name;
|
|
294 if (bam.IsFirstMate()) a.name += "/1";
|
|
295 if (bam.IsSecondMate()) a.name += "/2";
|
|
296
|
|
297 a.score = ToString(bam.MapQuality);
|
|
298
|
|
299 a.strand = "+";
|
|
300 if (bam.IsReverseStrand()) a.strand = "-";
|
|
301
|
|
302 if (_bamOutput == true) {
|
|
303 bool overlapsFound = false;
|
|
304 // treat the BAM alignment as a single "block"
|
|
305 if (_obeySplits == false) {
|
|
306 overlapsFound = FindOneOrMoreOverlap(a);
|
|
307 }
|
|
308 // split the BAM alignment into discrete blocks and
|
|
309 // look for overlaps only within each block.
|
|
310 else {
|
|
311 bool overlapFoundForBlock;
|
|
312 bedVector bedBlocks; // vec to store the discrete BED "blocks" from a
|
|
313 // we don't want to split on "D" ops, hence the "false"
|
|
314 getBamBlocks(bam, refs, bedBlocks, false);
|
|
315
|
|
316 vector<BED>::const_iterator bedItr = bedBlocks.begin();
|
|
317 vector<BED>::const_iterator bedEnd = bedBlocks.end();
|
|
318 for (; bedItr != bedEnd; ++bedItr) {
|
|
319 overlapFoundForBlock = FindOneOrMoreOverlap(*bedItr);
|
|
320 if (overlapFoundForBlock == true)
|
|
321 overlapsFound = true;
|
|
322 }
|
|
323 }
|
|
324 if (overlapsFound == true) {
|
|
325 if (_noHit == false)
|
|
326 writer.SaveAlignment(bam);
|
|
327 }
|
|
328 else {
|
|
329 if (_noHit == true) {
|
|
330 writer.SaveAlignment(bam);
|
|
331 }
|
|
332 }
|
|
333 }
|
|
334 else {
|
|
335 // treat the BAM alignment as a single BED "block"
|
|
336 if (_obeySplits == false) {
|
|
337 FindOverlaps(a, hits);
|
|
338 hits.clear();
|
|
339 }
|
|
340 // split the BAM alignment into discrete BED blocks and
|
|
341 // look for overlaps only within each block.
|
|
342 else {
|
|
343 bedVector bedBlocks; // vec to store the discrete BED "blocks" from a
|
|
344 getBamBlocks(bam, refs, bedBlocks, false);
|
|
345
|
|
346 vector<BED>::const_iterator bedItr = bedBlocks.begin();
|
|
347 vector<BED>::const_iterator bedEnd = bedBlocks.end();
|
|
348 for (; bedItr != bedEnd; ++bedItr) {
|
|
349 FindOverlaps(*bedItr, hits);
|
|
350 hits.clear();
|
|
351 }
|
|
352 }
|
|
353 }
|
|
354 }
|
|
355 // BAM IsMapped() is false
|
|
356 else if (_noHit == true) {
|
|
357 writer.SaveAlignment(bam);
|
|
358 }
|
|
359 }
|
|
360
|
|
361 // close the relevant BAM files.
|
|
362 reader.Close();
|
|
363 if (_bamOutput == true) {
|
|
364 writer.Close();
|
|
365 }
|
|
366 }
|
|
367
|