Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/intersectBed/intersectBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfcd8b6c1bda |
---|---|
1 /***************************************************************************** | |
2 intersectBed.cpp | |
3 | |
4 (c) 2009 - Aaron Quinlan | |
5 Hall Laboratory | |
6 Department of Biochemistry and Molecular Genetics | |
7 University of Virginia | |
8 aaronquinlan@gmail.com | |
9 | |
10 Licenced under the GNU General Public License 2.0 license. | |
11 ******************************************************************************/ | |
12 #include "lineFileUtilities.h" | |
13 #include "intersectBed.h" | |
14 | |
15 /************************************ | |
16 Helper functions | |
17 ************************************/ | |
18 bool BedIntersect::processHits(const BED &a, const vector<BED> &hits) { | |
19 | |
20 // how many overlaps are there b/w the bed and the set of hits? | |
21 CHRPOS s, e; | |
22 int overlapBases; | |
23 int numOverlaps = 0; | |
24 bool hitsFound = false; | |
25 int aLength = (a.end - a.start); // the length of a in b.p. | |
26 | |
27 // loop through the hits and report those that meet the user's criteria | |
28 vector<BED>::const_iterator h = hits.begin(); | |
29 vector<BED>::const_iterator hitsEnd = hits.end(); | |
30 for (; h != hitsEnd; ++h) { | |
31 s = max(a.start, h->start); | |
32 e = min(a.end, h->end); | |
33 overlapBases = (e - s); // the number of overlapping bases b/w a and b | |
34 | |
35 // is there enough overlap relative to the user's request? (default ~ 1bp) | |
36 if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { | |
37 // Report the hit if the user doesn't care about reciprocal overlap between A and B. | |
38 if (_reciprocal == false) { | |
39 hitsFound = true; | |
40 numOverlaps++; | |
41 if (_printable == true) | |
42 ReportOverlapDetail(overlapBases, a, *h, s, e); | |
43 } | |
44 // we require there to be sufficient __reciprocal__ overlap | |
45 else { | |
46 int bLength = (h->end - h->start); | |
47 float bOverlap = ( (float) overlapBases / (float) bLength ); | |
48 if (bOverlap >= _overlapFraction) { | |
49 hitsFound = true; | |
50 numOverlaps++; | |
51 if (_printable == true) | |
52 ReportOverlapDetail(overlapBases, a, *h, s, e); | |
53 } | |
54 } | |
55 } | |
56 } | |
57 // report the summary of the overlaps if requested. | |
58 ReportOverlapSummary(a, numOverlaps); | |
59 // were hits found for this BED feature? | |
60 return hitsFound; | |
61 } | |
62 | |
63 | |
64 /* | |
65 Constructor | |
66 */ | |
67 BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, | |
68 bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, | |
69 float overlapFraction, bool noHit, bool writeCount, bool sameStrand, bool diffStrand, | |
70 bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput, bool isUncompressedBam, | |
71 bool sortedInput) { | |
72 | |
73 _bedAFile = bedAFile; | |
74 _bedBFile = bedBFile; | |
75 _anyHit = anyHit; | |
76 _noHit = noHit; | |
77 _writeA = writeA; | |
78 _writeB = writeB; | |
79 _writeOverlap = writeOverlap; | |
80 _writeAllOverlap = writeAllOverlap; | |
81 _writeCount = writeCount; | |
82 _overlapFraction = overlapFraction; | |
83 _sameStrand = sameStrand; | |
84 _diffStrand = diffStrand; | |
85 _reciprocal = reciprocal; | |
86 _obeySplits = obeySplits; | |
87 _bamInput = bamInput; | |
88 _bamOutput = bamOutput; | |
89 _isUncompressedBam = isUncompressedBam; | |
90 _sortedInput = sortedInput; | |
91 | |
92 // should we print each overlap, or does the user want summary information? | |
93 _printable = true; | |
94 if (_anyHit || _noHit || _writeCount) | |
95 _printable = false; | |
96 | |
97 if (_bamInput == false) | |
98 IntersectBed(); | |
99 else | |
100 IntersectBam(bedAFile); | |
101 } | |
102 | |
103 | |
104 /* | |
105 Destructor | |
106 */ | |
107 BedIntersect::~BedIntersect(void) { | |
108 } | |
109 | |
110 | |
111 bool BedIntersect::FindOverlaps(const BED &a, vector<BED> &hits) { | |
112 bool hitsFound = false; | |
113 | |
114 // collect and report the sufficient hits | |
115 _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand); | |
116 hitsFound = processHits(a, hits); | |
117 return hitsFound; | |
118 } | |
119 | |
120 | |
121 void BedIntersect::ReportOverlapDetail(int overlapBases, const BED &a, const BED &b, CHRPOS s, CHRPOS e) { | |
122 // default. simple intersection only | |
123 if (_writeA == false && _writeB == false && _writeOverlap == false) { | |
124 _bedA->reportBedRangeNewLine(a,s,e); | |
125 } | |
126 // -wa -wb write the original A and B | |
127 else if (_writeA == true && _writeB == true) { | |
128 _bedA->reportBedTab(a); | |
129 _bedB->reportBedNewLine(b); | |
130 } | |
131 // -wa write just the original A | |
132 else if (_writeA == true) { | |
133 _bedA->reportBedNewLine(a); | |
134 } | |
135 // -wb write the intersected portion of A and the original B | |
136 else if (_writeB == true) { | |
137 _bedA->reportBedRangeTab(a,s,e); | |
138 _bedB->reportBedNewLine(b); | |
139 } | |
140 // -wo write the original A and B plus the no. of overlapping bases. | |
141 else if (_writeOverlap == true) { | |
142 _bedA->reportBedTab(a); | |
143 _bedB->reportBedTab(b); | |
144 if (b.zeroLength == false) printf("%d\n", overlapBases); | |
145 else printf("0\n"); | |
146 } | |
147 } | |
148 | |
149 | |
150 void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) { | |
151 // -u just report the fact that there was >= 1 overlaps | |
152 if (_anyHit && (numOverlapsFound >= 1)) { | |
153 _bedA->reportBedNewLine(a); | |
154 } | |
155 // -c report the total number of features overlapped in B | |
156 else if (_writeCount) { | |
157 _bedA->reportBedTab(a); | |
158 printf("%d\n", numOverlapsFound); | |
159 } | |
160 // -v report iff there were no overlaps | |
161 else if (_noHit && (numOverlapsFound == 0)) { | |
162 _bedA->reportBedNewLine(a); | |
163 } | |
164 // -wao the user wants to force the reporting of 0 overlap | |
165 else if (_writeAllOverlap && (numOverlapsFound == 0)) { | |
166 _bedA->reportBedTab(a); | |
167 _bedB->reportNullBedTab(); | |
168 printf("0\n"); | |
169 } | |
170 } | |
171 | |
172 | |
173 bool BedIntersect::FindOneOrMoreOverlap(const BED &a) { | |
174 bool overlapsFound = false; | |
175 if (_reciprocal == false) { | |
176 overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, | |
177 _sameStrand, _diffStrand, _overlapFraction); | |
178 } | |
179 else { | |
180 overlapsFound = _bedB->FindOneOrMoreReciprocalOverlapsPerBin(a.chrom, a.start, a.end, a.strand, | |
181 _sameStrand, _diffStrand, _overlapFraction); | |
182 } | |
183 return overlapsFound; | |
184 } | |
185 | |
186 | |
187 void BedIntersect::IntersectBed() { | |
188 | |
189 // create new BED file objects for A and B | |
190 _bedA = new BedFile(_bedAFile); | |
191 _bedB = new BedFile(_bedBFile); | |
192 | |
193 if (_sortedInput == false) { | |
194 // load the "B" file into a map in order to | |
195 // compare each entry in A to it in search of overlaps. | |
196 _bedB->loadBedFileIntoMap(); | |
197 | |
198 int lineNum = 0; | |
199 vector<BED> hits; | |
200 hits.reserve(100); | |
201 BED a, nullBed; | |
202 BedLineStatus bedStatus; | |
203 | |
204 // open the "A" file, process each BED entry and searh for overlaps. | |
205 _bedA->Open(); | |
206 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { | |
207 if (bedStatus == BED_VALID) { | |
208 // treat the BED as a single "block" | |
209 if (_obeySplits == false) { | |
210 FindOverlaps(a, hits); | |
211 hits.clear(); | |
212 } | |
213 // split the BED12 into blocks and look for overlaps in each discrete block | |
214 else { | |
215 bedVector bedBlocks; // vec to store the discrete BED "blocks" | |
216 splitBedIntoBlocks(a, lineNum, bedBlocks); | |
217 | |
218 vector<BED>::const_iterator bedItr = bedBlocks.begin(); | |
219 vector<BED>::const_iterator bedEnd = bedBlocks.end(); | |
220 for (; bedItr != bedEnd; ++bedItr) { | |
221 FindOverlaps(*bedItr, hits); | |
222 hits.clear(); | |
223 } | |
224 } | |
225 } | |
226 a = nullBed; | |
227 } | |
228 _bedA->Close(); | |
229 } | |
230 else { | |
231 // use the chromsweep algorithm to detect overlaps on the fly. | |
232 ChromSweep sweep = ChromSweep(_bedA, _bedB, _sameStrand, _diffStrand); | |
233 | |
234 pair<BED, vector<BED> > hit_set; | |
235 hit_set.second.reserve(10000); | |
236 while (sweep.Next(hit_set)) { | |
237 processHits(hit_set.first, hit_set.second); | |
238 } | |
239 } | |
240 } | |
241 | |
242 | |
243 void BedIntersect::IntersectBam(string bamFile) { | |
244 | |
245 // load the "B" bed file into a map so | |
246 // that we can easily compare "A" to it for overlaps | |
247 _bedB = new BedFile(_bedBFile); | |
248 _bedB->loadBedFileIntoMap(); | |
249 | |
250 // create a dummy BED A file for printing purposes if not | |
251 // using BAM output. | |
252 if (_bamOutput == false) { | |
253 _bedA = new BedFile(_bedAFile); | |
254 _bedA->bedType = 6; | |
255 } | |
256 | |
257 // open the BAM file | |
258 BamReader reader; | |
259 BamWriter writer; | |
260 | |
261 reader.Open(bamFile); | |
262 | |
263 // get header & reference information | |
264 string bamHeader = reader.GetHeaderText(); | |
265 RefVector refs = reader.GetReferenceData(); | |
266 | |
267 // open a BAM output to stdout if we are writing BAM | |
268 if (_bamOutput == true) { | |
269 // set compression mode | |
270 BamWriter::CompressionMode compressionMode = BamWriter::Compressed; | |
271 if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; | |
272 writer.SetCompressionMode(compressionMode); | |
273 // open our BAM writer | |
274 writer.Open("stdout", bamHeader, refs); | |
275 } | |
276 | |
277 vector<BED> hits; | |
278 // reserve some space | |
279 hits.reserve(100); | |
280 | |
281 | |
282 BamAlignment bam; | |
283 // get each set of alignments for each pair. | |
284 while (reader.GetNextAlignment(bam)) { | |
285 | |
286 if (bam.IsMapped()) { | |
287 BED a; | |
288 a.chrom = refs.at(bam.RefID).RefName; | |
289 a.start = bam.Position; | |
290 a.end = bam.GetEndPosition(false, false); | |
291 | |
292 // build the name field from the BAM alignment. | |
293 a.name = bam.Name; | |
294 if (bam.IsFirstMate()) a.name += "/1"; | |
295 if (bam.IsSecondMate()) a.name += "/2"; | |
296 | |
297 a.score = ToString(bam.MapQuality); | |
298 | |
299 a.strand = "+"; | |
300 if (bam.IsReverseStrand()) a.strand = "-"; | |
301 | |
302 if (_bamOutput == true) { | |
303 bool overlapsFound = false; | |
304 // treat the BAM alignment as a single "block" | |
305 if (_obeySplits == false) { | |
306 overlapsFound = FindOneOrMoreOverlap(a); | |
307 } | |
308 // split the BAM alignment into discrete blocks and | |
309 // look for overlaps only within each block. | |
310 else { | |
311 bool overlapFoundForBlock; | |
312 bedVector bedBlocks; // vec to store the discrete BED "blocks" from a | |
313 // we don't want to split on "D" ops, hence the "false" | |
314 getBamBlocks(bam, refs, bedBlocks, false); | |
315 | |
316 vector<BED>::const_iterator bedItr = bedBlocks.begin(); | |
317 vector<BED>::const_iterator bedEnd = bedBlocks.end(); | |
318 for (; bedItr != bedEnd; ++bedItr) { | |
319 overlapFoundForBlock = FindOneOrMoreOverlap(*bedItr); | |
320 if (overlapFoundForBlock == true) | |
321 overlapsFound = true; | |
322 } | |
323 } | |
324 if (overlapsFound == true) { | |
325 if (_noHit == false) | |
326 writer.SaveAlignment(bam); | |
327 } | |
328 else { | |
329 if (_noHit == true) { | |
330 writer.SaveAlignment(bam); | |
331 } | |
332 } | |
333 } | |
334 else { | |
335 // treat the BAM alignment as a single BED "block" | |
336 if (_obeySplits == false) { | |
337 FindOverlaps(a, hits); | |
338 hits.clear(); | |
339 } | |
340 // split the BAM alignment into discrete BED blocks and | |
341 // look for overlaps only within each block. | |
342 else { | |
343 bedVector bedBlocks; // vec to store the discrete BED "blocks" from a | |
344 getBamBlocks(bam, refs, bedBlocks, false); | |
345 | |
346 vector<BED>::const_iterator bedItr = bedBlocks.begin(); | |
347 vector<BED>::const_iterator bedEnd = bedBlocks.end(); | |
348 for (; bedItr != bedEnd; ++bedItr) { | |
349 FindOverlaps(*bedItr, hits); | |
350 hits.clear(); | |
351 } | |
352 } | |
353 } | |
354 } | |
355 // BAM IsMapped() is false | |
356 else if (_noHit == true) { | |
357 writer.SaveAlignment(bam); | |
358 } | |
359 } | |
360 | |
361 // close the relevant BAM files. | |
362 reader.Close(); | |
363 if (_bamOutput == true) { | |
364 writer.Close(); | |
365 } | |
366 } | |
367 |