Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/fjoin/fjoin.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfcd8b6c1bda |
---|---|
1 /***************************************************************************** | |
2 intersectBed.cpp | |
3 | |
4 (c) 2009 - Aaron Quinlan | |
5 Hall Laboratory | |
6 Department of Biochemistry and Molecular Genetics | |
7 University of Virginia | |
8 aaronquinlan@gmail.com | |
9 | |
10 Licenced under the GNU General Public License 2.0 license. | |
11 ******************************************************************************/ | |
12 #include "lineFileUtilities.h" | |
13 #include "fjoin.h" | |
14 #include <queue> | |
15 #include <set> | |
16 | |
17 bool leftOf(const BED &a, const BED &b); | |
18 | |
19 | |
20 bool BedIntersect::processHits(BED &a, vector<BED> &hits) { | |
21 // how many overlaps are there b/w the bed and the set of hits? | |
22 int s, e, overlapBases; | |
23 int numOverlaps = 0; | |
24 bool hitsFound = false; | |
25 int aLength = (a.end - a.start); // the length of a in b.p. | |
26 | |
27 // loop through the hits and report those that meet the user's criteria | |
28 vector<BED>::const_iterator h = hits.begin(); | |
29 vector<BED>::const_iterator hitsEnd = hits.end(); | |
30 for (; h != hitsEnd; ++h) { | |
31 s = max(a.start, h->start); | |
32 e = min(a.end, h->end); | |
33 overlapBases = (e - s); // the number of overlapping bases b/w a and b | |
34 | |
35 // is there enough overlap relative to the user's request? (default ~ 1bp) | |
36 if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { | |
37 // Report the hit if the user doesn't care about reciprocal overlap between A and B. | |
38 if (_reciprocal == false) { | |
39 hitsFound = true; | |
40 numOverlaps++; | |
41 if (_printable == true) | |
42 ReportOverlapDetail(overlapBases, a, *h, s, e); | |
43 } | |
44 // we require there to be sufficient __reciprocal__ overlap | |
45 else { | |
46 int bLength = (h->end - h->start); | |
47 float bOverlap = ( (float) overlapBases / (float) bLength ); | |
48 if (bOverlap >= _overlapFraction) { | |
49 hitsFound = true; | |
50 numOverlaps++; | |
51 if (_printable == true) | |
52 ReportOverlapDetail(overlapBases, a, *h, s, e); | |
53 } | |
54 } | |
55 } | |
56 } | |
57 // report the summary of the overlaps if requested. | |
58 ReportOverlapSummary(a, numOverlaps); | |
59 // were hits found for this BED feature? | |
60 return hitsFound; | |
61 } | |
62 | |
63 /* | |
64 Constructor | |
65 */ | |
66 BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, | |
67 bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, | |
68 float overlapFraction, bool noHit, bool writeCount, bool forceStrand, | |
69 bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput) { | |
70 | |
71 _bedAFile = bedAFile; | |
72 _bedBFile = bedBFile; | |
73 _anyHit = anyHit; | |
74 _noHit = noHit; | |
75 _writeA = writeA; | |
76 _writeB = writeB; | |
77 _writeOverlap = writeOverlap; | |
78 _writeAllOverlap = writeAllOverlap; | |
79 _writeCount = writeCount; | |
80 _overlapFraction = overlapFraction; | |
81 _forceStrand = forceStrand; | |
82 _reciprocal = reciprocal; | |
83 _obeySplits = obeySplits; | |
84 _bamInput = bamInput; | |
85 _bamOutput = bamOutput; | |
86 | |
87 if (_anyHit || _noHit || _writeCount) | |
88 _printable = false; | |
89 else | |
90 _printable = true; | |
91 | |
92 // create new BED file objects for A and B | |
93 _bedA = new BedFile(bedAFile); | |
94 _bedB = new BedFile(bedBFile); | |
95 | |
96 IntersectBed(); | |
97 } | |
98 | |
99 | |
100 /* | |
101 Destructor | |
102 */ | |
103 BedIntersect::~BedIntersect(void) { | |
104 } | |
105 | |
106 | |
107 bool leftOf(const BED &a, const BED &b) { | |
108 return (a.end <= b.start); | |
109 } | |
110 | |
111 | |
112 void BedIntersect::ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b, | |
113 const CHRPOS &s, const CHRPOS &e) { | |
114 // default. simple intersection only | |
115 if (_writeA == false && _writeB == false && _writeOverlap == false) { | |
116 _bedA->reportBedRangeNewLine(a,s,e); | |
117 } | |
118 // -wa -wbwrite the original A and B | |
119 else if (_writeA == true && _writeB == true) { | |
120 _bedA->reportBedTab(a); | |
121 _bedB->reportBedNewLine(b); | |
122 } | |
123 // -wa write just the original A | |
124 else if (_writeA == true) { | |
125 _bedA->reportBedNewLine(a); | |
126 } | |
127 // -wb write the intersected portion of A and the original B | |
128 else if (_writeB == true) { | |
129 _bedA->reportBedRangeTab(a,s,e); | |
130 _bedB->reportBedNewLine(b); | |
131 } | |
132 // -wo write the original A and B plus the no. of overlapping bases. | |
133 else if (_writeOverlap == true) { | |
134 _bedA->reportBedTab(a); | |
135 _bedB->reportBedTab(b); | |
136 printf("%d\n", overlapBases); | |
137 } | |
138 } | |
139 | |
140 | |
141 void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) { | |
142 // -u just report the fact that there was >= 1 overlaps | |
143 if (_anyHit && (numOverlapsFound >= 1)) { | |
144 _bedA->reportBedNewLine(a); | |
145 } | |
146 // -c report the total number of features overlapped in B | |
147 else if (_writeCount) { | |
148 _bedA->reportBedTab(a); | |
149 printf("%d\n", numOverlapsFound); | |
150 } | |
151 // -v report iff there were no overlaps | |
152 else if (_noHit && (numOverlapsFound == 0)) { | |
153 _bedA->reportBedNewLine(a); | |
154 } | |
155 // -wao the user wants to force the reporting of 0 overlap | |
156 else if (_writeAllOverlap && (numOverlapsFound == 0)) { | |
157 _bedA->reportBedTab(a); | |
158 _bedB->reportNullBedTab(); | |
159 printf("0\n"); | |
160 } | |
161 } | |
162 | |
163 | |
164 | |
165 void BedIntersect::Scan(BED *x, vector<BED *> *windowX, BedLineStatus xStatus, | |
166 const BED &y, vector<BED *> *windowY, BedLineStatus yStatus) { | |
167 | |
168 if (xStatus != BED_VALID) { | |
169 return; | |
170 } | |
171 | |
172 std::vector<BED *>::iterator wYIter = windowY->begin(); | |
173 while (wYIter != windowY->end()) { | |
174 if (leftOf(*(*wYIter), *x) == true) { | |
175 (*wYIter)->finished = true; | |
176 wYIter = windowY->erase(wYIter); // erase auto-increments to the next position | |
177 } | |
178 else if (overlaps((*wYIter)->start, (*wYIter)->end, x->start, x->end) > 0) { | |
179 if (_lastPick == 0) { | |
180 AddHits(x, *(*wYIter)); | |
181 } | |
182 else { | |
183 AddHits(*wYIter, *x); | |
184 } | |
185 ++wYIter; // force incrementing | |
186 } | |
187 } | |
188 if (leftOf(*x,y) == false) | |
189 windowX->push_back(x); | |
190 else { | |
191 x->finished = true; | |
192 } | |
193 // dump the buffered results (if any) | |
194 FlushOutputBuffer(); | |
195 } | |
196 | |
197 | |
198 void BedIntersect::AddHits(BED *x, const BED &y) { | |
199 if (x->overlaps.empty() == true) | |
200 _outputBuffer.push(x); | |
201 x->overlaps.push_back(y); | |
202 } | |
203 | |
204 | |
205 void BedIntersect::FlushOutputBuffer(bool final) { | |
206 while (_outputBuffer.empty() == false) | |
207 { | |
208 if (final == false && _outputBuffer.front()->finished == false) | |
209 break; | |
210 | |
211 processHits(*_outputBuffer.front(), _outputBuffer.front()->overlaps); | |
212 // remove the finished BED entry from the heap | |
213 delete _outputBuffer.front(); | |
214 _outputBuffer.pop(); | |
215 } | |
216 } | |
217 | |
218 | |
219 vector<BED*>* BedIntersect::GetWindow(const string &chrom, bool isA) { | |
220 | |
221 // iterator to test if a window for a given chrom exists. | |
222 map<string, vector<BED*> >::iterator it; | |
223 | |
224 // grab the current window for A or B, depending on | |
225 // the request. if a window hasn't yet been created | |
226 // for the requested chrom, create one. | |
227 | |
228 if (isA) { | |
229 it = _windowA.find(chrom); | |
230 if (it != _windowA.end()) { | |
231 return & _windowA[chrom]; | |
232 } | |
233 else { | |
234 _windowA.insert(pair<string, vector<BED *> >(chrom, vector<BED *>())); | |
235 return & _windowA[chrom]; | |
236 } | |
237 } | |
238 else { | |
239 it = _windowB.find(chrom); | |
240 if (it != _windowB.end()) { | |
241 return & _windowB[chrom]; | |
242 } | |
243 else { | |
244 _windowB.insert(pair<string, vector<BED *> >(chrom, vector<BED *>())); | |
245 return & _windowB[chrom]; | |
246 } | |
247 } | |
248 } | |
249 | |
250 | |
251 void BedIntersect::ChromSwitch(const string &chrom) { | |
252 | |
253 vector<BED*>::iterator windowAIter = _windowA[chrom].begin(); | |
254 vector<BED*>::iterator windowAEnd = _windowA[chrom].end(); | |
255 for (; windowAIter != windowAEnd; ++windowAIter) | |
256 (*windowAIter)->finished = true; | |
257 | |
258 vector<BED*>::iterator windowBIter = _windowB[chrom].begin(); | |
259 vector<BED*>::iterator windowBEnd = _windowB[chrom].end(); | |
260 for (; windowBIter != windowBEnd; ++windowBIter) | |
261 (*windowBIter)->finished = true; | |
262 | |
263 FlushOutputBuffer(); | |
264 } | |
265 | |
266 | |
267 void BedIntersect::IntersectBed() { | |
268 | |
269 int aLineNum = 0; | |
270 int bLineNum = 0; | |
271 | |
272 // current feature from each file | |
273 BED *a, *b, *prevA, *prevB; | |
274 | |
275 // status of the current lines | |
276 BedLineStatus aStatus, bStatus; | |
277 | |
278 // open the files; get the first line from each | |
279 _bedA->Open(); | |
280 _bedB->Open(); | |
281 | |
282 prevA = NULL; | |
283 prevB = NULL; | |
284 a = new BED(); | |
285 b = new BED(); | |
286 aStatus = _bedA->GetNextBed(*a, aLineNum); | |
287 bStatus = _bedB->GetNextBed(*b, bLineNum); | |
288 | |
289 cout << a->chrom << " " << a->start << " " << a->chrom << " " << b->start << endl; | |
290 while (aStatus != BED_INVALID || bStatus != BED_INVALID) { | |
291 | |
292 if ((a->start <= b->start) && (a->chrom == b->chrom)) { | |
293 prevA = a; | |
294 _lastPick = 0; | |
295 Scan(a, GetWindow(a->chrom, true), aStatus, | |
296 *b, GetWindow(a->chrom, false), bStatus); | |
297 | |
298 a = new BED(); | |
299 aStatus = _bedA->GetNextBed(*a, aLineNum); | |
300 } | |
301 else if ((a->start > b->start) && (a->chrom == b->chrom)) { | |
302 prevB = b; | |
303 _lastPick = 1; | |
304 Scan(b, GetWindow(b->chrom, false), bStatus, | |
305 *a, GetWindow(b->chrom, true), aStatus); | |
306 | |
307 b = new BED(); | |
308 bStatus = _bedB->GetNextBed(*b, bLineNum); | |
309 } | |
310 else if (a->chrom != b->chrom) { | |
311 // A was most recently read | |
312 if (_lastPick == 0) { | |
313 prevB = b; | |
314 while (b->chrom == prevA->chrom){ | |
315 _windowB[prevA->chrom].push_back(b); | |
316 b = new BED(); | |
317 bStatus = _bedB->GetNextBed(*b, bLineNum); | |
318 } | |
319 Scan(prevA, GetWindow(prevA->chrom, true), aStatus, | |
320 *prevB, GetWindow(prevA->chrom, false), bStatus); | |
321 } | |
322 // B was most recently read | |
323 else { | |
324 prevA = a; | |
325 while (a->chrom == prevB->chrom) { | |
326 _windowA[prevB->chrom].push_back(a); | |
327 a = new BED(); | |
328 aStatus = _bedA->GetNextBed(*a, aLineNum); | |
329 } | |
330 Scan(prevB, GetWindow(prevB->chrom, false), bStatus, | |
331 *prevA, GetWindow(prevB->chrom, true), aStatus); | |
332 } | |
333 FlushOutputBuffer(true); | |
334 } | |
335 if (prevA!=NULL&&prevB!=NULL) | |
336 //cout << prevA->chrom << " " << a->chrom << " " << a->start << " " | |
337 // << prevB->chrom << " " << b->chrom << " " << b->start << "\n"; | |
338 if (aStatus == BED_INVALID) a->start = INT_MAX; | |
339 if (bStatus == BED_INVALID) b->start = INT_MAX; | |
340 } | |
341 | |
342 // clear out the final bit of staged output | |
343 FlushOutputBuffer(true); | |
344 | |
345 // close the files | |
346 _bedA->Close(); | |
347 _bedB->Close(); | |
348 } | |
349 | |
350 |