0
|
1 /*****************************************************************************
|
|
2 intersectBed.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "fjoin.h"
|
|
14 #include <queue>
|
|
15 #include <set>
|
|
16
|
|
17 bool leftOf(const BED &a, const BED &b);
|
|
18
|
|
19
|
|
20 bool BedIntersect::processHits(BED &a, vector<BED> &hits) {
|
|
21 // how many overlaps are there b/w the bed and the set of hits?
|
|
22 int s, e, overlapBases;
|
|
23 int numOverlaps = 0;
|
|
24 bool hitsFound = false;
|
|
25 int aLength = (a.end - a.start); // the length of a in b.p.
|
|
26
|
|
27 // loop through the hits and report those that meet the user's criteria
|
|
28 vector<BED>::const_iterator h = hits.begin();
|
|
29 vector<BED>::const_iterator hitsEnd = hits.end();
|
|
30 for (; h != hitsEnd; ++h) {
|
|
31 s = max(a.start, h->start);
|
|
32 e = min(a.end, h->end);
|
|
33 overlapBases = (e - s); // the number of overlapping bases b/w a and b
|
|
34
|
|
35 // is there enough overlap relative to the user's request? (default ~ 1bp)
|
|
36 if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) {
|
|
37 // Report the hit if the user doesn't care about reciprocal overlap between A and B.
|
|
38 if (_reciprocal == false) {
|
|
39 hitsFound = true;
|
|
40 numOverlaps++;
|
|
41 if (_printable == true)
|
|
42 ReportOverlapDetail(overlapBases, a, *h, s, e);
|
|
43 }
|
|
44 // we require there to be sufficient __reciprocal__ overlap
|
|
45 else {
|
|
46 int bLength = (h->end - h->start);
|
|
47 float bOverlap = ( (float) overlapBases / (float) bLength );
|
|
48 if (bOverlap >= _overlapFraction) {
|
|
49 hitsFound = true;
|
|
50 numOverlaps++;
|
|
51 if (_printable == true)
|
|
52 ReportOverlapDetail(overlapBases, a, *h, s, e);
|
|
53 }
|
|
54 }
|
|
55 }
|
|
56 }
|
|
57 // report the summary of the overlaps if requested.
|
|
58 ReportOverlapSummary(a, numOverlaps);
|
|
59 // were hits found for this BED feature?
|
|
60 return hitsFound;
|
|
61 }
|
|
62
|
|
63 /*
|
|
64 Constructor
|
|
65 */
|
|
66 BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit,
|
|
67 bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap,
|
|
68 float overlapFraction, bool noHit, bool writeCount, bool forceStrand,
|
|
69 bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput) {
|
|
70
|
|
71 _bedAFile = bedAFile;
|
|
72 _bedBFile = bedBFile;
|
|
73 _anyHit = anyHit;
|
|
74 _noHit = noHit;
|
|
75 _writeA = writeA;
|
|
76 _writeB = writeB;
|
|
77 _writeOverlap = writeOverlap;
|
|
78 _writeAllOverlap = writeAllOverlap;
|
|
79 _writeCount = writeCount;
|
|
80 _overlapFraction = overlapFraction;
|
|
81 _forceStrand = forceStrand;
|
|
82 _reciprocal = reciprocal;
|
|
83 _obeySplits = obeySplits;
|
|
84 _bamInput = bamInput;
|
|
85 _bamOutput = bamOutput;
|
|
86
|
|
87 if (_anyHit || _noHit || _writeCount)
|
|
88 _printable = false;
|
|
89 else
|
|
90 _printable = true;
|
|
91
|
|
92 // create new BED file objects for A and B
|
|
93 _bedA = new BedFile(bedAFile);
|
|
94 _bedB = new BedFile(bedBFile);
|
|
95
|
|
96 IntersectBed();
|
|
97 }
|
|
98
|
|
99
|
|
100 /*
|
|
101 Destructor
|
|
102 */
|
|
103 BedIntersect::~BedIntersect(void) {
|
|
104 }
|
|
105
|
|
106
|
|
107 bool leftOf(const BED &a, const BED &b) {
|
|
108 return (a.end <= b.start);
|
|
109 }
|
|
110
|
|
111
|
|
112 void BedIntersect::ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b,
|
|
113 const CHRPOS &s, const CHRPOS &e) {
|
|
114 // default. simple intersection only
|
|
115 if (_writeA == false && _writeB == false && _writeOverlap == false) {
|
|
116 _bedA->reportBedRangeNewLine(a,s,e);
|
|
117 }
|
|
118 // -wa -wbwrite the original A and B
|
|
119 else if (_writeA == true && _writeB == true) {
|
|
120 _bedA->reportBedTab(a);
|
|
121 _bedB->reportBedNewLine(b);
|
|
122 }
|
|
123 // -wa write just the original A
|
|
124 else if (_writeA == true) {
|
|
125 _bedA->reportBedNewLine(a);
|
|
126 }
|
|
127 // -wb write the intersected portion of A and the original B
|
|
128 else if (_writeB == true) {
|
|
129 _bedA->reportBedRangeTab(a,s,e);
|
|
130 _bedB->reportBedNewLine(b);
|
|
131 }
|
|
132 // -wo write the original A and B plus the no. of overlapping bases.
|
|
133 else if (_writeOverlap == true) {
|
|
134 _bedA->reportBedTab(a);
|
|
135 _bedB->reportBedTab(b);
|
|
136 printf("%d\n", overlapBases);
|
|
137 }
|
|
138 }
|
|
139
|
|
140
|
|
141 void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) {
|
|
142 // -u just report the fact that there was >= 1 overlaps
|
|
143 if (_anyHit && (numOverlapsFound >= 1)) {
|
|
144 _bedA->reportBedNewLine(a);
|
|
145 }
|
|
146 // -c report the total number of features overlapped in B
|
|
147 else if (_writeCount) {
|
|
148 _bedA->reportBedTab(a);
|
|
149 printf("%d\n", numOverlapsFound);
|
|
150 }
|
|
151 // -v report iff there were no overlaps
|
|
152 else if (_noHit && (numOverlapsFound == 0)) {
|
|
153 _bedA->reportBedNewLine(a);
|
|
154 }
|
|
155 // -wao the user wants to force the reporting of 0 overlap
|
|
156 else if (_writeAllOverlap && (numOverlapsFound == 0)) {
|
|
157 _bedA->reportBedTab(a);
|
|
158 _bedB->reportNullBedTab();
|
|
159 printf("0\n");
|
|
160 }
|
|
161 }
|
|
162
|
|
163
|
|
164
|
|
165 void BedIntersect::Scan(BED *x, vector<BED *> *windowX, BedLineStatus xStatus,
|
|
166 const BED &y, vector<BED *> *windowY, BedLineStatus yStatus) {
|
|
167
|
|
168 if (xStatus != BED_VALID) {
|
|
169 return;
|
|
170 }
|
|
171
|
|
172 std::vector<BED *>::iterator wYIter = windowY->begin();
|
|
173 while (wYIter != windowY->end()) {
|
|
174 if (leftOf(*(*wYIter), *x) == true) {
|
|
175 (*wYIter)->finished = true;
|
|
176 wYIter = windowY->erase(wYIter); // erase auto-increments to the next position
|
|
177 }
|
|
178 else if (overlaps((*wYIter)->start, (*wYIter)->end, x->start, x->end) > 0) {
|
|
179 if (_lastPick == 0) {
|
|
180 AddHits(x, *(*wYIter));
|
|
181 }
|
|
182 else {
|
|
183 AddHits(*wYIter, *x);
|
|
184 }
|
|
185 ++wYIter; // force incrementing
|
|
186 }
|
|
187 }
|
|
188 if (leftOf(*x,y) == false)
|
|
189 windowX->push_back(x);
|
|
190 else {
|
|
191 x->finished = true;
|
|
192 }
|
|
193 // dump the buffered results (if any)
|
|
194 FlushOutputBuffer();
|
|
195 }
|
|
196
|
|
197
|
|
198 void BedIntersect::AddHits(BED *x, const BED &y) {
|
|
199 if (x->overlaps.empty() == true)
|
|
200 _outputBuffer.push(x);
|
|
201 x->overlaps.push_back(y);
|
|
202 }
|
|
203
|
|
204
|
|
205 void BedIntersect::FlushOutputBuffer(bool final) {
|
|
206 while (_outputBuffer.empty() == false)
|
|
207 {
|
|
208 if (final == false && _outputBuffer.front()->finished == false)
|
|
209 break;
|
|
210
|
|
211 processHits(*_outputBuffer.front(), _outputBuffer.front()->overlaps);
|
|
212 // remove the finished BED entry from the heap
|
|
213 delete _outputBuffer.front();
|
|
214 _outputBuffer.pop();
|
|
215 }
|
|
216 }
|
|
217
|
|
218
|
|
219 vector<BED*>* BedIntersect::GetWindow(const string &chrom, bool isA) {
|
|
220
|
|
221 // iterator to test if a window for a given chrom exists.
|
|
222 map<string, vector<BED*> >::iterator it;
|
|
223
|
|
224 // grab the current window for A or B, depending on
|
|
225 // the request. if a window hasn't yet been created
|
|
226 // for the requested chrom, create one.
|
|
227
|
|
228 if (isA) {
|
|
229 it = _windowA.find(chrom);
|
|
230 if (it != _windowA.end()) {
|
|
231 return & _windowA[chrom];
|
|
232 }
|
|
233 else {
|
|
234 _windowA.insert(pair<string, vector<BED *> >(chrom, vector<BED *>()));
|
|
235 return & _windowA[chrom];
|
|
236 }
|
|
237 }
|
|
238 else {
|
|
239 it = _windowB.find(chrom);
|
|
240 if (it != _windowB.end()) {
|
|
241 return & _windowB[chrom];
|
|
242 }
|
|
243 else {
|
|
244 _windowB.insert(pair<string, vector<BED *> >(chrom, vector<BED *>()));
|
|
245 return & _windowB[chrom];
|
|
246 }
|
|
247 }
|
|
248 }
|
|
249
|
|
250
|
|
251 void BedIntersect::ChromSwitch(const string &chrom) {
|
|
252
|
|
253 vector<BED*>::iterator windowAIter = _windowA[chrom].begin();
|
|
254 vector<BED*>::iterator windowAEnd = _windowA[chrom].end();
|
|
255 for (; windowAIter != windowAEnd; ++windowAIter)
|
|
256 (*windowAIter)->finished = true;
|
|
257
|
|
258 vector<BED*>::iterator windowBIter = _windowB[chrom].begin();
|
|
259 vector<BED*>::iterator windowBEnd = _windowB[chrom].end();
|
|
260 for (; windowBIter != windowBEnd; ++windowBIter)
|
|
261 (*windowBIter)->finished = true;
|
|
262
|
|
263 FlushOutputBuffer();
|
|
264 }
|
|
265
|
|
266
|
|
267 void BedIntersect::IntersectBed() {
|
|
268
|
|
269 int aLineNum = 0;
|
|
270 int bLineNum = 0;
|
|
271
|
|
272 // current feature from each file
|
|
273 BED *a, *b, *prevA, *prevB;
|
|
274
|
|
275 // status of the current lines
|
|
276 BedLineStatus aStatus, bStatus;
|
|
277
|
|
278 // open the files; get the first line from each
|
|
279 _bedA->Open();
|
|
280 _bedB->Open();
|
|
281
|
|
282 prevA = NULL;
|
|
283 prevB = NULL;
|
|
284 a = new BED();
|
|
285 b = new BED();
|
|
286 aStatus = _bedA->GetNextBed(*a, aLineNum);
|
|
287 bStatus = _bedB->GetNextBed(*b, bLineNum);
|
|
288
|
|
289 cout << a->chrom << " " << a->start << " " << a->chrom << " " << b->start << endl;
|
|
290 while (aStatus != BED_INVALID || bStatus != BED_INVALID) {
|
|
291
|
|
292 if ((a->start <= b->start) && (a->chrom == b->chrom)) {
|
|
293 prevA = a;
|
|
294 _lastPick = 0;
|
|
295 Scan(a, GetWindow(a->chrom, true), aStatus,
|
|
296 *b, GetWindow(a->chrom, false), bStatus);
|
|
297
|
|
298 a = new BED();
|
|
299 aStatus = _bedA->GetNextBed(*a, aLineNum);
|
|
300 }
|
|
301 else if ((a->start > b->start) && (a->chrom == b->chrom)) {
|
|
302 prevB = b;
|
|
303 _lastPick = 1;
|
|
304 Scan(b, GetWindow(b->chrom, false), bStatus,
|
|
305 *a, GetWindow(b->chrom, true), aStatus);
|
|
306
|
|
307 b = new BED();
|
|
308 bStatus = _bedB->GetNextBed(*b, bLineNum);
|
|
309 }
|
|
310 else if (a->chrom != b->chrom) {
|
|
311 // A was most recently read
|
|
312 if (_lastPick == 0) {
|
|
313 prevB = b;
|
|
314 while (b->chrom == prevA->chrom){
|
|
315 _windowB[prevA->chrom].push_back(b);
|
|
316 b = new BED();
|
|
317 bStatus = _bedB->GetNextBed(*b, bLineNum);
|
|
318 }
|
|
319 Scan(prevA, GetWindow(prevA->chrom, true), aStatus,
|
|
320 *prevB, GetWindow(prevA->chrom, false), bStatus);
|
|
321 }
|
|
322 // B was most recently read
|
|
323 else {
|
|
324 prevA = a;
|
|
325 while (a->chrom == prevB->chrom) {
|
|
326 _windowA[prevB->chrom].push_back(a);
|
|
327 a = new BED();
|
|
328 aStatus = _bedA->GetNextBed(*a, aLineNum);
|
|
329 }
|
|
330 Scan(prevB, GetWindow(prevB->chrom, false), bStatus,
|
|
331 *prevA, GetWindow(prevB->chrom, true), aStatus);
|
|
332 }
|
|
333 FlushOutputBuffer(true);
|
|
334 }
|
|
335 if (prevA!=NULL&&prevB!=NULL)
|
|
336 //cout << prevA->chrom << " " << a->chrom << " " << a->start << " "
|
|
337 // << prevB->chrom << " " << b->chrom << " " << b->start << "\n";
|
|
338 if (aStatus == BED_INVALID) a->start = INT_MAX;
|
|
339 if (bStatus == BED_INVALID) b->start = INT_MAX;
|
|
340 }
|
|
341
|
|
342 // clear out the final bit of staged output
|
|
343 FlushOutputBuffer(true);
|
|
344
|
|
345 // close the files
|
|
346 _bedA->Close();
|
|
347 _bedB->Close();
|
|
348 }
|
|
349
|
|
350
|