comparison BEDTools-Version-2.14.3/src/fjoin/fjoin.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfcd8b6c1bda
1 /*****************************************************************************
2 intersectBed.cpp
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "lineFileUtilities.h"
13 #include "fjoin.h"
14 #include <queue>
15 #include <set>
16
17 bool leftOf(const BED &a, const BED &b);
18
19
20 bool BedIntersect::processHits(BED &a, vector<BED> &hits) {
21 // how many overlaps are there b/w the bed and the set of hits?
22 int s, e, overlapBases;
23 int numOverlaps = 0;
24 bool hitsFound = false;
25 int aLength = (a.end - a.start); // the length of a in b.p.
26
27 // loop through the hits and report those that meet the user's criteria
28 vector<BED>::const_iterator h = hits.begin();
29 vector<BED>::const_iterator hitsEnd = hits.end();
30 for (; h != hitsEnd; ++h) {
31 s = max(a.start, h->start);
32 e = min(a.end, h->end);
33 overlapBases = (e - s); // the number of overlapping bases b/w a and b
34
35 // is there enough overlap relative to the user's request? (default ~ 1bp)
36 if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) {
37 // Report the hit if the user doesn't care about reciprocal overlap between A and B.
38 if (_reciprocal == false) {
39 hitsFound = true;
40 numOverlaps++;
41 if (_printable == true)
42 ReportOverlapDetail(overlapBases, a, *h, s, e);
43 }
44 // we require there to be sufficient __reciprocal__ overlap
45 else {
46 int bLength = (h->end - h->start);
47 float bOverlap = ( (float) overlapBases / (float) bLength );
48 if (bOverlap >= _overlapFraction) {
49 hitsFound = true;
50 numOverlaps++;
51 if (_printable == true)
52 ReportOverlapDetail(overlapBases, a, *h, s, e);
53 }
54 }
55 }
56 }
57 // report the summary of the overlaps if requested.
58 ReportOverlapSummary(a, numOverlaps);
59 // were hits found for this BED feature?
60 return hitsFound;
61 }
62
63 /*
64 Constructor
65 */
66 BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit,
67 bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap,
68 float overlapFraction, bool noHit, bool writeCount, bool forceStrand,
69 bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput) {
70
71 _bedAFile = bedAFile;
72 _bedBFile = bedBFile;
73 _anyHit = anyHit;
74 _noHit = noHit;
75 _writeA = writeA;
76 _writeB = writeB;
77 _writeOverlap = writeOverlap;
78 _writeAllOverlap = writeAllOverlap;
79 _writeCount = writeCount;
80 _overlapFraction = overlapFraction;
81 _forceStrand = forceStrand;
82 _reciprocal = reciprocal;
83 _obeySplits = obeySplits;
84 _bamInput = bamInput;
85 _bamOutput = bamOutput;
86
87 if (_anyHit || _noHit || _writeCount)
88 _printable = false;
89 else
90 _printable = true;
91
92 // create new BED file objects for A and B
93 _bedA = new BedFile(bedAFile);
94 _bedB = new BedFile(bedBFile);
95
96 IntersectBed();
97 }
98
99
100 /*
101 Destructor
102 */
103 BedIntersect::~BedIntersect(void) {
104 }
105
106
107 bool leftOf(const BED &a, const BED &b) {
108 return (a.end <= b.start);
109 }
110
111
112 void BedIntersect::ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b,
113 const CHRPOS &s, const CHRPOS &e) {
114 // default. simple intersection only
115 if (_writeA == false && _writeB == false && _writeOverlap == false) {
116 _bedA->reportBedRangeNewLine(a,s,e);
117 }
118 // -wa -wbwrite the original A and B
119 else if (_writeA == true && _writeB == true) {
120 _bedA->reportBedTab(a);
121 _bedB->reportBedNewLine(b);
122 }
123 // -wa write just the original A
124 else if (_writeA == true) {
125 _bedA->reportBedNewLine(a);
126 }
127 // -wb write the intersected portion of A and the original B
128 else if (_writeB == true) {
129 _bedA->reportBedRangeTab(a,s,e);
130 _bedB->reportBedNewLine(b);
131 }
132 // -wo write the original A and B plus the no. of overlapping bases.
133 else if (_writeOverlap == true) {
134 _bedA->reportBedTab(a);
135 _bedB->reportBedTab(b);
136 printf("%d\n", overlapBases);
137 }
138 }
139
140
141 void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) {
142 // -u just report the fact that there was >= 1 overlaps
143 if (_anyHit && (numOverlapsFound >= 1)) {
144 _bedA->reportBedNewLine(a);
145 }
146 // -c report the total number of features overlapped in B
147 else if (_writeCount) {
148 _bedA->reportBedTab(a);
149 printf("%d\n", numOverlapsFound);
150 }
151 // -v report iff there were no overlaps
152 else if (_noHit && (numOverlapsFound == 0)) {
153 _bedA->reportBedNewLine(a);
154 }
155 // -wao the user wants to force the reporting of 0 overlap
156 else if (_writeAllOverlap && (numOverlapsFound == 0)) {
157 _bedA->reportBedTab(a);
158 _bedB->reportNullBedTab();
159 printf("0\n");
160 }
161 }
162
163
164
165 void BedIntersect::Scan(BED *x, vector<BED *> *windowX, BedLineStatus xStatus,
166 const BED &y, vector<BED *> *windowY, BedLineStatus yStatus) {
167
168 if (xStatus != BED_VALID) {
169 return;
170 }
171
172 std::vector<BED *>::iterator wYIter = windowY->begin();
173 while (wYIter != windowY->end()) {
174 if (leftOf(*(*wYIter), *x) == true) {
175 (*wYIter)->finished = true;
176 wYIter = windowY->erase(wYIter); // erase auto-increments to the next position
177 }
178 else if (overlaps((*wYIter)->start, (*wYIter)->end, x->start, x->end) > 0) {
179 if (_lastPick == 0) {
180 AddHits(x, *(*wYIter));
181 }
182 else {
183 AddHits(*wYIter, *x);
184 }
185 ++wYIter; // force incrementing
186 }
187 }
188 if (leftOf(*x,y) == false)
189 windowX->push_back(x);
190 else {
191 x->finished = true;
192 }
193 // dump the buffered results (if any)
194 FlushOutputBuffer();
195 }
196
197
198 void BedIntersect::AddHits(BED *x, const BED &y) {
199 if (x->overlaps.empty() == true)
200 _outputBuffer.push(x);
201 x->overlaps.push_back(y);
202 }
203
204
205 void BedIntersect::FlushOutputBuffer(bool final) {
206 while (_outputBuffer.empty() == false)
207 {
208 if (final == false && _outputBuffer.front()->finished == false)
209 break;
210
211 processHits(*_outputBuffer.front(), _outputBuffer.front()->overlaps);
212 // remove the finished BED entry from the heap
213 delete _outputBuffer.front();
214 _outputBuffer.pop();
215 }
216 }
217
218
219 vector<BED*>* BedIntersect::GetWindow(const string &chrom, bool isA) {
220
221 // iterator to test if a window for a given chrom exists.
222 map<string, vector<BED*> >::iterator it;
223
224 // grab the current window for A or B, depending on
225 // the request. if a window hasn't yet been created
226 // for the requested chrom, create one.
227
228 if (isA) {
229 it = _windowA.find(chrom);
230 if (it != _windowA.end()) {
231 return & _windowA[chrom];
232 }
233 else {
234 _windowA.insert(pair<string, vector<BED *> >(chrom, vector<BED *>()));
235 return & _windowA[chrom];
236 }
237 }
238 else {
239 it = _windowB.find(chrom);
240 if (it != _windowB.end()) {
241 return & _windowB[chrom];
242 }
243 else {
244 _windowB.insert(pair<string, vector<BED *> >(chrom, vector<BED *>()));
245 return & _windowB[chrom];
246 }
247 }
248 }
249
250
251 void BedIntersect::ChromSwitch(const string &chrom) {
252
253 vector<BED*>::iterator windowAIter = _windowA[chrom].begin();
254 vector<BED*>::iterator windowAEnd = _windowA[chrom].end();
255 for (; windowAIter != windowAEnd; ++windowAIter)
256 (*windowAIter)->finished = true;
257
258 vector<BED*>::iterator windowBIter = _windowB[chrom].begin();
259 vector<BED*>::iterator windowBEnd = _windowB[chrom].end();
260 for (; windowBIter != windowBEnd; ++windowBIter)
261 (*windowBIter)->finished = true;
262
263 FlushOutputBuffer();
264 }
265
266
267 void BedIntersect::IntersectBed() {
268
269 int aLineNum = 0;
270 int bLineNum = 0;
271
272 // current feature from each file
273 BED *a, *b, *prevA, *prevB;
274
275 // status of the current lines
276 BedLineStatus aStatus, bStatus;
277
278 // open the files; get the first line from each
279 _bedA->Open();
280 _bedB->Open();
281
282 prevA = NULL;
283 prevB = NULL;
284 a = new BED();
285 b = new BED();
286 aStatus = _bedA->GetNextBed(*a, aLineNum);
287 bStatus = _bedB->GetNextBed(*b, bLineNum);
288
289 cout << a->chrom << " " << a->start << " " << a->chrom << " " << b->start << endl;
290 while (aStatus != BED_INVALID || bStatus != BED_INVALID) {
291
292 if ((a->start <= b->start) && (a->chrom == b->chrom)) {
293 prevA = a;
294 _lastPick = 0;
295 Scan(a, GetWindow(a->chrom, true), aStatus,
296 *b, GetWindow(a->chrom, false), bStatus);
297
298 a = new BED();
299 aStatus = _bedA->GetNextBed(*a, aLineNum);
300 }
301 else if ((a->start > b->start) && (a->chrom == b->chrom)) {
302 prevB = b;
303 _lastPick = 1;
304 Scan(b, GetWindow(b->chrom, false), bStatus,
305 *a, GetWindow(b->chrom, true), aStatus);
306
307 b = new BED();
308 bStatus = _bedB->GetNextBed(*b, bLineNum);
309 }
310 else if (a->chrom != b->chrom) {
311 // A was most recently read
312 if (_lastPick == 0) {
313 prevB = b;
314 while (b->chrom == prevA->chrom){
315 _windowB[prevA->chrom].push_back(b);
316 b = new BED();
317 bStatus = _bedB->GetNextBed(*b, bLineNum);
318 }
319 Scan(prevA, GetWindow(prevA->chrom, true), aStatus,
320 *prevB, GetWindow(prevA->chrom, false), bStatus);
321 }
322 // B was most recently read
323 else {
324 prevA = a;
325 while (a->chrom == prevB->chrom) {
326 _windowA[prevB->chrom].push_back(a);
327 a = new BED();
328 aStatus = _bedA->GetNextBed(*a, aLineNum);
329 }
330 Scan(prevB, GetWindow(prevB->chrom, false), bStatus,
331 *prevA, GetWindow(prevB->chrom, true), aStatus);
332 }
333 FlushOutputBuffer(true);
334 }
335 if (prevA!=NULL&&prevB!=NULL)
336 //cout << prevA->chrom << " " << a->chrom << " " << a->start << " "
337 // << prevB->chrom << " " << b->chrom << " " << b->start << "\n";
338 if (aStatus == BED_INVALID) a->start = INT_MAX;
339 if (bStatus == BED_INVALID) b->start = INT_MAX;
340 }
341
342 // clear out the final bit of staged output
343 FlushOutputBuffer(true);
344
345 // close the files
346 _bedA->Close();
347 _bedB->Close();
348 }
349
350