Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/utils/bedFilePE/bedFilePE.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfcd8b6c1bda |
---|---|
1 // | |
2 // bedFilePE.cpp | |
3 // BEDTools | |
4 // | |
5 // Created by Aaron Quinlan Spring 2009. | |
6 // Copyright 2009 Aaron Quinlan. All rights reserved. | |
7 // | |
8 // Summary: Contains common functions for finding BED overlaps. | |
9 // | |
10 // Acknowledgments: Much of the code herein is taken from Jim Kent's | |
11 // BED processing code. I am grateful for his elegant | |
12 // genome binning algorithm and therefore use it extensively. | |
13 | |
14 | |
15 #include "bedFilePE.h" | |
16 | |
17 | |
18 // Constructor | |
19 BedFilePE::BedFilePE(string &bedFile) { | |
20 this->bedFile = bedFile; | |
21 } | |
22 | |
23 // Destructor | |
24 BedFilePE::~BedFilePE(void) { | |
25 } | |
26 | |
27 void BedFilePE::Open(void) { | |
28 if (bedFile == "stdin" || bedFile == "-") { | |
29 _bedStream = &cin; | |
30 } | |
31 else { | |
32 _bedStream = new ifstream(bedFile.c_str(), ios::in); | |
33 | |
34 if (isGzipFile(_bedStream) == true) { | |
35 delete _bedStream; | |
36 _bedStream = new igzstream(bedFile.c_str(), ios::in); | |
37 } | |
38 // can we open the file? | |
39 if ( !(_bedStream->good()) ) { | |
40 cerr << "Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl; | |
41 exit (1); | |
42 } | |
43 } | |
44 } | |
45 | |
46 | |
47 | |
48 // Close the BEDPE file | |
49 void BedFilePE::Close(void) { | |
50 if (bedFile != "stdin" && bedFile != "-") delete _bedStream; | |
51 } | |
52 | |
53 | |
54 BedLineStatus BedFilePE::GetNextBedPE (BEDPE &bedpe, int &lineNum) { | |
55 | |
56 // make sure there are still lines to process. | |
57 // if so, tokenize, validate and return the BEDPE entry. | |
58 if (_bedStream->good()) { | |
59 string bedPELine; | |
60 vector<string> bedPEFields; | |
61 bedPEFields.reserve(10); | |
62 | |
63 // parse the bedStream pointer | |
64 getline(*_bedStream, bedPELine); | |
65 lineNum++; | |
66 | |
67 // split into a string vector. | |
68 Tokenize(bedPELine,bedPEFields); | |
69 | |
70 // load the BEDPE struct as long as it's a valid BEDPE entry. | |
71 return parseLine(bedpe, bedPEFields, lineNum); | |
72 } | |
73 // default if file is closed or EOF | |
74 return BED_INVALID; | |
75 } | |
76 | |
77 | |
78 /* | |
79 reportBedPETab | |
80 | |
81 Writes the _original_ BED entry for A. | |
82 Works for BEDPE only. | |
83 */ | |
84 void BedFilePE::reportBedPETab(const BEDPE &a) { | |
85 | |
86 if (this->bedType == 6) { | |
87 printf("%s\t%d\t%d\t%s\t%d\t%d\t", a.chrom1.c_str(), a.start1, a.end1, | |
88 a.chrom2.c_str(), a.start2, a.end2); | |
89 } | |
90 else if (this->bedType == 7) { | |
91 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t", a.chrom1.c_str(), a.start1, a.end1, | |
92 a.chrom2.c_str(), a.start2, a.end2, | |
93 a.name.c_str()); | |
94 } | |
95 else if (this->bedType == 8) { | |
96 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t", a.chrom1.c_str(), a.start1, a.end1, | |
97 a.chrom2.c_str(), a.start2, a.end2, | |
98 a.name.c_str(), a.score.c_str()); | |
99 } | |
100 else if (this->bedType == 10) { | |
101 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", a.chrom1.c_str(), a.start1, a.end1, | |
102 a.chrom2.c_str(), a.start2, a.end2, | |
103 a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); | |
104 } | |
105 else if (this->bedType > 10) { | |
106 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", a.chrom1.c_str(), a.start1, a.end1, | |
107 a.chrom2.c_str(), a.start2, a.end2, | |
108 a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); | |
109 | |
110 vector<string>::const_iterator othIt = a.otherFields.begin(); | |
111 vector<string>::const_iterator othEnd = a.otherFields.end(); | |
112 for ( ; othIt != othEnd; ++othIt) { | |
113 printf("\t%s", othIt->c_str()); | |
114 } | |
115 printf("\t"); | |
116 } | |
117 } | |
118 | |
119 | |
120 | |
121 /* | |
122 reportBedPENewLine | |
123 | |
124 Writes the _original_ BED entry for A. | |
125 Works for BEDPE only. | |
126 */ | |
127 void BedFilePE::reportBedPENewLine(const BEDPE &a) { | |
128 | |
129 if (this->bedType == 6) { | |
130 printf("%s\t%d\t%d\t%s\t%d\t%d\n", a.chrom1.c_str(), a.start1, a.end1, | |
131 a.chrom2.c_str(), a.start2, a.end2); | |
132 } | |
133 else if (this->bedType == 7) { | |
134 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\n", a.chrom1.c_str(), a.start1, a.end1, | |
135 a.chrom2.c_str(), a.start2, a.end2, | |
136 a.name.c_str()); | |
137 } | |
138 else if (this->bedType == 8) { | |
139 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\n", a.chrom1.c_str(), a.start1, a.end1, | |
140 a.chrom2.c_str(), a.start2, a.end2, | |
141 a.name.c_str(), a.score.c_str()); | |
142 } | |
143 else if (this->bedType == 10) { | |
144 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", a.chrom1.c_str(), a.start1, a.end1, | |
145 a.chrom2.c_str(), a.start2, a.end2, | |
146 a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); | |
147 } | |
148 else if (this->bedType > 10) { | |
149 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", a.chrom1.c_str(), a.start1, a.end1, | |
150 a.chrom2.c_str(), a.start2, a.end2, | |
151 a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str()); | |
152 | |
153 vector<string>::const_iterator othIt = a.otherFields.begin(); | |
154 vector<string>::const_iterator othEnd = a.otherFields.end(); | |
155 for ( ; othIt != othEnd; ++othIt) { | |
156 printf("\t%s", othIt->c_str()); | |
157 } | |
158 printf("\n"); | |
159 } | |
160 } | |
161 | |
162 | |
163 BedLineStatus BedFilePE::parseLine (BEDPE &bedpe, const vector<string> &lineVector, int &lineNum) { | |
164 | |
165 // bail out if we have a blank line | |
166 if (lineVector.empty()) | |
167 return BED_BLANK; | |
168 | |
169 if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) { | |
170 // we need at least 6 columns | |
171 if (lineVector.size() >= 6) { | |
172 if (parseBedPELine(bedpe, lineVector, lineNum) == true) | |
173 return BED_VALID; | |
174 else return BED_INVALID; | |
175 } | |
176 else { | |
177 cerr << "It looks as though you have less than 6 columns. Are you sure your files are tab-delimited?" << endl; | |
178 exit(1); | |
179 } | |
180 } | |
181 else { | |
182 lineNum--; | |
183 return BED_HEADER; | |
184 } | |
185 | |
186 // default | |
187 return BED_INVALID; | |
188 } | |
189 | |
190 | |
191 bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, const int &lineNum) { | |
192 | |
193 if ((lineNum == 1) && (lineVector.size() >= 6)) { | |
194 | |
195 this->bedType = lineVector.size(); | |
196 | |
197 if (this->bedType == 6) { | |
198 bed.chrom1 = lineVector[0]; | |
199 bed.start1 = atoi(lineVector[1].c_str()); | |
200 bed.end1 = atoi(lineVector[2].c_str()); | |
201 | |
202 bed.chrom2 = lineVector[3]; | |
203 bed.start2 = atoi(lineVector[4].c_str()); | |
204 bed.end2 = atoi(lineVector[5].c_str()); | |
205 | |
206 return true; | |
207 } | |
208 else if (this->bedType == 7) { | |
209 bed.chrom1 = lineVector[0]; | |
210 bed.start1 = atoi(lineVector[1].c_str()); | |
211 bed.end1 = atoi(lineVector[2].c_str()); | |
212 | |
213 bed.chrom2 = lineVector[3]; | |
214 bed.start2 = atoi(lineVector[4].c_str()); | |
215 bed.end2 = atoi(lineVector[5].c_str()); | |
216 | |
217 bed.name = lineVector[6]; | |
218 return true; | |
219 } | |
220 else if (this->bedType == 8) { | |
221 bed.chrom1 = lineVector[0]; | |
222 bed.start1 = atoi(lineVector[1].c_str()); | |
223 bed.end1 = atoi(lineVector[2].c_str()); | |
224 | |
225 bed.chrom2 = lineVector[3]; | |
226 bed.start2 = atoi(lineVector[4].c_str()); | |
227 bed.end2 = atoi(lineVector[5].c_str()); | |
228 | |
229 bed.name = lineVector[6]; | |
230 bed.score = lineVector[7].c_str(); | |
231 return true; | |
232 } | |
233 else if (this->bedType == 10) { | |
234 bed.chrom1 = lineVector[0]; | |
235 bed.start1 = atoi(lineVector[1].c_str()); | |
236 bed.end1 = atoi(lineVector[2].c_str()); | |
237 | |
238 bed.chrom2 = lineVector[3]; | |
239 bed.start2 = atoi(lineVector[4].c_str()); | |
240 bed.end2 = atoi(lineVector[5].c_str()); | |
241 | |
242 bed.name = lineVector[6]; | |
243 bed.score = lineVector[7].c_str(); | |
244 | |
245 bed.strand1 = lineVector[8]; | |
246 bed.strand2 = lineVector[9]; | |
247 | |
248 return true; | |
249 } | |
250 else if (this->bedType > 10) { | |
251 bed.chrom1 = lineVector[0]; | |
252 bed.start1 = atoi(lineVector[1].c_str()); | |
253 bed.end1 = atoi(lineVector[2].c_str()); | |
254 | |
255 bed.chrom2 = lineVector[3]; | |
256 bed.start2 = atoi(lineVector[4].c_str()); | |
257 bed.end2 = atoi(lineVector[5].c_str()); | |
258 | |
259 bed.name = lineVector[6]; | |
260 bed.score = lineVector[7].c_str(); | |
261 | |
262 bed.strand1 = lineVector[8]; | |
263 bed.strand2 = lineVector[9]; | |
264 | |
265 for (unsigned int i = 10; i < lineVector.size(); ++i) { | |
266 bed.otherFields.push_back(lineVector[i]); | |
267 } | |
268 return true; | |
269 } | |
270 else { | |
271 cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields. Exiting..." << endl; | |
272 exit(1); | |
273 } | |
274 | |
275 if (bed.start1 > bed.end1) { | |
276 cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl; | |
277 return false; | |
278 } | |
279 else if (bed.start2 > bed.end2) { | |
280 cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl; | |
281 return false; | |
282 } | |
283 else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) { | |
284 cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl; | |
285 return false; | |
286 } | |
287 } | |
288 else if ( (lineNum > 1) && (lineVector.size() == this->bedType)) { | |
289 | |
290 if (this->bedType == 6) { | |
291 bed.chrom1 = lineVector[0]; | |
292 bed.start1 = atoi(lineVector[1].c_str()); | |
293 bed.end1 = atoi(lineVector[2].c_str()); | |
294 | |
295 bed.chrom2 = lineVector[3]; | |
296 bed.start2 = atoi(lineVector[4].c_str()); | |
297 bed.end2 = atoi(lineVector[5].c_str()); | |
298 | |
299 return true; | |
300 } | |
301 else if (this->bedType == 7) { | |
302 bed.chrom1 = lineVector[0]; | |
303 bed.start1 = atoi(lineVector[1].c_str()); | |
304 bed.end1 = atoi(lineVector[2].c_str()); | |
305 | |
306 bed.chrom2 = lineVector[3]; | |
307 bed.start2 = atoi(lineVector[4].c_str()); | |
308 bed.end2 = atoi(lineVector[5].c_str()); | |
309 | |
310 bed.name = lineVector[6]; | |
311 return true; | |
312 } | |
313 else if (this->bedType == 8) { | |
314 bed.chrom1 = lineVector[0]; | |
315 bed.start1 = atoi(lineVector[1].c_str()); | |
316 bed.end1 = atoi(lineVector[2].c_str()); | |
317 | |
318 bed.chrom2 = lineVector[3]; | |
319 bed.start2 = atoi(lineVector[4].c_str()); | |
320 bed.end2 = atoi(lineVector[5].c_str()); | |
321 | |
322 bed.name = lineVector[6]; | |
323 bed.score = lineVector[7].c_str(); | |
324 return true; | |
325 } | |
326 else if (this->bedType == 10) { | |
327 bed.chrom1 = lineVector[0]; | |
328 bed.start1 = atoi(lineVector[1].c_str()); | |
329 bed.end1 = atoi(lineVector[2].c_str()); | |
330 | |
331 bed.chrom2 = lineVector[3]; | |
332 bed.start2 = atoi(lineVector[4].c_str()); | |
333 bed.end2 = atoi(lineVector[5].c_str()); | |
334 | |
335 bed.name = lineVector[6]; | |
336 bed.score = lineVector[7].c_str(); | |
337 | |
338 bed.strand1 = lineVector[8]; | |
339 bed.strand2 = lineVector[9]; | |
340 | |
341 return true; | |
342 } | |
343 else if (this->bedType > 10) { | |
344 bed.chrom1 = lineVector[0]; | |
345 bed.start1 = atoi(lineVector[1].c_str()); | |
346 bed.end1 = atoi(lineVector[2].c_str()); | |
347 | |
348 bed.chrom2 = lineVector[3]; | |
349 bed.start2 = atoi(lineVector[4].c_str()); | |
350 bed.end2 = atoi(lineVector[5].c_str()); | |
351 | |
352 bed.name = lineVector[6]; | |
353 bed.score = lineVector[7].c_str(); | |
354 | |
355 bed.strand1 = lineVector[8]; | |
356 bed.strand2 = lineVector[9]; | |
357 | |
358 for (unsigned int i = 10; i < lineVector.size(); ++i) { | |
359 bed.otherFields.push_back(lineVector[i]); | |
360 } | |
361 return true; | |
362 } | |
363 else { | |
364 cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields. Exiting..." << endl; | |
365 exit(1); | |
366 } | |
367 | |
368 if (bed.start1 > bed.end1) { | |
369 cerr << "Error: malformed BED entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl; | |
370 return false; | |
371 } | |
372 else if (bed.start2 > bed.end2) { | |
373 cerr << "Error: malformed BED entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl; | |
374 return false; | |
375 } | |
376 else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) { | |
377 cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl; | |
378 return false; | |
379 } | |
380 } | |
381 else if (lineVector.size() == 1) { | |
382 cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl; | |
383 exit(1); | |
384 } | |
385 else if ((lineVector.size() != this->bedType) && (lineVector.size() != 0)) { | |
386 cerr << "Differing number of BEDPE fields encountered at line: " << lineNum << ". Exiting..." << endl; | |
387 exit(1); | |
388 } | |
389 else if ((lineVector.size() < 6) && (lineVector.size() != 0)) { | |
390 cerr << "TAB delimited BEDPE file with at least 6 fields (chrom1, start1, end1, chrom2, start2, end2) is required at line: "<< lineNum << ". Exiting..." << endl; | |
391 exit(1); | |
392 } | |
393 return false; | |
394 } | |
395 | |
396 | |
397 /* | |
398 Adapted from kent source "binKeeperFind" | |
399 */ | |
400 void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string name, string strand, | |
401 vector<MATE> &hits, float overlapFraction, bool forceStrand, bool enforceDiffNames) { | |
402 | |
403 int startBin, endBin; | |
404 startBin = (start >> _binFirstShift); | |
405 endBin = ((end-1) >> _binFirstShift); | |
406 | |
407 // loop through each bin "level" in the binning hierarchy | |
408 for (int i = 0; i < _binLevels; ++i) { | |
409 | |
410 // loop through each bin at this level of the hierarchy | |
411 int offset = _binOffsetsExtended[i]; | |
412 for (int j = (startBin+offset); j <= (endBin+offset); ++j) { | |
413 | |
414 // loop through each feature in this chrom/bin and see if it overlaps | |
415 // with the feature that was passed in. if so, add the feature to | |
416 // the list of hits. | |
417 vector<MATE>::const_iterator bedItr; | |
418 vector<MATE>::const_iterator bedEnd; | |
419 if (bEnd == 1) { | |
420 bedItr = bedMapEnd1[chrom][j].begin(); | |
421 bedEnd = bedMapEnd1[chrom][j].end(); | |
422 } | |
423 else if (bEnd == 2) { | |
424 bedItr = bedMapEnd2[chrom][j].begin(); | |
425 bedEnd = bedMapEnd2[chrom][j].end(); | |
426 } | |
427 else { | |
428 cerr << "Unexpected end of B requested" << endl; | |
429 } | |
430 for (; bedItr != bedEnd; ++bedItr) { | |
431 float overlap = overlaps(bedItr->bed.start, bedItr->bed.end, start, end); | |
432 float size = end - start; | |
433 | |
434 if ( (overlap / size) >= overlapFraction ) { | |
435 | |
436 // skip the hit if not on the same strand (and we care) | |
437 if ((forceStrand == false) && (enforceDiffNames == false)) { | |
438 hits.push_back(*bedItr); // it's a hit, add it. | |
439 } | |
440 else if ((forceStrand == true) && (enforceDiffNames == false)) { | |
441 if (strand == bedItr->bed.strand) | |
442 hits.push_back(*bedItr); // it's a hit, add it. | |
443 } | |
444 else if ((forceStrand == true) && (enforceDiffNames == true)) { | |
445 if ((strand == bedItr->bed.strand) && (name != bedItr->bed.name)) | |
446 hits.push_back(*bedItr); // it's a hit, add it. | |
447 } | |
448 else if ((forceStrand == false) && (enforceDiffNames == true)) { | |
449 if (name != bedItr->bed.name) | |
450 hits.push_back(*bedItr); // it's a hit, add it. | |
451 } | |
452 } | |
453 | |
454 } | |
455 } | |
456 startBin >>= _binNextShift; | |
457 endBin >>= _binNextShift; | |
458 } | |
459 } | |
460 | |
461 | |
462 void BedFilePE::loadBedPEFileIntoMap() { | |
463 | |
464 int lineNum = 0; | |
465 int bin1, bin2; | |
466 BedLineStatus bedStatus; | |
467 BEDPE bedpeEntry, nullBedPE; | |
468 | |
469 Open(); | |
470 bedStatus = this->GetNextBedPE(bedpeEntry, lineNum); | |
471 while (bedStatus != BED_INVALID) { | |
472 | |
473 if (bedStatus == BED_VALID) { | |
474 MATE *bedEntry1 = new MATE(); | |
475 MATE *bedEntry2 = new MATE(); | |
476 // separate the BEDPE entry into separate | |
477 // BED entries | |
478 splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2); | |
479 | |
480 // load end1 into a UCSC bin map | |
481 bin1 = getBin(bedEntry1->bed.start, bedEntry1->bed.end); | |
482 this->bedMapEnd1[bedEntry1->bed.chrom][bin1].push_back(*bedEntry1); | |
483 | |
484 // load end2 into a UCSC bin map | |
485 bin2 = getBin(bedEntry2->bed.start, bedEntry2->bed.end); | |
486 this->bedMapEnd2[bedEntry2->bed.chrom][bin2].push_back(*bedEntry2); | |
487 | |
488 bedpeEntry = nullBedPE; | |
489 } | |
490 bedStatus = this->GetNextBedPE(bedpeEntry, lineNum); | |
491 } | |
492 Close(); | |
493 } | |
494 | |
495 | |
496 void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, MATE *bedEntry1, MATE *bedEntry2) { | |
497 | |
498 /* | |
499 Split the BEDPE entry into separate BED entries | |
500 | |
501 NOTE: I am using a trick here where I store | |
502 the lineNum of the BEDPE from the original file | |
503 in the "count" column. This allows me to later | |
504 resolve whether the hits found on both ends of BEDPE A | |
505 came from the same entry in BEDPE B. Tracking by "name" | |
506 alone with fail when there are multiple mappings for a given | |
507 read-pair. | |
508 */ | |
509 | |
510 bedEntry1->bed.chrom = bedpeEntry.chrom1; | |
511 bedEntry1->bed.start = bedpeEntry.start1; | |
512 bedEntry1->bed.end = bedpeEntry.end1; | |
513 bedEntry1->bed.name = bedpeEntry.name; | |
514 bedEntry1->bed.score = bedpeEntry.score; // only store the score in end1 to save memory | |
515 bedEntry1->bed.strand = bedpeEntry.strand1; | |
516 bedEntry1->bed.otherFields = bedpeEntry.otherFields; // only store the otherFields in end1 to save memory | |
517 bedEntry1->lineNum = lineNum; | |
518 bedEntry1->mate = bedEntry2; // keep a pointer to end2 | |
519 | |
520 bedEntry2->bed.chrom = bedpeEntry.chrom2; | |
521 bedEntry2->bed.start = bedpeEntry.start2; | |
522 bedEntry2->bed.end = bedpeEntry.end2; | |
523 bedEntry2->bed.name = bedpeEntry.name; | |
524 bedEntry2->bed.strand = bedpeEntry.strand2; | |
525 bedEntry2->lineNum = lineNum; | |
526 bedEntry2->mate = bedEntry1; // keep a pointer to end1 | |
527 } | |
528 | |
529 | |
530 |