comparison BEDTools-Version-2.14.3/src/utils/bedFilePE/bedFilePE.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfcd8b6c1bda
1 //
2 // bedFilePE.cpp
3 // BEDTools
4 //
5 // Created by Aaron Quinlan Spring 2009.
6 // Copyright 2009 Aaron Quinlan. All rights reserved.
7 //
8 // Summary: Contains common functions for finding BED overlaps.
9 //
10 // Acknowledgments: Much of the code herein is taken from Jim Kent's
11 // BED processing code. I am grateful for his elegant
12 // genome binning algorithm and therefore use it extensively.
13
14
15 #include "bedFilePE.h"
16
17
18 // Constructor
19 BedFilePE::BedFilePE(string &bedFile) {
20 this->bedFile = bedFile;
21 }
22
23 // Destructor
24 BedFilePE::~BedFilePE(void) {
25 }
26
27 void BedFilePE::Open(void) {
28 if (bedFile == "stdin" || bedFile == "-") {
29 _bedStream = &cin;
30 }
31 else {
32 _bedStream = new ifstream(bedFile.c_str(), ios::in);
33
34 if (isGzipFile(_bedStream) == true) {
35 delete _bedStream;
36 _bedStream = new igzstream(bedFile.c_str(), ios::in);
37 }
38 // can we open the file?
39 if ( !(_bedStream->good()) ) {
40 cerr << "Error: The requested bed file (" << bedFile << ") could not be opened. Exiting!" << endl;
41 exit (1);
42 }
43 }
44 }
45
46
47
48 // Close the BEDPE file
49 void BedFilePE::Close(void) {
50 if (bedFile != "stdin" && bedFile != "-") delete _bedStream;
51 }
52
53
54 BedLineStatus BedFilePE::GetNextBedPE (BEDPE &bedpe, int &lineNum) {
55
56 // make sure there are still lines to process.
57 // if so, tokenize, validate and return the BEDPE entry.
58 if (_bedStream->good()) {
59 string bedPELine;
60 vector<string> bedPEFields;
61 bedPEFields.reserve(10);
62
63 // parse the bedStream pointer
64 getline(*_bedStream, bedPELine);
65 lineNum++;
66
67 // split into a string vector.
68 Tokenize(bedPELine,bedPEFields);
69
70 // load the BEDPE struct as long as it's a valid BEDPE entry.
71 return parseLine(bedpe, bedPEFields, lineNum);
72 }
73 // default if file is closed or EOF
74 return BED_INVALID;
75 }
76
77
78 /*
79 reportBedPETab
80
81 Writes the _original_ BED entry for A.
82 Works for BEDPE only.
83 */
84 void BedFilePE::reportBedPETab(const BEDPE &a) {
85
86 if (this->bedType == 6) {
87 printf("%s\t%d\t%d\t%s\t%d\t%d\t", a.chrom1.c_str(), a.start1, a.end1,
88 a.chrom2.c_str(), a.start2, a.end2);
89 }
90 else if (this->bedType == 7) {
91 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t", a.chrom1.c_str(), a.start1, a.end1,
92 a.chrom2.c_str(), a.start2, a.end2,
93 a.name.c_str());
94 }
95 else if (this->bedType == 8) {
96 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t", a.chrom1.c_str(), a.start1, a.end1,
97 a.chrom2.c_str(), a.start2, a.end2,
98 a.name.c_str(), a.score.c_str());
99 }
100 else if (this->bedType == 10) {
101 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\t", a.chrom1.c_str(), a.start1, a.end1,
102 a.chrom2.c_str(), a.start2, a.end2,
103 a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
104 }
105 else if (this->bedType > 10) {
106 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", a.chrom1.c_str(), a.start1, a.end1,
107 a.chrom2.c_str(), a.start2, a.end2,
108 a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
109
110 vector<string>::const_iterator othIt = a.otherFields.begin();
111 vector<string>::const_iterator othEnd = a.otherFields.end();
112 for ( ; othIt != othEnd; ++othIt) {
113 printf("\t%s", othIt->c_str());
114 }
115 printf("\t");
116 }
117 }
118
119
120
121 /*
122 reportBedPENewLine
123
124 Writes the _original_ BED entry for A.
125 Works for BEDPE only.
126 */
127 void BedFilePE::reportBedPENewLine(const BEDPE &a) {
128
129 if (this->bedType == 6) {
130 printf("%s\t%d\t%d\t%s\t%d\t%d\n", a.chrom1.c_str(), a.start1, a.end1,
131 a.chrom2.c_str(), a.start2, a.end2);
132 }
133 else if (this->bedType == 7) {
134 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\n", a.chrom1.c_str(), a.start1, a.end1,
135 a.chrom2.c_str(), a.start2, a.end2,
136 a.name.c_str());
137 }
138 else if (this->bedType == 8) {
139 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\n", a.chrom1.c_str(), a.start1, a.end1,
140 a.chrom2.c_str(), a.start2, a.end2,
141 a.name.c_str(), a.score.c_str());
142 }
143 else if (this->bedType == 10) {
144 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s\n", a.chrom1.c_str(), a.start1, a.end1,
145 a.chrom2.c_str(), a.start2, a.end2,
146 a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
147 }
148 else if (this->bedType > 10) {
149 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", a.chrom1.c_str(), a.start1, a.end1,
150 a.chrom2.c_str(), a.start2, a.end2,
151 a.name.c_str(), a.score.c_str(), a.strand1.c_str(), a.strand2.c_str());
152
153 vector<string>::const_iterator othIt = a.otherFields.begin();
154 vector<string>::const_iterator othEnd = a.otherFields.end();
155 for ( ; othIt != othEnd; ++othIt) {
156 printf("\t%s", othIt->c_str());
157 }
158 printf("\n");
159 }
160 }
161
162
163 BedLineStatus BedFilePE::parseLine (BEDPE &bedpe, const vector<string> &lineVector, int &lineNum) {
164
165 // bail out if we have a blank line
166 if (lineVector.empty())
167 return BED_BLANK;
168
169 if ((lineVector[0].find("track") == string::npos) && (lineVector[0].find("browser") == string::npos) && (lineVector[0].find("#") == string::npos) ) {
170 // we need at least 6 columns
171 if (lineVector.size() >= 6) {
172 if (parseBedPELine(bedpe, lineVector, lineNum) == true)
173 return BED_VALID;
174 else return BED_INVALID;
175 }
176 else {
177 cerr << "It looks as though you have less than 6 columns. Are you sure your files are tab-delimited?" << endl;
178 exit(1);
179 }
180 }
181 else {
182 lineNum--;
183 return BED_HEADER;
184 }
185
186 // default
187 return BED_INVALID;
188 }
189
190
191 bool BedFilePE::parseBedPELine (BEDPE &bed, const vector<string> &lineVector, const int &lineNum) {
192
193 if ((lineNum == 1) && (lineVector.size() >= 6)) {
194
195 this->bedType = lineVector.size();
196
197 if (this->bedType == 6) {
198 bed.chrom1 = lineVector[0];
199 bed.start1 = atoi(lineVector[1].c_str());
200 bed.end1 = atoi(lineVector[2].c_str());
201
202 bed.chrom2 = lineVector[3];
203 bed.start2 = atoi(lineVector[4].c_str());
204 bed.end2 = atoi(lineVector[5].c_str());
205
206 return true;
207 }
208 else if (this->bedType == 7) {
209 bed.chrom1 = lineVector[0];
210 bed.start1 = atoi(lineVector[1].c_str());
211 bed.end1 = atoi(lineVector[2].c_str());
212
213 bed.chrom2 = lineVector[3];
214 bed.start2 = atoi(lineVector[4].c_str());
215 bed.end2 = atoi(lineVector[5].c_str());
216
217 bed.name = lineVector[6];
218 return true;
219 }
220 else if (this->bedType == 8) {
221 bed.chrom1 = lineVector[0];
222 bed.start1 = atoi(lineVector[1].c_str());
223 bed.end1 = atoi(lineVector[2].c_str());
224
225 bed.chrom2 = lineVector[3];
226 bed.start2 = atoi(lineVector[4].c_str());
227 bed.end2 = atoi(lineVector[5].c_str());
228
229 bed.name = lineVector[6];
230 bed.score = lineVector[7].c_str();
231 return true;
232 }
233 else if (this->bedType == 10) {
234 bed.chrom1 = lineVector[0];
235 bed.start1 = atoi(lineVector[1].c_str());
236 bed.end1 = atoi(lineVector[2].c_str());
237
238 bed.chrom2 = lineVector[3];
239 bed.start2 = atoi(lineVector[4].c_str());
240 bed.end2 = atoi(lineVector[5].c_str());
241
242 bed.name = lineVector[6];
243 bed.score = lineVector[7].c_str();
244
245 bed.strand1 = lineVector[8];
246 bed.strand2 = lineVector[9];
247
248 return true;
249 }
250 else if (this->bedType > 10) {
251 bed.chrom1 = lineVector[0];
252 bed.start1 = atoi(lineVector[1].c_str());
253 bed.end1 = atoi(lineVector[2].c_str());
254
255 bed.chrom2 = lineVector[3];
256 bed.start2 = atoi(lineVector[4].c_str());
257 bed.end2 = atoi(lineVector[5].c_str());
258
259 bed.name = lineVector[6];
260 bed.score = lineVector[7].c_str();
261
262 bed.strand1 = lineVector[8];
263 bed.strand2 = lineVector[9];
264
265 for (unsigned int i = 10; i < lineVector.size(); ++i) {
266 bed.otherFields.push_back(lineVector[i]);
267 }
268 return true;
269 }
270 else {
271 cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields. Exiting..." << endl;
272 exit(1);
273 }
274
275 if (bed.start1 > bed.end1) {
276 cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl;
277 return false;
278 }
279 else if (bed.start2 > bed.end2) {
280 cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl;
281 return false;
282 }
283 else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) {
284 cerr << "Error: malformed BEDPE entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl;
285 return false;
286 }
287 }
288 else if ( (lineNum > 1) && (lineVector.size() == this->bedType)) {
289
290 if (this->bedType == 6) {
291 bed.chrom1 = lineVector[0];
292 bed.start1 = atoi(lineVector[1].c_str());
293 bed.end1 = atoi(lineVector[2].c_str());
294
295 bed.chrom2 = lineVector[3];
296 bed.start2 = atoi(lineVector[4].c_str());
297 bed.end2 = atoi(lineVector[5].c_str());
298
299 return true;
300 }
301 else if (this->bedType == 7) {
302 bed.chrom1 = lineVector[0];
303 bed.start1 = atoi(lineVector[1].c_str());
304 bed.end1 = atoi(lineVector[2].c_str());
305
306 bed.chrom2 = lineVector[3];
307 bed.start2 = atoi(lineVector[4].c_str());
308 bed.end2 = atoi(lineVector[5].c_str());
309
310 bed.name = lineVector[6];
311 return true;
312 }
313 else if (this->bedType == 8) {
314 bed.chrom1 = lineVector[0];
315 bed.start1 = atoi(lineVector[1].c_str());
316 bed.end1 = atoi(lineVector[2].c_str());
317
318 bed.chrom2 = lineVector[3];
319 bed.start2 = atoi(lineVector[4].c_str());
320 bed.end2 = atoi(lineVector[5].c_str());
321
322 bed.name = lineVector[6];
323 bed.score = lineVector[7].c_str();
324 return true;
325 }
326 else if (this->bedType == 10) {
327 bed.chrom1 = lineVector[0];
328 bed.start1 = atoi(lineVector[1].c_str());
329 bed.end1 = atoi(lineVector[2].c_str());
330
331 bed.chrom2 = lineVector[3];
332 bed.start2 = atoi(lineVector[4].c_str());
333 bed.end2 = atoi(lineVector[5].c_str());
334
335 bed.name = lineVector[6];
336 bed.score = lineVector[7].c_str();
337
338 bed.strand1 = lineVector[8];
339 bed.strand2 = lineVector[9];
340
341 return true;
342 }
343 else if (this->bedType > 10) {
344 bed.chrom1 = lineVector[0];
345 bed.start1 = atoi(lineVector[1].c_str());
346 bed.end1 = atoi(lineVector[2].c_str());
347
348 bed.chrom2 = lineVector[3];
349 bed.start2 = atoi(lineVector[4].c_str());
350 bed.end2 = atoi(lineVector[5].c_str());
351
352 bed.name = lineVector[6];
353 bed.score = lineVector[7].c_str();
354
355 bed.strand1 = lineVector[8];
356 bed.strand2 = lineVector[9];
357
358 for (unsigned int i = 10; i < lineVector.size(); ++i) {
359 bed.otherFields.push_back(lineVector[i]);
360 }
361 return true;
362 }
363 else {
364 cerr << "Unexpected number of fields: " << lineNum << ". Verify that your files are TAB-delimited and that your BEDPE file has 6,7,8 or 10 fields. Exiting..." << endl;
365 exit(1);
366 }
367
368 if (bed.start1 > bed.end1) {
369 cerr << "Error: malformed BED entry at line " << lineNum << ". Start1 was greater than End1. Ignoring it and moving on." << endl;
370 return false;
371 }
372 else if (bed.start2 > bed.end2) {
373 cerr << "Error: malformed BED entry at line " << lineNum << ". Start2 was greater than End2. Ignoring it and moving on." << endl;
374 return false;
375 }
376 else if ( (bed.start1 < 0) || (bed.end1 < 0) || (bed.start2 < 0) || (bed.end2 < 0) ) {
377 cerr << "Error: malformed BED entry at line " << lineNum << ". Coordinate <= 0. Ignoring it and moving on." << endl;
378 return false;
379 }
380 }
381 else if (lineVector.size() == 1) {
382 cerr << "Only one BED field detected: " << lineNum << ". Verify that your files are TAB-delimited. Exiting..." << endl;
383 exit(1);
384 }
385 else if ((lineVector.size() != this->bedType) && (lineVector.size() != 0)) {
386 cerr << "Differing number of BEDPE fields encountered at line: " << lineNum << ". Exiting..." << endl;
387 exit(1);
388 }
389 else if ((lineVector.size() < 6) && (lineVector.size() != 0)) {
390 cerr << "TAB delimited BEDPE file with at least 6 fields (chrom1, start1, end1, chrom2, start2, end2) is required at line: "<< lineNum << ". Exiting..." << endl;
391 exit(1);
392 }
393 return false;
394 }
395
396
397 /*
398 Adapted from kent source "binKeeperFind"
399 */
400 void BedFilePE::FindOverlapsPerBin(int bEnd, string chrom, CHRPOS start, CHRPOS end, string name, string strand,
401 vector<MATE> &hits, float overlapFraction, bool forceStrand, bool enforceDiffNames) {
402
403 int startBin, endBin;
404 startBin = (start >> _binFirstShift);
405 endBin = ((end-1) >> _binFirstShift);
406
407 // loop through each bin "level" in the binning hierarchy
408 for (int i = 0; i < _binLevels; ++i) {
409
410 // loop through each bin at this level of the hierarchy
411 int offset = _binOffsetsExtended[i];
412 for (int j = (startBin+offset); j <= (endBin+offset); ++j) {
413
414 // loop through each feature in this chrom/bin and see if it overlaps
415 // with the feature that was passed in. if so, add the feature to
416 // the list of hits.
417 vector<MATE>::const_iterator bedItr;
418 vector<MATE>::const_iterator bedEnd;
419 if (bEnd == 1) {
420 bedItr = bedMapEnd1[chrom][j].begin();
421 bedEnd = bedMapEnd1[chrom][j].end();
422 }
423 else if (bEnd == 2) {
424 bedItr = bedMapEnd2[chrom][j].begin();
425 bedEnd = bedMapEnd2[chrom][j].end();
426 }
427 else {
428 cerr << "Unexpected end of B requested" << endl;
429 }
430 for (; bedItr != bedEnd; ++bedItr) {
431 float overlap = overlaps(bedItr->bed.start, bedItr->bed.end, start, end);
432 float size = end - start;
433
434 if ( (overlap / size) >= overlapFraction ) {
435
436 // skip the hit if not on the same strand (and we care)
437 if ((forceStrand == false) && (enforceDiffNames == false)) {
438 hits.push_back(*bedItr); // it's a hit, add it.
439 }
440 else if ((forceStrand == true) && (enforceDiffNames == false)) {
441 if (strand == bedItr->bed.strand)
442 hits.push_back(*bedItr); // it's a hit, add it.
443 }
444 else if ((forceStrand == true) && (enforceDiffNames == true)) {
445 if ((strand == bedItr->bed.strand) && (name != bedItr->bed.name))
446 hits.push_back(*bedItr); // it's a hit, add it.
447 }
448 else if ((forceStrand == false) && (enforceDiffNames == true)) {
449 if (name != bedItr->bed.name)
450 hits.push_back(*bedItr); // it's a hit, add it.
451 }
452 }
453
454 }
455 }
456 startBin >>= _binNextShift;
457 endBin >>= _binNextShift;
458 }
459 }
460
461
462 void BedFilePE::loadBedPEFileIntoMap() {
463
464 int lineNum = 0;
465 int bin1, bin2;
466 BedLineStatus bedStatus;
467 BEDPE bedpeEntry, nullBedPE;
468
469 Open();
470 bedStatus = this->GetNextBedPE(bedpeEntry, lineNum);
471 while (bedStatus != BED_INVALID) {
472
473 if (bedStatus == BED_VALID) {
474 MATE *bedEntry1 = new MATE();
475 MATE *bedEntry2 = new MATE();
476 // separate the BEDPE entry into separate
477 // BED entries
478 splitBedPEIntoBeds(bedpeEntry, lineNum, bedEntry1, bedEntry2);
479
480 // load end1 into a UCSC bin map
481 bin1 = getBin(bedEntry1->bed.start, bedEntry1->bed.end);
482 this->bedMapEnd1[bedEntry1->bed.chrom][bin1].push_back(*bedEntry1);
483
484 // load end2 into a UCSC bin map
485 bin2 = getBin(bedEntry2->bed.start, bedEntry2->bed.end);
486 this->bedMapEnd2[bedEntry2->bed.chrom][bin2].push_back(*bedEntry2);
487
488 bedpeEntry = nullBedPE;
489 }
490 bedStatus = this->GetNextBedPE(bedpeEntry, lineNum);
491 }
492 Close();
493 }
494
495
496 void BedFilePE::splitBedPEIntoBeds(const BEDPE &bedpeEntry, const int &lineNum, MATE *bedEntry1, MATE *bedEntry2) {
497
498 /*
499 Split the BEDPE entry into separate BED entries
500
501 NOTE: I am using a trick here where I store
502 the lineNum of the BEDPE from the original file
503 in the "count" column. This allows me to later
504 resolve whether the hits found on both ends of BEDPE A
505 came from the same entry in BEDPE B. Tracking by "name"
506 alone with fail when there are multiple mappings for a given
507 read-pair.
508 */
509
510 bedEntry1->bed.chrom = bedpeEntry.chrom1;
511 bedEntry1->bed.start = bedpeEntry.start1;
512 bedEntry1->bed.end = bedpeEntry.end1;
513 bedEntry1->bed.name = bedpeEntry.name;
514 bedEntry1->bed.score = bedpeEntry.score; // only store the score in end1 to save memory
515 bedEntry1->bed.strand = bedpeEntry.strand1;
516 bedEntry1->bed.otherFields = bedpeEntry.otherFields; // only store the otherFields in end1 to save memory
517 bedEntry1->lineNum = lineNum;
518 bedEntry1->mate = bedEntry2; // keep a pointer to end2
519
520 bedEntry2->bed.chrom = bedpeEntry.chrom2;
521 bedEntry2->bed.start = bedpeEntry.start2;
522 bedEntry2->bed.end = bedpeEntry.end2;
523 bedEntry2->bed.name = bedpeEntry.name;
524 bedEntry2->bed.strand = bedpeEntry.strand2;
525 bedEntry2->lineNum = lineNum;
526 bedEntry2->mate = bedEntry1; // keep a pointer to end1
527 }
528
529
530