Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/fjoin/fjoin.cpp @ 0:dfcd8b6c1bda
Uploaded
| author | aaronquinlan |
|---|---|
| date | Thu, 03 Nov 2011 10:25:04 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfcd8b6c1bda |
|---|---|
| 1 /***************************************************************************** | |
| 2 intersectBed.cpp | |
| 3 | |
| 4 (c) 2009 - Aaron Quinlan | |
| 5 Hall Laboratory | |
| 6 Department of Biochemistry and Molecular Genetics | |
| 7 University of Virginia | |
| 8 aaronquinlan@gmail.com | |
| 9 | |
| 10 Licenced under the GNU General Public License 2.0 license. | |
| 11 ******************************************************************************/ | |
| 12 #include "lineFileUtilities.h" | |
| 13 #include "fjoin.h" | |
| 14 #include <queue> | |
| 15 #include <set> | |
| 16 | |
| 17 bool leftOf(const BED &a, const BED &b); | |
| 18 | |
| 19 | |
| 20 bool BedIntersect::processHits(BED &a, vector<BED> &hits) { | |
| 21 // how many overlaps are there b/w the bed and the set of hits? | |
| 22 int s, e, overlapBases; | |
| 23 int numOverlaps = 0; | |
| 24 bool hitsFound = false; | |
| 25 int aLength = (a.end - a.start); // the length of a in b.p. | |
| 26 | |
| 27 // loop through the hits and report those that meet the user's criteria | |
| 28 vector<BED>::const_iterator h = hits.begin(); | |
| 29 vector<BED>::const_iterator hitsEnd = hits.end(); | |
| 30 for (; h != hitsEnd; ++h) { | |
| 31 s = max(a.start, h->start); | |
| 32 e = min(a.end, h->end); | |
| 33 overlapBases = (e - s); // the number of overlapping bases b/w a and b | |
| 34 | |
| 35 // is there enough overlap relative to the user's request? (default ~ 1bp) | |
| 36 if ( ( (float) overlapBases / (float) aLength ) >= _overlapFraction ) { | |
| 37 // Report the hit if the user doesn't care about reciprocal overlap between A and B. | |
| 38 if (_reciprocal == false) { | |
| 39 hitsFound = true; | |
| 40 numOverlaps++; | |
| 41 if (_printable == true) | |
| 42 ReportOverlapDetail(overlapBases, a, *h, s, e); | |
| 43 } | |
| 44 // we require there to be sufficient __reciprocal__ overlap | |
| 45 else { | |
| 46 int bLength = (h->end - h->start); | |
| 47 float bOverlap = ( (float) overlapBases / (float) bLength ); | |
| 48 if (bOverlap >= _overlapFraction) { | |
| 49 hitsFound = true; | |
| 50 numOverlaps++; | |
| 51 if (_printable == true) | |
| 52 ReportOverlapDetail(overlapBases, a, *h, s, e); | |
| 53 } | |
| 54 } | |
| 55 } | |
| 56 } | |
| 57 // report the summary of the overlaps if requested. | |
| 58 ReportOverlapSummary(a, numOverlaps); | |
| 59 // were hits found for this BED feature? | |
| 60 return hitsFound; | |
| 61 } | |
| 62 | |
| 63 /* | |
| 64 Constructor | |
| 65 */ | |
| 66 BedIntersect::BedIntersect(string bedAFile, string bedBFile, bool anyHit, | |
| 67 bool writeA, bool writeB, bool writeOverlap, bool writeAllOverlap, | |
| 68 float overlapFraction, bool noHit, bool writeCount, bool forceStrand, | |
| 69 bool reciprocal, bool obeySplits, bool bamInput, bool bamOutput) { | |
| 70 | |
| 71 _bedAFile = bedAFile; | |
| 72 _bedBFile = bedBFile; | |
| 73 _anyHit = anyHit; | |
| 74 _noHit = noHit; | |
| 75 _writeA = writeA; | |
| 76 _writeB = writeB; | |
| 77 _writeOverlap = writeOverlap; | |
| 78 _writeAllOverlap = writeAllOverlap; | |
| 79 _writeCount = writeCount; | |
| 80 _overlapFraction = overlapFraction; | |
| 81 _forceStrand = forceStrand; | |
| 82 _reciprocal = reciprocal; | |
| 83 _obeySplits = obeySplits; | |
| 84 _bamInput = bamInput; | |
| 85 _bamOutput = bamOutput; | |
| 86 | |
| 87 if (_anyHit || _noHit || _writeCount) | |
| 88 _printable = false; | |
| 89 else | |
| 90 _printable = true; | |
| 91 | |
| 92 // create new BED file objects for A and B | |
| 93 _bedA = new BedFile(bedAFile); | |
| 94 _bedB = new BedFile(bedBFile); | |
| 95 | |
| 96 IntersectBed(); | |
| 97 } | |
| 98 | |
| 99 | |
| 100 /* | |
| 101 Destructor | |
| 102 */ | |
| 103 BedIntersect::~BedIntersect(void) { | |
| 104 } | |
| 105 | |
| 106 | |
| 107 bool leftOf(const BED &a, const BED &b) { | |
| 108 return (a.end <= b.start); | |
| 109 } | |
| 110 | |
| 111 | |
| 112 void BedIntersect::ReportOverlapDetail(const int &overlapBases, const BED &a, const BED &b, | |
| 113 const CHRPOS &s, const CHRPOS &e) { | |
| 114 // default. simple intersection only | |
| 115 if (_writeA == false && _writeB == false && _writeOverlap == false) { | |
| 116 _bedA->reportBedRangeNewLine(a,s,e); | |
| 117 } | |
| 118 // -wa -wbwrite the original A and B | |
| 119 else if (_writeA == true && _writeB == true) { | |
| 120 _bedA->reportBedTab(a); | |
| 121 _bedB->reportBedNewLine(b); | |
| 122 } | |
| 123 // -wa write just the original A | |
| 124 else if (_writeA == true) { | |
| 125 _bedA->reportBedNewLine(a); | |
| 126 } | |
| 127 // -wb write the intersected portion of A and the original B | |
| 128 else if (_writeB == true) { | |
| 129 _bedA->reportBedRangeTab(a,s,e); | |
| 130 _bedB->reportBedNewLine(b); | |
| 131 } | |
| 132 // -wo write the original A and B plus the no. of overlapping bases. | |
| 133 else if (_writeOverlap == true) { | |
| 134 _bedA->reportBedTab(a); | |
| 135 _bedB->reportBedTab(b); | |
| 136 printf("%d\n", overlapBases); | |
| 137 } | |
| 138 } | |
| 139 | |
| 140 | |
| 141 void BedIntersect::ReportOverlapSummary(const BED &a, const int &numOverlapsFound) { | |
| 142 // -u just report the fact that there was >= 1 overlaps | |
| 143 if (_anyHit && (numOverlapsFound >= 1)) { | |
| 144 _bedA->reportBedNewLine(a); | |
| 145 } | |
| 146 // -c report the total number of features overlapped in B | |
| 147 else if (_writeCount) { | |
| 148 _bedA->reportBedTab(a); | |
| 149 printf("%d\n", numOverlapsFound); | |
| 150 } | |
| 151 // -v report iff there were no overlaps | |
| 152 else if (_noHit && (numOverlapsFound == 0)) { | |
| 153 _bedA->reportBedNewLine(a); | |
| 154 } | |
| 155 // -wao the user wants to force the reporting of 0 overlap | |
| 156 else if (_writeAllOverlap && (numOverlapsFound == 0)) { | |
| 157 _bedA->reportBedTab(a); | |
| 158 _bedB->reportNullBedTab(); | |
| 159 printf("0\n"); | |
| 160 } | |
| 161 } | |
| 162 | |
| 163 | |
| 164 | |
| 165 void BedIntersect::Scan(BED *x, vector<BED *> *windowX, BedLineStatus xStatus, | |
| 166 const BED &y, vector<BED *> *windowY, BedLineStatus yStatus) { | |
| 167 | |
| 168 if (xStatus != BED_VALID) { | |
| 169 return; | |
| 170 } | |
| 171 | |
| 172 std::vector<BED *>::iterator wYIter = windowY->begin(); | |
| 173 while (wYIter != windowY->end()) { | |
| 174 if (leftOf(*(*wYIter), *x) == true) { | |
| 175 (*wYIter)->finished = true; | |
| 176 wYIter = windowY->erase(wYIter); // erase auto-increments to the next position | |
| 177 } | |
| 178 else if (overlaps((*wYIter)->start, (*wYIter)->end, x->start, x->end) > 0) { | |
| 179 if (_lastPick == 0) { | |
| 180 AddHits(x, *(*wYIter)); | |
| 181 } | |
| 182 else { | |
| 183 AddHits(*wYIter, *x); | |
| 184 } | |
| 185 ++wYIter; // force incrementing | |
| 186 } | |
| 187 } | |
| 188 if (leftOf(*x,y) == false) | |
| 189 windowX->push_back(x); | |
| 190 else { | |
| 191 x->finished = true; | |
| 192 } | |
| 193 // dump the buffered results (if any) | |
| 194 FlushOutputBuffer(); | |
| 195 } | |
| 196 | |
| 197 | |
| 198 void BedIntersect::AddHits(BED *x, const BED &y) { | |
| 199 if (x->overlaps.empty() == true) | |
| 200 _outputBuffer.push(x); | |
| 201 x->overlaps.push_back(y); | |
| 202 } | |
| 203 | |
| 204 | |
| 205 void BedIntersect::FlushOutputBuffer(bool final) { | |
| 206 while (_outputBuffer.empty() == false) | |
| 207 { | |
| 208 if (final == false && _outputBuffer.front()->finished == false) | |
| 209 break; | |
| 210 | |
| 211 processHits(*_outputBuffer.front(), _outputBuffer.front()->overlaps); | |
| 212 // remove the finished BED entry from the heap | |
| 213 delete _outputBuffer.front(); | |
| 214 _outputBuffer.pop(); | |
| 215 } | |
| 216 } | |
| 217 | |
| 218 | |
| 219 vector<BED*>* BedIntersect::GetWindow(const string &chrom, bool isA) { | |
| 220 | |
| 221 // iterator to test if a window for a given chrom exists. | |
| 222 map<string, vector<BED*> >::iterator it; | |
| 223 | |
| 224 // grab the current window for A or B, depending on | |
| 225 // the request. if a window hasn't yet been created | |
| 226 // for the requested chrom, create one. | |
| 227 | |
| 228 if (isA) { | |
| 229 it = _windowA.find(chrom); | |
| 230 if (it != _windowA.end()) { | |
| 231 return & _windowA[chrom]; | |
| 232 } | |
| 233 else { | |
| 234 _windowA.insert(pair<string, vector<BED *> >(chrom, vector<BED *>())); | |
| 235 return & _windowA[chrom]; | |
| 236 } | |
| 237 } | |
| 238 else { | |
| 239 it = _windowB.find(chrom); | |
| 240 if (it != _windowB.end()) { | |
| 241 return & _windowB[chrom]; | |
| 242 } | |
| 243 else { | |
| 244 _windowB.insert(pair<string, vector<BED *> >(chrom, vector<BED *>())); | |
| 245 return & _windowB[chrom]; | |
| 246 } | |
| 247 } | |
| 248 } | |
| 249 | |
| 250 | |
| 251 void BedIntersect::ChromSwitch(const string &chrom) { | |
| 252 | |
| 253 vector<BED*>::iterator windowAIter = _windowA[chrom].begin(); | |
| 254 vector<BED*>::iterator windowAEnd = _windowA[chrom].end(); | |
| 255 for (; windowAIter != windowAEnd; ++windowAIter) | |
| 256 (*windowAIter)->finished = true; | |
| 257 | |
| 258 vector<BED*>::iterator windowBIter = _windowB[chrom].begin(); | |
| 259 vector<BED*>::iterator windowBEnd = _windowB[chrom].end(); | |
| 260 for (; windowBIter != windowBEnd; ++windowBIter) | |
| 261 (*windowBIter)->finished = true; | |
| 262 | |
| 263 FlushOutputBuffer(); | |
| 264 } | |
| 265 | |
| 266 | |
| 267 void BedIntersect::IntersectBed() { | |
| 268 | |
| 269 int aLineNum = 0; | |
| 270 int bLineNum = 0; | |
| 271 | |
| 272 // current feature from each file | |
| 273 BED *a, *b, *prevA, *prevB; | |
| 274 | |
| 275 // status of the current lines | |
| 276 BedLineStatus aStatus, bStatus; | |
| 277 | |
| 278 // open the files; get the first line from each | |
| 279 _bedA->Open(); | |
| 280 _bedB->Open(); | |
| 281 | |
| 282 prevA = NULL; | |
| 283 prevB = NULL; | |
| 284 a = new BED(); | |
| 285 b = new BED(); | |
| 286 aStatus = _bedA->GetNextBed(*a, aLineNum); | |
| 287 bStatus = _bedB->GetNextBed(*b, bLineNum); | |
| 288 | |
| 289 cout << a->chrom << " " << a->start << " " << a->chrom << " " << b->start << endl; | |
| 290 while (aStatus != BED_INVALID || bStatus != BED_INVALID) { | |
| 291 | |
| 292 if ((a->start <= b->start) && (a->chrom == b->chrom)) { | |
| 293 prevA = a; | |
| 294 _lastPick = 0; | |
| 295 Scan(a, GetWindow(a->chrom, true), aStatus, | |
| 296 *b, GetWindow(a->chrom, false), bStatus); | |
| 297 | |
| 298 a = new BED(); | |
| 299 aStatus = _bedA->GetNextBed(*a, aLineNum); | |
| 300 } | |
| 301 else if ((a->start > b->start) && (a->chrom == b->chrom)) { | |
| 302 prevB = b; | |
| 303 _lastPick = 1; | |
| 304 Scan(b, GetWindow(b->chrom, false), bStatus, | |
| 305 *a, GetWindow(b->chrom, true), aStatus); | |
| 306 | |
| 307 b = new BED(); | |
| 308 bStatus = _bedB->GetNextBed(*b, bLineNum); | |
| 309 } | |
| 310 else if (a->chrom != b->chrom) { | |
| 311 // A was most recently read | |
| 312 if (_lastPick == 0) { | |
| 313 prevB = b; | |
| 314 while (b->chrom == prevA->chrom){ | |
| 315 _windowB[prevA->chrom].push_back(b); | |
| 316 b = new BED(); | |
| 317 bStatus = _bedB->GetNextBed(*b, bLineNum); | |
| 318 } | |
| 319 Scan(prevA, GetWindow(prevA->chrom, true), aStatus, | |
| 320 *prevB, GetWindow(prevA->chrom, false), bStatus); | |
| 321 } | |
| 322 // B was most recently read | |
| 323 else { | |
| 324 prevA = a; | |
| 325 while (a->chrom == prevB->chrom) { | |
| 326 _windowA[prevB->chrom].push_back(a); | |
| 327 a = new BED(); | |
| 328 aStatus = _bedA->GetNextBed(*a, aLineNum); | |
| 329 } | |
| 330 Scan(prevB, GetWindow(prevB->chrom, false), bStatus, | |
| 331 *prevA, GetWindow(prevB->chrom, true), aStatus); | |
| 332 } | |
| 333 FlushOutputBuffer(true); | |
| 334 } | |
| 335 if (prevA!=NULL&&prevB!=NULL) | |
| 336 //cout << prevA->chrom << " " << a->chrom << " " << a->start << " " | |
| 337 // << prevB->chrom << " " << b->chrom << " " << b->start << "\n"; | |
| 338 if (aStatus == BED_INVALID) a->start = INT_MAX; | |
| 339 if (bStatus == BED_INVALID) b->start = INT_MAX; | |
| 340 } | |
| 341 | |
| 342 // clear out the final bit of staged output | |
| 343 FlushOutputBuffer(true); | |
| 344 | |
| 345 // close the files | |
| 346 _bedA->Close(); | |
| 347 _bedB->Close(); | |
| 348 } | |
| 349 | |
| 350 |
