Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/subtractBed/subtractBed.cpp @ 0:dfcd8b6c1bda
Uploaded
| author | aaronquinlan |
|---|---|
| date | Thu, 03 Nov 2011 10:25:04 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfcd8b6c1bda |
|---|---|
| 1 /***************************************************************************** | |
| 2 subtractBed.cpp | |
| 3 | |
| 4 (c) 2009 - Aaron Quinlan | |
| 5 Hall Laboratory | |
| 6 Department of Biochemistry and Molecular Genetics | |
| 7 University of Virginia | |
| 8 aaronquinlan@gmail.com | |
| 9 | |
| 10 Licenced under the GNU General Public License 2.0 license. | |
| 11 ******************************************************************************/ | |
| 12 #include "lineFileUtilities.h" | |
| 13 #include "subtractBed.h" | |
| 14 | |
| 15 | |
| 16 /* | |
| 17 Constructor | |
| 18 */ | |
| 19 BedSubtract::BedSubtract(string &bedAFile, string &bedBFile, float overlapFraction, bool sameStrand, bool diffStrand) { | |
| 20 | |
| 21 _bedAFile = bedAFile; | |
| 22 _bedBFile = bedBFile; | |
| 23 _overlapFraction = overlapFraction; | |
| 24 _sameStrand = sameStrand; | |
| 25 _diffStrand = diffStrand; | |
| 26 | |
| 27 _bedA = new BedFile(bedAFile); | |
| 28 _bedB = new BedFile(bedBFile); | |
| 29 | |
| 30 SubtractBed(); | |
| 31 } | |
| 32 | |
| 33 | |
| 34 /* | |
| 35 Destructor | |
| 36 */ | |
| 37 BedSubtract::~BedSubtract(void) { | |
| 38 } | |
| 39 | |
| 40 | |
| 41 void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) { | |
| 42 | |
| 43 // find all of the overlaps between a and B. | |
| 44 _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand); | |
| 45 | |
| 46 // is A completely spanned by an entry in B? | |
| 47 // if so, A should not be reported. | |
| 48 int numConsumedByB = 0; | |
| 49 int numOverlaps = 0; | |
| 50 vector<BED> bOverlaps; // list of hits in B. Special processing if there are multiple. | |
| 51 | |
| 52 vector<BED>::const_iterator h = hits.begin(); | |
| 53 vector<BED>::const_iterator hitsEnd = hits.end(); | |
| 54 for (; h != hitsEnd; ++h) { | |
| 55 | |
| 56 int s = max(a.start, h->start); | |
| 57 int e = min(a.end, h->end); | |
| 58 int overlapBases = (e - s); // the number of overlapping bases b/w a and b | |
| 59 int aLength = (a.end - a.start); // the length of a in b.p. | |
| 60 | |
| 61 if (s < e) { | |
| 62 | |
| 63 // is there enough overlap (default ~ 1bp) | |
| 64 float overlap = ((float) overlapBases / (float) aLength); | |
| 65 | |
| 66 if (overlap >= 1.0) { | |
| 67 numOverlaps++; | |
| 68 numConsumedByB++; | |
| 69 } | |
| 70 else if ( overlap >= _overlapFraction ) { | |
| 71 numOverlaps++; | |
| 72 bOverlaps.push_back(*h); | |
| 73 } | |
| 74 } | |
| 75 } | |
| 76 | |
| 77 if (numOverlaps == 0) { | |
| 78 // no overlap found, so just report A as-is. | |
| 79 _bedA->reportBedNewLine(a); | |
| 80 } | |
| 81 else if (numOverlaps == 1) { | |
| 82 // one overlap found. only need to look at the single | |
| 83 // entry in bOverlaps. | |
| 84 | |
| 85 // if A was not "consumed" by any entry in B | |
| 86 if (numConsumedByB == 0) { | |
| 87 | |
| 88 BED theHit = bOverlaps[0]; | |
| 89 | |
| 90 // A ++++++++++++ | |
| 91 // B ---- | |
| 92 // Res. ==== ==== | |
| 93 if ( (theHit.start > a.start) && (theHit.end < a.end) ) { | |
| 94 _bedA->reportBedRangeNewLine(a,a.start,theHit.start); | |
| 95 _bedA->reportBedRangeNewLine(a,theHit.end,a.end); | |
| 96 } | |
| 97 // A ++++++++++++ | |
| 98 // B ---------- | |
| 99 // Res. == | |
| 100 else if (theHit.start == a.start) { | |
| 101 _bedA->reportBedRangeNewLine(a,theHit.end,a.end); | |
| 102 } | |
| 103 // A ++++++++++++ | |
| 104 // B ---------- | |
| 105 // Res. ==== | |
| 106 else if (theHit.start < a.start) { | |
| 107 _bedA->reportBedRangeNewLine(a,theHit.end,a.end); | |
| 108 } | |
| 109 // A ++++++++++++ | |
| 110 // B ---------- | |
| 111 // Res. ======= | |
| 112 else if (theHit.start > a.start) { | |
| 113 _bedA->reportBedRangeNewLine(a,a.start,theHit.start); | |
| 114 } | |
| 115 } | |
| 116 } | |
| 117 else if (numOverlaps > 1) { | |
| 118 // multiple overlapz found. look at all the hits | |
| 119 // and figure out which bases in A survived. then | |
| 120 // report the contigous intervals that survived. | |
| 121 | |
| 122 vector<bool> aKeep(a.end - a.start, true); | |
| 123 | |
| 124 if (numConsumedByB == 0) { | |
| 125 // track the number of hit starts and ends at each position in A | |
| 126 for (vector<BED>::iterator h = bOverlaps.begin(); h != bOverlaps.end(); ++h) { | |
| 127 int s = max(a.start, h->start); | |
| 128 int e = min(a.end, h->end); | |
| 129 | |
| 130 for (int i = s+1; i <= e; ++i) { | |
| 131 aKeep[i-a.start-1] = false; | |
| 132 } | |
| 133 } | |
| 134 // report the remaining blocks. | |
| 135 for (unsigned int i = 0; i < aKeep.size(); ++i) { | |
| 136 if (aKeep[i] == true) { | |
| 137 CHRPOS blockStart = i + a.start; | |
| 138 while ((aKeep[i] == true) && (i < aKeep.size())) { | |
| 139 i++; | |
| 140 } | |
| 141 CHRPOS blockEnd = i + a.start; | |
| 142 blockEnd = min(a.end, blockEnd); | |
| 143 _bedA->reportBedRangeNewLine(a,blockStart,blockEnd); | |
| 144 } | |
| 145 } | |
| 146 } | |
| 147 } | |
| 148 } | |
| 149 | |
| 150 | |
| 151 | |
| 152 void BedSubtract::SubtractBed() { | |
| 153 | |
| 154 // load the "B" bed file into a map so | |
| 155 // that we can easily compare "A" to it for overlaps | |
| 156 _bedB->loadBedFileIntoMap(); | |
| 157 | |
| 158 BED a, nullBed; | |
| 159 BedLineStatus bedStatus; | |
| 160 int lineNum = 0; // current input line number | |
| 161 vector<BED> hits; // vector of potential hits | |
| 162 // reserve some space | |
| 163 hits.reserve(100); | |
| 164 | |
| 165 _bedA->Open(); | |
| 166 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { | |
| 167 if (bedStatus == BED_VALID) { | |
| 168 FindAndSubtractOverlaps(a, hits); | |
| 169 hits.clear(); | |
| 170 a = nullBed; | |
| 171 } | |
| 172 } | |
| 173 _bedA->Close(); | |
| 174 | |
| 175 } | |
| 176 // END Intersect | |
| 177 | |
| 178 |
