Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/closestBed/closestBed.cpp @ 0:dfcd8b6c1bda
Uploaded
| author | aaronquinlan |
|---|---|
| date | Thu, 03 Nov 2011 10:25:04 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfcd8b6c1bda |
|---|---|
| 1 /***************************************************************************** | |
| 2 closestBed.cpp | |
| 3 | |
| 4 (c) 2009 - Aaron Quinlan | |
| 5 Hall Laboratory | |
| 6 Department of Biochemistry and Molecular Genetics | |
| 7 University of Virginia | |
| 8 aaronquinlan@gmail.com | |
| 9 | |
| 10 Licenced under the GNU General Public License 2.0 license. | |
| 11 ******************************************************************************/ | |
| 12 #include "lineFileUtilities.h" | |
| 13 #include "closestBed.h" | |
| 14 | |
| 15 const int MAXSLOP = 256000000; // 2*MAXSLOP = 512 megabases. | |
| 16 // We don't want to keep looking if we | |
| 17 // can't find a nearby feature within 512 Mb. | |
| 18 const int SLOPGROWTH = 2048000; | |
| 19 | |
| 20 | |
| 21 /* | |
| 22 Constructor | |
| 23 */ | |
| 24 BedClosest::BedClosest(string &bedAFile, string &bedBFile, bool sameStrand, bool diffStrand, | |
| 25 string &tieMode, bool reportDistance, bool signDistance, string &_strandedDistMode, | |
| 26 bool ignoreOverlaps) | |
| 27 : _bedAFile(bedAFile) | |
| 28 , _bedBFile(bedBFile) | |
| 29 , _tieMode(tieMode) | |
| 30 , _sameStrand(sameStrand) | |
| 31 , _diffStrand(diffStrand) | |
| 32 , _reportDistance(reportDistance) | |
| 33 , _signDistance(signDistance) | |
| 34 , _strandedDistMode(_strandedDistMode) | |
| 35 , _ignoreOverlaps(ignoreOverlaps) | |
| 36 { | |
| 37 _bedA = new BedFile(_bedAFile); | |
| 38 _bedB = new BedFile(_bedBFile); | |
| 39 FindClosestBed(); | |
| 40 } | |
| 41 | |
| 42 | |
| 43 /* | |
| 44 Destructor | |
| 45 */ | |
| 46 BedClosest::~BedClosest(void) { | |
| 47 } | |
| 48 | |
| 49 | |
| 50 void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) { | |
| 51 | |
| 52 int slop = 0; // start out just looking for overlaps | |
| 53 // within the current bin (~128Kb) | |
| 54 | |
| 55 // update the current feature's start and end | |
| 56 | |
| 57 CHRPOS aFudgeStart = 0; | |
| 58 CHRPOS aFudgeEnd; | |
| 59 int numOverlaps = 0; | |
| 60 vector<BED> closestB; | |
| 61 CHRPOS minDistance = INT_MAX; | |
| 62 int32_t curDistance = INT_MAX; | |
| 63 vector<int32_t> distances; | |
| 64 | |
| 65 // is there at least one feature in B on the same chrom | |
| 66 // as the current A feature? | |
| 67 if(_bedB->bedMap.find(a.chrom) != _bedB->bedMap.end()) { | |
| 68 | |
| 69 while ((numOverlaps == 0) && (slop <= MAXSLOP)) { | |
| 70 | |
| 71 // add some slop (starting at 0 bases) to a in hopes | |
| 72 // of finding a hit in B | |
| 73 if ((static_cast<int>(a.start) - slop) > 0) | |
| 74 aFudgeStart = a.start - slop; | |
| 75 else | |
| 76 aFudgeStart = 0; | |
| 77 | |
| 78 if ((static_cast<int>(a.start) + slop) < (2 * MAXSLOP)) | |
| 79 aFudgeEnd = a.end + slop; | |
| 80 else | |
| 81 aFudgeEnd = 2 * MAXSLOP; | |
| 82 | |
| 83 // THE HEAVY LIFTING | |
| 84 // search for hits with the current slop added | |
| 85 _bedB->FindOverlapsPerBin(a.chrom, aFudgeStart, aFudgeEnd, a.strand, hits, _sameStrand, _diffStrand); | |
| 86 | |
| 87 vector<BED>::const_iterator h = hits.begin(); | |
| 88 vector<BED>::const_iterator hitsEnd = hits.end(); | |
| 89 for (; h != hitsEnd; ++h) { | |
| 90 | |
| 91 // do the actual features overlap? | |
| 92 int s = max(a.start, h->start); | |
| 93 int e = min(a.end, h->end); | |
| 94 int overlapBases = (e - s); // the number of overlapping bases b/w a and b | |
| 95 | |
| 96 // make sure we allow overlapping features. | |
| 97 if ((overlapBases > 0) && (_ignoreOverlaps == true)) | |
| 98 continue; | |
| 99 else | |
| 100 numOverlaps++; | |
| 101 | |
| 102 // there is overlap. make sure we allow overlapping features () | |
| 103 if (overlapBases > 0) { | |
| 104 closestB.push_back(*h); | |
| 105 distances.push_back(0); | |
| 106 } | |
| 107 // the hit is to the "left" of A | |
| 108 else if (h->end <= a.start) { | |
| 109 curDistance = a.start - h->end; | |
| 110 if (_signDistance) { | |
| 111 if ((_strandedDistMode == "ref") | |
| 112 || (_strandedDistMode == "a" && a.strand != "-") | |
| 113 || (_strandedDistMode == "b" && h->strand == "-")) { | |
| 114 curDistance = -curDistance; | |
| 115 } | |
| 116 } | |
| 117 | |
| 118 if (abs(curDistance) < minDistance) { | |
| 119 minDistance = abs(curDistance); | |
| 120 | |
| 121 closestB.clear(); | |
| 122 closestB.push_back(*h); | |
| 123 distances.clear(); | |
| 124 distances.push_back(curDistance); | |
| 125 } | |
| 126 else if (abs(curDistance) == minDistance) { | |
| 127 minDistance = abs(curDistance); | |
| 128 closestB.push_back(*h); | |
| 129 distances.push_back(curDistance); | |
| 130 } | |
| 131 } | |
| 132 // the hit is to the "right" of A | |
| 133 else if (h->start >= a.end) { | |
| 134 curDistance = h->start - a.end; | |
| 135 if (_signDistance) { | |
| 136 if ((_strandedDistMode == "a" && a.strand == "-") | |
| 137 || (_strandedDistMode == "b" && h->strand != "-")) { | |
| 138 curDistance = -curDistance; | |
| 139 } | |
| 140 } | |
| 141 if (abs(curDistance) < minDistance) { | |
| 142 minDistance = abs(curDistance); | |
| 143 closestB.clear(); | |
| 144 closestB.push_back(*h); | |
| 145 distances.clear(); | |
| 146 distances.push_back(curDistance); | |
| 147 } | |
| 148 else if (abs(curDistance) == minDistance) { | |
| 149 minDistance = abs(curDistance); | |
| 150 closestB.push_back(*h); | |
| 151 distances.push_back(curDistance); | |
| 152 } | |
| 153 } | |
| 154 } | |
| 155 // if no overlaps were found, we'll widen the range | |
| 156 // by SLOPGROWTH in each direction and search again. | |
| 157 slop += SLOPGROWTH; | |
| 158 } | |
| 159 } | |
| 160 // there is no feature in B on the same chromosome as A | |
| 161 else { | |
| 162 _bedA->reportBedTab(a); | |
| 163 if (_reportDistance == true) { | |
| 164 _bedB->reportNullBedTab(); | |
| 165 cout << -1 << endl; | |
| 166 } | |
| 167 else | |
| 168 _bedB->reportNullBedNewLine(); | |
| 169 } | |
| 170 | |
| 171 // report the closest feature(s) in B to the current A feature. | |
| 172 // obey the user's reporting request (_tieMode) | |
| 173 if (numOverlaps > 0) { | |
| 174 if (closestB.size() == 1 || _tieMode == "first") { | |
| 175 _bedA->reportBedTab(a); | |
| 176 if (_reportDistance == true) { | |
| 177 _bedB->reportBedTab(closestB[0]); | |
| 178 cout << distances[0] << endl; | |
| 179 } | |
| 180 else | |
| 181 _bedB->reportBedNewLine(closestB[0]); | |
| 182 } | |
| 183 else { | |
| 184 if (_tieMode == "all") { | |
| 185 size_t i = 0; | |
| 186 for (vector<BED>::iterator b = closestB.begin(); b != closestB.end(); ++b) { | |
| 187 _bedA->reportBedTab(a); | |
| 188 if (_reportDistance == true) { | |
| 189 _bedB->reportBedTab(*b); | |
| 190 cout << distances[i++] <<endl; | |
| 191 } | |
| 192 else | |
| 193 _bedB->reportBedNewLine(*b); | |
| 194 } | |
| 195 } | |
| 196 else if (_tieMode == "last") { | |
| 197 _bedA->reportBedTab(a); | |
| 198 if (_reportDistance == true) { | |
| 199 _bedB->reportBedTab(closestB[closestB.size()-1]); | |
| 200 cout << distances[distances.size() - 1]<<endl; | |
| 201 } | |
| 202 else | |
| 203 _bedB->reportBedNewLine(closestB[closestB.size()-1]); | |
| 204 } | |
| 205 } | |
| 206 } | |
| 207 } | |
| 208 | |
| 209 | |
| 210 void BedClosest::FindClosestBed() { | |
| 211 | |
| 212 // load the "B" bed file into a map so | |
| 213 // that we can easily compare "A" to it for overlaps | |
| 214 _bedB->loadBedFileIntoMap(); | |
| 215 | |
| 216 BED a, nullBed; | |
| 217 int lineNum = 0; // current input line number | |
| 218 vector<BED> hits; // vector of potential hits | |
| 219 hits.reserve(100); | |
| 220 BedLineStatus bedStatus; | |
| 221 | |
| 222 _bedA->Open(); | |
| 223 // process each entry in A in search of the closest feature in B | |
| 224 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { | |
| 225 if (bedStatus == BED_VALID) { | |
| 226 FindWindowOverlaps(a, hits); | |
| 227 hits.clear(); | |
| 228 a = nullBed; | |
| 229 } | |
| 230 } | |
| 231 _bedA->Close(); | |
| 232 } | |
| 233 // END ClosestBed | |
| 234 |
