| 0 | 1 /***************************************************************************** | 
|  | 2   closestBed.cpp | 
|  | 3 | 
|  | 4   (c) 2009 - Aaron Quinlan | 
|  | 5   Hall Laboratory | 
|  | 6   Department of Biochemistry and Molecular Genetics | 
|  | 7   University of Virginia | 
|  | 8   aaronquinlan@gmail.com | 
|  | 9 | 
|  | 10   Licenced under the GNU General Public License 2.0 license. | 
|  | 11 ******************************************************************************/ | 
|  | 12 #include "lineFileUtilities.h" | 
|  | 13 #include "closestBed.h" | 
|  | 14 | 
|  | 15 const int MAXSLOP = 256000000;  // 2*MAXSLOP = 512 megabases. | 
|  | 16                                 // We don't want to keep looking if we | 
|  | 17                                 // can't find a nearby feature within 512 Mb. | 
|  | 18 const int SLOPGROWTH = 2048000; | 
|  | 19 | 
|  | 20 | 
|  | 21 /* | 
|  | 22     Constructor | 
|  | 23 */ | 
|  | 24 BedClosest::BedClosest(string &bedAFile, string &bedBFile, bool sameStrand, bool diffStrand, | 
|  | 25                        string &tieMode, bool reportDistance, bool signDistance, string &_strandedDistMode, | 
|  | 26                        bool ignoreOverlaps) | 
|  | 27     : _bedAFile(bedAFile) | 
|  | 28     , _bedBFile(bedBFile) | 
|  | 29     , _tieMode(tieMode) | 
|  | 30     , _sameStrand(sameStrand) | 
|  | 31     , _diffStrand(diffStrand) | 
|  | 32     , _reportDistance(reportDistance) | 
|  | 33     , _signDistance(signDistance) | 
|  | 34     , _strandedDistMode(_strandedDistMode) | 
|  | 35     , _ignoreOverlaps(ignoreOverlaps) | 
|  | 36 { | 
|  | 37     _bedA           = new BedFile(_bedAFile); | 
|  | 38     _bedB           = new BedFile(_bedBFile); | 
|  | 39     FindClosestBed(); | 
|  | 40 } | 
|  | 41 | 
|  | 42 | 
|  | 43 /* | 
|  | 44     Destructor | 
|  | 45 */ | 
|  | 46 BedClosest::~BedClosest(void) { | 
|  | 47 } | 
|  | 48 | 
|  | 49 | 
|  | 50 void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) { | 
|  | 51 | 
|  | 52     int slop = 0;  // start out just looking for overlaps | 
|  | 53                    // within the current bin (~128Kb) | 
|  | 54 | 
|  | 55     // update the current feature's start and end | 
|  | 56 | 
|  | 57     CHRPOS aFudgeStart = 0; | 
|  | 58     CHRPOS aFudgeEnd; | 
|  | 59     int numOverlaps = 0; | 
|  | 60     vector<BED> closestB; | 
|  | 61     CHRPOS minDistance = INT_MAX; | 
|  | 62     int32_t curDistance = INT_MAX; | 
|  | 63     vector<int32_t> distances; | 
|  | 64 | 
|  | 65     // is there at least one feature in B on the same chrom | 
|  | 66     // as the current A feature? | 
|  | 67     if(_bedB->bedMap.find(a.chrom) != _bedB->bedMap.end()) { | 
|  | 68 | 
|  | 69         while ((numOverlaps == 0) && (slop <= MAXSLOP)) { | 
|  | 70 | 
|  | 71             // add some slop (starting at 0 bases) to a in hopes | 
|  | 72             // of finding a hit in B | 
|  | 73             if ((static_cast<int>(a.start) - slop) > 0) | 
|  | 74                 aFudgeStart = a.start - slop; | 
|  | 75             else | 
|  | 76                 aFudgeStart = 0; | 
|  | 77 | 
|  | 78             if ((static_cast<int>(a.start) + slop) < (2 * MAXSLOP)) | 
|  | 79                 aFudgeEnd = a.end + slop; | 
|  | 80             else | 
|  | 81                 aFudgeEnd = 2 * MAXSLOP; | 
|  | 82 | 
|  | 83             // THE HEAVY LIFTING | 
|  | 84             // search for hits with the current slop added | 
|  | 85             _bedB->FindOverlapsPerBin(a.chrom, aFudgeStart, aFudgeEnd, a.strand, hits, _sameStrand, _diffStrand); | 
|  | 86 | 
|  | 87             vector<BED>::const_iterator h = hits.begin(); | 
|  | 88             vector<BED>::const_iterator hitsEnd = hits.end(); | 
|  | 89             for (; h != hitsEnd; ++h) { | 
|  | 90 | 
|  | 91                 // do the actual features overlap? | 
|  | 92                 int s = max(a.start, h->start); | 
|  | 93                 int e = min(a.end, h->end); | 
|  | 94                 int overlapBases = (e - s);             // the number of overlapping bases b/w a and b | 
|  | 95 | 
|  | 96                 // make sure we allow overlapping features. | 
|  | 97                 if ((overlapBases > 0) && (_ignoreOverlaps == true)) | 
|  | 98                     continue; | 
|  | 99                 else | 
|  | 100                     numOverlaps++; | 
|  | 101 | 
|  | 102                 // there is overlap. make sure we allow overlapping features () | 
|  | 103                 if (overlapBases > 0) { | 
|  | 104                     closestB.push_back(*h); | 
|  | 105                     distances.push_back(0); | 
|  | 106                 } | 
|  | 107                 // the hit is to the "left" of A | 
|  | 108                 else if (h->end <= a.start) { | 
|  | 109                     curDistance = a.start - h->end; | 
|  | 110                     if (_signDistance) { | 
|  | 111                         if ((_strandedDistMode == "ref") | 
|  | 112                                 || (_strandedDistMode == "a" && a.strand != "-") | 
|  | 113                                 || (_strandedDistMode == "b" && h->strand == "-")) { | 
|  | 114                             curDistance = -curDistance; | 
|  | 115                         } | 
|  | 116                     } | 
|  | 117 | 
|  | 118                     if (abs(curDistance) < minDistance) { | 
|  | 119                         minDistance = abs(curDistance); | 
|  | 120 | 
|  | 121                         closestB.clear(); | 
|  | 122                         closestB.push_back(*h); | 
|  | 123                         distances.clear(); | 
|  | 124                         distances.push_back(curDistance); | 
|  | 125                     } | 
|  | 126                     else if (abs(curDistance) == minDistance) { | 
|  | 127                         minDistance = abs(curDistance); | 
|  | 128                         closestB.push_back(*h); | 
|  | 129                         distances.push_back(curDistance); | 
|  | 130                     } | 
|  | 131                 } | 
|  | 132                 // the hit is to the "right" of A | 
|  | 133                 else if (h->start >= a.end) { | 
|  | 134                     curDistance = h->start - a.end; | 
|  | 135                     if (_signDistance) { | 
|  | 136                         if ((_strandedDistMode == "a" && a.strand == "-") | 
|  | 137                                 || (_strandedDistMode == "b" && h->strand != "-")) { | 
|  | 138                             curDistance = -curDistance; | 
|  | 139                         } | 
|  | 140                     } | 
|  | 141                     if (abs(curDistance) < minDistance) { | 
|  | 142                         minDistance = abs(curDistance); | 
|  | 143                         closestB.clear(); | 
|  | 144                         closestB.push_back(*h); | 
|  | 145                         distances.clear(); | 
|  | 146                         distances.push_back(curDistance); | 
|  | 147                     } | 
|  | 148                     else if (abs(curDistance) == minDistance) { | 
|  | 149                         minDistance = abs(curDistance); | 
|  | 150                         closestB.push_back(*h); | 
|  | 151                         distances.push_back(curDistance); | 
|  | 152                     } | 
|  | 153                 } | 
|  | 154             } | 
|  | 155             // if no overlaps were found, we'll widen the range | 
|  | 156             // by SLOPGROWTH in each direction and search again. | 
|  | 157             slop += SLOPGROWTH; | 
|  | 158         } | 
|  | 159     } | 
|  | 160     // there is no feature in B on the same chromosome as A | 
|  | 161     else { | 
|  | 162         _bedA->reportBedTab(a); | 
|  | 163         if (_reportDistance == true) { | 
|  | 164             _bedB->reportNullBedTab(); | 
|  | 165             cout << -1 << endl; | 
|  | 166         } | 
|  | 167         else | 
|  | 168             _bedB->reportNullBedNewLine(); | 
|  | 169     } | 
|  | 170 | 
|  | 171     // report the closest feature(s) in B to the current A feature. | 
|  | 172     // obey the user's reporting request (_tieMode) | 
|  | 173     if (numOverlaps > 0) { | 
|  | 174         if (closestB.size() == 1 || _tieMode == "first") { | 
|  | 175             _bedA->reportBedTab(a); | 
|  | 176             if (_reportDistance == true) { | 
|  | 177                 _bedB->reportBedTab(closestB[0]); | 
|  | 178                 cout << distances[0] << endl; | 
|  | 179             } | 
|  | 180             else | 
|  | 181                 _bedB->reportBedNewLine(closestB[0]); | 
|  | 182         } | 
|  | 183         else { | 
|  | 184             if (_tieMode == "all") { | 
|  | 185                 size_t i = 0; | 
|  | 186                 for (vector<BED>::iterator b = closestB.begin(); b != closestB.end(); ++b) { | 
|  | 187                     _bedA->reportBedTab(a); | 
|  | 188                     if (_reportDistance == true) { | 
|  | 189                         _bedB->reportBedTab(*b); | 
|  | 190                         cout << distances[i++] <<endl; | 
|  | 191                     } | 
|  | 192                     else | 
|  | 193                         _bedB->reportBedNewLine(*b); | 
|  | 194                 } | 
|  | 195             } | 
|  | 196             else if (_tieMode == "last") { | 
|  | 197                 _bedA->reportBedTab(a); | 
|  | 198                 if (_reportDistance == true) { | 
|  | 199                     _bedB->reportBedTab(closestB[closestB.size()-1]); | 
|  | 200                     cout << distances[distances.size() - 1]<<endl; | 
|  | 201                 } | 
|  | 202                 else | 
|  | 203                     _bedB->reportBedNewLine(closestB[closestB.size()-1]); | 
|  | 204             } | 
|  | 205         } | 
|  | 206     } | 
|  | 207 } | 
|  | 208 | 
|  | 209 | 
|  | 210 void BedClosest::FindClosestBed() { | 
|  | 211 | 
|  | 212     // load the "B" bed file into a map so | 
|  | 213     // that we can easily compare "A" to it for overlaps | 
|  | 214     _bedB->loadBedFileIntoMap(); | 
|  | 215 | 
|  | 216     BED a, nullBed; | 
|  | 217     int lineNum = 0;                    // current input line number | 
|  | 218     vector<BED> hits;                   // vector of potential hits | 
|  | 219     hits.reserve(100); | 
|  | 220     BedLineStatus bedStatus; | 
|  | 221 | 
|  | 222     _bedA->Open(); | 
|  | 223     // process each entry in A in search of the closest feature in B | 
|  | 224     while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { | 
|  | 225         if (bedStatus == BED_VALID) { | 
|  | 226             FindWindowOverlaps(a, hits); | 
|  | 227             hits.clear(); | 
|  | 228             a = nullBed; | 
|  | 229         } | 
|  | 230     } | 
|  | 231     _bedA->Close(); | 
|  | 232 } | 
|  | 233 // END ClosestBed | 
|  | 234 |