0
|
1 /*****************************************************************************
|
|
2 closestBed.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "closestBed.h"
|
|
14
|
|
15 const int MAXSLOP = 256000000; // 2*MAXSLOP = 512 megabases.
|
|
16 // We don't want to keep looking if we
|
|
17 // can't find a nearby feature within 512 Mb.
|
|
18 const int SLOPGROWTH = 2048000;
|
|
19
|
|
20
|
|
21 /*
|
|
22 Constructor
|
|
23 */
|
|
24 BedClosest::BedClosest(string &bedAFile, string &bedBFile, bool sameStrand, bool diffStrand,
|
|
25 string &tieMode, bool reportDistance, bool signDistance, string &_strandedDistMode,
|
|
26 bool ignoreOverlaps)
|
|
27 : _bedAFile(bedAFile)
|
|
28 , _bedBFile(bedBFile)
|
|
29 , _tieMode(tieMode)
|
|
30 , _sameStrand(sameStrand)
|
|
31 , _diffStrand(diffStrand)
|
|
32 , _reportDistance(reportDistance)
|
|
33 , _signDistance(signDistance)
|
|
34 , _strandedDistMode(_strandedDistMode)
|
|
35 , _ignoreOverlaps(ignoreOverlaps)
|
|
36 {
|
|
37 _bedA = new BedFile(_bedAFile);
|
|
38 _bedB = new BedFile(_bedBFile);
|
|
39 FindClosestBed();
|
|
40 }
|
|
41
|
|
42
|
|
43 /*
|
|
44 Destructor
|
|
45 */
|
|
46 BedClosest::~BedClosest(void) {
|
|
47 }
|
|
48
|
|
49
|
|
50 void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) {
|
|
51
|
|
52 int slop = 0; // start out just looking for overlaps
|
|
53 // within the current bin (~128Kb)
|
|
54
|
|
55 // update the current feature's start and end
|
|
56
|
|
57 CHRPOS aFudgeStart = 0;
|
|
58 CHRPOS aFudgeEnd;
|
|
59 int numOverlaps = 0;
|
|
60 vector<BED> closestB;
|
|
61 CHRPOS minDistance = INT_MAX;
|
|
62 int32_t curDistance = INT_MAX;
|
|
63 vector<int32_t> distances;
|
|
64
|
|
65 // is there at least one feature in B on the same chrom
|
|
66 // as the current A feature?
|
|
67 if(_bedB->bedMap.find(a.chrom) != _bedB->bedMap.end()) {
|
|
68
|
|
69 while ((numOverlaps == 0) && (slop <= MAXSLOP)) {
|
|
70
|
|
71 // add some slop (starting at 0 bases) to a in hopes
|
|
72 // of finding a hit in B
|
|
73 if ((static_cast<int>(a.start) - slop) > 0)
|
|
74 aFudgeStart = a.start - slop;
|
|
75 else
|
|
76 aFudgeStart = 0;
|
|
77
|
|
78 if ((static_cast<int>(a.start) + slop) < (2 * MAXSLOP))
|
|
79 aFudgeEnd = a.end + slop;
|
|
80 else
|
|
81 aFudgeEnd = 2 * MAXSLOP;
|
|
82
|
|
83 // THE HEAVY LIFTING
|
|
84 // search for hits with the current slop added
|
|
85 _bedB->FindOverlapsPerBin(a.chrom, aFudgeStart, aFudgeEnd, a.strand, hits, _sameStrand, _diffStrand);
|
|
86
|
|
87 vector<BED>::const_iterator h = hits.begin();
|
|
88 vector<BED>::const_iterator hitsEnd = hits.end();
|
|
89 for (; h != hitsEnd; ++h) {
|
|
90
|
|
91 // do the actual features overlap?
|
|
92 int s = max(a.start, h->start);
|
|
93 int e = min(a.end, h->end);
|
|
94 int overlapBases = (e - s); // the number of overlapping bases b/w a and b
|
|
95
|
|
96 // make sure we allow overlapping features.
|
|
97 if ((overlapBases > 0) && (_ignoreOverlaps == true))
|
|
98 continue;
|
|
99 else
|
|
100 numOverlaps++;
|
|
101
|
|
102 // there is overlap. make sure we allow overlapping features ()
|
|
103 if (overlapBases > 0) {
|
|
104 closestB.push_back(*h);
|
|
105 distances.push_back(0);
|
|
106 }
|
|
107 // the hit is to the "left" of A
|
|
108 else if (h->end <= a.start) {
|
|
109 curDistance = a.start - h->end;
|
|
110 if (_signDistance) {
|
|
111 if ((_strandedDistMode == "ref")
|
|
112 || (_strandedDistMode == "a" && a.strand != "-")
|
|
113 || (_strandedDistMode == "b" && h->strand == "-")) {
|
|
114 curDistance = -curDistance;
|
|
115 }
|
|
116 }
|
|
117
|
|
118 if (abs(curDistance) < minDistance) {
|
|
119 minDistance = abs(curDistance);
|
|
120
|
|
121 closestB.clear();
|
|
122 closestB.push_back(*h);
|
|
123 distances.clear();
|
|
124 distances.push_back(curDistance);
|
|
125 }
|
|
126 else if (abs(curDistance) == minDistance) {
|
|
127 minDistance = abs(curDistance);
|
|
128 closestB.push_back(*h);
|
|
129 distances.push_back(curDistance);
|
|
130 }
|
|
131 }
|
|
132 // the hit is to the "right" of A
|
|
133 else if (h->start >= a.end) {
|
|
134 curDistance = h->start - a.end;
|
|
135 if (_signDistance) {
|
|
136 if ((_strandedDistMode == "a" && a.strand == "-")
|
|
137 || (_strandedDistMode == "b" && h->strand != "-")) {
|
|
138 curDistance = -curDistance;
|
|
139 }
|
|
140 }
|
|
141 if (abs(curDistance) < minDistance) {
|
|
142 minDistance = abs(curDistance);
|
|
143 closestB.clear();
|
|
144 closestB.push_back(*h);
|
|
145 distances.clear();
|
|
146 distances.push_back(curDistance);
|
|
147 }
|
|
148 else if (abs(curDistance) == minDistance) {
|
|
149 minDistance = abs(curDistance);
|
|
150 closestB.push_back(*h);
|
|
151 distances.push_back(curDistance);
|
|
152 }
|
|
153 }
|
|
154 }
|
|
155 // if no overlaps were found, we'll widen the range
|
|
156 // by SLOPGROWTH in each direction and search again.
|
|
157 slop += SLOPGROWTH;
|
|
158 }
|
|
159 }
|
|
160 // there is no feature in B on the same chromosome as A
|
|
161 else {
|
|
162 _bedA->reportBedTab(a);
|
|
163 if (_reportDistance == true) {
|
|
164 _bedB->reportNullBedTab();
|
|
165 cout << -1 << endl;
|
|
166 }
|
|
167 else
|
|
168 _bedB->reportNullBedNewLine();
|
|
169 }
|
|
170
|
|
171 // report the closest feature(s) in B to the current A feature.
|
|
172 // obey the user's reporting request (_tieMode)
|
|
173 if (numOverlaps > 0) {
|
|
174 if (closestB.size() == 1 || _tieMode == "first") {
|
|
175 _bedA->reportBedTab(a);
|
|
176 if (_reportDistance == true) {
|
|
177 _bedB->reportBedTab(closestB[0]);
|
|
178 cout << distances[0] << endl;
|
|
179 }
|
|
180 else
|
|
181 _bedB->reportBedNewLine(closestB[0]);
|
|
182 }
|
|
183 else {
|
|
184 if (_tieMode == "all") {
|
|
185 size_t i = 0;
|
|
186 for (vector<BED>::iterator b = closestB.begin(); b != closestB.end(); ++b) {
|
|
187 _bedA->reportBedTab(a);
|
|
188 if (_reportDistance == true) {
|
|
189 _bedB->reportBedTab(*b);
|
|
190 cout << distances[i++] <<endl;
|
|
191 }
|
|
192 else
|
|
193 _bedB->reportBedNewLine(*b);
|
|
194 }
|
|
195 }
|
|
196 else if (_tieMode == "last") {
|
|
197 _bedA->reportBedTab(a);
|
|
198 if (_reportDistance == true) {
|
|
199 _bedB->reportBedTab(closestB[closestB.size()-1]);
|
|
200 cout << distances[distances.size() - 1]<<endl;
|
|
201 }
|
|
202 else
|
|
203 _bedB->reportBedNewLine(closestB[closestB.size()-1]);
|
|
204 }
|
|
205 }
|
|
206 }
|
|
207 }
|
|
208
|
|
209
|
|
210 void BedClosest::FindClosestBed() {
|
|
211
|
|
212 // load the "B" bed file into a map so
|
|
213 // that we can easily compare "A" to it for overlaps
|
|
214 _bedB->loadBedFileIntoMap();
|
|
215
|
|
216 BED a, nullBed;
|
|
217 int lineNum = 0; // current input line number
|
|
218 vector<BED> hits; // vector of potential hits
|
|
219 hits.reserve(100);
|
|
220 BedLineStatus bedStatus;
|
|
221
|
|
222 _bedA->Open();
|
|
223 // process each entry in A in search of the closest feature in B
|
|
224 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) {
|
|
225 if (bedStatus == BED_VALID) {
|
|
226 FindWindowOverlaps(a, hits);
|
|
227 hits.clear();
|
|
228 a = nullBed;
|
|
229 }
|
|
230 }
|
|
231 _bedA->Close();
|
|
232 }
|
|
233 // END ClosestBed
|
|
234
|