Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/closestBed/closestBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfcd8b6c1bda |
---|---|
1 /***************************************************************************** | |
2 closestBed.cpp | |
3 | |
4 (c) 2009 - Aaron Quinlan | |
5 Hall Laboratory | |
6 Department of Biochemistry and Molecular Genetics | |
7 University of Virginia | |
8 aaronquinlan@gmail.com | |
9 | |
10 Licenced under the GNU General Public License 2.0 license. | |
11 ******************************************************************************/ | |
12 #include "lineFileUtilities.h" | |
13 #include "closestBed.h" | |
14 | |
15 const int MAXSLOP = 256000000; // 2*MAXSLOP = 512 megabases. | |
16 // We don't want to keep looking if we | |
17 // can't find a nearby feature within 512 Mb. | |
18 const int SLOPGROWTH = 2048000; | |
19 | |
20 | |
21 /* | |
22 Constructor | |
23 */ | |
24 BedClosest::BedClosest(string &bedAFile, string &bedBFile, bool sameStrand, bool diffStrand, | |
25 string &tieMode, bool reportDistance, bool signDistance, string &_strandedDistMode, | |
26 bool ignoreOverlaps) | |
27 : _bedAFile(bedAFile) | |
28 , _bedBFile(bedBFile) | |
29 , _tieMode(tieMode) | |
30 , _sameStrand(sameStrand) | |
31 , _diffStrand(diffStrand) | |
32 , _reportDistance(reportDistance) | |
33 , _signDistance(signDistance) | |
34 , _strandedDistMode(_strandedDistMode) | |
35 , _ignoreOverlaps(ignoreOverlaps) | |
36 { | |
37 _bedA = new BedFile(_bedAFile); | |
38 _bedB = new BedFile(_bedBFile); | |
39 FindClosestBed(); | |
40 } | |
41 | |
42 | |
43 /* | |
44 Destructor | |
45 */ | |
46 BedClosest::~BedClosest(void) { | |
47 } | |
48 | |
49 | |
50 void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) { | |
51 | |
52 int slop = 0; // start out just looking for overlaps | |
53 // within the current bin (~128Kb) | |
54 | |
55 // update the current feature's start and end | |
56 | |
57 CHRPOS aFudgeStart = 0; | |
58 CHRPOS aFudgeEnd; | |
59 int numOverlaps = 0; | |
60 vector<BED> closestB; | |
61 CHRPOS minDistance = INT_MAX; | |
62 int32_t curDistance = INT_MAX; | |
63 vector<int32_t> distances; | |
64 | |
65 // is there at least one feature in B on the same chrom | |
66 // as the current A feature? | |
67 if(_bedB->bedMap.find(a.chrom) != _bedB->bedMap.end()) { | |
68 | |
69 while ((numOverlaps == 0) && (slop <= MAXSLOP)) { | |
70 | |
71 // add some slop (starting at 0 bases) to a in hopes | |
72 // of finding a hit in B | |
73 if ((static_cast<int>(a.start) - slop) > 0) | |
74 aFudgeStart = a.start - slop; | |
75 else | |
76 aFudgeStart = 0; | |
77 | |
78 if ((static_cast<int>(a.start) + slop) < (2 * MAXSLOP)) | |
79 aFudgeEnd = a.end + slop; | |
80 else | |
81 aFudgeEnd = 2 * MAXSLOP; | |
82 | |
83 // THE HEAVY LIFTING | |
84 // search for hits with the current slop added | |
85 _bedB->FindOverlapsPerBin(a.chrom, aFudgeStart, aFudgeEnd, a.strand, hits, _sameStrand, _diffStrand); | |
86 | |
87 vector<BED>::const_iterator h = hits.begin(); | |
88 vector<BED>::const_iterator hitsEnd = hits.end(); | |
89 for (; h != hitsEnd; ++h) { | |
90 | |
91 // do the actual features overlap? | |
92 int s = max(a.start, h->start); | |
93 int e = min(a.end, h->end); | |
94 int overlapBases = (e - s); // the number of overlapping bases b/w a and b | |
95 | |
96 // make sure we allow overlapping features. | |
97 if ((overlapBases > 0) && (_ignoreOverlaps == true)) | |
98 continue; | |
99 else | |
100 numOverlaps++; | |
101 | |
102 // there is overlap. make sure we allow overlapping features () | |
103 if (overlapBases > 0) { | |
104 closestB.push_back(*h); | |
105 distances.push_back(0); | |
106 } | |
107 // the hit is to the "left" of A | |
108 else if (h->end <= a.start) { | |
109 curDistance = a.start - h->end; | |
110 if (_signDistance) { | |
111 if ((_strandedDistMode == "ref") | |
112 || (_strandedDistMode == "a" && a.strand != "-") | |
113 || (_strandedDistMode == "b" && h->strand == "-")) { | |
114 curDistance = -curDistance; | |
115 } | |
116 } | |
117 | |
118 if (abs(curDistance) < minDistance) { | |
119 minDistance = abs(curDistance); | |
120 | |
121 closestB.clear(); | |
122 closestB.push_back(*h); | |
123 distances.clear(); | |
124 distances.push_back(curDistance); | |
125 } | |
126 else if (abs(curDistance) == minDistance) { | |
127 minDistance = abs(curDistance); | |
128 closestB.push_back(*h); | |
129 distances.push_back(curDistance); | |
130 } | |
131 } | |
132 // the hit is to the "right" of A | |
133 else if (h->start >= a.end) { | |
134 curDistance = h->start - a.end; | |
135 if (_signDistance) { | |
136 if ((_strandedDistMode == "a" && a.strand == "-") | |
137 || (_strandedDistMode == "b" && h->strand != "-")) { | |
138 curDistance = -curDistance; | |
139 } | |
140 } | |
141 if (abs(curDistance) < minDistance) { | |
142 minDistance = abs(curDistance); | |
143 closestB.clear(); | |
144 closestB.push_back(*h); | |
145 distances.clear(); | |
146 distances.push_back(curDistance); | |
147 } | |
148 else if (abs(curDistance) == minDistance) { | |
149 minDistance = abs(curDistance); | |
150 closestB.push_back(*h); | |
151 distances.push_back(curDistance); | |
152 } | |
153 } | |
154 } | |
155 // if no overlaps were found, we'll widen the range | |
156 // by SLOPGROWTH in each direction and search again. | |
157 slop += SLOPGROWTH; | |
158 } | |
159 } | |
160 // there is no feature in B on the same chromosome as A | |
161 else { | |
162 _bedA->reportBedTab(a); | |
163 if (_reportDistance == true) { | |
164 _bedB->reportNullBedTab(); | |
165 cout << -1 << endl; | |
166 } | |
167 else | |
168 _bedB->reportNullBedNewLine(); | |
169 } | |
170 | |
171 // report the closest feature(s) in B to the current A feature. | |
172 // obey the user's reporting request (_tieMode) | |
173 if (numOverlaps > 0) { | |
174 if (closestB.size() == 1 || _tieMode == "first") { | |
175 _bedA->reportBedTab(a); | |
176 if (_reportDistance == true) { | |
177 _bedB->reportBedTab(closestB[0]); | |
178 cout << distances[0] << endl; | |
179 } | |
180 else | |
181 _bedB->reportBedNewLine(closestB[0]); | |
182 } | |
183 else { | |
184 if (_tieMode == "all") { | |
185 size_t i = 0; | |
186 for (vector<BED>::iterator b = closestB.begin(); b != closestB.end(); ++b) { | |
187 _bedA->reportBedTab(a); | |
188 if (_reportDistance == true) { | |
189 _bedB->reportBedTab(*b); | |
190 cout << distances[i++] <<endl; | |
191 } | |
192 else | |
193 _bedB->reportBedNewLine(*b); | |
194 } | |
195 } | |
196 else if (_tieMode == "last") { | |
197 _bedA->reportBedTab(a); | |
198 if (_reportDistance == true) { | |
199 _bedB->reportBedTab(closestB[closestB.size()-1]); | |
200 cout << distances[distances.size() - 1]<<endl; | |
201 } | |
202 else | |
203 _bedB->reportBedNewLine(closestB[closestB.size()-1]); | |
204 } | |
205 } | |
206 } | |
207 } | |
208 | |
209 | |
210 void BedClosest::FindClosestBed() { | |
211 | |
212 // load the "B" bed file into a map so | |
213 // that we can easily compare "A" to it for overlaps | |
214 _bedB->loadBedFileIntoMap(); | |
215 | |
216 BED a, nullBed; | |
217 int lineNum = 0; // current input line number | |
218 vector<BED> hits; // vector of potential hits | |
219 hits.reserve(100); | |
220 BedLineStatus bedStatus; | |
221 | |
222 _bedA->Open(); | |
223 // process each entry in A in search of the closest feature in B | |
224 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { | |
225 if (bedStatus == BED_VALID) { | |
226 FindWindowOverlaps(a, hits); | |
227 hits.clear(); | |
228 a = nullBed; | |
229 } | |
230 } | |
231 _bedA->Close(); | |
232 } | |
233 // END ClosestBed | |
234 |