comparison BEDTools-Version-2.14.3/src/closestBed/closestBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfcd8b6c1bda
1 /*****************************************************************************
2 closestBed.cpp
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "lineFileUtilities.h"
13 #include "closestBed.h"
14
15 const int MAXSLOP = 256000000; // 2*MAXSLOP = 512 megabases.
16 // We don't want to keep looking if we
17 // can't find a nearby feature within 512 Mb.
18 const int SLOPGROWTH = 2048000;
19
20
21 /*
22 Constructor
23 */
24 BedClosest::BedClosest(string &bedAFile, string &bedBFile, bool sameStrand, bool diffStrand,
25 string &tieMode, bool reportDistance, bool signDistance, string &_strandedDistMode,
26 bool ignoreOverlaps)
27 : _bedAFile(bedAFile)
28 , _bedBFile(bedBFile)
29 , _tieMode(tieMode)
30 , _sameStrand(sameStrand)
31 , _diffStrand(diffStrand)
32 , _reportDistance(reportDistance)
33 , _signDistance(signDistance)
34 , _strandedDistMode(_strandedDistMode)
35 , _ignoreOverlaps(ignoreOverlaps)
36 {
37 _bedA = new BedFile(_bedAFile);
38 _bedB = new BedFile(_bedBFile);
39 FindClosestBed();
40 }
41
42
43 /*
44 Destructor
45 */
46 BedClosest::~BedClosest(void) {
47 }
48
49
50 void BedClosest::FindWindowOverlaps(BED &a, vector<BED> &hits) {
51
52 int slop = 0; // start out just looking for overlaps
53 // within the current bin (~128Kb)
54
55 // update the current feature's start and end
56
57 CHRPOS aFudgeStart = 0;
58 CHRPOS aFudgeEnd;
59 int numOverlaps = 0;
60 vector<BED> closestB;
61 CHRPOS minDistance = INT_MAX;
62 int32_t curDistance = INT_MAX;
63 vector<int32_t> distances;
64
65 // is there at least one feature in B on the same chrom
66 // as the current A feature?
67 if(_bedB->bedMap.find(a.chrom) != _bedB->bedMap.end()) {
68
69 while ((numOverlaps == 0) && (slop <= MAXSLOP)) {
70
71 // add some slop (starting at 0 bases) to a in hopes
72 // of finding a hit in B
73 if ((static_cast<int>(a.start) - slop) > 0)
74 aFudgeStart = a.start - slop;
75 else
76 aFudgeStart = 0;
77
78 if ((static_cast<int>(a.start) + slop) < (2 * MAXSLOP))
79 aFudgeEnd = a.end + slop;
80 else
81 aFudgeEnd = 2 * MAXSLOP;
82
83 // THE HEAVY LIFTING
84 // search for hits with the current slop added
85 _bedB->FindOverlapsPerBin(a.chrom, aFudgeStart, aFudgeEnd, a.strand, hits, _sameStrand, _diffStrand);
86
87 vector<BED>::const_iterator h = hits.begin();
88 vector<BED>::const_iterator hitsEnd = hits.end();
89 for (; h != hitsEnd; ++h) {
90
91 // do the actual features overlap?
92 int s = max(a.start, h->start);
93 int e = min(a.end, h->end);
94 int overlapBases = (e - s); // the number of overlapping bases b/w a and b
95
96 // make sure we allow overlapping features.
97 if ((overlapBases > 0) && (_ignoreOverlaps == true))
98 continue;
99 else
100 numOverlaps++;
101
102 // there is overlap. make sure we allow overlapping features ()
103 if (overlapBases > 0) {
104 closestB.push_back(*h);
105 distances.push_back(0);
106 }
107 // the hit is to the "left" of A
108 else if (h->end <= a.start) {
109 curDistance = a.start - h->end;
110 if (_signDistance) {
111 if ((_strandedDistMode == "ref")
112 || (_strandedDistMode == "a" && a.strand != "-")
113 || (_strandedDistMode == "b" && h->strand == "-")) {
114 curDistance = -curDistance;
115 }
116 }
117
118 if (abs(curDistance) < minDistance) {
119 minDistance = abs(curDistance);
120
121 closestB.clear();
122 closestB.push_back(*h);
123 distances.clear();
124 distances.push_back(curDistance);
125 }
126 else if (abs(curDistance) == minDistance) {
127 minDistance = abs(curDistance);
128 closestB.push_back(*h);
129 distances.push_back(curDistance);
130 }
131 }
132 // the hit is to the "right" of A
133 else if (h->start >= a.end) {
134 curDistance = h->start - a.end;
135 if (_signDistance) {
136 if ((_strandedDistMode == "a" && a.strand == "-")
137 || (_strandedDistMode == "b" && h->strand != "-")) {
138 curDistance = -curDistance;
139 }
140 }
141 if (abs(curDistance) < minDistance) {
142 minDistance = abs(curDistance);
143 closestB.clear();
144 closestB.push_back(*h);
145 distances.clear();
146 distances.push_back(curDistance);
147 }
148 else if (abs(curDistance) == minDistance) {
149 minDistance = abs(curDistance);
150 closestB.push_back(*h);
151 distances.push_back(curDistance);
152 }
153 }
154 }
155 // if no overlaps were found, we'll widen the range
156 // by SLOPGROWTH in each direction and search again.
157 slop += SLOPGROWTH;
158 }
159 }
160 // there is no feature in B on the same chromosome as A
161 else {
162 _bedA->reportBedTab(a);
163 if (_reportDistance == true) {
164 _bedB->reportNullBedTab();
165 cout << -1 << endl;
166 }
167 else
168 _bedB->reportNullBedNewLine();
169 }
170
171 // report the closest feature(s) in B to the current A feature.
172 // obey the user's reporting request (_tieMode)
173 if (numOverlaps > 0) {
174 if (closestB.size() == 1 || _tieMode == "first") {
175 _bedA->reportBedTab(a);
176 if (_reportDistance == true) {
177 _bedB->reportBedTab(closestB[0]);
178 cout << distances[0] << endl;
179 }
180 else
181 _bedB->reportBedNewLine(closestB[0]);
182 }
183 else {
184 if (_tieMode == "all") {
185 size_t i = 0;
186 for (vector<BED>::iterator b = closestB.begin(); b != closestB.end(); ++b) {
187 _bedA->reportBedTab(a);
188 if (_reportDistance == true) {
189 _bedB->reportBedTab(*b);
190 cout << distances[i++] <<endl;
191 }
192 else
193 _bedB->reportBedNewLine(*b);
194 }
195 }
196 else if (_tieMode == "last") {
197 _bedA->reportBedTab(a);
198 if (_reportDistance == true) {
199 _bedB->reportBedTab(closestB[closestB.size()-1]);
200 cout << distances[distances.size() - 1]<<endl;
201 }
202 else
203 _bedB->reportBedNewLine(closestB[closestB.size()-1]);
204 }
205 }
206 }
207 }
208
209
210 void BedClosest::FindClosestBed() {
211
212 // load the "B" bed file into a map so
213 // that we can easily compare "A" to it for overlaps
214 _bedB->loadBedFileIntoMap();
215
216 BED a, nullBed;
217 int lineNum = 0; // current input line number
218 vector<BED> hits; // vector of potential hits
219 hits.reserve(100);
220 BedLineStatus bedStatus;
221
222 _bedA->Open();
223 // process each entry in A in search of the closest feature in B
224 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) {
225 if (bedStatus == BED_VALID) {
226 FindWindowOverlaps(a, hits);
227 hits.clear();
228 a = nullBed;
229 }
230 }
231 _bedA->Close();
232 }
233 // END ClosestBed
234