Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/mergeBed/mergeBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfcd8b6c1bda |
---|---|
1 /***************************************************************************** | |
2 mergeBed.cpp | |
3 | |
4 (c) 2009 - Aaron Quinlan | |
5 Hall Laboratory | |
6 Department of Biochemistry and Molecular Genetics | |
7 University of Virginia | |
8 aaronquinlan@gmail.com | |
9 | |
10 Licenced under the GNU General Public License 2.0 license. | |
11 ******************************************************************************/ | |
12 #include "lineFileUtilities.h" | |
13 #include "mergeBed.h" | |
14 | |
15 | |
16 | |
17 void BedMerge::ReportMergedNames(const vector<string> &names) { | |
18 if (names.size() > 0) { | |
19 printf("\t"); | |
20 vector<string>::const_iterator nameItr = names.begin(); | |
21 vector<string>::const_iterator nameEnd = names.end(); | |
22 for (; nameItr != nameEnd; ++nameItr) { | |
23 if (nameItr < (nameEnd - 1)) | |
24 cout << *nameItr << ";"; | |
25 else | |
26 cout << *nameItr; | |
27 } | |
28 } | |
29 else { | |
30 cerr << endl | |
31 << "*****" << endl | |
32 << "*****ERROR: No names found to report for the -names option. Exiting." << endl | |
33 << "*****" << endl; | |
34 exit(1); | |
35 } | |
36 } | |
37 | |
38 | |
39 void BedMerge::ReportMergedScores(const vector<string> &scores) { | |
40 if (scores.size() > 0) { | |
41 printf("\t"); | |
42 | |
43 // convert the scores to floats | |
44 vector<float> data; | |
45 for (size_t i = 0 ; i < scores.size() ; i++) { | |
46 data.push_back(atof(scores[i].c_str())); | |
47 } | |
48 | |
49 if (_scoreOp == "sum") { | |
50 printf("%.3f", accumulate(data.begin(), data.end(), 0.0)); | |
51 } | |
52 else if (_scoreOp == "min") { | |
53 printf("%.3f", *min_element( data.begin(), data.end() )); | |
54 } | |
55 else if (_scoreOp == "max") { | |
56 printf("%.3f", *max_element( data.begin(), data.end() )); | |
57 } | |
58 else if (_scoreOp == "mean") { | |
59 double total = accumulate(data.begin(), data.end(), 0.0); | |
60 double mean = total / data.size(); | |
61 printf("%.3f", mean); | |
62 } | |
63 else if (_scoreOp == "median") { | |
64 double median = 0.0; | |
65 sort(data.begin(), data.end()); | |
66 int totalLines = data.size(); | |
67 if ((totalLines % 2) > 0) { | |
68 long mid; | |
69 mid = totalLines / 2; | |
70 median = data[mid]; | |
71 } | |
72 else { | |
73 long midLow, midHigh; | |
74 midLow = (totalLines / 2) - 1; | |
75 midHigh = (totalLines / 2); | |
76 median = (data[midLow] + data[midHigh]) / 2.0; | |
77 } | |
78 printf("%.3f", median); | |
79 } | |
80 else if ((_scoreOp == "mode") || (_scoreOp == "antimode")) { | |
81 // compute the frequency of each unique value | |
82 map<string, int> freqs; | |
83 vector<string>::const_iterator dIt = scores.begin(); | |
84 vector<string>::const_iterator dEnd = scores.end(); | |
85 for (; dIt != dEnd; ++dIt) { | |
86 freqs[*dIt]++; | |
87 } | |
88 | |
89 // grab the mode and the anti mode | |
90 string mode, antiMode; | |
91 int count = 0; | |
92 int minCount = INT_MAX; | |
93 for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) { | |
94 if (iter->second > count) { | |
95 mode = iter->first; | |
96 count = iter->second; | |
97 } | |
98 if (iter->second < minCount) { | |
99 antiMode = iter->first; | |
100 minCount = iter->second; | |
101 } | |
102 } | |
103 // report | |
104 if (_scoreOp == "mode") { | |
105 printf("%s", mode.c_str()); | |
106 } | |
107 else if (_scoreOp == "antimode") { | |
108 printf("%s", antiMode.c_str()); | |
109 } | |
110 } | |
111 else if (_scoreOp == "collapse") { | |
112 vector<string>::const_iterator scoreItr = scores.begin(); | |
113 vector<string>::const_iterator scoreEnd = scores.end(); | |
114 for (; scoreItr != scoreEnd; ++scoreItr) { | |
115 if (scoreItr < (scoreEnd - 1)) | |
116 cout << *scoreItr << ";"; | |
117 else | |
118 cout << *scoreItr; | |
119 } | |
120 } | |
121 } | |
122 else { | |
123 cerr << endl | |
124 << "*****" << endl | |
125 << "*****ERROR: No scores found to report for the -scores option. Exiting." << endl | |
126 << "*****" << endl; | |
127 exit(1); | |
128 } | |
129 } | |
130 | |
131 // =============== | |
132 // = Constructor = | |
133 // =============== | |
134 BedMerge::BedMerge(string &bedFile, | |
135 bool numEntries, | |
136 int maxDistance, | |
137 bool forceStrand, | |
138 bool reportNames, | |
139 bool reportScores, | |
140 const string &scoreOp) : | |
141 _bedFile(bedFile), | |
142 _numEntries(numEntries), | |
143 _forceStrand(forceStrand), | |
144 _reportNames(reportNames), | |
145 _reportScores(reportScores), | |
146 _scoreOp(scoreOp), | |
147 _maxDistance(maxDistance) | |
148 { | |
149 _bed = new BedFile(bedFile); | |
150 | |
151 if (_forceStrand == false) | |
152 MergeBed(); | |
153 else | |
154 MergeBedStranded(); | |
155 } | |
156 | |
157 | |
158 // ================= | |
159 // = Destructor = | |
160 // ================= | |
161 BedMerge::~BedMerge(void) { | |
162 } | |
163 | |
164 | |
165 // =============================================== | |
166 // Convenience method for reporting merged blocks | |
167 // ================================================ | |
168 void BedMerge::Report(string chrom, int start, int end, | |
169 const vector<string> &names, const vector<string> &scores, int mergeCount) | |
170 { | |
171 // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type | |
172 //if (_bed->isZeroBased == false) {start++;} | |
173 | |
174 printf("%s\t%d\t%d", chrom.c_str(), start, end); | |
175 // just the merged intervals | |
176 if (_numEntries == false && _reportNames == false && _reportScores == false) { | |
177 printf("\n"); | |
178 } | |
179 // merged intervals and counts | |
180 else if (_numEntries == true && _reportNames == false && _reportScores == false) { | |
181 printf("\t%d\n", mergeCount); | |
182 } | |
183 // merged intervals and names | |
184 else if (_numEntries == false && _reportNames == true && _reportScores == false) { | |
185 ReportMergedNames(names); | |
186 printf("\n"); | |
187 } | |
188 // merged intervals and scores | |
189 else if (_numEntries == false && _reportNames == false && _reportScores == true) { | |
190 ReportMergedScores(scores); | |
191 printf("\n"); | |
192 } | |
193 // merged intervals, names, and scores | |
194 else if (_numEntries == false && _reportNames == true && _reportScores == true) { | |
195 ReportMergedNames(names); | |
196 ReportMergedScores(scores); | |
197 printf("\n"); | |
198 } | |
199 } | |
200 | |
201 | |
202 // ========================================================= | |
203 // Convenience method for reporting merged blocks by strand | |
204 // ========================================================= | |
205 void BedMerge::ReportStranded(string chrom, int start, int end, | |
206 const vector<string> &names, const vector<string> &scores, | |
207 int mergeCount, string strand) | |
208 { | |
209 // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type | |
210 //if (_bed->isZeroBased == false) {start++;} | |
211 | |
212 printf("%s\t%d\t%d", chrom.c_str(), start, end); | |
213 // just the merged intervals | |
214 if (_numEntries == false && _reportNames == false && _reportScores == false) { | |
215 printf("\t%s\n", strand.c_str()); | |
216 } | |
217 // merged intervals and counts | |
218 else if (_numEntries == true && _reportNames == false && _reportScores == false) { | |
219 printf("\t%d\t%s\n", mergeCount, strand.c_str()); | |
220 } | |
221 // merged intervals and names | |
222 else if (_numEntries == false && _reportNames == true && _reportScores == false) { | |
223 ReportMergedNames(names); | |
224 printf("\t%s\n", strand.c_str()); | |
225 } | |
226 // merged intervals and scores | |
227 else if (_numEntries == false && _reportNames == false && _reportScores == true) { | |
228 ReportMergedScores(scores); | |
229 printf("\t%s\n", strand.c_str()); | |
230 } | |
231 // merged intervals, names, and scores | |
232 else if (_numEntries == false && _reportNames == true && _reportScores == true) { | |
233 ReportMergedNames(names); | |
234 ReportMergedScores(scores); | |
235 printf("\t%s\n", strand.c_str()); | |
236 } | |
237 } | |
238 | |
239 | |
240 // ===================================================== | |
241 // = Merge overlapping BED entries into a single entry = | |
242 // ===================================================== | |
243 void BedMerge::MergeBed() { | |
244 | |
245 // load the "B" bed file into a map so | |
246 // that we can easily compare "A" to it for overlaps | |
247 _bed->loadBedFileIntoMapNoBin(); | |
248 | |
249 // loop through each chromosome and merge their BED entries | |
250 masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); | |
251 masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); | |
252 for (; m != mEnd; ++m) { | |
253 | |
254 // bedList is already sorted by start position. | |
255 string chrom = m->first; | |
256 vector<BED> bedList = m->second; | |
257 int mergeCount = 1; | |
258 vector<string> names; | |
259 vector<string> scores; | |
260 | |
261 // merge overlapping features for this chromosome. | |
262 int start = -1; | |
263 int end = -1; | |
264 vector<BED>::const_iterator bedItr = bedList.begin(); | |
265 vector<BED>::const_iterator bedEnd = bedList.end(); | |
266 for (; bedItr != bedEnd; ++bedItr) { | |
267 // new block, no overlap | |
268 if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) { | |
269 if (start >= 0) { | |
270 Report(chrom, start, end, names, scores, mergeCount); | |
271 // reset | |
272 mergeCount = 1; | |
273 names.clear(); | |
274 scores.clear(); | |
275 } | |
276 start = bedItr->start; | |
277 end = bedItr->end; | |
278 if (!bedItr->name.empty()) names.push_back(bedItr->name); | |
279 if (!bedItr->score.empty()) scores.push_back(bedItr->score); | |
280 } | |
281 // same block, overlaps | |
282 else { | |
283 if ((int) bedItr-> end > end) end = bedItr->end; | |
284 mergeCount++; | |
285 if (!bedItr->name.empty()) names.push_back(bedItr->name); | |
286 if (!bedItr->score.empty()) scores.push_back(bedItr->score); | |
287 } | |
288 } | |
289 if (start >= 0) { | |
290 Report(chrom, start, end, names, scores, mergeCount); | |
291 } | |
292 } | |
293 } | |
294 | |
295 | |
296 // ================================================================================== | |
297 // = Merge overlapping BED entries into a single entry, accounting for strandedness = | |
298 // ================================================================================== | |
299 void BedMerge::MergeBedStranded() { | |
300 | |
301 // load the "B" bed file into a map so | |
302 // that we can easily compare "A" to it for overlaps | |
303 _bed->loadBedFileIntoMapNoBin(); | |
304 | |
305 // loop through each chromosome and merge their BED entries | |
306 masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); | |
307 masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); | |
308 for (; m != mEnd; ++m) { | |
309 | |
310 // bedList is already sorted by start position. | |
311 string chrom = m->first; | |
312 vector<BED> bedList = m->second; | |
313 | |
314 // make a list of the two strands to merge separately. | |
315 vector<string> strands(2); | |
316 strands[0] = "+"; | |
317 strands[1] = "-"; | |
318 | |
319 // do two passes, one for each strand. | |
320 for (unsigned int s = 0; s < strands.size(); s++) { | |
321 | |
322 int mergeCount = 1; | |
323 int numOnStrand = 0; | |
324 vector<string> names; | |
325 vector<string> scores; | |
326 | |
327 // merge overlapping features for this chromosome. | |
328 int start = -1; | |
329 int end = -1; | |
330 vector<BED>::const_iterator bedItr = bedList.begin(); | |
331 vector<BED>::const_iterator bedEnd = bedList.end(); | |
332 for (; bedItr != bedEnd; ++bedItr) { | |
333 | |
334 // if forcing strandedness, move on if the hit | |
335 // is not on the current strand. | |
336 if (bedItr->strand != strands[s]) { continue; } | |
337 else { numOnStrand++; } | |
338 | |
339 if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) { | |
340 if (start >= 0) { | |
341 ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]); | |
342 // reset | |
343 mergeCount = 1; | |
344 names.clear(); | |
345 scores.clear(); | |
346 } | |
347 start = bedItr->start; | |
348 end = bedItr->end; | |
349 if (!bedItr->name.empty()) names.push_back(bedItr->name); | |
350 if (!bedItr->score.empty()) scores.push_back(bedItr->score); | |
351 } | |
352 else { | |
353 if ((int) bedItr-> end > end) end = bedItr->end; | |
354 mergeCount++; | |
355 if (!bedItr->name.empty()) names.push_back(bedItr->name); | |
356 if (!bedItr->score.empty()) scores.push_back(bedItr->score); | |
357 } | |
358 } | |
359 if (start >= 0) { | |
360 ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]); | |
361 } | |
362 } | |
363 } | |
364 } |