comparison BEDTools-Version-2.14.3/src/mergeBed/mergeBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfcd8b6c1bda
1 /*****************************************************************************
2 mergeBed.cpp
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "lineFileUtilities.h"
13 #include "mergeBed.h"
14
15
16
17 void BedMerge::ReportMergedNames(const vector<string> &names) {
18 if (names.size() > 0) {
19 printf("\t");
20 vector<string>::const_iterator nameItr = names.begin();
21 vector<string>::const_iterator nameEnd = names.end();
22 for (; nameItr != nameEnd; ++nameItr) {
23 if (nameItr < (nameEnd - 1))
24 cout << *nameItr << ";";
25 else
26 cout << *nameItr;
27 }
28 }
29 else {
30 cerr << endl
31 << "*****" << endl
32 << "*****ERROR: No names found to report for the -names option. Exiting." << endl
33 << "*****" << endl;
34 exit(1);
35 }
36 }
37
38
39 void BedMerge::ReportMergedScores(const vector<string> &scores) {
40 if (scores.size() > 0) {
41 printf("\t");
42
43 // convert the scores to floats
44 vector<float> data;
45 for (size_t i = 0 ; i < scores.size() ; i++) {
46 data.push_back(atof(scores[i].c_str()));
47 }
48
49 if (_scoreOp == "sum") {
50 printf("%.3f", accumulate(data.begin(), data.end(), 0.0));
51 }
52 else if (_scoreOp == "min") {
53 printf("%.3f", *min_element( data.begin(), data.end() ));
54 }
55 else if (_scoreOp == "max") {
56 printf("%.3f", *max_element( data.begin(), data.end() ));
57 }
58 else if (_scoreOp == "mean") {
59 double total = accumulate(data.begin(), data.end(), 0.0);
60 double mean = total / data.size();
61 printf("%.3f", mean);
62 }
63 else if (_scoreOp == "median") {
64 double median = 0.0;
65 sort(data.begin(), data.end());
66 int totalLines = data.size();
67 if ((totalLines % 2) > 0) {
68 long mid;
69 mid = totalLines / 2;
70 median = data[mid];
71 }
72 else {
73 long midLow, midHigh;
74 midLow = (totalLines / 2) - 1;
75 midHigh = (totalLines / 2);
76 median = (data[midLow] + data[midHigh]) / 2.0;
77 }
78 printf("%.3f", median);
79 }
80 else if ((_scoreOp == "mode") || (_scoreOp == "antimode")) {
81 // compute the frequency of each unique value
82 map<string, int> freqs;
83 vector<string>::const_iterator dIt = scores.begin();
84 vector<string>::const_iterator dEnd = scores.end();
85 for (; dIt != dEnd; ++dIt) {
86 freqs[*dIt]++;
87 }
88
89 // grab the mode and the anti mode
90 string mode, antiMode;
91 int count = 0;
92 int minCount = INT_MAX;
93 for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) {
94 if (iter->second > count) {
95 mode = iter->first;
96 count = iter->second;
97 }
98 if (iter->second < minCount) {
99 antiMode = iter->first;
100 minCount = iter->second;
101 }
102 }
103 // report
104 if (_scoreOp == "mode") {
105 printf("%s", mode.c_str());
106 }
107 else if (_scoreOp == "antimode") {
108 printf("%s", antiMode.c_str());
109 }
110 }
111 else if (_scoreOp == "collapse") {
112 vector<string>::const_iterator scoreItr = scores.begin();
113 vector<string>::const_iterator scoreEnd = scores.end();
114 for (; scoreItr != scoreEnd; ++scoreItr) {
115 if (scoreItr < (scoreEnd - 1))
116 cout << *scoreItr << ";";
117 else
118 cout << *scoreItr;
119 }
120 }
121 }
122 else {
123 cerr << endl
124 << "*****" << endl
125 << "*****ERROR: No scores found to report for the -scores option. Exiting." << endl
126 << "*****" << endl;
127 exit(1);
128 }
129 }
130
131 // ===============
132 // = Constructor =
133 // ===============
134 BedMerge::BedMerge(string &bedFile,
135 bool numEntries,
136 int maxDistance,
137 bool forceStrand,
138 bool reportNames,
139 bool reportScores,
140 const string &scoreOp) :
141 _bedFile(bedFile),
142 _numEntries(numEntries),
143 _forceStrand(forceStrand),
144 _reportNames(reportNames),
145 _reportScores(reportScores),
146 _scoreOp(scoreOp),
147 _maxDistance(maxDistance)
148 {
149 _bed = new BedFile(bedFile);
150
151 if (_forceStrand == false)
152 MergeBed();
153 else
154 MergeBedStranded();
155 }
156
157
158 // =================
159 // = Destructor =
160 // =================
161 BedMerge::~BedMerge(void) {
162 }
163
164
165 // ===============================================
166 // Convenience method for reporting merged blocks
167 // ================================================
168 void BedMerge::Report(string chrom, int start, int end,
169 const vector<string> &names, const vector<string> &scores, int mergeCount)
170 {
171 // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type
172 //if (_bed->isZeroBased == false) {start++;}
173
174 printf("%s\t%d\t%d", chrom.c_str(), start, end);
175 // just the merged intervals
176 if (_numEntries == false && _reportNames == false && _reportScores == false) {
177 printf("\n");
178 }
179 // merged intervals and counts
180 else if (_numEntries == true && _reportNames == false && _reportScores == false) {
181 printf("\t%d\n", mergeCount);
182 }
183 // merged intervals and names
184 else if (_numEntries == false && _reportNames == true && _reportScores == false) {
185 ReportMergedNames(names);
186 printf("\n");
187 }
188 // merged intervals and scores
189 else if (_numEntries == false && _reportNames == false && _reportScores == true) {
190 ReportMergedScores(scores);
191 printf("\n");
192 }
193 // merged intervals, names, and scores
194 else if (_numEntries == false && _reportNames == true && _reportScores == true) {
195 ReportMergedNames(names);
196 ReportMergedScores(scores);
197 printf("\n");
198 }
199 }
200
201
202 // =========================================================
203 // Convenience method for reporting merged blocks by strand
204 // =========================================================
205 void BedMerge::ReportStranded(string chrom, int start, int end,
206 const vector<string> &names, const vector<string> &scores,
207 int mergeCount, string strand)
208 {
209 // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type
210 //if (_bed->isZeroBased == false) {start++;}
211
212 printf("%s\t%d\t%d", chrom.c_str(), start, end);
213 // just the merged intervals
214 if (_numEntries == false && _reportNames == false && _reportScores == false) {
215 printf("\t%s\n", strand.c_str());
216 }
217 // merged intervals and counts
218 else if (_numEntries == true && _reportNames == false && _reportScores == false) {
219 printf("\t%d\t%s\n", mergeCount, strand.c_str());
220 }
221 // merged intervals and names
222 else if (_numEntries == false && _reportNames == true && _reportScores == false) {
223 ReportMergedNames(names);
224 printf("\t%s\n", strand.c_str());
225 }
226 // merged intervals and scores
227 else if (_numEntries == false && _reportNames == false && _reportScores == true) {
228 ReportMergedScores(scores);
229 printf("\t%s\n", strand.c_str());
230 }
231 // merged intervals, names, and scores
232 else if (_numEntries == false && _reportNames == true && _reportScores == true) {
233 ReportMergedNames(names);
234 ReportMergedScores(scores);
235 printf("\t%s\n", strand.c_str());
236 }
237 }
238
239
240 // =====================================================
241 // = Merge overlapping BED entries into a single entry =
242 // =====================================================
243 void BedMerge::MergeBed() {
244
245 // load the "B" bed file into a map so
246 // that we can easily compare "A" to it for overlaps
247 _bed->loadBedFileIntoMapNoBin();
248
249 // loop through each chromosome and merge their BED entries
250 masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
251 masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
252 for (; m != mEnd; ++m) {
253
254 // bedList is already sorted by start position.
255 string chrom = m->first;
256 vector<BED> bedList = m->second;
257 int mergeCount = 1;
258 vector<string> names;
259 vector<string> scores;
260
261 // merge overlapping features for this chromosome.
262 int start = -1;
263 int end = -1;
264 vector<BED>::const_iterator bedItr = bedList.begin();
265 vector<BED>::const_iterator bedEnd = bedList.end();
266 for (; bedItr != bedEnd; ++bedItr) {
267 // new block, no overlap
268 if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) {
269 if (start >= 0) {
270 Report(chrom, start, end, names, scores, mergeCount);
271 // reset
272 mergeCount = 1;
273 names.clear();
274 scores.clear();
275 }
276 start = bedItr->start;
277 end = bedItr->end;
278 if (!bedItr->name.empty()) names.push_back(bedItr->name);
279 if (!bedItr->score.empty()) scores.push_back(bedItr->score);
280 }
281 // same block, overlaps
282 else {
283 if ((int) bedItr-> end > end) end = bedItr->end;
284 mergeCount++;
285 if (!bedItr->name.empty()) names.push_back(bedItr->name);
286 if (!bedItr->score.empty()) scores.push_back(bedItr->score);
287 }
288 }
289 if (start >= 0) {
290 Report(chrom, start, end, names, scores, mergeCount);
291 }
292 }
293 }
294
295
296 // ==================================================================================
297 // = Merge overlapping BED entries into a single entry, accounting for strandedness =
298 // ==================================================================================
299 void BedMerge::MergeBedStranded() {
300
301 // load the "B" bed file into a map so
302 // that we can easily compare "A" to it for overlaps
303 _bed->loadBedFileIntoMapNoBin();
304
305 // loop through each chromosome and merge their BED entries
306 masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
307 masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
308 for (; m != mEnd; ++m) {
309
310 // bedList is already sorted by start position.
311 string chrom = m->first;
312 vector<BED> bedList = m->second;
313
314 // make a list of the two strands to merge separately.
315 vector<string> strands(2);
316 strands[0] = "+";
317 strands[1] = "-";
318
319 // do two passes, one for each strand.
320 for (unsigned int s = 0; s < strands.size(); s++) {
321
322 int mergeCount = 1;
323 int numOnStrand = 0;
324 vector<string> names;
325 vector<string> scores;
326
327 // merge overlapping features for this chromosome.
328 int start = -1;
329 int end = -1;
330 vector<BED>::const_iterator bedItr = bedList.begin();
331 vector<BED>::const_iterator bedEnd = bedList.end();
332 for (; bedItr != bedEnd; ++bedItr) {
333
334 // if forcing strandedness, move on if the hit
335 // is not on the current strand.
336 if (bedItr->strand != strands[s]) { continue; }
337 else { numOnStrand++; }
338
339 if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) {
340 if (start >= 0) {
341 ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]);
342 // reset
343 mergeCount = 1;
344 names.clear();
345 scores.clear();
346 }
347 start = bedItr->start;
348 end = bedItr->end;
349 if (!bedItr->name.empty()) names.push_back(bedItr->name);
350 if (!bedItr->score.empty()) scores.push_back(bedItr->score);
351 }
352 else {
353 if ((int) bedItr-> end > end) end = bedItr->end;
354 mergeCount++;
355 if (!bedItr->name.empty()) names.push_back(bedItr->name);
356 if (!bedItr->score.empty()) scores.push_back(bedItr->score);
357 }
358 }
359 if (start >= 0) {
360 ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]);
361 }
362 }
363 }
364 }