0
|
1 /*****************************************************************************
|
|
2 mergeBed.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "mergeBed.h"
|
|
14
|
|
15
|
|
16
|
|
17 void BedMerge::ReportMergedNames(const vector<string> &names) {
|
|
18 if (names.size() > 0) {
|
|
19 printf("\t");
|
|
20 vector<string>::const_iterator nameItr = names.begin();
|
|
21 vector<string>::const_iterator nameEnd = names.end();
|
|
22 for (; nameItr != nameEnd; ++nameItr) {
|
|
23 if (nameItr < (nameEnd - 1))
|
|
24 cout << *nameItr << ";";
|
|
25 else
|
|
26 cout << *nameItr;
|
|
27 }
|
|
28 }
|
|
29 else {
|
|
30 cerr << endl
|
|
31 << "*****" << endl
|
|
32 << "*****ERROR: No names found to report for the -names option. Exiting." << endl
|
|
33 << "*****" << endl;
|
|
34 exit(1);
|
|
35 }
|
|
36 }
|
|
37
|
|
38
|
|
39 void BedMerge::ReportMergedScores(const vector<string> &scores) {
|
|
40 if (scores.size() > 0) {
|
|
41 printf("\t");
|
|
42
|
|
43 // convert the scores to floats
|
|
44 vector<float> data;
|
|
45 for (size_t i = 0 ; i < scores.size() ; i++) {
|
|
46 data.push_back(atof(scores[i].c_str()));
|
|
47 }
|
|
48
|
|
49 if (_scoreOp == "sum") {
|
|
50 printf("%.3f", accumulate(data.begin(), data.end(), 0.0));
|
|
51 }
|
|
52 else if (_scoreOp == "min") {
|
|
53 printf("%.3f", *min_element( data.begin(), data.end() ));
|
|
54 }
|
|
55 else if (_scoreOp == "max") {
|
|
56 printf("%.3f", *max_element( data.begin(), data.end() ));
|
|
57 }
|
|
58 else if (_scoreOp == "mean") {
|
|
59 double total = accumulate(data.begin(), data.end(), 0.0);
|
|
60 double mean = total / data.size();
|
|
61 printf("%.3f", mean);
|
|
62 }
|
|
63 else if (_scoreOp == "median") {
|
|
64 double median = 0.0;
|
|
65 sort(data.begin(), data.end());
|
|
66 int totalLines = data.size();
|
|
67 if ((totalLines % 2) > 0) {
|
|
68 long mid;
|
|
69 mid = totalLines / 2;
|
|
70 median = data[mid];
|
|
71 }
|
|
72 else {
|
|
73 long midLow, midHigh;
|
|
74 midLow = (totalLines / 2) - 1;
|
|
75 midHigh = (totalLines / 2);
|
|
76 median = (data[midLow] + data[midHigh]) / 2.0;
|
|
77 }
|
|
78 printf("%.3f", median);
|
|
79 }
|
|
80 else if ((_scoreOp == "mode") || (_scoreOp == "antimode")) {
|
|
81 // compute the frequency of each unique value
|
|
82 map<string, int> freqs;
|
|
83 vector<string>::const_iterator dIt = scores.begin();
|
|
84 vector<string>::const_iterator dEnd = scores.end();
|
|
85 for (; dIt != dEnd; ++dIt) {
|
|
86 freqs[*dIt]++;
|
|
87 }
|
|
88
|
|
89 // grab the mode and the anti mode
|
|
90 string mode, antiMode;
|
|
91 int count = 0;
|
|
92 int minCount = INT_MAX;
|
|
93 for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) {
|
|
94 if (iter->second > count) {
|
|
95 mode = iter->first;
|
|
96 count = iter->second;
|
|
97 }
|
|
98 if (iter->second < minCount) {
|
|
99 antiMode = iter->first;
|
|
100 minCount = iter->second;
|
|
101 }
|
|
102 }
|
|
103 // report
|
|
104 if (_scoreOp == "mode") {
|
|
105 printf("%s", mode.c_str());
|
|
106 }
|
|
107 else if (_scoreOp == "antimode") {
|
|
108 printf("%s", antiMode.c_str());
|
|
109 }
|
|
110 }
|
|
111 else if (_scoreOp == "collapse") {
|
|
112 vector<string>::const_iterator scoreItr = scores.begin();
|
|
113 vector<string>::const_iterator scoreEnd = scores.end();
|
|
114 for (; scoreItr != scoreEnd; ++scoreItr) {
|
|
115 if (scoreItr < (scoreEnd - 1))
|
|
116 cout << *scoreItr << ";";
|
|
117 else
|
|
118 cout << *scoreItr;
|
|
119 }
|
|
120 }
|
|
121 }
|
|
122 else {
|
|
123 cerr << endl
|
|
124 << "*****" << endl
|
|
125 << "*****ERROR: No scores found to report for the -scores option. Exiting." << endl
|
|
126 << "*****" << endl;
|
|
127 exit(1);
|
|
128 }
|
|
129 }
|
|
130
|
|
131 // ===============
|
|
132 // = Constructor =
|
|
133 // ===============
|
|
134 BedMerge::BedMerge(string &bedFile,
|
|
135 bool numEntries,
|
|
136 int maxDistance,
|
|
137 bool forceStrand,
|
|
138 bool reportNames,
|
|
139 bool reportScores,
|
|
140 const string &scoreOp) :
|
|
141 _bedFile(bedFile),
|
|
142 _numEntries(numEntries),
|
|
143 _forceStrand(forceStrand),
|
|
144 _reportNames(reportNames),
|
|
145 _reportScores(reportScores),
|
|
146 _scoreOp(scoreOp),
|
|
147 _maxDistance(maxDistance)
|
|
148 {
|
|
149 _bed = new BedFile(bedFile);
|
|
150
|
|
151 if (_forceStrand == false)
|
|
152 MergeBed();
|
|
153 else
|
|
154 MergeBedStranded();
|
|
155 }
|
|
156
|
|
157
|
|
158 // =================
|
|
159 // = Destructor =
|
|
160 // =================
|
|
161 BedMerge::~BedMerge(void) {
|
|
162 }
|
|
163
|
|
164
|
|
165 // ===============================================
|
|
166 // Convenience method for reporting merged blocks
|
|
167 // ================================================
|
|
168 void BedMerge::Report(string chrom, int start, int end,
|
|
169 const vector<string> &names, const vector<string> &scores, int mergeCount)
|
|
170 {
|
|
171 // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type
|
|
172 //if (_bed->isZeroBased == false) {start++;}
|
|
173
|
|
174 printf("%s\t%d\t%d", chrom.c_str(), start, end);
|
|
175 // just the merged intervals
|
|
176 if (_numEntries == false && _reportNames == false && _reportScores == false) {
|
|
177 printf("\n");
|
|
178 }
|
|
179 // merged intervals and counts
|
|
180 else if (_numEntries == true && _reportNames == false && _reportScores == false) {
|
|
181 printf("\t%d\n", mergeCount);
|
|
182 }
|
|
183 // merged intervals and names
|
|
184 else if (_numEntries == false && _reportNames == true && _reportScores == false) {
|
|
185 ReportMergedNames(names);
|
|
186 printf("\n");
|
|
187 }
|
|
188 // merged intervals and scores
|
|
189 else if (_numEntries == false && _reportNames == false && _reportScores == true) {
|
|
190 ReportMergedScores(scores);
|
|
191 printf("\n");
|
|
192 }
|
|
193 // merged intervals, names, and scores
|
|
194 else if (_numEntries == false && _reportNames == true && _reportScores == true) {
|
|
195 ReportMergedNames(names);
|
|
196 ReportMergedScores(scores);
|
|
197 printf("\n");
|
|
198 }
|
|
199 }
|
|
200
|
|
201
|
|
202 // =========================================================
|
|
203 // Convenience method for reporting merged blocks by strand
|
|
204 // =========================================================
|
|
205 void BedMerge::ReportStranded(string chrom, int start, int end,
|
|
206 const vector<string> &names, const vector<string> &scores,
|
|
207 int mergeCount, string strand)
|
|
208 {
|
|
209 // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type
|
|
210 //if (_bed->isZeroBased == false) {start++;}
|
|
211
|
|
212 printf("%s\t%d\t%d", chrom.c_str(), start, end);
|
|
213 // just the merged intervals
|
|
214 if (_numEntries == false && _reportNames == false && _reportScores == false) {
|
|
215 printf("\t%s\n", strand.c_str());
|
|
216 }
|
|
217 // merged intervals and counts
|
|
218 else if (_numEntries == true && _reportNames == false && _reportScores == false) {
|
|
219 printf("\t%d\t%s\n", mergeCount, strand.c_str());
|
|
220 }
|
|
221 // merged intervals and names
|
|
222 else if (_numEntries == false && _reportNames == true && _reportScores == false) {
|
|
223 ReportMergedNames(names);
|
|
224 printf("\t%s\n", strand.c_str());
|
|
225 }
|
|
226 // merged intervals and scores
|
|
227 else if (_numEntries == false && _reportNames == false && _reportScores == true) {
|
|
228 ReportMergedScores(scores);
|
|
229 printf("\t%s\n", strand.c_str());
|
|
230 }
|
|
231 // merged intervals, names, and scores
|
|
232 else if (_numEntries == false && _reportNames == true && _reportScores == true) {
|
|
233 ReportMergedNames(names);
|
|
234 ReportMergedScores(scores);
|
|
235 printf("\t%s\n", strand.c_str());
|
|
236 }
|
|
237 }
|
|
238
|
|
239
|
|
240 // =====================================================
|
|
241 // = Merge overlapping BED entries into a single entry =
|
|
242 // =====================================================
|
|
243 void BedMerge::MergeBed() {
|
|
244
|
|
245 // load the "B" bed file into a map so
|
|
246 // that we can easily compare "A" to it for overlaps
|
|
247 _bed->loadBedFileIntoMapNoBin();
|
|
248
|
|
249 // loop through each chromosome and merge their BED entries
|
|
250 masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
|
|
251 masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
|
|
252 for (; m != mEnd; ++m) {
|
|
253
|
|
254 // bedList is already sorted by start position.
|
|
255 string chrom = m->first;
|
|
256 vector<BED> bedList = m->second;
|
|
257 int mergeCount = 1;
|
|
258 vector<string> names;
|
|
259 vector<string> scores;
|
|
260
|
|
261 // merge overlapping features for this chromosome.
|
|
262 int start = -1;
|
|
263 int end = -1;
|
|
264 vector<BED>::const_iterator bedItr = bedList.begin();
|
|
265 vector<BED>::const_iterator bedEnd = bedList.end();
|
|
266 for (; bedItr != bedEnd; ++bedItr) {
|
|
267 // new block, no overlap
|
|
268 if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) {
|
|
269 if (start >= 0) {
|
|
270 Report(chrom, start, end, names, scores, mergeCount);
|
|
271 // reset
|
|
272 mergeCount = 1;
|
|
273 names.clear();
|
|
274 scores.clear();
|
|
275 }
|
|
276 start = bedItr->start;
|
|
277 end = bedItr->end;
|
|
278 if (!bedItr->name.empty()) names.push_back(bedItr->name);
|
|
279 if (!bedItr->score.empty()) scores.push_back(bedItr->score);
|
|
280 }
|
|
281 // same block, overlaps
|
|
282 else {
|
|
283 if ((int) bedItr-> end > end) end = bedItr->end;
|
|
284 mergeCount++;
|
|
285 if (!bedItr->name.empty()) names.push_back(bedItr->name);
|
|
286 if (!bedItr->score.empty()) scores.push_back(bedItr->score);
|
|
287 }
|
|
288 }
|
|
289 if (start >= 0) {
|
|
290 Report(chrom, start, end, names, scores, mergeCount);
|
|
291 }
|
|
292 }
|
|
293 }
|
|
294
|
|
295
|
|
296 // ==================================================================================
|
|
297 // = Merge overlapping BED entries into a single entry, accounting for strandedness =
|
|
298 // ==================================================================================
|
|
299 void BedMerge::MergeBedStranded() {
|
|
300
|
|
301 // load the "B" bed file into a map so
|
|
302 // that we can easily compare "A" to it for overlaps
|
|
303 _bed->loadBedFileIntoMapNoBin();
|
|
304
|
|
305 // loop through each chromosome and merge their BED entries
|
|
306 masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
|
|
307 masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
|
|
308 for (; m != mEnd; ++m) {
|
|
309
|
|
310 // bedList is already sorted by start position.
|
|
311 string chrom = m->first;
|
|
312 vector<BED> bedList = m->second;
|
|
313
|
|
314 // make a list of the two strands to merge separately.
|
|
315 vector<string> strands(2);
|
|
316 strands[0] = "+";
|
|
317 strands[1] = "-";
|
|
318
|
|
319 // do two passes, one for each strand.
|
|
320 for (unsigned int s = 0; s < strands.size(); s++) {
|
|
321
|
|
322 int mergeCount = 1;
|
|
323 int numOnStrand = 0;
|
|
324 vector<string> names;
|
|
325 vector<string> scores;
|
|
326
|
|
327 // merge overlapping features for this chromosome.
|
|
328 int start = -1;
|
|
329 int end = -1;
|
|
330 vector<BED>::const_iterator bedItr = bedList.begin();
|
|
331 vector<BED>::const_iterator bedEnd = bedList.end();
|
|
332 for (; bedItr != bedEnd; ++bedItr) {
|
|
333
|
|
334 // if forcing strandedness, move on if the hit
|
|
335 // is not on the current strand.
|
|
336 if (bedItr->strand != strands[s]) { continue; }
|
|
337 else { numOnStrand++; }
|
|
338
|
|
339 if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) {
|
|
340 if (start >= 0) {
|
|
341 ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]);
|
|
342 // reset
|
|
343 mergeCount = 1;
|
|
344 names.clear();
|
|
345 scores.clear();
|
|
346 }
|
|
347 start = bedItr->start;
|
|
348 end = bedItr->end;
|
|
349 if (!bedItr->name.empty()) names.push_back(bedItr->name);
|
|
350 if (!bedItr->score.empty()) scores.push_back(bedItr->score);
|
|
351 }
|
|
352 else {
|
|
353 if ((int) bedItr-> end > end) end = bedItr->end;
|
|
354 mergeCount++;
|
|
355 if (!bedItr->name.empty()) names.push_back(bedItr->name);
|
|
356 if (!bedItr->score.empty()) scores.push_back(bedItr->score);
|
|
357 }
|
|
358 }
|
|
359 if (start >= 0) {
|
|
360 ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]);
|
|
361 }
|
|
362 }
|
|
363 }
|
|
364 }
|