Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/mergeBed/mergeBed.cpp @ 0:dfcd8b6c1bda
Uploaded
| author | aaronquinlan |
|---|---|
| date | Thu, 03 Nov 2011 10:25:04 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfcd8b6c1bda |
|---|---|
| 1 /***************************************************************************** | |
| 2 mergeBed.cpp | |
| 3 | |
| 4 (c) 2009 - Aaron Quinlan | |
| 5 Hall Laboratory | |
| 6 Department of Biochemistry and Molecular Genetics | |
| 7 University of Virginia | |
| 8 aaronquinlan@gmail.com | |
| 9 | |
| 10 Licenced under the GNU General Public License 2.0 license. | |
| 11 ******************************************************************************/ | |
| 12 #include "lineFileUtilities.h" | |
| 13 #include "mergeBed.h" | |
| 14 | |
| 15 | |
| 16 | |
| 17 void BedMerge::ReportMergedNames(const vector<string> &names) { | |
| 18 if (names.size() > 0) { | |
| 19 printf("\t"); | |
| 20 vector<string>::const_iterator nameItr = names.begin(); | |
| 21 vector<string>::const_iterator nameEnd = names.end(); | |
| 22 for (; nameItr != nameEnd; ++nameItr) { | |
| 23 if (nameItr < (nameEnd - 1)) | |
| 24 cout << *nameItr << ";"; | |
| 25 else | |
| 26 cout << *nameItr; | |
| 27 } | |
| 28 } | |
| 29 else { | |
| 30 cerr << endl | |
| 31 << "*****" << endl | |
| 32 << "*****ERROR: No names found to report for the -names option. Exiting." << endl | |
| 33 << "*****" << endl; | |
| 34 exit(1); | |
| 35 } | |
| 36 } | |
| 37 | |
| 38 | |
| 39 void BedMerge::ReportMergedScores(const vector<string> &scores) { | |
| 40 if (scores.size() > 0) { | |
| 41 printf("\t"); | |
| 42 | |
| 43 // convert the scores to floats | |
| 44 vector<float> data; | |
| 45 for (size_t i = 0 ; i < scores.size() ; i++) { | |
| 46 data.push_back(atof(scores[i].c_str())); | |
| 47 } | |
| 48 | |
| 49 if (_scoreOp == "sum") { | |
| 50 printf("%.3f", accumulate(data.begin(), data.end(), 0.0)); | |
| 51 } | |
| 52 else if (_scoreOp == "min") { | |
| 53 printf("%.3f", *min_element( data.begin(), data.end() )); | |
| 54 } | |
| 55 else if (_scoreOp == "max") { | |
| 56 printf("%.3f", *max_element( data.begin(), data.end() )); | |
| 57 } | |
| 58 else if (_scoreOp == "mean") { | |
| 59 double total = accumulate(data.begin(), data.end(), 0.0); | |
| 60 double mean = total / data.size(); | |
| 61 printf("%.3f", mean); | |
| 62 } | |
| 63 else if (_scoreOp == "median") { | |
| 64 double median = 0.0; | |
| 65 sort(data.begin(), data.end()); | |
| 66 int totalLines = data.size(); | |
| 67 if ((totalLines % 2) > 0) { | |
| 68 long mid; | |
| 69 mid = totalLines / 2; | |
| 70 median = data[mid]; | |
| 71 } | |
| 72 else { | |
| 73 long midLow, midHigh; | |
| 74 midLow = (totalLines / 2) - 1; | |
| 75 midHigh = (totalLines / 2); | |
| 76 median = (data[midLow] + data[midHigh]) / 2.0; | |
| 77 } | |
| 78 printf("%.3f", median); | |
| 79 } | |
| 80 else if ((_scoreOp == "mode") || (_scoreOp == "antimode")) { | |
| 81 // compute the frequency of each unique value | |
| 82 map<string, int> freqs; | |
| 83 vector<string>::const_iterator dIt = scores.begin(); | |
| 84 vector<string>::const_iterator dEnd = scores.end(); | |
| 85 for (; dIt != dEnd; ++dIt) { | |
| 86 freqs[*dIt]++; | |
| 87 } | |
| 88 | |
| 89 // grab the mode and the anti mode | |
| 90 string mode, antiMode; | |
| 91 int count = 0; | |
| 92 int minCount = INT_MAX; | |
| 93 for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) { | |
| 94 if (iter->second > count) { | |
| 95 mode = iter->first; | |
| 96 count = iter->second; | |
| 97 } | |
| 98 if (iter->second < minCount) { | |
| 99 antiMode = iter->first; | |
| 100 minCount = iter->second; | |
| 101 } | |
| 102 } | |
| 103 // report | |
| 104 if (_scoreOp == "mode") { | |
| 105 printf("%s", mode.c_str()); | |
| 106 } | |
| 107 else if (_scoreOp == "antimode") { | |
| 108 printf("%s", antiMode.c_str()); | |
| 109 } | |
| 110 } | |
| 111 else if (_scoreOp == "collapse") { | |
| 112 vector<string>::const_iterator scoreItr = scores.begin(); | |
| 113 vector<string>::const_iterator scoreEnd = scores.end(); | |
| 114 for (; scoreItr != scoreEnd; ++scoreItr) { | |
| 115 if (scoreItr < (scoreEnd - 1)) | |
| 116 cout << *scoreItr << ";"; | |
| 117 else | |
| 118 cout << *scoreItr; | |
| 119 } | |
| 120 } | |
| 121 } | |
| 122 else { | |
| 123 cerr << endl | |
| 124 << "*****" << endl | |
| 125 << "*****ERROR: No scores found to report for the -scores option. Exiting." << endl | |
| 126 << "*****" << endl; | |
| 127 exit(1); | |
| 128 } | |
| 129 } | |
| 130 | |
| 131 // =============== | |
| 132 // = Constructor = | |
| 133 // =============== | |
| 134 BedMerge::BedMerge(string &bedFile, | |
| 135 bool numEntries, | |
| 136 int maxDistance, | |
| 137 bool forceStrand, | |
| 138 bool reportNames, | |
| 139 bool reportScores, | |
| 140 const string &scoreOp) : | |
| 141 _bedFile(bedFile), | |
| 142 _numEntries(numEntries), | |
| 143 _forceStrand(forceStrand), | |
| 144 _reportNames(reportNames), | |
| 145 _reportScores(reportScores), | |
| 146 _scoreOp(scoreOp), | |
| 147 _maxDistance(maxDistance) | |
| 148 { | |
| 149 _bed = new BedFile(bedFile); | |
| 150 | |
| 151 if (_forceStrand == false) | |
| 152 MergeBed(); | |
| 153 else | |
| 154 MergeBedStranded(); | |
| 155 } | |
| 156 | |
| 157 | |
| 158 // ================= | |
| 159 // = Destructor = | |
| 160 // ================= | |
| 161 BedMerge::~BedMerge(void) { | |
| 162 } | |
| 163 | |
| 164 | |
| 165 // =============================================== | |
| 166 // Convenience method for reporting merged blocks | |
| 167 // ================================================ | |
| 168 void BedMerge::Report(string chrom, int start, int end, | |
| 169 const vector<string> &names, const vector<string> &scores, int mergeCount) | |
| 170 { | |
| 171 // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type | |
| 172 //if (_bed->isZeroBased == false) {start++;} | |
| 173 | |
| 174 printf("%s\t%d\t%d", chrom.c_str(), start, end); | |
| 175 // just the merged intervals | |
| 176 if (_numEntries == false && _reportNames == false && _reportScores == false) { | |
| 177 printf("\n"); | |
| 178 } | |
| 179 // merged intervals and counts | |
| 180 else if (_numEntries == true && _reportNames == false && _reportScores == false) { | |
| 181 printf("\t%d\n", mergeCount); | |
| 182 } | |
| 183 // merged intervals and names | |
| 184 else if (_numEntries == false && _reportNames == true && _reportScores == false) { | |
| 185 ReportMergedNames(names); | |
| 186 printf("\n"); | |
| 187 } | |
| 188 // merged intervals and scores | |
| 189 else if (_numEntries == false && _reportNames == false && _reportScores == true) { | |
| 190 ReportMergedScores(scores); | |
| 191 printf("\n"); | |
| 192 } | |
| 193 // merged intervals, names, and scores | |
| 194 else if (_numEntries == false && _reportNames == true && _reportScores == true) { | |
| 195 ReportMergedNames(names); | |
| 196 ReportMergedScores(scores); | |
| 197 printf("\n"); | |
| 198 } | |
| 199 } | |
| 200 | |
| 201 | |
| 202 // ========================================================= | |
| 203 // Convenience method for reporting merged blocks by strand | |
| 204 // ========================================================= | |
| 205 void BedMerge::ReportStranded(string chrom, int start, int end, | |
| 206 const vector<string> &names, const vector<string> &scores, | |
| 207 int mergeCount, string strand) | |
| 208 { | |
| 209 // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type | |
| 210 //if (_bed->isZeroBased == false) {start++;} | |
| 211 | |
| 212 printf("%s\t%d\t%d", chrom.c_str(), start, end); | |
| 213 // just the merged intervals | |
| 214 if (_numEntries == false && _reportNames == false && _reportScores == false) { | |
| 215 printf("\t%s\n", strand.c_str()); | |
| 216 } | |
| 217 // merged intervals and counts | |
| 218 else if (_numEntries == true && _reportNames == false && _reportScores == false) { | |
| 219 printf("\t%d\t%s\n", mergeCount, strand.c_str()); | |
| 220 } | |
| 221 // merged intervals and names | |
| 222 else if (_numEntries == false && _reportNames == true && _reportScores == false) { | |
| 223 ReportMergedNames(names); | |
| 224 printf("\t%s\n", strand.c_str()); | |
| 225 } | |
| 226 // merged intervals and scores | |
| 227 else if (_numEntries == false && _reportNames == false && _reportScores == true) { | |
| 228 ReportMergedScores(scores); | |
| 229 printf("\t%s\n", strand.c_str()); | |
| 230 } | |
| 231 // merged intervals, names, and scores | |
| 232 else if (_numEntries == false && _reportNames == true && _reportScores == true) { | |
| 233 ReportMergedNames(names); | |
| 234 ReportMergedScores(scores); | |
| 235 printf("\t%s\n", strand.c_str()); | |
| 236 } | |
| 237 } | |
| 238 | |
| 239 | |
| 240 // ===================================================== | |
| 241 // = Merge overlapping BED entries into a single entry = | |
| 242 // ===================================================== | |
| 243 void BedMerge::MergeBed() { | |
| 244 | |
| 245 // load the "B" bed file into a map so | |
| 246 // that we can easily compare "A" to it for overlaps | |
| 247 _bed->loadBedFileIntoMapNoBin(); | |
| 248 | |
| 249 // loop through each chromosome and merge their BED entries | |
| 250 masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); | |
| 251 masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); | |
| 252 for (; m != mEnd; ++m) { | |
| 253 | |
| 254 // bedList is already sorted by start position. | |
| 255 string chrom = m->first; | |
| 256 vector<BED> bedList = m->second; | |
| 257 int mergeCount = 1; | |
| 258 vector<string> names; | |
| 259 vector<string> scores; | |
| 260 | |
| 261 // merge overlapping features for this chromosome. | |
| 262 int start = -1; | |
| 263 int end = -1; | |
| 264 vector<BED>::const_iterator bedItr = bedList.begin(); | |
| 265 vector<BED>::const_iterator bedEnd = bedList.end(); | |
| 266 for (; bedItr != bedEnd; ++bedItr) { | |
| 267 // new block, no overlap | |
| 268 if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) { | |
| 269 if (start >= 0) { | |
| 270 Report(chrom, start, end, names, scores, mergeCount); | |
| 271 // reset | |
| 272 mergeCount = 1; | |
| 273 names.clear(); | |
| 274 scores.clear(); | |
| 275 } | |
| 276 start = bedItr->start; | |
| 277 end = bedItr->end; | |
| 278 if (!bedItr->name.empty()) names.push_back(bedItr->name); | |
| 279 if (!bedItr->score.empty()) scores.push_back(bedItr->score); | |
| 280 } | |
| 281 // same block, overlaps | |
| 282 else { | |
| 283 if ((int) bedItr-> end > end) end = bedItr->end; | |
| 284 mergeCount++; | |
| 285 if (!bedItr->name.empty()) names.push_back(bedItr->name); | |
| 286 if (!bedItr->score.empty()) scores.push_back(bedItr->score); | |
| 287 } | |
| 288 } | |
| 289 if (start >= 0) { | |
| 290 Report(chrom, start, end, names, scores, mergeCount); | |
| 291 } | |
| 292 } | |
| 293 } | |
| 294 | |
| 295 | |
| 296 // ================================================================================== | |
| 297 // = Merge overlapping BED entries into a single entry, accounting for strandedness = | |
| 298 // ================================================================================== | |
| 299 void BedMerge::MergeBedStranded() { | |
| 300 | |
| 301 // load the "B" bed file into a map so | |
| 302 // that we can easily compare "A" to it for overlaps | |
| 303 _bed->loadBedFileIntoMapNoBin(); | |
| 304 | |
| 305 // loop through each chromosome and merge their BED entries | |
| 306 masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); | |
| 307 masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); | |
| 308 for (; m != mEnd; ++m) { | |
| 309 | |
| 310 // bedList is already sorted by start position. | |
| 311 string chrom = m->first; | |
| 312 vector<BED> bedList = m->second; | |
| 313 | |
| 314 // make a list of the two strands to merge separately. | |
| 315 vector<string> strands(2); | |
| 316 strands[0] = "+"; | |
| 317 strands[1] = "-"; | |
| 318 | |
| 319 // do two passes, one for each strand. | |
| 320 for (unsigned int s = 0; s < strands.size(); s++) { | |
| 321 | |
| 322 int mergeCount = 1; | |
| 323 int numOnStrand = 0; | |
| 324 vector<string> names; | |
| 325 vector<string> scores; | |
| 326 | |
| 327 // merge overlapping features for this chromosome. | |
| 328 int start = -1; | |
| 329 int end = -1; | |
| 330 vector<BED>::const_iterator bedItr = bedList.begin(); | |
| 331 vector<BED>::const_iterator bedEnd = bedList.end(); | |
| 332 for (; bedItr != bedEnd; ++bedItr) { | |
| 333 | |
| 334 // if forcing strandedness, move on if the hit | |
| 335 // is not on the current strand. | |
| 336 if (bedItr->strand != strands[s]) { continue; } | |
| 337 else { numOnStrand++; } | |
| 338 | |
| 339 if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) { | |
| 340 if (start >= 0) { | |
| 341 ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]); | |
| 342 // reset | |
| 343 mergeCount = 1; | |
| 344 names.clear(); | |
| 345 scores.clear(); | |
| 346 } | |
| 347 start = bedItr->start; | |
| 348 end = bedItr->end; | |
| 349 if (!bedItr->name.empty()) names.push_back(bedItr->name); | |
| 350 if (!bedItr->score.empty()) scores.push_back(bedItr->score); | |
| 351 } | |
| 352 else { | |
| 353 if ((int) bedItr-> end > end) end = bedItr->end; | |
| 354 mergeCount++; | |
| 355 if (!bedItr->name.empty()) names.push_back(bedItr->name); | |
| 356 if (!bedItr->score.empty()) scores.push_back(bedItr->score); | |
| 357 } | |
| 358 } | |
| 359 if (start >= 0) { | |
| 360 ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]); | |
| 361 } | |
| 362 } | |
| 363 } | |
| 364 } |
