Mercurial > repos > aaronquinlan > multi_intersect
view BEDTools-Version-2.14.3/src/mergeBed/mergeBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
line wrap: on
line source
/***************************************************************************** mergeBed.cpp (c) 2009 - Aaron Quinlan Hall Laboratory Department of Biochemistry and Molecular Genetics University of Virginia aaronquinlan@gmail.com Licenced under the GNU General Public License 2.0 license. ******************************************************************************/ #include "lineFileUtilities.h" #include "mergeBed.h" void BedMerge::ReportMergedNames(const vector<string> &names) { if (names.size() > 0) { printf("\t"); vector<string>::const_iterator nameItr = names.begin(); vector<string>::const_iterator nameEnd = names.end(); for (; nameItr != nameEnd; ++nameItr) { if (nameItr < (nameEnd - 1)) cout << *nameItr << ";"; else cout << *nameItr; } } else { cerr << endl << "*****" << endl << "*****ERROR: No names found to report for the -names option. Exiting." << endl << "*****" << endl; exit(1); } } void BedMerge::ReportMergedScores(const vector<string> &scores) { if (scores.size() > 0) { printf("\t"); // convert the scores to floats vector<float> data; for (size_t i = 0 ; i < scores.size() ; i++) { data.push_back(atof(scores[i].c_str())); } if (_scoreOp == "sum") { printf("%.3f", accumulate(data.begin(), data.end(), 0.0)); } else if (_scoreOp == "min") { printf("%.3f", *min_element( data.begin(), data.end() )); } else if (_scoreOp == "max") { printf("%.3f", *max_element( data.begin(), data.end() )); } else if (_scoreOp == "mean") { double total = accumulate(data.begin(), data.end(), 0.0); double mean = total / data.size(); printf("%.3f", mean); } else if (_scoreOp == "median") { double median = 0.0; sort(data.begin(), data.end()); int totalLines = data.size(); if ((totalLines % 2) > 0) { long mid; mid = totalLines / 2; median = data[mid]; } else { long midLow, midHigh; midLow = (totalLines / 2) - 1; midHigh = (totalLines / 2); median = (data[midLow] + data[midHigh]) / 2.0; } printf("%.3f", median); } else if ((_scoreOp == "mode") || (_scoreOp == "antimode")) { // compute the frequency of each unique value map<string, int> freqs; vector<string>::const_iterator dIt = scores.begin(); vector<string>::const_iterator dEnd = scores.end(); for (; dIt != dEnd; ++dIt) { freqs[*dIt]++; } // grab the mode and the anti mode string mode, antiMode; int count = 0; int minCount = INT_MAX; for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) { if (iter->second > count) { mode = iter->first; count = iter->second; } if (iter->second < minCount) { antiMode = iter->first; minCount = iter->second; } } // report if (_scoreOp == "mode") { printf("%s", mode.c_str()); } else if (_scoreOp == "antimode") { printf("%s", antiMode.c_str()); } } else if (_scoreOp == "collapse") { vector<string>::const_iterator scoreItr = scores.begin(); vector<string>::const_iterator scoreEnd = scores.end(); for (; scoreItr != scoreEnd; ++scoreItr) { if (scoreItr < (scoreEnd - 1)) cout << *scoreItr << ";"; else cout << *scoreItr; } } } else { cerr << endl << "*****" << endl << "*****ERROR: No scores found to report for the -scores option. Exiting." << endl << "*****" << endl; exit(1); } } // =============== // = Constructor = // =============== BedMerge::BedMerge(string &bedFile, bool numEntries, int maxDistance, bool forceStrand, bool reportNames, bool reportScores, const string &scoreOp) : _bedFile(bedFile), _numEntries(numEntries), _forceStrand(forceStrand), _reportNames(reportNames), _reportScores(reportScores), _scoreOp(scoreOp), _maxDistance(maxDistance) { _bed = new BedFile(bedFile); if (_forceStrand == false) MergeBed(); else MergeBedStranded(); } // ================= // = Destructor = // ================= BedMerge::~BedMerge(void) { } // =============================================== // Convenience method for reporting merged blocks // ================================================ void BedMerge::Report(string chrom, int start, int end, const vector<string> &names, const vector<string> &scores, int mergeCount) { // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type //if (_bed->isZeroBased == false) {start++;} printf("%s\t%d\t%d", chrom.c_str(), start, end); // just the merged intervals if (_numEntries == false && _reportNames == false && _reportScores == false) { printf("\n"); } // merged intervals and counts else if (_numEntries == true && _reportNames == false && _reportScores == false) { printf("\t%d\n", mergeCount); } // merged intervals and names else if (_numEntries == false && _reportNames == true && _reportScores == false) { ReportMergedNames(names); printf("\n"); } // merged intervals and scores else if (_numEntries == false && _reportNames == false && _reportScores == true) { ReportMergedScores(scores); printf("\n"); } // merged intervals, names, and scores else if (_numEntries == false && _reportNames == true && _reportScores == true) { ReportMergedNames(names); ReportMergedScores(scores); printf("\n"); } } // ========================================================= // Convenience method for reporting merged blocks by strand // ========================================================= void BedMerge::ReportStranded(string chrom, int start, int end, const vector<string> &names, const vector<string> &scores, int mergeCount, string strand) { // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type //if (_bed->isZeroBased == false) {start++;} printf("%s\t%d\t%d", chrom.c_str(), start, end); // just the merged intervals if (_numEntries == false && _reportNames == false && _reportScores == false) { printf("\t%s\n", strand.c_str()); } // merged intervals and counts else if (_numEntries == true && _reportNames == false && _reportScores == false) { printf("\t%d\t%s\n", mergeCount, strand.c_str()); } // merged intervals and names else if (_numEntries == false && _reportNames == true && _reportScores == false) { ReportMergedNames(names); printf("\t%s\n", strand.c_str()); } // merged intervals and scores else if (_numEntries == false && _reportNames == false && _reportScores == true) { ReportMergedScores(scores); printf("\t%s\n", strand.c_str()); } // merged intervals, names, and scores else if (_numEntries == false && _reportNames == true && _reportScores == true) { ReportMergedNames(names); ReportMergedScores(scores); printf("\t%s\n", strand.c_str()); } } // ===================================================== // = Merge overlapping BED entries into a single entry = // ===================================================== void BedMerge::MergeBed() { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bed->loadBedFileIntoMapNoBin(); // loop through each chromosome and merge their BED entries masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); for (; m != mEnd; ++m) { // bedList is already sorted by start position. string chrom = m->first; vector<BED> bedList = m->second; int mergeCount = 1; vector<string> names; vector<string> scores; // merge overlapping features for this chromosome. int start = -1; int end = -1; vector<BED>::const_iterator bedItr = bedList.begin(); vector<BED>::const_iterator bedEnd = bedList.end(); for (; bedItr != bedEnd; ++bedItr) { // new block, no overlap if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) { if (start >= 0) { Report(chrom, start, end, names, scores, mergeCount); // reset mergeCount = 1; names.clear(); scores.clear(); } start = bedItr->start; end = bedItr->end; if (!bedItr->name.empty()) names.push_back(bedItr->name); if (!bedItr->score.empty()) scores.push_back(bedItr->score); } // same block, overlaps else { if ((int) bedItr-> end > end) end = bedItr->end; mergeCount++; if (!bedItr->name.empty()) names.push_back(bedItr->name); if (!bedItr->score.empty()) scores.push_back(bedItr->score); } } if (start >= 0) { Report(chrom, start, end, names, scores, mergeCount); } } } // ================================================================================== // = Merge overlapping BED entries into a single entry, accounting for strandedness = // ================================================================================== void BedMerge::MergeBedStranded() { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bed->loadBedFileIntoMapNoBin(); // loop through each chromosome and merge their BED entries masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin(); masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end(); for (; m != mEnd; ++m) { // bedList is already sorted by start position. string chrom = m->first; vector<BED> bedList = m->second; // make a list of the two strands to merge separately. vector<string> strands(2); strands[0] = "+"; strands[1] = "-"; // do two passes, one for each strand. for (unsigned int s = 0; s < strands.size(); s++) { int mergeCount = 1; int numOnStrand = 0; vector<string> names; vector<string> scores; // merge overlapping features for this chromosome. int start = -1; int end = -1; vector<BED>::const_iterator bedItr = bedList.begin(); vector<BED>::const_iterator bedEnd = bedList.end(); for (; bedItr != bedEnd; ++bedItr) { // if forcing strandedness, move on if the hit // is not on the current strand. if (bedItr->strand != strands[s]) { continue; } else { numOnStrand++; } if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) { if (start >= 0) { ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]); // reset mergeCount = 1; names.clear(); scores.clear(); } start = bedItr->start; end = bedItr->end; if (!bedItr->name.empty()) names.push_back(bedItr->name); if (!bedItr->score.empty()) scores.push_back(bedItr->score); } else { if ((int) bedItr-> end > end) end = bedItr->end; mergeCount++; if (!bedItr->name.empty()) names.push_back(bedItr->name); if (!bedItr->score.empty()) scores.push_back(bedItr->score); } } if (start >= 0) { ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]); } } } }