diff BEDTools-Version-2.14.3/src/mergeBed/mergeBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BEDTools-Version-2.14.3/src/mergeBed/mergeBed.cpp	Thu Nov 03 10:25:04 2011 -0400
@@ -0,0 +1,364 @@
+/*****************************************************************************
+  mergeBed.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "lineFileUtilities.h"
+#include "mergeBed.h"
+
+
+
+void BedMerge::ReportMergedNames(const vector<string> &names) {
+    if (names.size() > 0) {
+        printf("\t");
+        vector<string>::const_iterator nameItr = names.begin();
+        vector<string>::const_iterator nameEnd = names.end();
+        for (; nameItr != nameEnd; ++nameItr) {
+            if (nameItr < (nameEnd - 1))
+                cout << *nameItr << ";";
+            else
+                cout << *nameItr;
+        }
+    }
+    else {
+        cerr << endl 
+             << "*****" << endl 
+             << "*****ERROR: No names found to report for the -names option. Exiting." << endl 
+             << "*****" << endl;
+        exit(1);
+    }
+}
+
+
+void BedMerge::ReportMergedScores(const vector<string> &scores) {
+    if (scores.size() > 0) {
+        printf("\t");
+
+        // convert the scores to floats
+        vector<float> data;
+        for (size_t i = 0 ; i < scores.size() ; i++) {
+            data.push_back(atof(scores[i].c_str()));
+        }    
+
+        if (_scoreOp == "sum") {
+            printf("%.3f", accumulate(data.begin(), data.end(), 0.0));
+        }
+        else if (_scoreOp == "min") {
+            printf("%.3f", *min_element( data.begin(), data.end() ));
+        }
+        else if (_scoreOp == "max") {
+            printf("%.3f", *max_element( data.begin(), data.end() ));
+        }
+        else if (_scoreOp == "mean") {
+            double total = accumulate(data.begin(), data.end(), 0.0);
+            double mean = total / data.size();
+            printf("%.3f", mean);
+        }
+        else if (_scoreOp == "median") {
+            double median = 0.0;
+            sort(data.begin(), data.end());
+            int totalLines = data.size();
+            if ((totalLines % 2) > 0) {
+                long mid;
+                mid = totalLines / 2;
+                median = data[mid];
+            }
+            else {
+                long midLow, midHigh;
+                midLow = (totalLines / 2) - 1;
+                midHigh = (totalLines / 2);
+                median = (data[midLow] + data[midHigh]) / 2.0;
+            }
+            printf("%.3f", median);
+        }
+        else if ((_scoreOp == "mode") || (_scoreOp == "antimode")) {
+             // compute the frequency of each unique value
+             map<string, int> freqs;
+             vector<string>::const_iterator dIt  = scores.begin();
+             vector<string>::const_iterator dEnd = scores.end();
+             for (; dIt != dEnd; ++dIt) {
+                 freqs[*dIt]++;
+             }
+
+             // grab the mode and the anti mode
+             string mode, antiMode;
+             int    count = 0;
+             int minCount = INT_MAX;
+             for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) {
+                 if (iter->second > count) {
+                     mode = iter->first;
+                     count = iter->second;
+                 }
+                 if (iter->second < minCount) {
+                     antiMode = iter->first;
+                     minCount = iter->second;
+                 }
+             }
+             // report
+             if (_scoreOp == "mode") {
+                 printf("%s", mode.c_str());
+             }
+             else if (_scoreOp == "antimode") {
+                 printf("%s", antiMode.c_str());
+             }
+         }
+         else if (_scoreOp == "collapse") {    
+            vector<string>::const_iterator scoreItr = scores.begin();
+            vector<string>::const_iterator scoreEnd = scores.end();
+            for (; scoreItr != scoreEnd; ++scoreItr) {
+                if (scoreItr < (scoreEnd - 1))
+                    cout << *scoreItr << ";";
+                else
+                    cout << *scoreItr;
+            }
+        }
+    }
+    else {        
+        cerr << endl 
+             << "*****" << endl 
+             << "*****ERROR: No scores found to report for the -scores option. Exiting." << endl 
+             << "*****" << endl;
+        exit(1);
+    }
+}
+
+// ===============
+// = Constructor =
+// ===============
+BedMerge::BedMerge(string &bedFile, 
+                   bool numEntries, 
+                   int  maxDistance, 
+                   bool forceStrand, 
+                   bool reportNames, 
+                   bool reportScores,
+                   const string &scoreOp) :
+    _bedFile(bedFile),
+    _numEntries(numEntries),
+    _forceStrand(forceStrand),
+    _reportNames(reportNames),
+    _reportScores(reportScores),
+    _scoreOp(scoreOp),
+    _maxDistance(maxDistance)
+{
+    _bed = new BedFile(bedFile);
+
+    if (_forceStrand == false)
+        MergeBed();
+    else
+        MergeBedStranded();
+}
+
+
+// =================
+// =   Destructor  =
+// =================
+BedMerge::~BedMerge(void) {
+}
+
+
+// ===============================================
+// Convenience method for reporting merged blocks
+// ================================================
+void BedMerge::Report(string chrom, int start, int end, 
+                      const vector<string> &names, const vector<string> &scores, int mergeCount) 
+{
+    // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type
+    //if (_bed->isZeroBased == false) {start++;}
+    
+    printf("%s\t%d\t%d", chrom.c_str(), start, end);
+    // just the merged intervals
+    if (_numEntries == false && _reportNames == false && _reportScores == false) {
+        printf("\n");
+    }
+    // merged intervals and counts    
+    else if (_numEntries == true && _reportNames == false && _reportScores == false) {
+        printf("\t%d\n", mergeCount);
+    }
+    // merged intervals and names        
+    else if (_numEntries == false && _reportNames == true && _reportScores == false) {
+        ReportMergedNames(names);
+        printf("\n");
+    }
+    // merged intervals and scores        
+    else if (_numEntries == false && _reportNames == false && _reportScores == true) {
+        ReportMergedScores(scores);
+        printf("\n");
+    }
+    // merged intervals, names, and scores        
+    else if (_numEntries == false && _reportNames == true && _reportScores == true) {
+        ReportMergedNames(names);
+        ReportMergedScores(scores);
+        printf("\n");
+    }
+}
+
+
+// =========================================================
+// Convenience method for reporting merged blocks by strand
+// =========================================================
+void BedMerge::ReportStranded(string chrom, int start, int end, 
+                              const vector<string> &names, const vector<string> &scores,
+                              int mergeCount, string strand) 
+{
+    // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type
+    //if (_bed->isZeroBased == false) {start++;}
+    
+    printf("%s\t%d\t%d", chrom.c_str(), start, end);
+    // just the merged intervals
+    if (_numEntries == false && _reportNames == false && _reportScores == false) {
+        printf("\t%s\n", strand.c_str());
+    }
+    // merged intervals and counts    
+    else if (_numEntries == true && _reportNames == false && _reportScores == false) {
+        printf("\t%d\t%s\n", mergeCount, strand.c_str());
+    }
+    // merged intervals and names        
+    else if (_numEntries == false && _reportNames == true && _reportScores == false) {
+        ReportMergedNames(names);
+        printf("\t%s\n", strand.c_str());
+    }
+    // merged intervals and scores        
+    else if (_numEntries == false && _reportNames == false && _reportScores == true) {
+        ReportMergedScores(scores);
+        printf("\t%s\n", strand.c_str());
+    }
+    // merged intervals, names, and scores        
+    else if (_numEntries == false && _reportNames == true && _reportScores == true) {
+        ReportMergedNames(names);
+        ReportMergedScores(scores);
+        printf("\t%s\n", strand.c_str());
+    }
+}
+
+
+// =====================================================
+// = Merge overlapping BED entries into a single entry =
+// =====================================================
+void BedMerge::MergeBed() {
+
+    // load the "B" bed file into a map so
+    // that we can easily compare "A" to it for overlaps
+    _bed->loadBedFileIntoMapNoBin();
+
+    // loop through each chromosome and merge their BED entries
+    masterBedMapNoBin::const_iterator m    = _bed->bedMapNoBin.begin();
+    masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
+    for (; m != mEnd; ++m) {
+
+        // bedList is already sorted by start position.
+        string chrom        = m->first;
+        vector<BED> bedList = m->second;
+        int mergeCount = 1;
+        vector<string> names;
+        vector<string> scores;
+        
+        // merge overlapping features for this chromosome.
+        int start = -1;
+        int end   = -1;
+        vector<BED>::const_iterator bedItr = bedList.begin();
+        vector<BED>::const_iterator bedEnd = bedList.end();
+        for (; bedItr != bedEnd; ++bedItr) {
+            // new block, no overlap
+            if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) {
+                if (start >= 0) {
+                    Report(chrom, start, end, names, scores, mergeCount);
+                    // reset
+                    mergeCount = 1;
+                    names.clear();
+                    scores.clear();
+                }
+                start = bedItr->start;
+                end   = bedItr->end;
+                if (!bedItr->name.empty())  names.push_back(bedItr->name);
+                if (!bedItr->score.empty()) scores.push_back(bedItr->score);
+            }
+            // same block, overlaps
+            else {
+                if ((int) bedItr-> end > end) end = bedItr->end;
+                mergeCount++;
+                if (!bedItr->name.empty())  names.push_back(bedItr->name);
+                if (!bedItr->score.empty()) scores.push_back(bedItr->score);
+            }
+        }
+        if (start >= 0) {
+            Report(chrom, start, end, names, scores, mergeCount);
+        }
+    }
+}
+
+
+// ==================================================================================
+// = Merge overlapping BED entries into a single entry, accounting for strandedness =
+// ==================================================================================
+void BedMerge::MergeBedStranded() {
+
+    // load the "B" bed file into a map so
+    // that we can easily compare "A" to it for overlaps
+    _bed->loadBedFileIntoMapNoBin();
+
+    // loop through each chromosome and merge their BED entries
+    masterBedMapNoBin::const_iterator m    = _bed->bedMapNoBin.begin();
+    masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
+    for (; m != mEnd; ++m) {
+        
+        // bedList is already sorted by start position.
+        string chrom        = m->first;
+        vector<BED> bedList = m->second;
+
+        // make a list of the two strands to merge separately.
+        vector<string> strands(2);
+        strands[0] = "+";
+        strands[1] = "-";
+
+        // do two passes, one for each strand.
+        for (unsigned int s = 0; s < strands.size(); s++) {
+
+            int mergeCount = 1;
+            int numOnStrand = 0;
+            vector<string> names;
+            vector<string> scores;
+
+            // merge overlapping features for this chromosome.
+            int start = -1;
+            int end   = -1;
+            vector<BED>::const_iterator bedItr = bedList.begin();
+            vector<BED>::const_iterator bedEnd = bedList.end();
+            for (; bedItr != bedEnd; ++bedItr) {
+
+                // if forcing strandedness, move on if the hit
+                // is not on the current strand.
+                if (bedItr->strand != strands[s]) { continue; }
+                else { numOnStrand++; }
+                
+                if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) {
+                    if (start >= 0) {
+                        ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]);
+                        // reset
+                        mergeCount = 1;
+                        names.clear();
+                        scores.clear();
+                    }
+                    start = bedItr->start;
+                    end   = bedItr->end;
+                    if (!bedItr->name.empty())  names.push_back(bedItr->name);
+                    if (!bedItr->score.empty()) scores.push_back(bedItr->score);
+                }
+                else {
+                    if ((int) bedItr-> end > end) end = bedItr->end;
+                    mergeCount++;
+                    if (!bedItr->name.empty())  names.push_back(bedItr->name);
+                    if (!bedItr->score.empty()) scores.push_back(bedItr->score);
+                }
+            }
+            if (start >= 0) {
+                ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]);
+            }
+        }
+    }
+}