diff BEDTools-Version-2.14.3/src/unionBedGraphs/unionBedGraphs.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BEDTools-Version-2.14.3/src/unionBedGraphs/unionBedGraphs.cpp	Thu Nov 03 10:25:04 2011 -0400
@@ -0,0 +1,255 @@
+/*****************************************************************************
+  unionBedGraphs.cpp
+
+  (c) 2010 - Assaf Gordon, CSHL
+           - Aaron Quinlan, UVA
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include <cassert>
+#include <cstring>
+#include <cstdlib>
+#include <iostream>
+#include <algorithm>
+
+#include "bedGraphFile.h"
+#include "unionBedGraphs.h"
+
+using namespace std;
+
+
+UnionBedGraphs::UnionBedGraphs(std::ostream& _output,
+                            const vector<string>& _filenames,
+                            const vector<string>& _titles,
+                            bool _print_empty_regions,
+                            const std::string& _genome_size_filename,
+                            const std::string& _no_coverage_value   ) :
+    filenames(_filenames),
+    titles(_titles),
+    output(_output),
+    current_non_zero_inputs(0),
+    print_empty_regions(_print_empty_regions),
+    genome_sizes(NULL),
+    no_coverage_value(_no_coverage_value)
+{
+    if (print_empty_regions) {
+        assert(!_genome_size_filename.empty());
+
+        genome_sizes = new GenomeFile(_genome_size_filename);
+    }
+}
+
+
+UnionBedGraphs::~UnionBedGraphs() {
+    CloseBedgraphFiles();
+    if (genome_sizes) {
+        delete genome_sizes;
+        genome_sizes = NULL ;
+    }
+}
+
+
+void UnionBedGraphs::Union() {
+    OpenBedgraphFiles();
+
+    // Add the first interval from each file
+    for(size_t i=0;i<bedgraph_files.size();++i)
+        LoadNextBedgraphItem(i);
+
+    // Chromosome loop - once per chromosome
+    do {
+        // Find the first chromosome to use
+        current_chrom = DetermineNextChrom();
+
+        // Populate the queue with initial values from all files
+        // (if they belong to the correct chromosome)
+        for(size_t i=0;i<bedgraph_files.size();++i)
+            AddInterval(i);
+
+        CHRPOS current_start = ConsumeNextCoordinate();
+
+        // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage
+        if (print_empty_regions && current_start > 0)
+            PrintEmptyCoverage(0,current_start);
+
+        // Intervals loop - until all intervals (of current chromosome) from all files are used.
+        do {
+            CHRPOS current_end = queue.top().coord;
+            PrintCoverage(current_start, current_end);
+            current_start = ConsumeNextCoordinate();
+        } while (!queue.empty());
+
+        // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome
+            // print a dummy empty coverage
+        if (print_empty_regions) {
+            CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom);
+            if (current_start < chrom_size)
+                PrintEmptyCoverage(current_start, chrom_size);
+        }
+
+    } while (!AllFilesDone());
+}
+
+
+CHRPOS UnionBedGraphs::ConsumeNextCoordinate() {
+    assert(!queue.empty());
+
+    CHRPOS new_position = queue.top().coord;
+    do {
+        IntervalItem item = queue.top();
+        UpdateInformation(item);
+        queue.pop();
+    } while (!queue.empty() && queue.top().coord == new_position);
+
+    return new_position;
+}
+
+
+void UnionBedGraphs::UpdateInformation(const IntervalItem &item) {
+    // Update the depth coverage for this file
+
+    // Which coordinate is it - start or end?
+    switch (item.coord_type)
+    {
+    case START:
+        current_depth[item.source_index] = item.depth;
+        current_non_zero_inputs++;
+        break;
+    case END:
+        //Read the next interval from this file
+        AddInterval(item.source_index);
+        current_depth[item.source_index] = no_coverage_value;
+        current_non_zero_inputs--;
+        break;
+    default:
+        assert(0);
+    }
+}
+
+
+void UnionBedGraphs::PrintHeader() {
+    output << "chrom\tstart\tend" ;
+    for (size_t i=0;i<titles.size();++i)
+        output << "\t" <<titles[i];
+    output << endl;
+}
+
+
+void UnionBedGraphs::PrintCoverage(CHRPOS start, CHRPOS end) {
+    if ( current_non_zero_inputs == 0 && ! print_empty_regions )
+        return ;
+
+    output << current_chrom << "\t"
+        << start << "\t"
+        << end;
+
+    for (size_t i=0;i<current_depth.size();++i)
+        output << "\t" << current_depth[i] ;
+
+    output << endl;
+}
+
+
+void UnionBedGraphs::PrintEmptyCoverage(CHRPOS start, CHRPOS end) {
+    output << current_chrom << "\t"
+        << start << "\t"
+        << end;
+
+    for (size_t i=0;i<current_depth.size();++i)
+        output << "\t" << no_coverage_value ;
+
+    output << endl;
+}
+
+
+void UnionBedGraphs::LoadNextBedgraphItem(int index) {
+    assert(static_cast<unsigned int>(index) < bedgraph_files.size());
+
+    current_bedgraph_item[index].chrom="";
+
+    BedGraphFile *file = bedgraph_files[index];
+    BEDGRAPH_STR bg;
+    int lineNum = 0;
+    BedGraphLineStatus status;
+
+    while ( (status = file->GetNextBedGraph(bg, lineNum)) != BEDGRAPH_INVALID )  {
+        if (status != BEDGRAPH_VALID)
+            continue;
+
+        current_bedgraph_item[index] = bg ;
+        break;
+    }
+}
+
+
+bool UnionBedGraphs::AllFilesDone() {
+    for (size_t i=0;i<current_bedgraph_item.size();++i)
+        if (!current_bedgraph_item[i].chrom.empty())
+            return false;
+    return true;
+}
+
+
+string UnionBedGraphs::DetermineNextChrom() {
+    string next_chrom;
+    for (size_t i=0;i<current_bedgraph_item.size();++i) {
+        if (current_bedgraph_item[i].chrom.empty())
+            continue;
+
+        if (next_chrom.empty())
+            next_chrom = current_bedgraph_item[i].chrom;
+        else
+            if (current_bedgraph_item[i].chrom < next_chrom)
+                next_chrom = current_bedgraph_item[i].chrom ;
+    }
+    return next_chrom;
+}
+
+
+void UnionBedGraphs::AddInterval(int index) {
+    assert(static_cast<unsigned int>(index) < bedgraph_files.size());
+
+    //This file has no more intervals
+    if (current_bedgraph_item[index].chrom.empty())
+        return ;
+
+    //If the next interval belongs to a different chrom, don't add it
+    if (current_bedgraph_item[index].chrom!=current_chrom)
+        return ;
+
+    const BEDGRAPH_STR &bg(current_bedgraph_item[index]);
+
+    IntervalItem start_item(index, START, bg.start, bg.depth);
+    IntervalItem end_item(index, END, bg.end, bg.depth);
+
+    queue.push(start_item);
+    queue.push(end_item);
+
+    LoadNextBedgraphItem(index);
+}
+
+
+void UnionBedGraphs::OpenBedgraphFiles() {
+    for (size_t i=0;i<filenames.size();++i) {
+        BedGraphFile *file = new BedGraphFile(filenames[i]);
+        file->Open();
+        bedgraph_files.push_back(file);
+
+        current_depth.push_back(no_coverage_value);
+    }
+    current_bedgraph_item.resize(filenames.size());
+}
+
+
+void UnionBedGraphs::CloseBedgraphFiles() {
+    for (size_t i=0;i<bedgraph_files.size();++i) {
+        BedGraphFile *file = bedgraph_files[i];
+        delete file;
+        bedgraph_files[i] = NULL ;
+    }
+    bedgraph_files.clear();
+}