Mercurial > repos > aaronquinlan > multi_intersect
diff BEDTools-Version-2.14.3/src/unionBedGraphs/unionBedGraphs.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BEDTools-Version-2.14.3/src/unionBedGraphs/unionBedGraphs.cpp Thu Nov 03 10:25:04 2011 -0400 @@ -0,0 +1,255 @@ +/***************************************************************************** + unionBedGraphs.cpp + + (c) 2010 - Assaf Gordon, CSHL + - Aaron Quinlan, UVA + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include <cassert> +#include <cstring> +#include <cstdlib> +#include <iostream> +#include <algorithm> + +#include "bedGraphFile.h" +#include "unionBedGraphs.h" + +using namespace std; + + +UnionBedGraphs::UnionBedGraphs(std::ostream& _output, + const vector<string>& _filenames, + const vector<string>& _titles, + bool _print_empty_regions, + const std::string& _genome_size_filename, + const std::string& _no_coverage_value ) : + filenames(_filenames), + titles(_titles), + output(_output), + current_non_zero_inputs(0), + print_empty_regions(_print_empty_regions), + genome_sizes(NULL), + no_coverage_value(_no_coverage_value) +{ + if (print_empty_regions) { + assert(!_genome_size_filename.empty()); + + genome_sizes = new GenomeFile(_genome_size_filename); + } +} + + +UnionBedGraphs::~UnionBedGraphs() { + CloseBedgraphFiles(); + if (genome_sizes) { + delete genome_sizes; + genome_sizes = NULL ; + } +} + + +void UnionBedGraphs::Union() { + OpenBedgraphFiles(); + + // Add the first interval from each file + for(size_t i=0;i<bedgraph_files.size();++i) + LoadNextBedgraphItem(i); + + // Chromosome loop - once per chromosome + do { + // Find the first chromosome to use + current_chrom = DetermineNextChrom(); + + // Populate the queue with initial values from all files + // (if they belong to the correct chromosome) + for(size_t i=0;i<bedgraph_files.size();++i) + AddInterval(i); + + CHRPOS current_start = ConsumeNextCoordinate(); + + // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage + if (print_empty_regions && current_start > 0) + PrintEmptyCoverage(0,current_start); + + // Intervals loop - until all intervals (of current chromosome) from all files are used. + do { + CHRPOS current_end = queue.top().coord; + PrintCoverage(current_start, current_end); + current_start = ConsumeNextCoordinate(); + } while (!queue.empty()); + + // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome + // print a dummy empty coverage + if (print_empty_regions) { + CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom); + if (current_start < chrom_size) + PrintEmptyCoverage(current_start, chrom_size); + } + + } while (!AllFilesDone()); +} + + +CHRPOS UnionBedGraphs::ConsumeNextCoordinate() { + assert(!queue.empty()); + + CHRPOS new_position = queue.top().coord; + do { + IntervalItem item = queue.top(); + UpdateInformation(item); + queue.pop(); + } while (!queue.empty() && queue.top().coord == new_position); + + return new_position; +} + + +void UnionBedGraphs::UpdateInformation(const IntervalItem &item) { + // Update the depth coverage for this file + + // Which coordinate is it - start or end? + switch (item.coord_type) + { + case START: + current_depth[item.source_index] = item.depth; + current_non_zero_inputs++; + break; + case END: + //Read the next interval from this file + AddInterval(item.source_index); + current_depth[item.source_index] = no_coverage_value; + current_non_zero_inputs--; + break; + default: + assert(0); + } +} + + +void UnionBedGraphs::PrintHeader() { + output << "chrom\tstart\tend" ; + for (size_t i=0;i<titles.size();++i) + output << "\t" <<titles[i]; + output << endl; +} + + +void UnionBedGraphs::PrintCoverage(CHRPOS start, CHRPOS end) { + if ( current_non_zero_inputs == 0 && ! print_empty_regions ) + return ; + + output << current_chrom << "\t" + << start << "\t" + << end; + + for (size_t i=0;i<current_depth.size();++i) + output << "\t" << current_depth[i] ; + + output << endl; +} + + +void UnionBedGraphs::PrintEmptyCoverage(CHRPOS start, CHRPOS end) { + output << current_chrom << "\t" + << start << "\t" + << end; + + for (size_t i=0;i<current_depth.size();++i) + output << "\t" << no_coverage_value ; + + output << endl; +} + + +void UnionBedGraphs::LoadNextBedgraphItem(int index) { + assert(static_cast<unsigned int>(index) < bedgraph_files.size()); + + current_bedgraph_item[index].chrom=""; + + BedGraphFile *file = bedgraph_files[index]; + BEDGRAPH_STR bg; + int lineNum = 0; + BedGraphLineStatus status; + + while ( (status = file->GetNextBedGraph(bg, lineNum)) != BEDGRAPH_INVALID ) { + if (status != BEDGRAPH_VALID) + continue; + + current_bedgraph_item[index] = bg ; + break; + } +} + + +bool UnionBedGraphs::AllFilesDone() { + for (size_t i=0;i<current_bedgraph_item.size();++i) + if (!current_bedgraph_item[i].chrom.empty()) + return false; + return true; +} + + +string UnionBedGraphs::DetermineNextChrom() { + string next_chrom; + for (size_t i=0;i<current_bedgraph_item.size();++i) { + if (current_bedgraph_item[i].chrom.empty()) + continue; + + if (next_chrom.empty()) + next_chrom = current_bedgraph_item[i].chrom; + else + if (current_bedgraph_item[i].chrom < next_chrom) + next_chrom = current_bedgraph_item[i].chrom ; + } + return next_chrom; +} + + +void UnionBedGraphs::AddInterval(int index) { + assert(static_cast<unsigned int>(index) < bedgraph_files.size()); + + //This file has no more intervals + if (current_bedgraph_item[index].chrom.empty()) + return ; + + //If the next interval belongs to a different chrom, don't add it + if (current_bedgraph_item[index].chrom!=current_chrom) + return ; + + const BEDGRAPH_STR &bg(current_bedgraph_item[index]); + + IntervalItem start_item(index, START, bg.start, bg.depth); + IntervalItem end_item(index, END, bg.end, bg.depth); + + queue.push(start_item); + queue.push(end_item); + + LoadNextBedgraphItem(index); +} + + +void UnionBedGraphs::OpenBedgraphFiles() { + for (size_t i=0;i<filenames.size();++i) { + BedGraphFile *file = new BedGraphFile(filenames[i]); + file->Open(); + bedgraph_files.push_back(file); + + current_depth.push_back(no_coverage_value); + } + current_bedgraph_item.resize(filenames.size()); +} + + +void UnionBedGraphs::CloseBedgraphFiles() { + for (size_t i=0;i<bedgraph_files.size();++i) { + BedGraphFile *file = bedgraph_files[i]; + delete file; + bedgraph_files[i] = NULL ; + } + bedgraph_files.clear(); +}