Mercurial > repos > aaronquinlan > multi_intersect
view BEDTools-Version-2.14.3/src/multiIntersectBed/multiIntersectBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
line wrap: on
line source
/***************************************************************************** unionBedGraphs.cpp (c) 2010 - Assaf Gordon, CSHL - Aaron Quinlan, UVA Hall Laboratory Department of Biochemistry and Molecular Genetics University of Virginia aaronquinlan@gmail.com Licenced under the GNU General Public License 2.0 license. ******************************************************************************/ #include <cassert> #include <cstring> #include <cstdlib> #include <iostream> #include <algorithm> #include "bedFile.h" #include "multiIntersectBed.h" using namespace std; MultiIntersectBed::MultiIntersectBed(std::ostream& _output, const vector<string>& _filenames, const vector<string>& _titles, bool _print_empty_regions, const std::string& _genome_size_filename, const std::string& _no_coverage_value ) : filenames(_filenames), titles(_titles), output(_output), current_non_zero_inputs(0), print_empty_regions(_print_empty_regions), haveTitles(false), genome_sizes(NULL), no_coverage_value(_no_coverage_value) { if (print_empty_regions) { assert(!_genome_size_filename.empty()); genome_sizes = new GenomeFile(_genome_size_filename); } if (titles.size() > 0) { haveTitles = true; } } MultiIntersectBed::~MultiIntersectBed() { CloseFiles(); if (genome_sizes) { delete genome_sizes; genome_sizes = NULL ; } } void MultiIntersectBed::MultiIntersect() { OpenFiles(); // Add the first interval from each file for(size_t i = 0;i < input_files.size(); ++i) LoadNextItem(i); // Chromosome loop - once per chromosome do { // Find the first chromosome to use current_chrom = DetermineNextChrom(); // Populate the queue with initial values from all files // (if they belong to the correct chromosome) for(size_t i = 0; i < input_files.size(); ++i) AddInterval(i); CHRPOS current_start = ConsumeNextCoordinate(); // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage if (print_empty_regions && current_start > 0) PrintEmptyCoverage(0,current_start); // Intervals loop - until all intervals (of current chromosome) from all files are used. do { CHRPOS current_end = queue.top().coord; PrintCoverage(current_start, current_end); current_start = ConsumeNextCoordinate(); } while (!queue.empty()); // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome // print a dummy empty coverage if (print_empty_regions) { CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom); if (current_start < chrom_size) PrintEmptyCoverage(current_start, chrom_size); } } while (!AllFilesDone()); } CHRPOS MultiIntersectBed::ConsumeNextCoordinate() { assert(!queue.empty()); CHRPOS new_position = queue.top().coord; do { IntervalItem item = queue.top(); UpdateInformation(item); queue.pop(); } while (!queue.empty() && queue.top().coord == new_position); return new_position; } void MultiIntersectBed::UpdateInformation(const IntervalItem &item) { // Update the depth coverage for this file // Which coordinate is it - start or end? switch (item.coord_type) { case START: current_depth[item.source_index] = 1; current_non_zero_inputs++; files_with_coverage[item.source_index] = true; break; case END: //Read the next interval from this file AddInterval(item.source_index); current_depth[item.source_index] = 0; current_non_zero_inputs--; files_with_coverage.erase(item.source_index); break; default: assert(0); } } void MultiIntersectBed::AddInterval(int index) { assert(static_cast<unsigned int>(index) < input_files.size()); //This file has no more intervals if (current_item[index].chrom.empty()) return; //If the next interval belongs to a different chrom, don't add it if (current_item[index].chrom!=current_chrom) return; const BED &bed(current_item[index]); IntervalItem start_item(index, START, bed.start); IntervalItem end_item(index, END, bed.end); queue.push(start_item); queue.push(end_item); LoadNextItem(index); } void MultiIntersectBed::PrintHeader() { output << "chrom\tstart\tend\tnum\tlist" ; for (size_t i=0;i<titles.size();++i) output << "\t" <<titles[i]; output << endl; } void MultiIntersectBed::PrintCoverage(CHRPOS start, CHRPOS end) { if ( current_non_zero_inputs == 0 && ! print_empty_regions ) return ; output << current_chrom << "\t" << start << "\t" << end << "\t" << current_non_zero_inputs << "\t"; ostringstream file_list_string; ostringstream file_bool_string; int depth_count = 0; for (size_t i = 0; i < current_depth.size(); ++i) { if (current_depth[i] > 0) { if (depth_count < current_non_zero_inputs - 1) { if (!haveTitles) file_list_string << i+1 << ","; else file_list_string << titles[i] << ","; } else { if (!haveTitles) file_list_string << i+1; else file_list_string << titles[i]; } depth_count++; } file_bool_string << "\t" << current_depth[i]; } if (current_non_zero_inputs > 0) { cout << file_list_string.str() << file_bool_string.str() << endl; } else { cout << "none" << file_bool_string.str() << endl; } } void MultiIntersectBed::PrintEmptyCoverage(CHRPOS start, CHRPOS end) { output << current_chrom << "\t" << start << "\t" << end << "\t" << "0" << "\t" << "none"; for (size_t i=0;i<current_depth.size();++i) output << "\t0"; output << endl; } void MultiIntersectBed::LoadNextItem(int index) { assert(static_cast<unsigned int>(index) < input_files.size()); current_item[index].chrom=""; BedFile *file = input_files[index]; BED merged_bed; int lineNum = 0; // // TO DO: Do the mergeing on the fly. How best to do this? // // IDEA: Implement a Merge class with GetNextMerge element. // while (file->GetNextMergedBed(merged_bed, lineNum)) { current_item[index] = merged_bed; break; } } bool MultiIntersectBed::AllFilesDone() { for (size_t i=0;i<current_item.size();++i) if (!current_item[i].chrom.empty()) return false; return true; } string MultiIntersectBed::DetermineNextChrom() { string next_chrom; for (size_t i=0;i<current_item.size();++i) { if (current_item[i].chrom.empty()) continue; if (next_chrom.empty()) next_chrom = current_item[i].chrom; else if (current_item[i].chrom < next_chrom) next_chrom = current_item[i].chrom ; } return next_chrom; } void MultiIntersectBed::OpenFiles() { for (size_t i = 0; i < filenames.size(); ++i) { BedFile *file = new BedFile(filenames[i]); file->Open(); input_files.push_back(file); current_depth.push_back(0); } current_item.resize(filenames.size()); } void MultiIntersectBed::CloseFiles() { for (size_t i=0; i < input_files.size(); ++i) { BedFile *file = input_files[i]; delete file; input_files[i] = NULL ; } input_files.clear(); }