0
|
1 /*****************************************************************************
|
|
2 unionBedGraphs.cpp
|
|
3
|
|
4 (c) 2010 - Assaf Gordon, CSHL
|
|
5 - Aaron Quinlan, UVA
|
|
6 Hall Laboratory
|
|
7 Department of Biochemistry and Molecular Genetics
|
|
8 University of Virginia
|
|
9 aaronquinlan@gmail.com
|
|
10
|
|
11 Licenced under the GNU General Public License 2.0 license.
|
|
12 ******************************************************************************/
|
|
13 #include <cassert>
|
|
14 #include <cstring>
|
|
15 #include <cstdlib>
|
|
16 #include <iostream>
|
|
17 #include <algorithm>
|
|
18
|
|
19 #include "bedGraphFile.h"
|
|
20 #include "unionBedGraphs.h"
|
|
21
|
|
22 using namespace std;
|
|
23
|
|
24
|
|
25 UnionBedGraphs::UnionBedGraphs(std::ostream& _output,
|
|
26 const vector<string>& _filenames,
|
|
27 const vector<string>& _titles,
|
|
28 bool _print_empty_regions,
|
|
29 const std::string& _genome_size_filename,
|
|
30 const std::string& _no_coverage_value ) :
|
|
31 filenames(_filenames),
|
|
32 titles(_titles),
|
|
33 output(_output),
|
|
34 current_non_zero_inputs(0),
|
|
35 print_empty_regions(_print_empty_regions),
|
|
36 genome_sizes(NULL),
|
|
37 no_coverage_value(_no_coverage_value)
|
|
38 {
|
|
39 if (print_empty_regions) {
|
|
40 assert(!_genome_size_filename.empty());
|
|
41
|
|
42 genome_sizes = new GenomeFile(_genome_size_filename);
|
|
43 }
|
|
44 }
|
|
45
|
|
46
|
|
47 UnionBedGraphs::~UnionBedGraphs() {
|
|
48 CloseBedgraphFiles();
|
|
49 if (genome_sizes) {
|
|
50 delete genome_sizes;
|
|
51 genome_sizes = NULL ;
|
|
52 }
|
|
53 }
|
|
54
|
|
55
|
|
56 void UnionBedGraphs::Union() {
|
|
57 OpenBedgraphFiles();
|
|
58
|
|
59 // Add the first interval from each file
|
|
60 for(size_t i=0;i<bedgraph_files.size();++i)
|
|
61 LoadNextBedgraphItem(i);
|
|
62
|
|
63 // Chromosome loop - once per chromosome
|
|
64 do {
|
|
65 // Find the first chromosome to use
|
|
66 current_chrom = DetermineNextChrom();
|
|
67
|
|
68 // Populate the queue with initial values from all files
|
|
69 // (if they belong to the correct chromosome)
|
|
70 for(size_t i=0;i<bedgraph_files.size();++i)
|
|
71 AddInterval(i);
|
|
72
|
|
73 CHRPOS current_start = ConsumeNextCoordinate();
|
|
74
|
|
75 // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage
|
|
76 if (print_empty_regions && current_start > 0)
|
|
77 PrintEmptyCoverage(0,current_start);
|
|
78
|
|
79 // Intervals loop - until all intervals (of current chromosome) from all files are used.
|
|
80 do {
|
|
81 CHRPOS current_end = queue.top().coord;
|
|
82 PrintCoverage(current_start, current_end);
|
|
83 current_start = ConsumeNextCoordinate();
|
|
84 } while (!queue.empty());
|
|
85
|
|
86 // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome
|
|
87 // print a dummy empty coverage
|
|
88 if (print_empty_regions) {
|
|
89 CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom);
|
|
90 if (current_start < chrom_size)
|
|
91 PrintEmptyCoverage(current_start, chrom_size);
|
|
92 }
|
|
93
|
|
94 } while (!AllFilesDone());
|
|
95 }
|
|
96
|
|
97
|
|
98 CHRPOS UnionBedGraphs::ConsumeNextCoordinate() {
|
|
99 assert(!queue.empty());
|
|
100
|
|
101 CHRPOS new_position = queue.top().coord;
|
|
102 do {
|
|
103 IntervalItem item = queue.top();
|
|
104 UpdateInformation(item);
|
|
105 queue.pop();
|
|
106 } while (!queue.empty() && queue.top().coord == new_position);
|
|
107
|
|
108 return new_position;
|
|
109 }
|
|
110
|
|
111
|
|
112 void UnionBedGraphs::UpdateInformation(const IntervalItem &item) {
|
|
113 // Update the depth coverage for this file
|
|
114
|
|
115 // Which coordinate is it - start or end?
|
|
116 switch (item.coord_type)
|
|
117 {
|
|
118 case START:
|
|
119 current_depth[item.source_index] = item.depth;
|
|
120 current_non_zero_inputs++;
|
|
121 break;
|
|
122 case END:
|
|
123 //Read the next interval from this file
|
|
124 AddInterval(item.source_index);
|
|
125 current_depth[item.source_index] = no_coverage_value;
|
|
126 current_non_zero_inputs--;
|
|
127 break;
|
|
128 default:
|
|
129 assert(0);
|
|
130 }
|
|
131 }
|
|
132
|
|
133
|
|
134 void UnionBedGraphs::PrintHeader() {
|
|
135 output << "chrom\tstart\tend" ;
|
|
136 for (size_t i=0;i<titles.size();++i)
|
|
137 output << "\t" <<titles[i];
|
|
138 output << endl;
|
|
139 }
|
|
140
|
|
141
|
|
142 void UnionBedGraphs::PrintCoverage(CHRPOS start, CHRPOS end) {
|
|
143 if ( current_non_zero_inputs == 0 && ! print_empty_regions )
|
|
144 return ;
|
|
145
|
|
146 output << current_chrom << "\t"
|
|
147 << start << "\t"
|
|
148 << end;
|
|
149
|
|
150 for (size_t i=0;i<current_depth.size();++i)
|
|
151 output << "\t" << current_depth[i] ;
|
|
152
|
|
153 output << endl;
|
|
154 }
|
|
155
|
|
156
|
|
157 void UnionBedGraphs::PrintEmptyCoverage(CHRPOS start, CHRPOS end) {
|
|
158 output << current_chrom << "\t"
|
|
159 << start << "\t"
|
|
160 << end;
|
|
161
|
|
162 for (size_t i=0;i<current_depth.size();++i)
|
|
163 output << "\t" << no_coverage_value ;
|
|
164
|
|
165 output << endl;
|
|
166 }
|
|
167
|
|
168
|
|
169 void UnionBedGraphs::LoadNextBedgraphItem(int index) {
|
|
170 assert(static_cast<unsigned int>(index) < bedgraph_files.size());
|
|
171
|
|
172 current_bedgraph_item[index].chrom="";
|
|
173
|
|
174 BedGraphFile *file = bedgraph_files[index];
|
|
175 BEDGRAPH_STR bg;
|
|
176 int lineNum = 0;
|
|
177 BedGraphLineStatus status;
|
|
178
|
|
179 while ( (status = file->GetNextBedGraph(bg, lineNum)) != BEDGRAPH_INVALID ) {
|
|
180 if (status != BEDGRAPH_VALID)
|
|
181 continue;
|
|
182
|
|
183 current_bedgraph_item[index] = bg ;
|
|
184 break;
|
|
185 }
|
|
186 }
|
|
187
|
|
188
|
|
189 bool UnionBedGraphs::AllFilesDone() {
|
|
190 for (size_t i=0;i<current_bedgraph_item.size();++i)
|
|
191 if (!current_bedgraph_item[i].chrom.empty())
|
|
192 return false;
|
|
193 return true;
|
|
194 }
|
|
195
|
|
196
|
|
197 string UnionBedGraphs::DetermineNextChrom() {
|
|
198 string next_chrom;
|
|
199 for (size_t i=0;i<current_bedgraph_item.size();++i) {
|
|
200 if (current_bedgraph_item[i].chrom.empty())
|
|
201 continue;
|
|
202
|
|
203 if (next_chrom.empty())
|
|
204 next_chrom = current_bedgraph_item[i].chrom;
|
|
205 else
|
|
206 if (current_bedgraph_item[i].chrom < next_chrom)
|
|
207 next_chrom = current_bedgraph_item[i].chrom ;
|
|
208 }
|
|
209 return next_chrom;
|
|
210 }
|
|
211
|
|
212
|
|
213 void UnionBedGraphs::AddInterval(int index) {
|
|
214 assert(static_cast<unsigned int>(index) < bedgraph_files.size());
|
|
215
|
|
216 //This file has no more intervals
|
|
217 if (current_bedgraph_item[index].chrom.empty())
|
|
218 return ;
|
|
219
|
|
220 //If the next interval belongs to a different chrom, don't add it
|
|
221 if (current_bedgraph_item[index].chrom!=current_chrom)
|
|
222 return ;
|
|
223
|
|
224 const BEDGRAPH_STR &bg(current_bedgraph_item[index]);
|
|
225
|
|
226 IntervalItem start_item(index, START, bg.start, bg.depth);
|
|
227 IntervalItem end_item(index, END, bg.end, bg.depth);
|
|
228
|
|
229 queue.push(start_item);
|
|
230 queue.push(end_item);
|
|
231
|
|
232 LoadNextBedgraphItem(index);
|
|
233 }
|
|
234
|
|
235
|
|
236 void UnionBedGraphs::OpenBedgraphFiles() {
|
|
237 for (size_t i=0;i<filenames.size();++i) {
|
|
238 BedGraphFile *file = new BedGraphFile(filenames[i]);
|
|
239 file->Open();
|
|
240 bedgraph_files.push_back(file);
|
|
241
|
|
242 current_depth.push_back(no_coverage_value);
|
|
243 }
|
|
244 current_bedgraph_item.resize(filenames.size());
|
|
245 }
|
|
246
|
|
247
|
|
248 void UnionBedGraphs::CloseBedgraphFiles() {
|
|
249 for (size_t i=0;i<bedgraph_files.size();++i) {
|
|
250 BedGraphFile *file = bedgraph_files[i];
|
|
251 delete file;
|
|
252 bedgraph_files[i] = NULL ;
|
|
253 }
|
|
254 bedgraph_files.clear();
|
|
255 }
|