Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/unionBedGraphs/unionBedGraphs.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfcd8b6c1bda |
---|---|
1 /***************************************************************************** | |
2 unionBedGraphs.cpp | |
3 | |
4 (c) 2010 - Assaf Gordon, CSHL | |
5 - Aaron Quinlan, UVA | |
6 Hall Laboratory | |
7 Department of Biochemistry and Molecular Genetics | |
8 University of Virginia | |
9 aaronquinlan@gmail.com | |
10 | |
11 Licenced under the GNU General Public License 2.0 license. | |
12 ******************************************************************************/ | |
13 #include <cassert> | |
14 #include <cstring> | |
15 #include <cstdlib> | |
16 #include <iostream> | |
17 #include <algorithm> | |
18 | |
19 #include "bedGraphFile.h" | |
20 #include "unionBedGraphs.h" | |
21 | |
22 using namespace std; | |
23 | |
24 | |
25 UnionBedGraphs::UnionBedGraphs(std::ostream& _output, | |
26 const vector<string>& _filenames, | |
27 const vector<string>& _titles, | |
28 bool _print_empty_regions, | |
29 const std::string& _genome_size_filename, | |
30 const std::string& _no_coverage_value ) : | |
31 filenames(_filenames), | |
32 titles(_titles), | |
33 output(_output), | |
34 current_non_zero_inputs(0), | |
35 print_empty_regions(_print_empty_regions), | |
36 genome_sizes(NULL), | |
37 no_coverage_value(_no_coverage_value) | |
38 { | |
39 if (print_empty_regions) { | |
40 assert(!_genome_size_filename.empty()); | |
41 | |
42 genome_sizes = new GenomeFile(_genome_size_filename); | |
43 } | |
44 } | |
45 | |
46 | |
47 UnionBedGraphs::~UnionBedGraphs() { | |
48 CloseBedgraphFiles(); | |
49 if (genome_sizes) { | |
50 delete genome_sizes; | |
51 genome_sizes = NULL ; | |
52 } | |
53 } | |
54 | |
55 | |
56 void UnionBedGraphs::Union() { | |
57 OpenBedgraphFiles(); | |
58 | |
59 // Add the first interval from each file | |
60 for(size_t i=0;i<bedgraph_files.size();++i) | |
61 LoadNextBedgraphItem(i); | |
62 | |
63 // Chromosome loop - once per chromosome | |
64 do { | |
65 // Find the first chromosome to use | |
66 current_chrom = DetermineNextChrom(); | |
67 | |
68 // Populate the queue with initial values from all files | |
69 // (if they belong to the correct chromosome) | |
70 for(size_t i=0;i<bedgraph_files.size();++i) | |
71 AddInterval(i); | |
72 | |
73 CHRPOS current_start = ConsumeNextCoordinate(); | |
74 | |
75 // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage | |
76 if (print_empty_regions && current_start > 0) | |
77 PrintEmptyCoverage(0,current_start); | |
78 | |
79 // Intervals loop - until all intervals (of current chromosome) from all files are used. | |
80 do { | |
81 CHRPOS current_end = queue.top().coord; | |
82 PrintCoverage(current_start, current_end); | |
83 current_start = ConsumeNextCoordinate(); | |
84 } while (!queue.empty()); | |
85 | |
86 // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome | |
87 // print a dummy empty coverage | |
88 if (print_empty_regions) { | |
89 CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom); | |
90 if (current_start < chrom_size) | |
91 PrintEmptyCoverage(current_start, chrom_size); | |
92 } | |
93 | |
94 } while (!AllFilesDone()); | |
95 } | |
96 | |
97 | |
98 CHRPOS UnionBedGraphs::ConsumeNextCoordinate() { | |
99 assert(!queue.empty()); | |
100 | |
101 CHRPOS new_position = queue.top().coord; | |
102 do { | |
103 IntervalItem item = queue.top(); | |
104 UpdateInformation(item); | |
105 queue.pop(); | |
106 } while (!queue.empty() && queue.top().coord == new_position); | |
107 | |
108 return new_position; | |
109 } | |
110 | |
111 | |
112 void UnionBedGraphs::UpdateInformation(const IntervalItem &item) { | |
113 // Update the depth coverage for this file | |
114 | |
115 // Which coordinate is it - start or end? | |
116 switch (item.coord_type) | |
117 { | |
118 case START: | |
119 current_depth[item.source_index] = item.depth; | |
120 current_non_zero_inputs++; | |
121 break; | |
122 case END: | |
123 //Read the next interval from this file | |
124 AddInterval(item.source_index); | |
125 current_depth[item.source_index] = no_coverage_value; | |
126 current_non_zero_inputs--; | |
127 break; | |
128 default: | |
129 assert(0); | |
130 } | |
131 } | |
132 | |
133 | |
134 void UnionBedGraphs::PrintHeader() { | |
135 output << "chrom\tstart\tend" ; | |
136 for (size_t i=0;i<titles.size();++i) | |
137 output << "\t" <<titles[i]; | |
138 output << endl; | |
139 } | |
140 | |
141 | |
142 void UnionBedGraphs::PrintCoverage(CHRPOS start, CHRPOS end) { | |
143 if ( current_non_zero_inputs == 0 && ! print_empty_regions ) | |
144 return ; | |
145 | |
146 output << current_chrom << "\t" | |
147 << start << "\t" | |
148 << end; | |
149 | |
150 for (size_t i=0;i<current_depth.size();++i) | |
151 output << "\t" << current_depth[i] ; | |
152 | |
153 output << endl; | |
154 } | |
155 | |
156 | |
157 void UnionBedGraphs::PrintEmptyCoverage(CHRPOS start, CHRPOS end) { | |
158 output << current_chrom << "\t" | |
159 << start << "\t" | |
160 << end; | |
161 | |
162 for (size_t i=0;i<current_depth.size();++i) | |
163 output << "\t" << no_coverage_value ; | |
164 | |
165 output << endl; | |
166 } | |
167 | |
168 | |
169 void UnionBedGraphs::LoadNextBedgraphItem(int index) { | |
170 assert(static_cast<unsigned int>(index) < bedgraph_files.size()); | |
171 | |
172 current_bedgraph_item[index].chrom=""; | |
173 | |
174 BedGraphFile *file = bedgraph_files[index]; | |
175 BEDGRAPH_STR bg; | |
176 int lineNum = 0; | |
177 BedGraphLineStatus status; | |
178 | |
179 while ( (status = file->GetNextBedGraph(bg, lineNum)) != BEDGRAPH_INVALID ) { | |
180 if (status != BEDGRAPH_VALID) | |
181 continue; | |
182 | |
183 current_bedgraph_item[index] = bg ; | |
184 break; | |
185 } | |
186 } | |
187 | |
188 | |
189 bool UnionBedGraphs::AllFilesDone() { | |
190 for (size_t i=0;i<current_bedgraph_item.size();++i) | |
191 if (!current_bedgraph_item[i].chrom.empty()) | |
192 return false; | |
193 return true; | |
194 } | |
195 | |
196 | |
197 string UnionBedGraphs::DetermineNextChrom() { | |
198 string next_chrom; | |
199 for (size_t i=0;i<current_bedgraph_item.size();++i) { | |
200 if (current_bedgraph_item[i].chrom.empty()) | |
201 continue; | |
202 | |
203 if (next_chrom.empty()) | |
204 next_chrom = current_bedgraph_item[i].chrom; | |
205 else | |
206 if (current_bedgraph_item[i].chrom < next_chrom) | |
207 next_chrom = current_bedgraph_item[i].chrom ; | |
208 } | |
209 return next_chrom; | |
210 } | |
211 | |
212 | |
213 void UnionBedGraphs::AddInterval(int index) { | |
214 assert(static_cast<unsigned int>(index) < bedgraph_files.size()); | |
215 | |
216 //This file has no more intervals | |
217 if (current_bedgraph_item[index].chrom.empty()) | |
218 return ; | |
219 | |
220 //If the next interval belongs to a different chrom, don't add it | |
221 if (current_bedgraph_item[index].chrom!=current_chrom) | |
222 return ; | |
223 | |
224 const BEDGRAPH_STR &bg(current_bedgraph_item[index]); | |
225 | |
226 IntervalItem start_item(index, START, bg.start, bg.depth); | |
227 IntervalItem end_item(index, END, bg.end, bg.depth); | |
228 | |
229 queue.push(start_item); | |
230 queue.push(end_item); | |
231 | |
232 LoadNextBedgraphItem(index); | |
233 } | |
234 | |
235 | |
236 void UnionBedGraphs::OpenBedgraphFiles() { | |
237 for (size_t i=0;i<filenames.size();++i) { | |
238 BedGraphFile *file = new BedGraphFile(filenames[i]); | |
239 file->Open(); | |
240 bedgraph_files.push_back(file); | |
241 | |
242 current_depth.push_back(no_coverage_value); | |
243 } | |
244 current_bedgraph_item.resize(filenames.size()); | |
245 } | |
246 | |
247 | |
248 void UnionBedGraphs::CloseBedgraphFiles() { | |
249 for (size_t i=0;i<bedgraph_files.size();++i) { | |
250 BedGraphFile *file = bedgraph_files[i]; | |
251 delete file; | |
252 bedgraph_files[i] = NULL ; | |
253 } | |
254 bedgraph_files.clear(); | |
255 } |