Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/multiIntersectBed/multiIntersectBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfcd8b6c1bda |
---|---|
1 /***************************************************************************** | |
2 unionBedGraphs.cpp | |
3 | |
4 (c) 2010 - Assaf Gordon, CSHL | |
5 - Aaron Quinlan, UVA | |
6 Hall Laboratory | |
7 Department of Biochemistry and Molecular Genetics | |
8 University of Virginia | |
9 aaronquinlan@gmail.com | |
10 | |
11 Licenced under the GNU General Public License 2.0 license. | |
12 ******************************************************************************/ | |
13 #include <cassert> | |
14 #include <cstring> | |
15 #include <cstdlib> | |
16 #include <iostream> | |
17 #include <algorithm> | |
18 | |
19 #include "bedFile.h" | |
20 #include "multiIntersectBed.h" | |
21 | |
22 using namespace std; | |
23 | |
24 | |
25 MultiIntersectBed::MultiIntersectBed(std::ostream& _output, | |
26 const vector<string>& _filenames, | |
27 const vector<string>& _titles, | |
28 bool _print_empty_regions, | |
29 const std::string& _genome_size_filename, | |
30 const std::string& _no_coverage_value ) : | |
31 filenames(_filenames), | |
32 titles(_titles), | |
33 output(_output), | |
34 current_non_zero_inputs(0), | |
35 print_empty_regions(_print_empty_regions), | |
36 haveTitles(false), | |
37 genome_sizes(NULL), | |
38 no_coverage_value(_no_coverage_value) | |
39 { | |
40 if (print_empty_regions) { | |
41 assert(!_genome_size_filename.empty()); | |
42 | |
43 genome_sizes = new GenomeFile(_genome_size_filename); | |
44 } | |
45 | |
46 if (titles.size() > 0) { | |
47 haveTitles = true; | |
48 } | |
49 } | |
50 | |
51 | |
52 MultiIntersectBed::~MultiIntersectBed() { | |
53 CloseFiles(); | |
54 if (genome_sizes) { | |
55 delete genome_sizes; | |
56 genome_sizes = NULL ; | |
57 } | |
58 } | |
59 | |
60 | |
61 void MultiIntersectBed::MultiIntersect() { | |
62 OpenFiles(); | |
63 | |
64 // Add the first interval from each file | |
65 for(size_t i = 0;i < input_files.size(); ++i) | |
66 LoadNextItem(i); | |
67 | |
68 // Chromosome loop - once per chromosome | |
69 do { | |
70 // Find the first chromosome to use | |
71 current_chrom = DetermineNextChrom(); | |
72 | |
73 // Populate the queue with initial values from all files | |
74 // (if they belong to the correct chromosome) | |
75 for(size_t i = 0; i < input_files.size(); ++i) | |
76 AddInterval(i); | |
77 | |
78 CHRPOS current_start = ConsumeNextCoordinate(); | |
79 | |
80 // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage | |
81 if (print_empty_regions && current_start > 0) | |
82 PrintEmptyCoverage(0,current_start); | |
83 | |
84 // Intervals loop - until all intervals (of current chromosome) from all files are used. | |
85 do { | |
86 CHRPOS current_end = queue.top().coord; | |
87 PrintCoverage(current_start, current_end); | |
88 current_start = ConsumeNextCoordinate(); | |
89 } while (!queue.empty()); | |
90 | |
91 // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome | |
92 // print a dummy empty coverage | |
93 if (print_empty_regions) { | |
94 CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom); | |
95 if (current_start < chrom_size) | |
96 PrintEmptyCoverage(current_start, chrom_size); | |
97 } | |
98 | |
99 } while (!AllFilesDone()); | |
100 } | |
101 | |
102 | |
103 CHRPOS MultiIntersectBed::ConsumeNextCoordinate() { | |
104 assert(!queue.empty()); | |
105 | |
106 CHRPOS new_position = queue.top().coord; | |
107 do { | |
108 IntervalItem item = queue.top(); | |
109 UpdateInformation(item); | |
110 queue.pop(); | |
111 } while (!queue.empty() && queue.top().coord == new_position); | |
112 | |
113 return new_position; | |
114 } | |
115 | |
116 | |
117 void MultiIntersectBed::UpdateInformation(const IntervalItem &item) { | |
118 // Update the depth coverage for this file | |
119 | |
120 // Which coordinate is it - start or end? | |
121 switch (item.coord_type) | |
122 { | |
123 case START: | |
124 current_depth[item.source_index] = 1; | |
125 current_non_zero_inputs++; | |
126 files_with_coverage[item.source_index] = true; | |
127 break; | |
128 case END: | |
129 //Read the next interval from this file | |
130 AddInterval(item.source_index); | |
131 current_depth[item.source_index] = 0; | |
132 current_non_zero_inputs--; | |
133 files_with_coverage.erase(item.source_index); | |
134 break; | |
135 default: | |
136 assert(0); | |
137 } | |
138 } | |
139 | |
140 | |
141 void MultiIntersectBed::AddInterval(int index) { | |
142 assert(static_cast<unsigned int>(index) < input_files.size()); | |
143 | |
144 //This file has no more intervals | |
145 if (current_item[index].chrom.empty()) | |
146 return; | |
147 | |
148 //If the next interval belongs to a different chrom, don't add it | |
149 if (current_item[index].chrom!=current_chrom) | |
150 return; | |
151 | |
152 const BED &bed(current_item[index]); | |
153 | |
154 IntervalItem start_item(index, START, bed.start); | |
155 IntervalItem end_item(index, END, bed.end); | |
156 | |
157 queue.push(start_item); | |
158 queue.push(end_item); | |
159 | |
160 LoadNextItem(index); | |
161 } | |
162 | |
163 | |
164 void MultiIntersectBed::PrintHeader() { | |
165 output << "chrom\tstart\tend\tnum\tlist" ; | |
166 for (size_t i=0;i<titles.size();++i) | |
167 output << "\t" <<titles[i]; | |
168 output << endl; | |
169 } | |
170 | |
171 | |
172 void MultiIntersectBed::PrintCoverage(CHRPOS start, CHRPOS end) { | |
173 if ( current_non_zero_inputs == 0 && ! print_empty_regions ) | |
174 return ; | |
175 | |
176 output << current_chrom << "\t" | |
177 << start << "\t" | |
178 << end << "\t" | |
179 << current_non_zero_inputs << "\t"; | |
180 | |
181 ostringstream file_list_string; | |
182 ostringstream file_bool_string; | |
183 int depth_count = 0; | |
184 for (size_t i = 0; i < current_depth.size(); ++i) | |
185 { | |
186 if (current_depth[i] > 0) { | |
187 if (depth_count < current_non_zero_inputs - 1) { | |
188 if (!haveTitles) | |
189 file_list_string << i+1 << ","; | |
190 else | |
191 file_list_string << titles[i] << ","; | |
192 } | |
193 else { | |
194 if (!haveTitles) | |
195 file_list_string << i+1; | |
196 else | |
197 file_list_string << titles[i]; | |
198 } | |
199 depth_count++; | |
200 } | |
201 file_bool_string << "\t" << current_depth[i]; | |
202 } | |
203 if (current_non_zero_inputs > 0) { | |
204 cout << file_list_string.str() << file_bool_string.str() << endl; | |
205 } | |
206 else { | |
207 cout << "none" << file_bool_string.str() << endl; | |
208 } | |
209 } | |
210 | |
211 | |
212 void MultiIntersectBed::PrintEmptyCoverage(CHRPOS start, CHRPOS end) { | |
213 output << current_chrom << "\t" | |
214 << start << "\t" | |
215 << end << "\t" | |
216 << "0" << "\t" << "none"; | |
217 | |
218 for (size_t i=0;i<current_depth.size();++i) | |
219 output << "\t0"; | |
220 | |
221 output << endl; | |
222 } | |
223 | |
224 | |
225 void MultiIntersectBed::LoadNextItem(int index) { | |
226 assert(static_cast<unsigned int>(index) < input_files.size()); | |
227 | |
228 current_item[index].chrom=""; | |
229 | |
230 BedFile *file = input_files[index]; | |
231 BED merged_bed; | |
232 int lineNum = 0; | |
233 // | |
234 // TO DO: Do the mergeing on the fly. How best to do this? | |
235 // | |
236 // IDEA: Implement a Merge class with GetNextMerge element. | |
237 // | |
238 | |
239 while (file->GetNextMergedBed(merged_bed, lineNum)) | |
240 { | |
241 current_item[index] = merged_bed; | |
242 break; | |
243 } | |
244 } | |
245 | |
246 | |
247 bool MultiIntersectBed::AllFilesDone() { | |
248 for (size_t i=0;i<current_item.size();++i) | |
249 if (!current_item[i].chrom.empty()) | |
250 return false; | |
251 return true; | |
252 } | |
253 | |
254 | |
255 string MultiIntersectBed::DetermineNextChrom() { | |
256 string next_chrom; | |
257 for (size_t i=0;i<current_item.size();++i) { | |
258 if (current_item[i].chrom.empty()) | |
259 continue; | |
260 | |
261 if (next_chrom.empty()) | |
262 next_chrom = current_item[i].chrom; | |
263 else | |
264 if (current_item[i].chrom < next_chrom) | |
265 next_chrom = current_item[i].chrom ; | |
266 } | |
267 return next_chrom; | |
268 } | |
269 | |
270 | |
271 void MultiIntersectBed::OpenFiles() { | |
272 for (size_t i = 0; i < filenames.size(); ++i) { | |
273 BedFile *file = new BedFile(filenames[i]); | |
274 file->Open(); | |
275 input_files.push_back(file); | |
276 current_depth.push_back(0); | |
277 } | |
278 current_item.resize(filenames.size()); | |
279 } | |
280 | |
281 | |
282 void MultiIntersectBed::CloseFiles() { | |
283 for (size_t i=0; i < input_files.size(); ++i) { | |
284 BedFile *file = input_files[i]; | |
285 delete file; | |
286 input_files[i] = NULL ; | |
287 } | |
288 input_files.clear(); | |
289 } |