0
|
1 /*****************************************************************************
|
|
2 unionBedGraphs.cpp
|
|
3
|
|
4 (c) 2010 - Assaf Gordon, CSHL
|
|
5 - Aaron Quinlan, UVA
|
|
6 Hall Laboratory
|
|
7 Department of Biochemistry and Molecular Genetics
|
|
8 University of Virginia
|
|
9 aaronquinlan@gmail.com
|
|
10
|
|
11 Licenced under the GNU General Public License 2.0 license.
|
|
12 ******************************************************************************/
|
|
13 #include <cassert>
|
|
14 #include <cstring>
|
|
15 #include <cstdlib>
|
|
16 #include <iostream>
|
|
17 #include <algorithm>
|
|
18
|
|
19 #include "bedFile.h"
|
|
20 #include "multiIntersectBed.h"
|
|
21
|
|
22 using namespace std;
|
|
23
|
|
24
|
|
25 MultiIntersectBed::MultiIntersectBed(std::ostream& _output,
|
|
26 const vector<string>& _filenames,
|
|
27 const vector<string>& _titles,
|
|
28 bool _print_empty_regions,
|
|
29 const std::string& _genome_size_filename,
|
|
30 const std::string& _no_coverage_value ) :
|
|
31 filenames(_filenames),
|
|
32 titles(_titles),
|
|
33 output(_output),
|
|
34 current_non_zero_inputs(0),
|
|
35 print_empty_regions(_print_empty_regions),
|
|
36 haveTitles(false),
|
|
37 genome_sizes(NULL),
|
|
38 no_coverage_value(_no_coverage_value)
|
|
39 {
|
|
40 if (print_empty_regions) {
|
|
41 assert(!_genome_size_filename.empty());
|
|
42
|
|
43 genome_sizes = new GenomeFile(_genome_size_filename);
|
|
44 }
|
|
45
|
|
46 if (titles.size() > 0) {
|
|
47 haveTitles = true;
|
|
48 }
|
|
49 }
|
|
50
|
|
51
|
|
52 MultiIntersectBed::~MultiIntersectBed() {
|
|
53 CloseFiles();
|
|
54 if (genome_sizes) {
|
|
55 delete genome_sizes;
|
|
56 genome_sizes = NULL ;
|
|
57 }
|
|
58 }
|
|
59
|
|
60
|
|
61 void MultiIntersectBed::MultiIntersect() {
|
|
62 OpenFiles();
|
|
63
|
|
64 // Add the first interval from each file
|
|
65 for(size_t i = 0;i < input_files.size(); ++i)
|
|
66 LoadNextItem(i);
|
|
67
|
|
68 // Chromosome loop - once per chromosome
|
|
69 do {
|
|
70 // Find the first chromosome to use
|
|
71 current_chrom = DetermineNextChrom();
|
|
72
|
|
73 // Populate the queue with initial values from all files
|
|
74 // (if they belong to the correct chromosome)
|
|
75 for(size_t i = 0; i < input_files.size(); ++i)
|
|
76 AddInterval(i);
|
|
77
|
|
78 CHRPOS current_start = ConsumeNextCoordinate();
|
|
79
|
|
80 // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage
|
|
81 if (print_empty_regions && current_start > 0)
|
|
82 PrintEmptyCoverage(0,current_start);
|
|
83
|
|
84 // Intervals loop - until all intervals (of current chromosome) from all files are used.
|
|
85 do {
|
|
86 CHRPOS current_end = queue.top().coord;
|
|
87 PrintCoverage(current_start, current_end);
|
|
88 current_start = ConsumeNextCoordinate();
|
|
89 } while (!queue.empty());
|
|
90
|
|
91 // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome
|
|
92 // print a dummy empty coverage
|
|
93 if (print_empty_regions) {
|
|
94 CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom);
|
|
95 if (current_start < chrom_size)
|
|
96 PrintEmptyCoverage(current_start, chrom_size);
|
|
97 }
|
|
98
|
|
99 } while (!AllFilesDone());
|
|
100 }
|
|
101
|
|
102
|
|
103 CHRPOS MultiIntersectBed::ConsumeNextCoordinate() {
|
|
104 assert(!queue.empty());
|
|
105
|
|
106 CHRPOS new_position = queue.top().coord;
|
|
107 do {
|
|
108 IntervalItem item = queue.top();
|
|
109 UpdateInformation(item);
|
|
110 queue.pop();
|
|
111 } while (!queue.empty() && queue.top().coord == new_position);
|
|
112
|
|
113 return new_position;
|
|
114 }
|
|
115
|
|
116
|
|
117 void MultiIntersectBed::UpdateInformation(const IntervalItem &item) {
|
|
118 // Update the depth coverage for this file
|
|
119
|
|
120 // Which coordinate is it - start or end?
|
|
121 switch (item.coord_type)
|
|
122 {
|
|
123 case START:
|
|
124 current_depth[item.source_index] = 1;
|
|
125 current_non_zero_inputs++;
|
|
126 files_with_coverage[item.source_index] = true;
|
|
127 break;
|
|
128 case END:
|
|
129 //Read the next interval from this file
|
|
130 AddInterval(item.source_index);
|
|
131 current_depth[item.source_index] = 0;
|
|
132 current_non_zero_inputs--;
|
|
133 files_with_coverage.erase(item.source_index);
|
|
134 break;
|
|
135 default:
|
|
136 assert(0);
|
|
137 }
|
|
138 }
|
|
139
|
|
140
|
|
141 void MultiIntersectBed::AddInterval(int index) {
|
|
142 assert(static_cast<unsigned int>(index) < input_files.size());
|
|
143
|
|
144 //This file has no more intervals
|
|
145 if (current_item[index].chrom.empty())
|
|
146 return;
|
|
147
|
|
148 //If the next interval belongs to a different chrom, don't add it
|
|
149 if (current_item[index].chrom!=current_chrom)
|
|
150 return;
|
|
151
|
|
152 const BED &bed(current_item[index]);
|
|
153
|
|
154 IntervalItem start_item(index, START, bed.start);
|
|
155 IntervalItem end_item(index, END, bed.end);
|
|
156
|
|
157 queue.push(start_item);
|
|
158 queue.push(end_item);
|
|
159
|
|
160 LoadNextItem(index);
|
|
161 }
|
|
162
|
|
163
|
|
164 void MultiIntersectBed::PrintHeader() {
|
|
165 output << "chrom\tstart\tend\tnum\tlist" ;
|
|
166 for (size_t i=0;i<titles.size();++i)
|
|
167 output << "\t" <<titles[i];
|
|
168 output << endl;
|
|
169 }
|
|
170
|
|
171
|
|
172 void MultiIntersectBed::PrintCoverage(CHRPOS start, CHRPOS end) {
|
|
173 if ( current_non_zero_inputs == 0 && ! print_empty_regions )
|
|
174 return ;
|
|
175
|
|
176 output << current_chrom << "\t"
|
|
177 << start << "\t"
|
|
178 << end << "\t"
|
|
179 << current_non_zero_inputs << "\t";
|
|
180
|
|
181 ostringstream file_list_string;
|
|
182 ostringstream file_bool_string;
|
|
183 int depth_count = 0;
|
|
184 for (size_t i = 0; i < current_depth.size(); ++i)
|
|
185 {
|
|
186 if (current_depth[i] > 0) {
|
|
187 if (depth_count < current_non_zero_inputs - 1) {
|
|
188 if (!haveTitles)
|
|
189 file_list_string << i+1 << ",";
|
|
190 else
|
|
191 file_list_string << titles[i] << ",";
|
|
192 }
|
|
193 else {
|
|
194 if (!haveTitles)
|
|
195 file_list_string << i+1;
|
|
196 else
|
|
197 file_list_string << titles[i];
|
|
198 }
|
|
199 depth_count++;
|
|
200 }
|
|
201 file_bool_string << "\t" << current_depth[i];
|
|
202 }
|
|
203 if (current_non_zero_inputs > 0) {
|
|
204 cout << file_list_string.str() << file_bool_string.str() << endl;
|
|
205 }
|
|
206 else {
|
|
207 cout << "none" << file_bool_string.str() << endl;
|
|
208 }
|
|
209 }
|
|
210
|
|
211
|
|
212 void MultiIntersectBed::PrintEmptyCoverage(CHRPOS start, CHRPOS end) {
|
|
213 output << current_chrom << "\t"
|
|
214 << start << "\t"
|
|
215 << end << "\t"
|
|
216 << "0" << "\t" << "none";
|
|
217
|
|
218 for (size_t i=0;i<current_depth.size();++i)
|
|
219 output << "\t0";
|
|
220
|
|
221 output << endl;
|
|
222 }
|
|
223
|
|
224
|
|
225 void MultiIntersectBed::LoadNextItem(int index) {
|
|
226 assert(static_cast<unsigned int>(index) < input_files.size());
|
|
227
|
|
228 current_item[index].chrom="";
|
|
229
|
|
230 BedFile *file = input_files[index];
|
|
231 BED merged_bed;
|
|
232 int lineNum = 0;
|
|
233 //
|
|
234 // TO DO: Do the mergeing on the fly. How best to do this?
|
|
235 //
|
|
236 // IDEA: Implement a Merge class with GetNextMerge element.
|
|
237 //
|
|
238
|
|
239 while (file->GetNextMergedBed(merged_bed, lineNum))
|
|
240 {
|
|
241 current_item[index] = merged_bed;
|
|
242 break;
|
|
243 }
|
|
244 }
|
|
245
|
|
246
|
|
247 bool MultiIntersectBed::AllFilesDone() {
|
|
248 for (size_t i=0;i<current_item.size();++i)
|
|
249 if (!current_item[i].chrom.empty())
|
|
250 return false;
|
|
251 return true;
|
|
252 }
|
|
253
|
|
254
|
|
255 string MultiIntersectBed::DetermineNextChrom() {
|
|
256 string next_chrom;
|
|
257 for (size_t i=0;i<current_item.size();++i) {
|
|
258 if (current_item[i].chrom.empty())
|
|
259 continue;
|
|
260
|
|
261 if (next_chrom.empty())
|
|
262 next_chrom = current_item[i].chrom;
|
|
263 else
|
|
264 if (current_item[i].chrom < next_chrom)
|
|
265 next_chrom = current_item[i].chrom ;
|
|
266 }
|
|
267 return next_chrom;
|
|
268 }
|
|
269
|
|
270
|
|
271 void MultiIntersectBed::OpenFiles() {
|
|
272 for (size_t i = 0; i < filenames.size(); ++i) {
|
|
273 BedFile *file = new BedFile(filenames[i]);
|
|
274 file->Open();
|
|
275 input_files.push_back(file);
|
|
276 current_depth.push_back(0);
|
|
277 }
|
|
278 current_item.resize(filenames.size());
|
|
279 }
|
|
280
|
|
281
|
|
282 void MultiIntersectBed::CloseFiles() {
|
|
283 for (size_t i=0; i < input_files.size(); ++i) {
|
|
284 BedFile *file = input_files[i];
|
|
285 delete file;
|
|
286 input_files[i] = NULL ;
|
|
287 }
|
|
288 input_files.clear();
|
|
289 }
|