comparison BEDTools-Version-2.14.3/src/multiIntersectBed/multiIntersectBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfcd8b6c1bda
1 /*****************************************************************************
2 unionBedGraphs.cpp
3
4 (c) 2010 - Assaf Gordon, CSHL
5 - Aaron Quinlan, UVA
6 Hall Laboratory
7 Department of Biochemistry and Molecular Genetics
8 University of Virginia
9 aaronquinlan@gmail.com
10
11 Licenced under the GNU General Public License 2.0 license.
12 ******************************************************************************/
13 #include <cassert>
14 #include <cstring>
15 #include <cstdlib>
16 #include <iostream>
17 #include <algorithm>
18
19 #include "bedFile.h"
20 #include "multiIntersectBed.h"
21
22 using namespace std;
23
24
25 MultiIntersectBed::MultiIntersectBed(std::ostream& _output,
26 const vector<string>& _filenames,
27 const vector<string>& _titles,
28 bool _print_empty_regions,
29 const std::string& _genome_size_filename,
30 const std::string& _no_coverage_value ) :
31 filenames(_filenames),
32 titles(_titles),
33 output(_output),
34 current_non_zero_inputs(0),
35 print_empty_regions(_print_empty_regions),
36 haveTitles(false),
37 genome_sizes(NULL),
38 no_coverage_value(_no_coverage_value)
39 {
40 if (print_empty_regions) {
41 assert(!_genome_size_filename.empty());
42
43 genome_sizes = new GenomeFile(_genome_size_filename);
44 }
45
46 if (titles.size() > 0) {
47 haveTitles = true;
48 }
49 }
50
51
52 MultiIntersectBed::~MultiIntersectBed() {
53 CloseFiles();
54 if (genome_sizes) {
55 delete genome_sizes;
56 genome_sizes = NULL ;
57 }
58 }
59
60
61 void MultiIntersectBed::MultiIntersect() {
62 OpenFiles();
63
64 // Add the first interval from each file
65 for(size_t i = 0;i < input_files.size(); ++i)
66 LoadNextItem(i);
67
68 // Chromosome loop - once per chromosome
69 do {
70 // Find the first chromosome to use
71 current_chrom = DetermineNextChrom();
72
73 // Populate the queue with initial values from all files
74 // (if they belong to the correct chromosome)
75 for(size_t i = 0; i < input_files.size(); ++i)
76 AddInterval(i);
77
78 CHRPOS current_start = ConsumeNextCoordinate();
79
80 // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage
81 if (print_empty_regions && current_start > 0)
82 PrintEmptyCoverage(0,current_start);
83
84 // Intervals loop - until all intervals (of current chromosome) from all files are used.
85 do {
86 CHRPOS current_end = queue.top().coord;
87 PrintCoverage(current_start, current_end);
88 current_start = ConsumeNextCoordinate();
89 } while (!queue.empty());
90
91 // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome
92 // print a dummy empty coverage
93 if (print_empty_regions) {
94 CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom);
95 if (current_start < chrom_size)
96 PrintEmptyCoverage(current_start, chrom_size);
97 }
98
99 } while (!AllFilesDone());
100 }
101
102
103 CHRPOS MultiIntersectBed::ConsumeNextCoordinate() {
104 assert(!queue.empty());
105
106 CHRPOS new_position = queue.top().coord;
107 do {
108 IntervalItem item = queue.top();
109 UpdateInformation(item);
110 queue.pop();
111 } while (!queue.empty() && queue.top().coord == new_position);
112
113 return new_position;
114 }
115
116
117 void MultiIntersectBed::UpdateInformation(const IntervalItem &item) {
118 // Update the depth coverage for this file
119
120 // Which coordinate is it - start or end?
121 switch (item.coord_type)
122 {
123 case START:
124 current_depth[item.source_index] = 1;
125 current_non_zero_inputs++;
126 files_with_coverage[item.source_index] = true;
127 break;
128 case END:
129 //Read the next interval from this file
130 AddInterval(item.source_index);
131 current_depth[item.source_index] = 0;
132 current_non_zero_inputs--;
133 files_with_coverage.erase(item.source_index);
134 break;
135 default:
136 assert(0);
137 }
138 }
139
140
141 void MultiIntersectBed::AddInterval(int index) {
142 assert(static_cast<unsigned int>(index) < input_files.size());
143
144 //This file has no more intervals
145 if (current_item[index].chrom.empty())
146 return;
147
148 //If the next interval belongs to a different chrom, don't add it
149 if (current_item[index].chrom!=current_chrom)
150 return;
151
152 const BED &bed(current_item[index]);
153
154 IntervalItem start_item(index, START, bed.start);
155 IntervalItem end_item(index, END, bed.end);
156
157 queue.push(start_item);
158 queue.push(end_item);
159
160 LoadNextItem(index);
161 }
162
163
164 void MultiIntersectBed::PrintHeader() {
165 output << "chrom\tstart\tend\tnum\tlist" ;
166 for (size_t i=0;i<titles.size();++i)
167 output << "\t" <<titles[i];
168 output << endl;
169 }
170
171
172 void MultiIntersectBed::PrintCoverage(CHRPOS start, CHRPOS end) {
173 if ( current_non_zero_inputs == 0 && ! print_empty_regions )
174 return ;
175
176 output << current_chrom << "\t"
177 << start << "\t"
178 << end << "\t"
179 << current_non_zero_inputs << "\t";
180
181 ostringstream file_list_string;
182 ostringstream file_bool_string;
183 int depth_count = 0;
184 for (size_t i = 0; i < current_depth.size(); ++i)
185 {
186 if (current_depth[i] > 0) {
187 if (depth_count < current_non_zero_inputs - 1) {
188 if (!haveTitles)
189 file_list_string << i+1 << ",";
190 else
191 file_list_string << titles[i] << ",";
192 }
193 else {
194 if (!haveTitles)
195 file_list_string << i+1;
196 else
197 file_list_string << titles[i];
198 }
199 depth_count++;
200 }
201 file_bool_string << "\t" << current_depth[i];
202 }
203 if (current_non_zero_inputs > 0) {
204 cout << file_list_string.str() << file_bool_string.str() << endl;
205 }
206 else {
207 cout << "none" << file_bool_string.str() << endl;
208 }
209 }
210
211
212 void MultiIntersectBed::PrintEmptyCoverage(CHRPOS start, CHRPOS end) {
213 output << current_chrom << "\t"
214 << start << "\t"
215 << end << "\t"
216 << "0" << "\t" << "none";
217
218 for (size_t i=0;i<current_depth.size();++i)
219 output << "\t0";
220
221 output << endl;
222 }
223
224
225 void MultiIntersectBed::LoadNextItem(int index) {
226 assert(static_cast<unsigned int>(index) < input_files.size());
227
228 current_item[index].chrom="";
229
230 BedFile *file = input_files[index];
231 BED merged_bed;
232 int lineNum = 0;
233 //
234 // TO DO: Do the mergeing on the fly. How best to do this?
235 //
236 // IDEA: Implement a Merge class with GetNextMerge element.
237 //
238
239 while (file->GetNextMergedBed(merged_bed, lineNum))
240 {
241 current_item[index] = merged_bed;
242 break;
243 }
244 }
245
246
247 bool MultiIntersectBed::AllFilesDone() {
248 for (size_t i=0;i<current_item.size();++i)
249 if (!current_item[i].chrom.empty())
250 return false;
251 return true;
252 }
253
254
255 string MultiIntersectBed::DetermineNextChrom() {
256 string next_chrom;
257 for (size_t i=0;i<current_item.size();++i) {
258 if (current_item[i].chrom.empty())
259 continue;
260
261 if (next_chrom.empty())
262 next_chrom = current_item[i].chrom;
263 else
264 if (current_item[i].chrom < next_chrom)
265 next_chrom = current_item[i].chrom ;
266 }
267 return next_chrom;
268 }
269
270
271 void MultiIntersectBed::OpenFiles() {
272 for (size_t i = 0; i < filenames.size(); ++i) {
273 BedFile *file = new BedFile(filenames[i]);
274 file->Open();
275 input_files.push_back(file);
276 current_depth.push_back(0);
277 }
278 current_item.resize(filenames.size());
279 }
280
281
282 void MultiIntersectBed::CloseFiles() {
283 for (size_t i=0; i < input_files.size(); ++i) {
284 BedFile *file = input_files[i];
285 delete file;
286 input_files[i] = NULL ;
287 }
288 input_files.clear();
289 }