Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/multiIntersectBed/multiIntersectBed.cpp @ 1:bec36315bd12 default tip
Deleted selected files
| author | aaronquinlan |
|---|---|
| date | Sat, 19 Nov 2011 14:17:03 -0500 |
| parents | dfcd8b6c1bda |
| children |
comparison
equal
deleted
inserted
replaced
| 0:dfcd8b6c1bda | 1:bec36315bd12 |
|---|---|
| 1 /***************************************************************************** | |
| 2 unionBedGraphs.cpp | |
| 3 | |
| 4 (c) 2010 - Assaf Gordon, CSHL | |
| 5 - Aaron Quinlan, UVA | |
| 6 Hall Laboratory | |
| 7 Department of Biochemistry and Molecular Genetics | |
| 8 University of Virginia | |
| 9 aaronquinlan@gmail.com | |
| 10 | |
| 11 Licenced under the GNU General Public License 2.0 license. | |
| 12 ******************************************************************************/ | |
| 13 #include <cassert> | |
| 14 #include <cstring> | |
| 15 #include <cstdlib> | |
| 16 #include <iostream> | |
| 17 #include <algorithm> | |
| 18 | |
| 19 #include "bedFile.h" | |
| 20 #include "multiIntersectBed.h" | |
| 21 | |
| 22 using namespace std; | |
| 23 | |
| 24 | |
| 25 MultiIntersectBed::MultiIntersectBed(std::ostream& _output, | |
| 26 const vector<string>& _filenames, | |
| 27 const vector<string>& _titles, | |
| 28 bool _print_empty_regions, | |
| 29 const std::string& _genome_size_filename, | |
| 30 const std::string& _no_coverage_value ) : | |
| 31 filenames(_filenames), | |
| 32 titles(_titles), | |
| 33 output(_output), | |
| 34 current_non_zero_inputs(0), | |
| 35 print_empty_regions(_print_empty_regions), | |
| 36 haveTitles(false), | |
| 37 genome_sizes(NULL), | |
| 38 no_coverage_value(_no_coverage_value) | |
| 39 { | |
| 40 if (print_empty_regions) { | |
| 41 assert(!_genome_size_filename.empty()); | |
| 42 | |
| 43 genome_sizes = new GenomeFile(_genome_size_filename); | |
| 44 } | |
| 45 | |
| 46 if (titles.size() > 0) { | |
| 47 haveTitles = true; | |
| 48 } | |
| 49 } | |
| 50 | |
| 51 | |
| 52 MultiIntersectBed::~MultiIntersectBed() { | |
| 53 CloseFiles(); | |
| 54 if (genome_sizes) { | |
| 55 delete genome_sizes; | |
| 56 genome_sizes = NULL ; | |
| 57 } | |
| 58 } | |
| 59 | |
| 60 | |
| 61 void MultiIntersectBed::MultiIntersect() { | |
| 62 OpenFiles(); | |
| 63 | |
| 64 // Add the first interval from each file | |
| 65 for(size_t i = 0;i < input_files.size(); ++i) | |
| 66 LoadNextItem(i); | |
| 67 | |
| 68 // Chromosome loop - once per chromosome | |
| 69 do { | |
| 70 // Find the first chromosome to use | |
| 71 current_chrom = DetermineNextChrom(); | |
| 72 | |
| 73 // Populate the queue with initial values from all files | |
| 74 // (if they belong to the correct chromosome) | |
| 75 for(size_t i = 0; i < input_files.size(); ++i) | |
| 76 AddInterval(i); | |
| 77 | |
| 78 CHRPOS current_start = ConsumeNextCoordinate(); | |
| 79 | |
| 80 // User wanted empty regions, and the first coordinate is not 0 - print a dummy empty coverage | |
| 81 if (print_empty_regions && current_start > 0) | |
| 82 PrintEmptyCoverage(0,current_start); | |
| 83 | |
| 84 // Intervals loop - until all intervals (of current chromosome) from all files are used. | |
| 85 do { | |
| 86 CHRPOS current_end = queue.top().coord; | |
| 87 PrintCoverage(current_start, current_end); | |
| 88 current_start = ConsumeNextCoordinate(); | |
| 89 } while (!queue.empty()); | |
| 90 | |
| 91 // User wanted empty regions, and the last coordinate is not the last coordinate of the chromosome | |
| 92 // print a dummy empty coverage | |
| 93 if (print_empty_regions) { | |
| 94 CHRPOS chrom_size = genome_sizes->getChromSize(current_chrom); | |
| 95 if (current_start < chrom_size) | |
| 96 PrintEmptyCoverage(current_start, chrom_size); | |
| 97 } | |
| 98 | |
| 99 } while (!AllFilesDone()); | |
| 100 } | |
| 101 | |
| 102 | |
| 103 CHRPOS MultiIntersectBed::ConsumeNextCoordinate() { | |
| 104 assert(!queue.empty()); | |
| 105 | |
| 106 CHRPOS new_position = queue.top().coord; | |
| 107 do { | |
| 108 IntervalItem item = queue.top(); | |
| 109 UpdateInformation(item); | |
| 110 queue.pop(); | |
| 111 } while (!queue.empty() && queue.top().coord == new_position); | |
| 112 | |
| 113 return new_position; | |
| 114 } | |
| 115 | |
| 116 | |
| 117 void MultiIntersectBed::UpdateInformation(const IntervalItem &item) { | |
| 118 // Update the depth coverage for this file | |
| 119 | |
| 120 // Which coordinate is it - start or end? | |
| 121 switch (item.coord_type) | |
| 122 { | |
| 123 case START: | |
| 124 current_depth[item.source_index] = 1; | |
| 125 current_non_zero_inputs++; | |
| 126 files_with_coverage[item.source_index] = true; | |
| 127 break; | |
| 128 case END: | |
| 129 //Read the next interval from this file | |
| 130 AddInterval(item.source_index); | |
| 131 current_depth[item.source_index] = 0; | |
| 132 current_non_zero_inputs--; | |
| 133 files_with_coverage.erase(item.source_index); | |
| 134 break; | |
| 135 default: | |
| 136 assert(0); | |
| 137 } | |
| 138 } | |
| 139 | |
| 140 | |
| 141 void MultiIntersectBed::AddInterval(int index) { | |
| 142 assert(static_cast<unsigned int>(index) < input_files.size()); | |
| 143 | |
| 144 //This file has no more intervals | |
| 145 if (current_item[index].chrom.empty()) | |
| 146 return; | |
| 147 | |
| 148 //If the next interval belongs to a different chrom, don't add it | |
| 149 if (current_item[index].chrom!=current_chrom) | |
| 150 return; | |
| 151 | |
| 152 const BED &bed(current_item[index]); | |
| 153 | |
| 154 IntervalItem start_item(index, START, bed.start); | |
| 155 IntervalItem end_item(index, END, bed.end); | |
| 156 | |
| 157 queue.push(start_item); | |
| 158 queue.push(end_item); | |
| 159 | |
| 160 LoadNextItem(index); | |
| 161 } | |
| 162 | |
| 163 | |
| 164 void MultiIntersectBed::PrintHeader() { | |
| 165 output << "chrom\tstart\tend\tnum\tlist" ; | |
| 166 for (size_t i=0;i<titles.size();++i) | |
| 167 output << "\t" <<titles[i]; | |
| 168 output << endl; | |
| 169 } | |
| 170 | |
| 171 | |
| 172 void MultiIntersectBed::PrintCoverage(CHRPOS start, CHRPOS end) { | |
| 173 if ( current_non_zero_inputs == 0 && ! print_empty_regions ) | |
| 174 return ; | |
| 175 | |
| 176 output << current_chrom << "\t" | |
| 177 << start << "\t" | |
| 178 << end << "\t" | |
| 179 << current_non_zero_inputs << "\t"; | |
| 180 | |
| 181 ostringstream file_list_string; | |
| 182 ostringstream file_bool_string; | |
| 183 int depth_count = 0; | |
| 184 for (size_t i = 0; i < current_depth.size(); ++i) | |
| 185 { | |
| 186 if (current_depth[i] > 0) { | |
| 187 if (depth_count < current_non_zero_inputs - 1) { | |
| 188 if (!haveTitles) | |
| 189 file_list_string << i+1 << ","; | |
| 190 else | |
| 191 file_list_string << titles[i] << ","; | |
| 192 } | |
| 193 else { | |
| 194 if (!haveTitles) | |
| 195 file_list_string << i+1; | |
| 196 else | |
| 197 file_list_string << titles[i]; | |
| 198 } | |
| 199 depth_count++; | |
| 200 } | |
| 201 file_bool_string << "\t" << current_depth[i]; | |
| 202 } | |
| 203 if (current_non_zero_inputs > 0) { | |
| 204 cout << file_list_string.str() << file_bool_string.str() << endl; | |
| 205 } | |
| 206 else { | |
| 207 cout << "none" << file_bool_string.str() << endl; | |
| 208 } | |
| 209 } | |
| 210 | |
| 211 | |
| 212 void MultiIntersectBed::PrintEmptyCoverage(CHRPOS start, CHRPOS end) { | |
| 213 output << current_chrom << "\t" | |
| 214 << start << "\t" | |
| 215 << end << "\t" | |
| 216 << "0" << "\t" << "none"; | |
| 217 | |
| 218 for (size_t i=0;i<current_depth.size();++i) | |
| 219 output << "\t0"; | |
| 220 | |
| 221 output << endl; | |
| 222 } | |
| 223 | |
| 224 | |
| 225 void MultiIntersectBed::LoadNextItem(int index) { | |
| 226 assert(static_cast<unsigned int>(index) < input_files.size()); | |
| 227 | |
| 228 current_item[index].chrom=""; | |
| 229 | |
| 230 BedFile *file = input_files[index]; | |
| 231 BED merged_bed; | |
| 232 int lineNum = 0; | |
| 233 // | |
| 234 // TO DO: Do the mergeing on the fly. How best to do this? | |
| 235 // | |
| 236 // IDEA: Implement a Merge class with GetNextMerge element. | |
| 237 // | |
| 238 | |
| 239 while (file->GetNextMergedBed(merged_bed, lineNum)) | |
| 240 { | |
| 241 current_item[index] = merged_bed; | |
| 242 break; | |
| 243 } | |
| 244 } | |
| 245 | |
| 246 | |
| 247 bool MultiIntersectBed::AllFilesDone() { | |
| 248 for (size_t i=0;i<current_item.size();++i) | |
| 249 if (!current_item[i].chrom.empty()) | |
| 250 return false; | |
| 251 return true; | |
| 252 } | |
| 253 | |
| 254 | |
| 255 string MultiIntersectBed::DetermineNextChrom() { | |
| 256 string next_chrom; | |
| 257 for (size_t i=0;i<current_item.size();++i) { | |
| 258 if (current_item[i].chrom.empty()) | |
| 259 continue; | |
| 260 | |
| 261 if (next_chrom.empty()) | |
| 262 next_chrom = current_item[i].chrom; | |
| 263 else | |
| 264 if (current_item[i].chrom < next_chrom) | |
| 265 next_chrom = current_item[i].chrom ; | |
| 266 } | |
| 267 return next_chrom; | |
| 268 } | |
| 269 | |
| 270 | |
| 271 void MultiIntersectBed::OpenFiles() { | |
| 272 for (size_t i = 0; i < filenames.size(); ++i) { | |
| 273 BedFile *file = new BedFile(filenames[i]); | |
| 274 file->Open(); | |
| 275 input_files.push_back(file); | |
| 276 current_depth.push_back(0); | |
| 277 } | |
| 278 current_item.resize(filenames.size()); | |
| 279 } | |
| 280 | |
| 281 | |
| 282 void MultiIntersectBed::CloseFiles() { | |
| 283 for (size_t i=0; i < input_files.size(); ++i) { | |
| 284 BedFile *file = input_files[i]; | |
| 285 delete file; | |
| 286 input_files[i] = NULL ; | |
| 287 } | |
| 288 input_files.clear(); | |
| 289 } |
