Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/genomeCoverageBed/genomeCoverageBed.cpp @ 1:bec36315bd12 default tip
Deleted selected files
| author | aaronquinlan |
|---|---|
| date | Sat, 19 Nov 2011 14:17:03 -0500 |
| parents | dfcd8b6c1bda |
| children |
comparison
equal
deleted
inserted
replaced
| 0:dfcd8b6c1bda | 1:bec36315bd12 |
|---|---|
| 1 /***************************************************************************** | |
| 2 genomeCoverage.cpp | |
| 3 | |
| 4 (c) 2009 - Aaron Quinlan | |
| 5 Hall Laboratory | |
| 6 Department of Biochemistry and Molecular Genetics | |
| 7 University of Virginia | |
| 8 aaronquinlan@gmail.com | |
| 9 | |
| 10 Licenced under the GNU General Public License 2.0 license. | |
| 11 ******************************************************************************/ | |
| 12 #include "lineFileUtilities.h" | |
| 13 #include "genomeCoverageBed.h" | |
| 14 | |
| 15 | |
| 16 BedGenomeCoverage::BedGenomeCoverage(string bedFile, string genomeFile, | |
| 17 bool eachBase, bool startSites, | |
| 18 bool bedGraph, bool bedGraphAll, | |
| 19 int max, float scale, | |
| 20 bool bamInput, bool obeySplits, | |
| 21 bool filterByStrand, string requestedStrand, | |
| 22 bool only_5p_end, bool only_3p_end, | |
| 23 bool eachBaseZeroBased, | |
| 24 bool add_gb_track_line, string gb_track_line_opts) { | |
| 25 | |
| 26 _bedFile = bedFile; | |
| 27 _genomeFile = genomeFile; | |
| 28 _eachBase = eachBase; | |
| 29 _eachBaseZeroBased = eachBaseZeroBased; | |
| 30 _startSites = startSites; | |
| 31 _bedGraph = bedGraph; | |
| 32 _bedGraphAll = bedGraphAll; | |
| 33 _max = max; | |
| 34 _scale = scale; | |
| 35 _bamInput = bamInput; | |
| 36 _obeySplits = obeySplits; | |
| 37 _filterByStrand = filterByStrand; | |
| 38 _requestedStrand = requestedStrand; | |
| 39 _only_3p_end = only_3p_end; | |
| 40 _only_5p_end = only_5p_end; | |
| 41 _add_gb_track_line = add_gb_track_line; | |
| 42 _gb_track_line_opts = gb_track_line_opts; | |
| 43 _currChromName = ""; | |
| 44 _currChromSize = 0 ; | |
| 45 | |
| 46 | |
| 47 if (_bamInput == false) { | |
| 48 _genome = new GenomeFile(genomeFile); | |
| 49 } | |
| 50 | |
| 51 PrintTrackDefinitionLine(); | |
| 52 | |
| 53 if (_bamInput == false) { | |
| 54 _bed = new BedFile(bedFile); | |
| 55 CoverageBed(); | |
| 56 } | |
| 57 else { | |
| 58 CoverageBam(_bedFile); | |
| 59 } | |
| 60 } | |
| 61 | |
| 62 void BedGenomeCoverage::PrintTrackDefinitionLine() | |
| 63 { | |
| 64 //Print Track Definition line (if requested) | |
| 65 if ( (_bedGraph||_bedGraphAll) && _add_gb_track_line) { | |
| 66 string line = "track type=bedGraph"; | |
| 67 if (!_gb_track_line_opts.empty()) { | |
| 68 line += " " ; | |
| 69 line += _gb_track_line_opts ; | |
| 70 } | |
| 71 cout << line << endl; | |
| 72 } | |
| 73 | |
| 74 } | |
| 75 | |
| 76 | |
| 77 BedGenomeCoverage::~BedGenomeCoverage(void) { | |
| 78 delete _bed; | |
| 79 delete _genome; | |
| 80 } | |
| 81 | |
| 82 | |
| 83 void BedGenomeCoverage::ResetChromCoverage() { | |
| 84 _currChromName = ""; | |
| 85 _currChromSize = 0 ; | |
| 86 std::vector<DEPTH>().swap(_currChromCoverage); | |
| 87 } | |
| 88 | |
| 89 | |
| 90 void BedGenomeCoverage::StartNewChrom(const string& newChrom) { | |
| 91 // If we've moved beyond the first encountered chromosomes, | |
| 92 // process the results of the previous chromosome. | |
| 93 if (_currChromName.length() > 0) { | |
| 94 ReportChromCoverage(_currChromCoverage, _currChromSize, | |
| 95 _currChromName, _currChromDepthHist); | |
| 96 } | |
| 97 | |
| 98 // empty the previous chromosome and reserve new | |
| 99 std::vector<DEPTH>().swap(_currChromCoverage); | |
| 100 | |
| 101 if (_visitedChromosomes.find(newChrom) != _visitedChromosomes.end()) { | |
| 102 cerr << "Input error: Chromosome " << _currChromName | |
| 103 << " found in non-sequential lines. This suggests that the input file is not sorted correctly." << endl; | |
| 104 | |
| 105 } | |
| 106 _visitedChromosomes.insert(newChrom); | |
| 107 | |
| 108 _currChromName = newChrom; | |
| 109 | |
| 110 // get the current chrom size and allocate space | |
| 111 _currChromSize = _genome->getChromSize(_currChromName); | |
| 112 | |
| 113 if (_currChromSize >= 0) | |
| 114 _currChromCoverage.resize(_currChromSize); | |
| 115 else { | |
| 116 cerr << "Input error: Chromosome " << _currChromName << " found in your input file but not in your genome file." << endl; | |
| 117 exit(1); | |
| 118 } | |
| 119 } | |
| 120 | |
| 121 | |
| 122 void BedGenomeCoverage::AddCoverage(int start, int end) { | |
| 123 // process the first line for this chromosome. | |
| 124 // make sure the coordinates fit within the chrom | |
| 125 if (start < _currChromSize) | |
| 126 _currChromCoverage[start].starts++; | |
| 127 if (end < _currChromSize) | |
| 128 _currChromCoverage[end].ends++; | |
| 129 else | |
| 130 _currChromCoverage[_currChromSize-1].ends++; | |
| 131 } | |
| 132 | |
| 133 | |
| 134 void BedGenomeCoverage::AddBlockedCoverage(const vector<BED> &bedBlocks) { | |
| 135 vector<BED>::const_iterator bedItr = bedBlocks.begin(); | |
| 136 vector<BED>::const_iterator bedEnd = bedBlocks.end(); | |
| 137 for (; bedItr != bedEnd; ++bedItr) { | |
| 138 // the end - 1 must be done because BamAncillary::getBamBlocks | |
| 139 // returns ends uncorrected for the genomeCoverageBed data structure. | |
| 140 // ugly, but necessary. | |
| 141 AddCoverage(bedItr->start, bedItr->end - 1); | |
| 142 } | |
| 143 } | |
| 144 | |
| 145 | |
| 146 void BedGenomeCoverage::CoverageBed() { | |
| 147 | |
| 148 BED a, nullBed; | |
| 149 int lineNum = 0; // current input line number | |
| 150 BedLineStatus bedStatus; | |
| 151 | |
| 152 ResetChromCoverage(); | |
| 153 | |
| 154 _bed->Open(); | |
| 155 while ( (bedStatus = _bed->GetNextBed(a, lineNum)) != BED_INVALID ) { | |
| 156 if (bedStatus == BED_VALID) { | |
| 157 if (_filterByStrand == true) { | |
| 158 if (a.strand.empty()) { | |
| 159 cerr << "Input error: Interval is missing a strand value on line " << lineNum << "." <<endl; | |
| 160 exit(1); | |
| 161 } | |
| 162 if ( ! (a.strand == "-" || a.strand == "+") ) { | |
| 163 cerr << "Input error: Invalid strand value (" << a.strand << ") on line " << lineNum << "." << endl; | |
| 164 exit(1); | |
| 165 } | |
| 166 // skip if the strand is not what the user requested. | |
| 167 if (a.strand != _requestedStrand) | |
| 168 continue; | |
| 169 } | |
| 170 | |
| 171 // are we on a new chromosome? | |
| 172 if (a.chrom != _currChromName) | |
| 173 StartNewChrom(a.chrom); | |
| 174 | |
| 175 if (_obeySplits == true) { | |
| 176 bedVector bedBlocks; // vec to store the discrete BED "blocks" | |
| 177 splitBedIntoBlocks(a, lineNum, bedBlocks); | |
| 178 AddBlockedCoverage(bedBlocks); | |
| 179 } | |
| 180 else if (_only_5p_end) { | |
| 181 int pos = ( a.strand=="+" ) ? a.start : a.end-1; | |
| 182 AddCoverage(pos,pos); | |
| 183 } | |
| 184 else if (_only_3p_end) { | |
| 185 int pos = ( a.strand=="-" ) ? a.start : a.end-1; | |
| 186 AddCoverage(pos,pos); | |
| 187 } | |
| 188 else | |
| 189 AddCoverage(a.start, a.end-1); | |
| 190 } | |
| 191 } | |
| 192 _bed->Close(); | |
| 193 PrintFinalCoverage(); | |
| 194 } | |
| 195 | |
| 196 | |
| 197 void BedGenomeCoverage::PrintFinalCoverage() | |
| 198 { | |
| 199 | |
| 200 | |
| 201 // process the results of the last chromosome. | |
| 202 ReportChromCoverage(_currChromCoverage, _currChromSize, | |
| 203 _currChromName, _currChromDepthHist); | |
| 204 if (_eachBase == false && _bedGraph == false && _bedGraphAll == false) { | |
| 205 ReportGenomeCoverage(_currChromDepthHist); | |
| 206 } | |
| 207 } | |
| 208 | |
| 209 | |
| 210 void BedGenomeCoverage::CoverageBam(string bamFile) { | |
| 211 | |
| 212 ResetChromCoverage(); | |
| 213 | |
| 214 // open the BAM file | |
| 215 BamReader reader; | |
| 216 reader.Open(bamFile); | |
| 217 | |
| 218 // get header & reference information | |
| 219 string header = reader.GetHeaderText(); | |
| 220 RefVector refs = reader.GetReferenceData(); | |
| 221 | |
| 222 // load the BAM header references into a BEDTools "genome file" | |
| 223 _genome = new GenomeFile(refs); | |
| 224 // convert each aligned BAM entry to BED | |
| 225 // and compute coverage on B | |
| 226 BamAlignment bam; | |
| 227 while (reader.GetNextAlignment(bam)) { | |
| 228 // skip if the read is unaligned | |
| 229 if (bam.IsMapped() == false) | |
| 230 continue; | |
| 231 | |
| 232 // skip if we care about strands and the strand isn't what | |
| 233 // the user wanted | |
| 234 if ( (_filterByStrand == true) && | |
| 235 ((_requestedStrand == "-") != bam.IsReverseStrand()) ) | |
| 236 continue; | |
| 237 | |
| 238 // extract the chrom, start and end from the BAM alignment | |
| 239 string chrom(refs.at(bam.RefID).RefName); | |
| 240 CHRPOS start = bam.Position; | |
| 241 CHRPOS end = bam.GetEndPosition(false, false) - 1; | |
| 242 | |
| 243 // are we on a new chromosome? | |
| 244 if ( chrom != _currChromName ) | |
| 245 StartNewChrom(chrom); | |
| 246 | |
| 247 // add coverage accordingly. | |
| 248 if (_obeySplits) { | |
| 249 bedVector bedBlocks; | |
| 250 // since we are counting coverage, we do want to split blocks when a | |
| 251 // deletion (D) CIGAR op is encountered (hence the true for the last parm) | |
| 252 getBamBlocks(bam, refs, bedBlocks, true); | |
| 253 AddBlockedCoverage(bedBlocks); | |
| 254 } | |
| 255 else if (_only_5p_end) { | |
| 256 int pos = ( !bam.IsReverseStrand() ) ? start : end; | |
| 257 AddCoverage(pos,pos); | |
| 258 } | |
| 259 else if (_only_3p_end) { | |
| 260 int pos = ( bam.IsReverseStrand() ) ? start : end; | |
| 261 AddCoverage(pos,pos); | |
| 262 } | |
| 263 else | |
| 264 AddCoverage(start, end); | |
| 265 } | |
| 266 // close the BAM | |
| 267 reader.Close(); | |
| 268 PrintFinalCoverage(); | |
| 269 } | |
| 270 | |
| 271 | |
| 272 void BedGenomeCoverage::ReportChromCoverage(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom, chromHistMap &chromDepthHist) { | |
| 273 | |
| 274 if (_eachBase) { | |
| 275 int depth = 0; // initialize the depth | |
| 276 int offset = (_eachBaseZeroBased)?0:1; | |
| 277 for (int pos = 0; pos < chromSize; pos++) { | |
| 278 | |
| 279 depth += chromCov[pos].starts; | |
| 280 // report the depth for this position. | |
| 281 if (depth>0 || !_eachBaseZeroBased) | |
| 282 cout << chrom << "\t" << pos+offset << "\t" << depth * _scale << endl; | |
| 283 depth = depth - chromCov[pos].ends; | |
| 284 } | |
| 285 } | |
| 286 else if (_bedGraph == true || _bedGraphAll == true) { | |
| 287 ReportChromCoverageBedGraph(chromCov, chromSize, chrom); | |
| 288 } | |
| 289 else { | |
| 290 | |
| 291 int depth = 0; // initialize the depth | |
| 292 | |
| 293 for (int pos = 0; pos < chromSize; pos++) { | |
| 294 | |
| 295 depth += chromCov[pos].starts; | |
| 296 | |
| 297 // add the depth at this position to the depth histogram | |
| 298 // for this chromosome. if the depth is greater than the | |
| 299 // maximum bin requested, then readjust the depth to be the max | |
| 300 if (depth >= _max) { | |
| 301 chromDepthHist[chrom][_max]++; | |
| 302 } | |
| 303 else { | |
| 304 chromDepthHist[chrom][depth]++; | |
| 305 } | |
| 306 depth = depth - chromCov[pos].ends; | |
| 307 } | |
| 308 // report the histogram for each chromosome | |
| 309 histMap::const_iterator depthIt = chromDepthHist[chrom].begin(); | |
| 310 histMap::const_iterator depthEnd = chromDepthHist[chrom].end(); | |
| 311 for (; depthIt != depthEnd; ++depthIt) { | |
| 312 int depth = depthIt->first; | |
| 313 unsigned int numBasesAtDepth = depthIt->second; | |
| 314 cout << chrom << "\t" << depth << "\t" << numBasesAtDepth << "\t" | |
| 315 << chromSize << "\t" << (float) ((float)numBasesAtDepth / (float)chromSize) << endl; | |
| 316 } | |
| 317 } | |
| 318 } | |
| 319 | |
| 320 | |
| 321 | |
| 322 void BedGenomeCoverage::ReportGenomeCoverage(chromHistMap &chromDepthHist) { | |
| 323 | |
| 324 // get the list of chromosome names in the genome | |
| 325 vector<string> chromList = _genome->getChromList(); | |
| 326 | |
| 327 unsigned int genomeSize = 0; | |
| 328 vector<string>::const_iterator chromItr = chromList.begin(); | |
| 329 vector<string>::const_iterator chromEnd = chromList.end(); | |
| 330 for (; chromItr != chromEnd; ++chromItr) { | |
| 331 string chrom = *chromItr; | |
| 332 genomeSize += _genome->getChromSize(chrom); | |
| 333 // if there were no reads for a give chromosome, then | |
| 334 // add the length of the chrom to the 0 bin. | |
| 335 if ( chromDepthHist.find(chrom) == chromDepthHist.end() ) { | |
| 336 chromDepthHist[chrom][0] += _genome->getChromSize(chrom); | |
| 337 } | |
| 338 } | |
| 339 | |
| 340 histMap genomeHist; // depth histogram for the entire genome | |
| 341 | |
| 342 // loop through each chromosome and add the depth and number of bases at each depth | |
| 343 // to the aggregate histogram for the entire genome | |
| 344 for (chromHistMap::iterator chromIt = chromDepthHist.begin(); chromIt != chromDepthHist.end(); ++chromIt) { | |
| 345 string chrom = chromIt->first; | |
| 346 for (histMap::iterator depthIt = chromDepthHist[chrom].begin(); depthIt != chromDepthHist[chrom].end(); ++depthIt) { | |
| 347 int depth = depthIt->first; | |
| 348 unsigned int numBasesAtDepth = depthIt->second; | |
| 349 genomeHist[depth] += numBasesAtDepth; | |
| 350 } | |
| 351 } | |
| 352 | |
| 353 // loop through the depths for the entire genome | |
| 354 // and report the number and fraction of bases in | |
| 355 // the entire genome that are at said depth. | |
| 356 for (histMap::iterator genomeDepthIt = genomeHist.begin(); genomeDepthIt != genomeHist.end(); ++genomeDepthIt) { | |
| 357 int depth = genomeDepthIt->first; | |
| 358 unsigned int numBasesAtDepth = genomeDepthIt->second; | |
| 359 | |
| 360 cout << "genome" << "\t" << depth << "\t" << numBasesAtDepth << "\t" | |
| 361 << genomeSize << "\t" << (float) ((float)numBasesAtDepth / (float)genomeSize) << endl; | |
| 362 } | |
| 363 } | |
| 364 | |
| 365 | |
| 366 void BedGenomeCoverage::ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom) { | |
| 367 | |
| 368 int depth = 0; // initialize the depth | |
| 369 int lastStart = -1; | |
| 370 int lastDepth = -1; | |
| 371 | |
| 372 for (int pos = 0; pos < chromSize; pos++) { | |
| 373 depth += chromCov[pos].starts; | |
| 374 | |
| 375 if (depth != lastDepth) { | |
| 376 // Coverage depth has changed, print the last interval coverage (if any) | |
| 377 // Print if: | |
| 378 // (1) depth>0 (the default running mode), | |
| 379 // (2) depth==0 and the user requested to print zero covered regions (_bedGraphAll) | |
| 380 if ( (lastDepth != -1) && (lastDepth > 0 || _bedGraphAll) ) { | |
| 381 cout << chrom << "\t" << lastStart << "\t" << pos << "\t" << lastDepth * _scale << endl; | |
| 382 } | |
| 383 //Set current position as the new interval start + depth | |
| 384 lastDepth = depth; | |
| 385 lastStart = pos; | |
| 386 } | |
| 387 // Default: the depth has not changed, so we will not print anything. | |
| 388 // Proceed until the depth changes. | |
| 389 // Update depth | |
| 390 depth = depth - chromCov[pos].ends; | |
| 391 } | |
| 392 //Print information about the last position | |
| 393 if ( (lastDepth != -1) && (lastDepth > 0 || _bedGraphAll) ) { | |
| 394 cout << chrom << "\t" << lastStart << "\t" << chromSize << "\t" << lastDepth * _scale << endl; | |
| 395 } | |
| 396 } |
