comparison BEDTools-Version-2.14.3/src/genomeCoverageBed/genomeCoverageBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dfcd8b6c1bda
1 /*****************************************************************************
2 genomeCoverage.cpp
3
4 (c) 2009 - Aaron Quinlan
5 Hall Laboratory
6 Department of Biochemistry and Molecular Genetics
7 University of Virginia
8 aaronquinlan@gmail.com
9
10 Licenced under the GNU General Public License 2.0 license.
11 ******************************************************************************/
12 #include "lineFileUtilities.h"
13 #include "genomeCoverageBed.h"
14
15
16 BedGenomeCoverage::BedGenomeCoverage(string bedFile, string genomeFile,
17 bool eachBase, bool startSites,
18 bool bedGraph, bool bedGraphAll,
19 int max, float scale,
20 bool bamInput, bool obeySplits,
21 bool filterByStrand, string requestedStrand,
22 bool only_5p_end, bool only_3p_end,
23 bool eachBaseZeroBased,
24 bool add_gb_track_line, string gb_track_line_opts) {
25
26 _bedFile = bedFile;
27 _genomeFile = genomeFile;
28 _eachBase = eachBase;
29 _eachBaseZeroBased = eachBaseZeroBased;
30 _startSites = startSites;
31 _bedGraph = bedGraph;
32 _bedGraphAll = bedGraphAll;
33 _max = max;
34 _scale = scale;
35 _bamInput = bamInput;
36 _obeySplits = obeySplits;
37 _filterByStrand = filterByStrand;
38 _requestedStrand = requestedStrand;
39 _only_3p_end = only_3p_end;
40 _only_5p_end = only_5p_end;
41 _add_gb_track_line = add_gb_track_line;
42 _gb_track_line_opts = gb_track_line_opts;
43 _currChromName = "";
44 _currChromSize = 0 ;
45
46
47 if (_bamInput == false) {
48 _genome = new GenomeFile(genomeFile);
49 }
50
51 PrintTrackDefinitionLine();
52
53 if (_bamInput == false) {
54 _bed = new BedFile(bedFile);
55 CoverageBed();
56 }
57 else {
58 CoverageBam(_bedFile);
59 }
60 }
61
62 void BedGenomeCoverage::PrintTrackDefinitionLine()
63 {
64 //Print Track Definition line (if requested)
65 if ( (_bedGraph||_bedGraphAll) && _add_gb_track_line) {
66 string line = "track type=bedGraph";
67 if (!_gb_track_line_opts.empty()) {
68 line += " " ;
69 line += _gb_track_line_opts ;
70 }
71 cout << line << endl;
72 }
73
74 }
75
76
77 BedGenomeCoverage::~BedGenomeCoverage(void) {
78 delete _bed;
79 delete _genome;
80 }
81
82
83 void BedGenomeCoverage::ResetChromCoverage() {
84 _currChromName = "";
85 _currChromSize = 0 ;
86 std::vector<DEPTH>().swap(_currChromCoverage);
87 }
88
89
90 void BedGenomeCoverage::StartNewChrom(const string& newChrom) {
91 // If we've moved beyond the first encountered chromosomes,
92 // process the results of the previous chromosome.
93 if (_currChromName.length() > 0) {
94 ReportChromCoverage(_currChromCoverage, _currChromSize,
95 _currChromName, _currChromDepthHist);
96 }
97
98 // empty the previous chromosome and reserve new
99 std::vector<DEPTH>().swap(_currChromCoverage);
100
101 if (_visitedChromosomes.find(newChrom) != _visitedChromosomes.end()) {
102 cerr << "Input error: Chromosome " << _currChromName
103 << " found in non-sequential lines. This suggests that the input file is not sorted correctly." << endl;
104
105 }
106 _visitedChromosomes.insert(newChrom);
107
108 _currChromName = newChrom;
109
110 // get the current chrom size and allocate space
111 _currChromSize = _genome->getChromSize(_currChromName);
112
113 if (_currChromSize >= 0)
114 _currChromCoverage.resize(_currChromSize);
115 else {
116 cerr << "Input error: Chromosome " << _currChromName << " found in your input file but not in your genome file." << endl;
117 exit(1);
118 }
119 }
120
121
122 void BedGenomeCoverage::AddCoverage(int start, int end) {
123 // process the first line for this chromosome.
124 // make sure the coordinates fit within the chrom
125 if (start < _currChromSize)
126 _currChromCoverage[start].starts++;
127 if (end < _currChromSize)
128 _currChromCoverage[end].ends++;
129 else
130 _currChromCoverage[_currChromSize-1].ends++;
131 }
132
133
134 void BedGenomeCoverage::AddBlockedCoverage(const vector<BED> &bedBlocks) {
135 vector<BED>::const_iterator bedItr = bedBlocks.begin();
136 vector<BED>::const_iterator bedEnd = bedBlocks.end();
137 for (; bedItr != bedEnd; ++bedItr) {
138 // the end - 1 must be done because BamAncillary::getBamBlocks
139 // returns ends uncorrected for the genomeCoverageBed data structure.
140 // ugly, but necessary.
141 AddCoverage(bedItr->start, bedItr->end - 1);
142 }
143 }
144
145
146 void BedGenomeCoverage::CoverageBed() {
147
148 BED a, nullBed;
149 int lineNum = 0; // current input line number
150 BedLineStatus bedStatus;
151
152 ResetChromCoverage();
153
154 _bed->Open();
155 while ( (bedStatus = _bed->GetNextBed(a, lineNum)) != BED_INVALID ) {
156 if (bedStatus == BED_VALID) {
157 if (_filterByStrand == true) {
158 if (a.strand.empty()) {
159 cerr << "Input error: Interval is missing a strand value on line " << lineNum << "." <<endl;
160 exit(1);
161 }
162 if ( ! (a.strand == "-" || a.strand == "+") ) {
163 cerr << "Input error: Invalid strand value (" << a.strand << ") on line " << lineNum << "." << endl;
164 exit(1);
165 }
166 // skip if the strand is not what the user requested.
167 if (a.strand != _requestedStrand)
168 continue;
169 }
170
171 // are we on a new chromosome?
172 if (a.chrom != _currChromName)
173 StartNewChrom(a.chrom);
174
175 if (_obeySplits == true) {
176 bedVector bedBlocks; // vec to store the discrete BED "blocks"
177 splitBedIntoBlocks(a, lineNum, bedBlocks);
178 AddBlockedCoverage(bedBlocks);
179 }
180 else if (_only_5p_end) {
181 int pos = ( a.strand=="+" ) ? a.start : a.end-1;
182 AddCoverage(pos,pos);
183 }
184 else if (_only_3p_end) {
185 int pos = ( a.strand=="-" ) ? a.start : a.end-1;
186 AddCoverage(pos,pos);
187 }
188 else
189 AddCoverage(a.start, a.end-1);
190 }
191 }
192 _bed->Close();
193 PrintFinalCoverage();
194 }
195
196
197 void BedGenomeCoverage::PrintFinalCoverage()
198 {
199
200
201 // process the results of the last chromosome.
202 ReportChromCoverage(_currChromCoverage, _currChromSize,
203 _currChromName, _currChromDepthHist);
204 if (_eachBase == false && _bedGraph == false && _bedGraphAll == false) {
205 ReportGenomeCoverage(_currChromDepthHist);
206 }
207 }
208
209
210 void BedGenomeCoverage::CoverageBam(string bamFile) {
211
212 ResetChromCoverage();
213
214 // open the BAM file
215 BamReader reader;
216 reader.Open(bamFile);
217
218 // get header & reference information
219 string header = reader.GetHeaderText();
220 RefVector refs = reader.GetReferenceData();
221
222 // load the BAM header references into a BEDTools "genome file"
223 _genome = new GenomeFile(refs);
224 // convert each aligned BAM entry to BED
225 // and compute coverage on B
226 BamAlignment bam;
227 while (reader.GetNextAlignment(bam)) {
228 // skip if the read is unaligned
229 if (bam.IsMapped() == false)
230 continue;
231
232 // skip if we care about strands and the strand isn't what
233 // the user wanted
234 if ( (_filterByStrand == true) &&
235 ((_requestedStrand == "-") != bam.IsReverseStrand()) )
236 continue;
237
238 // extract the chrom, start and end from the BAM alignment
239 string chrom(refs.at(bam.RefID).RefName);
240 CHRPOS start = bam.Position;
241 CHRPOS end = bam.GetEndPosition(false, false) - 1;
242
243 // are we on a new chromosome?
244 if ( chrom != _currChromName )
245 StartNewChrom(chrom);
246
247 // add coverage accordingly.
248 if (_obeySplits) {
249 bedVector bedBlocks;
250 // since we are counting coverage, we do want to split blocks when a
251 // deletion (D) CIGAR op is encountered (hence the true for the last parm)
252 getBamBlocks(bam, refs, bedBlocks, true);
253 AddBlockedCoverage(bedBlocks);
254 }
255 else if (_only_5p_end) {
256 int pos = ( !bam.IsReverseStrand() ) ? start : end;
257 AddCoverage(pos,pos);
258 }
259 else if (_only_3p_end) {
260 int pos = ( bam.IsReverseStrand() ) ? start : end;
261 AddCoverage(pos,pos);
262 }
263 else
264 AddCoverage(start, end);
265 }
266 // close the BAM
267 reader.Close();
268 PrintFinalCoverage();
269 }
270
271
272 void BedGenomeCoverage::ReportChromCoverage(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom, chromHistMap &chromDepthHist) {
273
274 if (_eachBase) {
275 int depth = 0; // initialize the depth
276 int offset = (_eachBaseZeroBased)?0:1;
277 for (int pos = 0; pos < chromSize; pos++) {
278
279 depth += chromCov[pos].starts;
280 // report the depth for this position.
281 if (depth>0 || !_eachBaseZeroBased)
282 cout << chrom << "\t" << pos+offset << "\t" << depth * _scale << endl;
283 depth = depth - chromCov[pos].ends;
284 }
285 }
286 else if (_bedGraph == true || _bedGraphAll == true) {
287 ReportChromCoverageBedGraph(chromCov, chromSize, chrom);
288 }
289 else {
290
291 int depth = 0; // initialize the depth
292
293 for (int pos = 0; pos < chromSize; pos++) {
294
295 depth += chromCov[pos].starts;
296
297 // add the depth at this position to the depth histogram
298 // for this chromosome. if the depth is greater than the
299 // maximum bin requested, then readjust the depth to be the max
300 if (depth >= _max) {
301 chromDepthHist[chrom][_max]++;
302 }
303 else {
304 chromDepthHist[chrom][depth]++;
305 }
306 depth = depth - chromCov[pos].ends;
307 }
308 // report the histogram for each chromosome
309 histMap::const_iterator depthIt = chromDepthHist[chrom].begin();
310 histMap::const_iterator depthEnd = chromDepthHist[chrom].end();
311 for (; depthIt != depthEnd; ++depthIt) {
312 int depth = depthIt->first;
313 unsigned int numBasesAtDepth = depthIt->second;
314 cout << chrom << "\t" << depth << "\t" << numBasesAtDepth << "\t"
315 << chromSize << "\t" << (float) ((float)numBasesAtDepth / (float)chromSize) << endl;
316 }
317 }
318 }
319
320
321
322 void BedGenomeCoverage::ReportGenomeCoverage(chromHistMap &chromDepthHist) {
323
324 // get the list of chromosome names in the genome
325 vector<string> chromList = _genome->getChromList();
326
327 unsigned int genomeSize = 0;
328 vector<string>::const_iterator chromItr = chromList.begin();
329 vector<string>::const_iterator chromEnd = chromList.end();
330 for (; chromItr != chromEnd; ++chromItr) {
331 string chrom = *chromItr;
332 genomeSize += _genome->getChromSize(chrom);
333 // if there were no reads for a give chromosome, then
334 // add the length of the chrom to the 0 bin.
335 if ( chromDepthHist.find(chrom) == chromDepthHist.end() ) {
336 chromDepthHist[chrom][0] += _genome->getChromSize(chrom);
337 }
338 }
339
340 histMap genomeHist; // depth histogram for the entire genome
341
342 // loop through each chromosome and add the depth and number of bases at each depth
343 // to the aggregate histogram for the entire genome
344 for (chromHistMap::iterator chromIt = chromDepthHist.begin(); chromIt != chromDepthHist.end(); ++chromIt) {
345 string chrom = chromIt->first;
346 for (histMap::iterator depthIt = chromDepthHist[chrom].begin(); depthIt != chromDepthHist[chrom].end(); ++depthIt) {
347 int depth = depthIt->first;
348 unsigned int numBasesAtDepth = depthIt->second;
349 genomeHist[depth] += numBasesAtDepth;
350 }
351 }
352
353 // loop through the depths for the entire genome
354 // and report the number and fraction of bases in
355 // the entire genome that are at said depth.
356 for (histMap::iterator genomeDepthIt = genomeHist.begin(); genomeDepthIt != genomeHist.end(); ++genomeDepthIt) {
357 int depth = genomeDepthIt->first;
358 unsigned int numBasesAtDepth = genomeDepthIt->second;
359
360 cout << "genome" << "\t" << depth << "\t" << numBasesAtDepth << "\t"
361 << genomeSize << "\t" << (float) ((float)numBasesAtDepth / (float)genomeSize) << endl;
362 }
363 }
364
365
366 void BedGenomeCoverage::ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom) {
367
368 int depth = 0; // initialize the depth
369 int lastStart = -1;
370 int lastDepth = -1;
371
372 for (int pos = 0; pos < chromSize; pos++) {
373 depth += chromCov[pos].starts;
374
375 if (depth != lastDepth) {
376 // Coverage depth has changed, print the last interval coverage (if any)
377 // Print if:
378 // (1) depth>0 (the default running mode),
379 // (2) depth==0 and the user requested to print zero covered regions (_bedGraphAll)
380 if ( (lastDepth != -1) && (lastDepth > 0 || _bedGraphAll) ) {
381 cout << chrom << "\t" << lastStart << "\t" << pos << "\t" << lastDepth * _scale << endl;
382 }
383 //Set current position as the new interval start + depth
384 lastDepth = depth;
385 lastStart = pos;
386 }
387 // Default: the depth has not changed, so we will not print anything.
388 // Proceed until the depth changes.
389 // Update depth
390 depth = depth - chromCov[pos].ends;
391 }
392 //Print information about the last position
393 if ( (lastDepth != -1) && (lastDepth > 0 || _bedGraphAll) ) {
394 cout << chrom << "\t" << lastStart << "\t" << chromSize << "\t" << lastDepth * _scale << endl;
395 }
396 }