0
|
1 /*****************************************************************************
|
|
2 genomeCoverage.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "genomeCoverageBed.h"
|
|
14
|
|
15
|
|
16 BedGenomeCoverage::BedGenomeCoverage(string bedFile, string genomeFile,
|
|
17 bool eachBase, bool startSites,
|
|
18 bool bedGraph, bool bedGraphAll,
|
|
19 int max, float scale,
|
|
20 bool bamInput, bool obeySplits,
|
|
21 bool filterByStrand, string requestedStrand,
|
|
22 bool only_5p_end, bool only_3p_end,
|
|
23 bool eachBaseZeroBased,
|
|
24 bool add_gb_track_line, string gb_track_line_opts) {
|
|
25
|
|
26 _bedFile = bedFile;
|
|
27 _genomeFile = genomeFile;
|
|
28 _eachBase = eachBase;
|
|
29 _eachBaseZeroBased = eachBaseZeroBased;
|
|
30 _startSites = startSites;
|
|
31 _bedGraph = bedGraph;
|
|
32 _bedGraphAll = bedGraphAll;
|
|
33 _max = max;
|
|
34 _scale = scale;
|
|
35 _bamInput = bamInput;
|
|
36 _obeySplits = obeySplits;
|
|
37 _filterByStrand = filterByStrand;
|
|
38 _requestedStrand = requestedStrand;
|
|
39 _only_3p_end = only_3p_end;
|
|
40 _only_5p_end = only_5p_end;
|
|
41 _add_gb_track_line = add_gb_track_line;
|
|
42 _gb_track_line_opts = gb_track_line_opts;
|
|
43 _currChromName = "";
|
|
44 _currChromSize = 0 ;
|
|
45
|
|
46
|
|
47 if (_bamInput == false) {
|
|
48 _genome = new GenomeFile(genomeFile);
|
|
49 }
|
|
50
|
|
51 PrintTrackDefinitionLine();
|
|
52
|
|
53 if (_bamInput == false) {
|
|
54 _bed = new BedFile(bedFile);
|
|
55 CoverageBed();
|
|
56 }
|
|
57 else {
|
|
58 CoverageBam(_bedFile);
|
|
59 }
|
|
60 }
|
|
61
|
|
62 void BedGenomeCoverage::PrintTrackDefinitionLine()
|
|
63 {
|
|
64 //Print Track Definition line (if requested)
|
|
65 if ( (_bedGraph||_bedGraphAll) && _add_gb_track_line) {
|
|
66 string line = "track type=bedGraph";
|
|
67 if (!_gb_track_line_opts.empty()) {
|
|
68 line += " " ;
|
|
69 line += _gb_track_line_opts ;
|
|
70 }
|
|
71 cout << line << endl;
|
|
72 }
|
|
73
|
|
74 }
|
|
75
|
|
76
|
|
77 BedGenomeCoverage::~BedGenomeCoverage(void) {
|
|
78 delete _bed;
|
|
79 delete _genome;
|
|
80 }
|
|
81
|
|
82
|
|
83 void BedGenomeCoverage::ResetChromCoverage() {
|
|
84 _currChromName = "";
|
|
85 _currChromSize = 0 ;
|
|
86 std::vector<DEPTH>().swap(_currChromCoverage);
|
|
87 }
|
|
88
|
|
89
|
|
90 void BedGenomeCoverage::StartNewChrom(const string& newChrom) {
|
|
91 // If we've moved beyond the first encountered chromosomes,
|
|
92 // process the results of the previous chromosome.
|
|
93 if (_currChromName.length() > 0) {
|
|
94 ReportChromCoverage(_currChromCoverage, _currChromSize,
|
|
95 _currChromName, _currChromDepthHist);
|
|
96 }
|
|
97
|
|
98 // empty the previous chromosome and reserve new
|
|
99 std::vector<DEPTH>().swap(_currChromCoverage);
|
|
100
|
|
101 if (_visitedChromosomes.find(newChrom) != _visitedChromosomes.end()) {
|
|
102 cerr << "Input error: Chromosome " << _currChromName
|
|
103 << " found in non-sequential lines. This suggests that the input file is not sorted correctly." << endl;
|
|
104
|
|
105 }
|
|
106 _visitedChromosomes.insert(newChrom);
|
|
107
|
|
108 _currChromName = newChrom;
|
|
109
|
|
110 // get the current chrom size and allocate space
|
|
111 _currChromSize = _genome->getChromSize(_currChromName);
|
|
112
|
|
113 if (_currChromSize >= 0)
|
|
114 _currChromCoverage.resize(_currChromSize);
|
|
115 else {
|
|
116 cerr << "Input error: Chromosome " << _currChromName << " found in your input file but not in your genome file." << endl;
|
|
117 exit(1);
|
|
118 }
|
|
119 }
|
|
120
|
|
121
|
|
122 void BedGenomeCoverage::AddCoverage(int start, int end) {
|
|
123 // process the first line for this chromosome.
|
|
124 // make sure the coordinates fit within the chrom
|
|
125 if (start < _currChromSize)
|
|
126 _currChromCoverage[start].starts++;
|
|
127 if (end < _currChromSize)
|
|
128 _currChromCoverage[end].ends++;
|
|
129 else
|
|
130 _currChromCoverage[_currChromSize-1].ends++;
|
|
131 }
|
|
132
|
|
133
|
|
134 void BedGenomeCoverage::AddBlockedCoverage(const vector<BED> &bedBlocks) {
|
|
135 vector<BED>::const_iterator bedItr = bedBlocks.begin();
|
|
136 vector<BED>::const_iterator bedEnd = bedBlocks.end();
|
|
137 for (; bedItr != bedEnd; ++bedItr) {
|
|
138 // the end - 1 must be done because BamAncillary::getBamBlocks
|
|
139 // returns ends uncorrected for the genomeCoverageBed data structure.
|
|
140 // ugly, but necessary.
|
|
141 AddCoverage(bedItr->start, bedItr->end - 1);
|
|
142 }
|
|
143 }
|
|
144
|
|
145
|
|
146 void BedGenomeCoverage::CoverageBed() {
|
|
147
|
|
148 BED a, nullBed;
|
|
149 int lineNum = 0; // current input line number
|
|
150 BedLineStatus bedStatus;
|
|
151
|
|
152 ResetChromCoverage();
|
|
153
|
|
154 _bed->Open();
|
|
155 while ( (bedStatus = _bed->GetNextBed(a, lineNum)) != BED_INVALID ) {
|
|
156 if (bedStatus == BED_VALID) {
|
|
157 if (_filterByStrand == true) {
|
|
158 if (a.strand.empty()) {
|
|
159 cerr << "Input error: Interval is missing a strand value on line " << lineNum << "." <<endl;
|
|
160 exit(1);
|
|
161 }
|
|
162 if ( ! (a.strand == "-" || a.strand == "+") ) {
|
|
163 cerr << "Input error: Invalid strand value (" << a.strand << ") on line " << lineNum << "." << endl;
|
|
164 exit(1);
|
|
165 }
|
|
166 // skip if the strand is not what the user requested.
|
|
167 if (a.strand != _requestedStrand)
|
|
168 continue;
|
|
169 }
|
|
170
|
|
171 // are we on a new chromosome?
|
|
172 if (a.chrom != _currChromName)
|
|
173 StartNewChrom(a.chrom);
|
|
174
|
|
175 if (_obeySplits == true) {
|
|
176 bedVector bedBlocks; // vec to store the discrete BED "blocks"
|
|
177 splitBedIntoBlocks(a, lineNum, bedBlocks);
|
|
178 AddBlockedCoverage(bedBlocks);
|
|
179 }
|
|
180 else if (_only_5p_end) {
|
|
181 int pos = ( a.strand=="+" ) ? a.start : a.end-1;
|
|
182 AddCoverage(pos,pos);
|
|
183 }
|
|
184 else if (_only_3p_end) {
|
|
185 int pos = ( a.strand=="-" ) ? a.start : a.end-1;
|
|
186 AddCoverage(pos,pos);
|
|
187 }
|
|
188 else
|
|
189 AddCoverage(a.start, a.end-1);
|
|
190 }
|
|
191 }
|
|
192 _bed->Close();
|
|
193 PrintFinalCoverage();
|
|
194 }
|
|
195
|
|
196
|
|
197 void BedGenomeCoverage::PrintFinalCoverage()
|
|
198 {
|
|
199
|
|
200
|
|
201 // process the results of the last chromosome.
|
|
202 ReportChromCoverage(_currChromCoverage, _currChromSize,
|
|
203 _currChromName, _currChromDepthHist);
|
|
204 if (_eachBase == false && _bedGraph == false && _bedGraphAll == false) {
|
|
205 ReportGenomeCoverage(_currChromDepthHist);
|
|
206 }
|
|
207 }
|
|
208
|
|
209
|
|
210 void BedGenomeCoverage::CoverageBam(string bamFile) {
|
|
211
|
|
212 ResetChromCoverage();
|
|
213
|
|
214 // open the BAM file
|
|
215 BamReader reader;
|
|
216 reader.Open(bamFile);
|
|
217
|
|
218 // get header & reference information
|
|
219 string header = reader.GetHeaderText();
|
|
220 RefVector refs = reader.GetReferenceData();
|
|
221
|
|
222 // load the BAM header references into a BEDTools "genome file"
|
|
223 _genome = new GenomeFile(refs);
|
|
224 // convert each aligned BAM entry to BED
|
|
225 // and compute coverage on B
|
|
226 BamAlignment bam;
|
|
227 while (reader.GetNextAlignment(bam)) {
|
|
228 // skip if the read is unaligned
|
|
229 if (bam.IsMapped() == false)
|
|
230 continue;
|
|
231
|
|
232 // skip if we care about strands and the strand isn't what
|
|
233 // the user wanted
|
|
234 if ( (_filterByStrand == true) &&
|
|
235 ((_requestedStrand == "-") != bam.IsReverseStrand()) )
|
|
236 continue;
|
|
237
|
|
238 // extract the chrom, start and end from the BAM alignment
|
|
239 string chrom(refs.at(bam.RefID).RefName);
|
|
240 CHRPOS start = bam.Position;
|
|
241 CHRPOS end = bam.GetEndPosition(false, false) - 1;
|
|
242
|
|
243 // are we on a new chromosome?
|
|
244 if ( chrom != _currChromName )
|
|
245 StartNewChrom(chrom);
|
|
246
|
|
247 // add coverage accordingly.
|
|
248 if (_obeySplits) {
|
|
249 bedVector bedBlocks;
|
|
250 // since we are counting coverage, we do want to split blocks when a
|
|
251 // deletion (D) CIGAR op is encountered (hence the true for the last parm)
|
|
252 getBamBlocks(bam, refs, bedBlocks, true);
|
|
253 AddBlockedCoverage(bedBlocks);
|
|
254 }
|
|
255 else if (_only_5p_end) {
|
|
256 int pos = ( !bam.IsReverseStrand() ) ? start : end;
|
|
257 AddCoverage(pos,pos);
|
|
258 }
|
|
259 else if (_only_3p_end) {
|
|
260 int pos = ( bam.IsReverseStrand() ) ? start : end;
|
|
261 AddCoverage(pos,pos);
|
|
262 }
|
|
263 else
|
|
264 AddCoverage(start, end);
|
|
265 }
|
|
266 // close the BAM
|
|
267 reader.Close();
|
|
268 PrintFinalCoverage();
|
|
269 }
|
|
270
|
|
271
|
|
272 void BedGenomeCoverage::ReportChromCoverage(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom, chromHistMap &chromDepthHist) {
|
|
273
|
|
274 if (_eachBase) {
|
|
275 int depth = 0; // initialize the depth
|
|
276 int offset = (_eachBaseZeroBased)?0:1;
|
|
277 for (int pos = 0; pos < chromSize; pos++) {
|
|
278
|
|
279 depth += chromCov[pos].starts;
|
|
280 // report the depth for this position.
|
|
281 if (depth>0 || !_eachBaseZeroBased)
|
|
282 cout << chrom << "\t" << pos+offset << "\t" << depth * _scale << endl;
|
|
283 depth = depth - chromCov[pos].ends;
|
|
284 }
|
|
285 }
|
|
286 else if (_bedGraph == true || _bedGraphAll == true) {
|
|
287 ReportChromCoverageBedGraph(chromCov, chromSize, chrom);
|
|
288 }
|
|
289 else {
|
|
290
|
|
291 int depth = 0; // initialize the depth
|
|
292
|
|
293 for (int pos = 0; pos < chromSize; pos++) {
|
|
294
|
|
295 depth += chromCov[pos].starts;
|
|
296
|
|
297 // add the depth at this position to the depth histogram
|
|
298 // for this chromosome. if the depth is greater than the
|
|
299 // maximum bin requested, then readjust the depth to be the max
|
|
300 if (depth >= _max) {
|
|
301 chromDepthHist[chrom][_max]++;
|
|
302 }
|
|
303 else {
|
|
304 chromDepthHist[chrom][depth]++;
|
|
305 }
|
|
306 depth = depth - chromCov[pos].ends;
|
|
307 }
|
|
308 // report the histogram for each chromosome
|
|
309 histMap::const_iterator depthIt = chromDepthHist[chrom].begin();
|
|
310 histMap::const_iterator depthEnd = chromDepthHist[chrom].end();
|
|
311 for (; depthIt != depthEnd; ++depthIt) {
|
|
312 int depth = depthIt->first;
|
|
313 unsigned int numBasesAtDepth = depthIt->second;
|
|
314 cout << chrom << "\t" << depth << "\t" << numBasesAtDepth << "\t"
|
|
315 << chromSize << "\t" << (float) ((float)numBasesAtDepth / (float)chromSize) << endl;
|
|
316 }
|
|
317 }
|
|
318 }
|
|
319
|
|
320
|
|
321
|
|
322 void BedGenomeCoverage::ReportGenomeCoverage(chromHistMap &chromDepthHist) {
|
|
323
|
|
324 // get the list of chromosome names in the genome
|
|
325 vector<string> chromList = _genome->getChromList();
|
|
326
|
|
327 unsigned int genomeSize = 0;
|
|
328 vector<string>::const_iterator chromItr = chromList.begin();
|
|
329 vector<string>::const_iterator chromEnd = chromList.end();
|
|
330 for (; chromItr != chromEnd; ++chromItr) {
|
|
331 string chrom = *chromItr;
|
|
332 genomeSize += _genome->getChromSize(chrom);
|
|
333 // if there were no reads for a give chromosome, then
|
|
334 // add the length of the chrom to the 0 bin.
|
|
335 if ( chromDepthHist.find(chrom) == chromDepthHist.end() ) {
|
|
336 chromDepthHist[chrom][0] += _genome->getChromSize(chrom);
|
|
337 }
|
|
338 }
|
|
339
|
|
340 histMap genomeHist; // depth histogram for the entire genome
|
|
341
|
|
342 // loop through each chromosome and add the depth and number of bases at each depth
|
|
343 // to the aggregate histogram for the entire genome
|
|
344 for (chromHistMap::iterator chromIt = chromDepthHist.begin(); chromIt != chromDepthHist.end(); ++chromIt) {
|
|
345 string chrom = chromIt->first;
|
|
346 for (histMap::iterator depthIt = chromDepthHist[chrom].begin(); depthIt != chromDepthHist[chrom].end(); ++depthIt) {
|
|
347 int depth = depthIt->first;
|
|
348 unsigned int numBasesAtDepth = depthIt->second;
|
|
349 genomeHist[depth] += numBasesAtDepth;
|
|
350 }
|
|
351 }
|
|
352
|
|
353 // loop through the depths for the entire genome
|
|
354 // and report the number and fraction of bases in
|
|
355 // the entire genome that are at said depth.
|
|
356 for (histMap::iterator genomeDepthIt = genomeHist.begin(); genomeDepthIt != genomeHist.end(); ++genomeDepthIt) {
|
|
357 int depth = genomeDepthIt->first;
|
|
358 unsigned int numBasesAtDepth = genomeDepthIt->second;
|
|
359
|
|
360 cout << "genome" << "\t" << depth << "\t" << numBasesAtDepth << "\t"
|
|
361 << genomeSize << "\t" << (float) ((float)numBasesAtDepth / (float)genomeSize) << endl;
|
|
362 }
|
|
363 }
|
|
364
|
|
365
|
|
366 void BedGenomeCoverage::ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom) {
|
|
367
|
|
368 int depth = 0; // initialize the depth
|
|
369 int lastStart = -1;
|
|
370 int lastDepth = -1;
|
|
371
|
|
372 for (int pos = 0; pos < chromSize; pos++) {
|
|
373 depth += chromCov[pos].starts;
|
|
374
|
|
375 if (depth != lastDepth) {
|
|
376 // Coverage depth has changed, print the last interval coverage (if any)
|
|
377 // Print if:
|
|
378 // (1) depth>0 (the default running mode),
|
|
379 // (2) depth==0 and the user requested to print zero covered regions (_bedGraphAll)
|
|
380 if ( (lastDepth != -1) && (lastDepth > 0 || _bedGraphAll) ) {
|
|
381 cout << chrom << "\t" << lastStart << "\t" << pos << "\t" << lastDepth * _scale << endl;
|
|
382 }
|
|
383 //Set current position as the new interval start + depth
|
|
384 lastDepth = depth;
|
|
385 lastStart = pos;
|
|
386 }
|
|
387 // Default: the depth has not changed, so we will not print anything.
|
|
388 // Proceed until the depth changes.
|
|
389 // Update depth
|
|
390 depth = depth - chromCov[pos].ends;
|
|
391 }
|
|
392 //Print information about the last position
|
|
393 if ( (lastDepth != -1) && (lastDepth > 0 || _bedGraphAll) ) {
|
|
394 cout << chrom << "\t" << lastStart << "\t" << chromSize << "\t" << lastDepth * _scale << endl;
|
|
395 }
|
|
396 }
|