Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/genomeCoverageBed/genomeCoverageBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dfcd8b6c1bda |
---|---|
1 /***************************************************************************** | |
2 genomeCoverage.cpp | |
3 | |
4 (c) 2009 - Aaron Quinlan | |
5 Hall Laboratory | |
6 Department of Biochemistry and Molecular Genetics | |
7 University of Virginia | |
8 aaronquinlan@gmail.com | |
9 | |
10 Licenced under the GNU General Public License 2.0 license. | |
11 ******************************************************************************/ | |
12 #include "lineFileUtilities.h" | |
13 #include "genomeCoverageBed.h" | |
14 | |
15 | |
16 BedGenomeCoverage::BedGenomeCoverage(string bedFile, string genomeFile, | |
17 bool eachBase, bool startSites, | |
18 bool bedGraph, bool bedGraphAll, | |
19 int max, float scale, | |
20 bool bamInput, bool obeySplits, | |
21 bool filterByStrand, string requestedStrand, | |
22 bool only_5p_end, bool only_3p_end, | |
23 bool eachBaseZeroBased, | |
24 bool add_gb_track_line, string gb_track_line_opts) { | |
25 | |
26 _bedFile = bedFile; | |
27 _genomeFile = genomeFile; | |
28 _eachBase = eachBase; | |
29 _eachBaseZeroBased = eachBaseZeroBased; | |
30 _startSites = startSites; | |
31 _bedGraph = bedGraph; | |
32 _bedGraphAll = bedGraphAll; | |
33 _max = max; | |
34 _scale = scale; | |
35 _bamInput = bamInput; | |
36 _obeySplits = obeySplits; | |
37 _filterByStrand = filterByStrand; | |
38 _requestedStrand = requestedStrand; | |
39 _only_3p_end = only_3p_end; | |
40 _only_5p_end = only_5p_end; | |
41 _add_gb_track_line = add_gb_track_line; | |
42 _gb_track_line_opts = gb_track_line_opts; | |
43 _currChromName = ""; | |
44 _currChromSize = 0 ; | |
45 | |
46 | |
47 if (_bamInput == false) { | |
48 _genome = new GenomeFile(genomeFile); | |
49 } | |
50 | |
51 PrintTrackDefinitionLine(); | |
52 | |
53 if (_bamInput == false) { | |
54 _bed = new BedFile(bedFile); | |
55 CoverageBed(); | |
56 } | |
57 else { | |
58 CoverageBam(_bedFile); | |
59 } | |
60 } | |
61 | |
62 void BedGenomeCoverage::PrintTrackDefinitionLine() | |
63 { | |
64 //Print Track Definition line (if requested) | |
65 if ( (_bedGraph||_bedGraphAll) && _add_gb_track_line) { | |
66 string line = "track type=bedGraph"; | |
67 if (!_gb_track_line_opts.empty()) { | |
68 line += " " ; | |
69 line += _gb_track_line_opts ; | |
70 } | |
71 cout << line << endl; | |
72 } | |
73 | |
74 } | |
75 | |
76 | |
77 BedGenomeCoverage::~BedGenomeCoverage(void) { | |
78 delete _bed; | |
79 delete _genome; | |
80 } | |
81 | |
82 | |
83 void BedGenomeCoverage::ResetChromCoverage() { | |
84 _currChromName = ""; | |
85 _currChromSize = 0 ; | |
86 std::vector<DEPTH>().swap(_currChromCoverage); | |
87 } | |
88 | |
89 | |
90 void BedGenomeCoverage::StartNewChrom(const string& newChrom) { | |
91 // If we've moved beyond the first encountered chromosomes, | |
92 // process the results of the previous chromosome. | |
93 if (_currChromName.length() > 0) { | |
94 ReportChromCoverage(_currChromCoverage, _currChromSize, | |
95 _currChromName, _currChromDepthHist); | |
96 } | |
97 | |
98 // empty the previous chromosome and reserve new | |
99 std::vector<DEPTH>().swap(_currChromCoverage); | |
100 | |
101 if (_visitedChromosomes.find(newChrom) != _visitedChromosomes.end()) { | |
102 cerr << "Input error: Chromosome " << _currChromName | |
103 << " found in non-sequential lines. This suggests that the input file is not sorted correctly." << endl; | |
104 | |
105 } | |
106 _visitedChromosomes.insert(newChrom); | |
107 | |
108 _currChromName = newChrom; | |
109 | |
110 // get the current chrom size and allocate space | |
111 _currChromSize = _genome->getChromSize(_currChromName); | |
112 | |
113 if (_currChromSize >= 0) | |
114 _currChromCoverage.resize(_currChromSize); | |
115 else { | |
116 cerr << "Input error: Chromosome " << _currChromName << " found in your input file but not in your genome file." << endl; | |
117 exit(1); | |
118 } | |
119 } | |
120 | |
121 | |
122 void BedGenomeCoverage::AddCoverage(int start, int end) { | |
123 // process the first line for this chromosome. | |
124 // make sure the coordinates fit within the chrom | |
125 if (start < _currChromSize) | |
126 _currChromCoverage[start].starts++; | |
127 if (end < _currChromSize) | |
128 _currChromCoverage[end].ends++; | |
129 else | |
130 _currChromCoverage[_currChromSize-1].ends++; | |
131 } | |
132 | |
133 | |
134 void BedGenomeCoverage::AddBlockedCoverage(const vector<BED> &bedBlocks) { | |
135 vector<BED>::const_iterator bedItr = bedBlocks.begin(); | |
136 vector<BED>::const_iterator bedEnd = bedBlocks.end(); | |
137 for (; bedItr != bedEnd; ++bedItr) { | |
138 // the end - 1 must be done because BamAncillary::getBamBlocks | |
139 // returns ends uncorrected for the genomeCoverageBed data structure. | |
140 // ugly, but necessary. | |
141 AddCoverage(bedItr->start, bedItr->end - 1); | |
142 } | |
143 } | |
144 | |
145 | |
146 void BedGenomeCoverage::CoverageBed() { | |
147 | |
148 BED a, nullBed; | |
149 int lineNum = 0; // current input line number | |
150 BedLineStatus bedStatus; | |
151 | |
152 ResetChromCoverage(); | |
153 | |
154 _bed->Open(); | |
155 while ( (bedStatus = _bed->GetNextBed(a, lineNum)) != BED_INVALID ) { | |
156 if (bedStatus == BED_VALID) { | |
157 if (_filterByStrand == true) { | |
158 if (a.strand.empty()) { | |
159 cerr << "Input error: Interval is missing a strand value on line " << lineNum << "." <<endl; | |
160 exit(1); | |
161 } | |
162 if ( ! (a.strand == "-" || a.strand == "+") ) { | |
163 cerr << "Input error: Invalid strand value (" << a.strand << ") on line " << lineNum << "." << endl; | |
164 exit(1); | |
165 } | |
166 // skip if the strand is not what the user requested. | |
167 if (a.strand != _requestedStrand) | |
168 continue; | |
169 } | |
170 | |
171 // are we on a new chromosome? | |
172 if (a.chrom != _currChromName) | |
173 StartNewChrom(a.chrom); | |
174 | |
175 if (_obeySplits == true) { | |
176 bedVector bedBlocks; // vec to store the discrete BED "blocks" | |
177 splitBedIntoBlocks(a, lineNum, bedBlocks); | |
178 AddBlockedCoverage(bedBlocks); | |
179 } | |
180 else if (_only_5p_end) { | |
181 int pos = ( a.strand=="+" ) ? a.start : a.end-1; | |
182 AddCoverage(pos,pos); | |
183 } | |
184 else if (_only_3p_end) { | |
185 int pos = ( a.strand=="-" ) ? a.start : a.end-1; | |
186 AddCoverage(pos,pos); | |
187 } | |
188 else | |
189 AddCoverage(a.start, a.end-1); | |
190 } | |
191 } | |
192 _bed->Close(); | |
193 PrintFinalCoverage(); | |
194 } | |
195 | |
196 | |
197 void BedGenomeCoverage::PrintFinalCoverage() | |
198 { | |
199 | |
200 | |
201 // process the results of the last chromosome. | |
202 ReportChromCoverage(_currChromCoverage, _currChromSize, | |
203 _currChromName, _currChromDepthHist); | |
204 if (_eachBase == false && _bedGraph == false && _bedGraphAll == false) { | |
205 ReportGenomeCoverage(_currChromDepthHist); | |
206 } | |
207 } | |
208 | |
209 | |
210 void BedGenomeCoverage::CoverageBam(string bamFile) { | |
211 | |
212 ResetChromCoverage(); | |
213 | |
214 // open the BAM file | |
215 BamReader reader; | |
216 reader.Open(bamFile); | |
217 | |
218 // get header & reference information | |
219 string header = reader.GetHeaderText(); | |
220 RefVector refs = reader.GetReferenceData(); | |
221 | |
222 // load the BAM header references into a BEDTools "genome file" | |
223 _genome = new GenomeFile(refs); | |
224 // convert each aligned BAM entry to BED | |
225 // and compute coverage on B | |
226 BamAlignment bam; | |
227 while (reader.GetNextAlignment(bam)) { | |
228 // skip if the read is unaligned | |
229 if (bam.IsMapped() == false) | |
230 continue; | |
231 | |
232 // skip if we care about strands and the strand isn't what | |
233 // the user wanted | |
234 if ( (_filterByStrand == true) && | |
235 ((_requestedStrand == "-") != bam.IsReverseStrand()) ) | |
236 continue; | |
237 | |
238 // extract the chrom, start and end from the BAM alignment | |
239 string chrom(refs.at(bam.RefID).RefName); | |
240 CHRPOS start = bam.Position; | |
241 CHRPOS end = bam.GetEndPosition(false, false) - 1; | |
242 | |
243 // are we on a new chromosome? | |
244 if ( chrom != _currChromName ) | |
245 StartNewChrom(chrom); | |
246 | |
247 // add coverage accordingly. | |
248 if (_obeySplits) { | |
249 bedVector bedBlocks; | |
250 // since we are counting coverage, we do want to split blocks when a | |
251 // deletion (D) CIGAR op is encountered (hence the true for the last parm) | |
252 getBamBlocks(bam, refs, bedBlocks, true); | |
253 AddBlockedCoverage(bedBlocks); | |
254 } | |
255 else if (_only_5p_end) { | |
256 int pos = ( !bam.IsReverseStrand() ) ? start : end; | |
257 AddCoverage(pos,pos); | |
258 } | |
259 else if (_only_3p_end) { | |
260 int pos = ( bam.IsReverseStrand() ) ? start : end; | |
261 AddCoverage(pos,pos); | |
262 } | |
263 else | |
264 AddCoverage(start, end); | |
265 } | |
266 // close the BAM | |
267 reader.Close(); | |
268 PrintFinalCoverage(); | |
269 } | |
270 | |
271 | |
272 void BedGenomeCoverage::ReportChromCoverage(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom, chromHistMap &chromDepthHist) { | |
273 | |
274 if (_eachBase) { | |
275 int depth = 0; // initialize the depth | |
276 int offset = (_eachBaseZeroBased)?0:1; | |
277 for (int pos = 0; pos < chromSize; pos++) { | |
278 | |
279 depth += chromCov[pos].starts; | |
280 // report the depth for this position. | |
281 if (depth>0 || !_eachBaseZeroBased) | |
282 cout << chrom << "\t" << pos+offset << "\t" << depth * _scale << endl; | |
283 depth = depth - chromCov[pos].ends; | |
284 } | |
285 } | |
286 else if (_bedGraph == true || _bedGraphAll == true) { | |
287 ReportChromCoverageBedGraph(chromCov, chromSize, chrom); | |
288 } | |
289 else { | |
290 | |
291 int depth = 0; // initialize the depth | |
292 | |
293 for (int pos = 0; pos < chromSize; pos++) { | |
294 | |
295 depth += chromCov[pos].starts; | |
296 | |
297 // add the depth at this position to the depth histogram | |
298 // for this chromosome. if the depth is greater than the | |
299 // maximum bin requested, then readjust the depth to be the max | |
300 if (depth >= _max) { | |
301 chromDepthHist[chrom][_max]++; | |
302 } | |
303 else { | |
304 chromDepthHist[chrom][depth]++; | |
305 } | |
306 depth = depth - chromCov[pos].ends; | |
307 } | |
308 // report the histogram for each chromosome | |
309 histMap::const_iterator depthIt = chromDepthHist[chrom].begin(); | |
310 histMap::const_iterator depthEnd = chromDepthHist[chrom].end(); | |
311 for (; depthIt != depthEnd; ++depthIt) { | |
312 int depth = depthIt->first; | |
313 unsigned int numBasesAtDepth = depthIt->second; | |
314 cout << chrom << "\t" << depth << "\t" << numBasesAtDepth << "\t" | |
315 << chromSize << "\t" << (float) ((float)numBasesAtDepth / (float)chromSize) << endl; | |
316 } | |
317 } | |
318 } | |
319 | |
320 | |
321 | |
322 void BedGenomeCoverage::ReportGenomeCoverage(chromHistMap &chromDepthHist) { | |
323 | |
324 // get the list of chromosome names in the genome | |
325 vector<string> chromList = _genome->getChromList(); | |
326 | |
327 unsigned int genomeSize = 0; | |
328 vector<string>::const_iterator chromItr = chromList.begin(); | |
329 vector<string>::const_iterator chromEnd = chromList.end(); | |
330 for (; chromItr != chromEnd; ++chromItr) { | |
331 string chrom = *chromItr; | |
332 genomeSize += _genome->getChromSize(chrom); | |
333 // if there were no reads for a give chromosome, then | |
334 // add the length of the chrom to the 0 bin. | |
335 if ( chromDepthHist.find(chrom) == chromDepthHist.end() ) { | |
336 chromDepthHist[chrom][0] += _genome->getChromSize(chrom); | |
337 } | |
338 } | |
339 | |
340 histMap genomeHist; // depth histogram for the entire genome | |
341 | |
342 // loop through each chromosome and add the depth and number of bases at each depth | |
343 // to the aggregate histogram for the entire genome | |
344 for (chromHistMap::iterator chromIt = chromDepthHist.begin(); chromIt != chromDepthHist.end(); ++chromIt) { | |
345 string chrom = chromIt->first; | |
346 for (histMap::iterator depthIt = chromDepthHist[chrom].begin(); depthIt != chromDepthHist[chrom].end(); ++depthIt) { | |
347 int depth = depthIt->first; | |
348 unsigned int numBasesAtDepth = depthIt->second; | |
349 genomeHist[depth] += numBasesAtDepth; | |
350 } | |
351 } | |
352 | |
353 // loop through the depths for the entire genome | |
354 // and report the number and fraction of bases in | |
355 // the entire genome that are at said depth. | |
356 for (histMap::iterator genomeDepthIt = genomeHist.begin(); genomeDepthIt != genomeHist.end(); ++genomeDepthIt) { | |
357 int depth = genomeDepthIt->first; | |
358 unsigned int numBasesAtDepth = genomeDepthIt->second; | |
359 | |
360 cout << "genome" << "\t" << depth << "\t" << numBasesAtDepth << "\t" | |
361 << genomeSize << "\t" << (float) ((float)numBasesAtDepth / (float)genomeSize) << endl; | |
362 } | |
363 } | |
364 | |
365 | |
366 void BedGenomeCoverage::ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom) { | |
367 | |
368 int depth = 0; // initialize the depth | |
369 int lastStart = -1; | |
370 int lastDepth = -1; | |
371 | |
372 for (int pos = 0; pos < chromSize; pos++) { | |
373 depth += chromCov[pos].starts; | |
374 | |
375 if (depth != lastDepth) { | |
376 // Coverage depth has changed, print the last interval coverage (if any) | |
377 // Print if: | |
378 // (1) depth>0 (the default running mode), | |
379 // (2) depth==0 and the user requested to print zero covered regions (_bedGraphAll) | |
380 if ( (lastDepth != -1) && (lastDepth > 0 || _bedGraphAll) ) { | |
381 cout << chrom << "\t" << lastStart << "\t" << pos << "\t" << lastDepth * _scale << endl; | |
382 } | |
383 //Set current position as the new interval start + depth | |
384 lastDepth = depth; | |
385 lastStart = pos; | |
386 } | |
387 // Default: the depth has not changed, so we will not print anything. | |
388 // Proceed until the depth changes. | |
389 // Update depth | |
390 depth = depth - chromCov[pos].ends; | |
391 } | |
392 //Print information about the last position | |
393 if ( (lastDepth != -1) && (lastDepth > 0 || _bedGraphAll) ) { | |
394 cout << chrom << "\t" << lastStart << "\t" << chromSize << "\t" << lastDepth * _scale << endl; | |
395 } | |
396 } |