0
|
1 /*****************************************************************************
|
|
2 coverageBed.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "coverageBed.h"
|
|
14
|
|
15 // build
|
|
16 BedCoverage::BedCoverage(string &bedAFile, string &bedBFile, bool sameStrand, bool diffStrand,
|
|
17 bool writeHistogram, bool bamInput, bool obeySplits,
|
|
18 bool eachBase, bool countsOnly) {
|
|
19
|
|
20 _bedAFile = bedAFile;
|
|
21 _bedBFile = bedBFile;
|
|
22
|
|
23 _bedA = new BedFile(bedAFile);
|
|
24 _bedB = new BedFile(bedBFile);
|
|
25
|
|
26 _sameStrand = sameStrand;
|
|
27 _diffStrand = diffStrand;
|
|
28 _obeySplits = obeySplits;
|
|
29 _eachBase = eachBase;
|
|
30 _writeHistogram = writeHistogram;
|
|
31 _bamInput = bamInput;
|
|
32 _countsOnly = countsOnly;
|
|
33
|
|
34
|
|
35 if (_bamInput == false)
|
|
36 CollectCoverageBed();
|
|
37 else
|
|
38 CollectCoverageBam(_bedA->bedFile);
|
|
39 }
|
|
40
|
|
41 // destroy
|
|
42 BedCoverage::~BedCoverage(void) {
|
|
43 delete _bedA;
|
|
44 delete _bedB;
|
|
45 }
|
|
46
|
|
47
|
|
48 void BedCoverage::CollectCoverageBed() {
|
|
49
|
|
50 // load the "B" bed file into a map so
|
|
51 // that we can easily compare "A" to it for overlaps
|
|
52 _bedB->loadBedCovFileIntoMap();
|
|
53
|
|
54 int lineNum = 0; // current input line number
|
|
55 BED a, nullBed;
|
|
56 BedLineStatus bedStatus;
|
|
57
|
|
58 _bedA->Open();
|
|
59 // process each entry in A
|
|
60 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) {
|
|
61 if (bedStatus == BED_VALID) {
|
|
62 // process the BED entry as a single block
|
|
63 if (_obeySplits == false)
|
|
64 _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly);
|
|
65 // split the BED into discrete blocksand process each independently.
|
|
66 else {
|
|
67 bedVector bedBlocks;
|
|
68 splitBedIntoBlocks(a, lineNum, bedBlocks);
|
|
69
|
|
70 // use countSplitHits to avoid over-counting each split chunk
|
|
71 // as distinct read coverage.
|
|
72 _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly);
|
|
73 }
|
|
74 a = nullBed;
|
|
75 }
|
|
76 }
|
|
77 _bedA->Close();
|
|
78
|
|
79 // report the coverage (summary or histogram) for BED B.
|
|
80 if (_countsOnly == true)
|
|
81 ReportCounts();
|
|
82 else
|
|
83 ReportCoverage();
|
|
84 }
|
|
85
|
|
86
|
|
87 void BedCoverage::CollectCoverageBam(string bamFile) {
|
|
88
|
|
89 // load the "B" bed file into a map so
|
|
90 // that we can easily compare "A" to it for overlaps
|
|
91 _bedB->loadBedCovFileIntoMap();
|
|
92
|
|
93 // open the BAM file
|
|
94 BamReader reader;
|
|
95 reader.Open(bamFile);
|
|
96
|
|
97 // get header & reference information
|
|
98 string header = reader.GetHeaderText();
|
|
99 RefVector refs = reader.GetReferenceData();
|
|
100
|
|
101 // convert each aligned BAM entry to BED
|
|
102 // and compute coverage on B
|
|
103 BamAlignment bam;
|
|
104 while (reader.GetNextAlignment(bam)) {
|
|
105 if (bam.IsMapped()) {
|
|
106 // treat the BAM alignment as a single "block"
|
|
107 if (_obeySplits == false) {
|
|
108 // construct a new BED entry from the current BAM alignment.
|
|
109 BED a;
|
|
110 a.chrom = refs.at(bam.RefID).RefName;
|
|
111 a.start = bam.Position;
|
|
112 a.end = bam.GetEndPosition(false, false);
|
|
113 a.strand = "+";
|
|
114 if (bam.IsReverseStrand()) a.strand = "-";
|
|
115
|
|
116 _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly);
|
|
117 }
|
|
118 // split the BAM alignment into discrete blocks and
|
|
119 // look for overlaps only within each block.
|
|
120 else {
|
|
121 // vec to store the discrete BED "blocks" from a
|
|
122 bedVector bedBlocks;
|
|
123 // since we are counting coverage, we do want to split blocks when a
|
|
124 // deletion (D) CIGAR op is encountered (hence the true for the last parm)
|
|
125 getBamBlocks(bam, refs, bedBlocks, true);
|
|
126 // use countSplitHits to avoid over-counting each split chunk
|
|
127 // as distinct read coverage.
|
|
128 _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly);
|
|
129 }
|
|
130 }
|
|
131 }
|
|
132 // report the coverage (summary or histogram) for BED B.
|
|
133 if (_countsOnly == true)
|
|
134 ReportCounts();
|
|
135 else
|
|
136 ReportCoverage();
|
|
137 // close the BAM file
|
|
138 reader.Close();
|
|
139 }
|
|
140
|
|
141
|
|
142 void BedCoverage::ReportCounts() {
|
|
143
|
|
144
|
|
145 // process each chromosome
|
|
146 masterBedCovMap::const_iterator chromItr = _bedB->bedCovMap.begin();
|
|
147 masterBedCovMap::const_iterator chromEnd = _bedB->bedCovMap.end();
|
|
148 for (; chromItr != chromEnd; ++chromItr)
|
|
149 {
|
|
150 // for each chrom, process each bin
|
|
151 binsToBedCovs::const_iterator binItr = chromItr->second.begin();
|
|
152 binsToBedCovs::const_iterator binEnd = chromItr->second.end();
|
|
153 for (; binItr != binEnd; ++binItr)
|
|
154 {
|
|
155 // for each chrom & bin, compute and report
|
|
156 // the observed coverage for each feature
|
|
157 vector<BEDCOV>::const_iterator bedItr = binItr->second.begin();
|
|
158 vector<BEDCOV>::const_iterator bedEnd = binItr->second.end();
|
|
159 for (; bedItr != bedEnd; ++bedItr)
|
|
160 {
|
|
161 _bedB->reportBedTab(*bedItr);
|
|
162 printf("%d\n", bedItr->count);
|
|
163 }
|
|
164 }
|
|
165 }
|
|
166 }
|
|
167
|
|
168 void BedCoverage::ReportCoverage() {
|
|
169
|
|
170 map<unsigned int, unsigned int> allDepthHist;
|
|
171 unsigned int totalLength = 0;
|
|
172
|
|
173 // process each chromosome
|
|
174 masterBedCovMap::const_iterator chromItr = _bedB->bedCovMap.begin();
|
|
175 masterBedCovMap::const_iterator chromEnd = _bedB->bedCovMap.end();
|
|
176 for (; chromItr != chromEnd; ++chromItr)
|
|
177 {
|
|
178 // for each chrom, process each bin
|
|
179 binsToBedCovs::const_iterator binItr = chromItr->second.begin();
|
|
180 binsToBedCovs::const_iterator binEnd = chromItr->second.end();
|
|
181 for (; binItr != binEnd; ++binItr)
|
|
182 {
|
|
183 // for each chrom & bin, compute and report
|
|
184 // the observed coverage for each feature
|
|
185 vector<BEDCOV>::const_iterator bedItr = binItr->second.begin();
|
|
186 vector<BEDCOV>::const_iterator bedEnd = binItr->second.end();
|
|
187 for (; bedItr != bedEnd; ++bedItr)
|
|
188 {
|
|
189 int zeroDepthCount = 0; // number of bases with zero depth
|
|
190 int depth = 0; // tracks the depth at the current base
|
|
191
|
|
192 // the start is either the first base in the feature OR
|
|
193 // the leftmost position of an overlapping feature. e.g. (s = start):
|
|
194 // A ----------
|
|
195 // B s ------------
|
|
196 int start = min(bedItr->minOverlapStart, bedItr->start);
|
|
197
|
|
198 // track the number of bases in the feature covered by
|
|
199 // 0, 1, 2, ... n features in A
|
|
200 map<unsigned int, unsigned int> depthHist;
|
|
201 map<unsigned int, DEPTH>::const_iterator depthItr;
|
|
202
|
|
203 // compute the coverage observed at each base in the feature marching from start to end.
|
|
204 for (CHRPOS pos = start+1; pos <= bedItr->end; pos++)
|
|
205 {
|
|
206 // map pointer grabbing the starts and ends observed at this position
|
|
207 depthItr = bedItr->depthMap.find(pos);
|
|
208 // increment coverage if starts observed at this position.
|
|
209 if (depthItr != bedItr->depthMap.end())
|
|
210 depth += depthItr->second.starts;
|
|
211 // update coverage assuming the current position is within the current B feature
|
|
212 if ((pos > bedItr->start) && (pos <= bedItr->end)) {
|
|
213 if (depth == 0) zeroDepthCount++;
|
|
214 // update our histograms, assuming we are not reporting "per-base" coverage.
|
|
215 if (_eachBase == false) {
|
|
216 depthHist[depth]++;
|
|
217 allDepthHist[depth]++;
|
|
218 }
|
|
219 else if ((_eachBase == true) && (bedItr->zeroLength == false))
|
|
220 {
|
|
221 _bedB->reportBedTab(*bedItr);
|
|
222 printf("%d\t%d\n", pos-bedItr->start, depth);
|
|
223 }
|
|
224 }
|
|
225 // decrement coverage if ends observed at this position.
|
|
226 if (depthItr != bedItr->depthMap.end())
|
|
227 depth = depth - depthItr->second.ends;
|
|
228 }
|
|
229
|
|
230 // handle the special case where the user wants "per-base" depth
|
|
231 // but the current feature is length = 0.
|
|
232 if ((_eachBase == true) && (bedItr->zeroLength == true)) {
|
|
233 _bedB->reportBedTab(*bedItr);
|
|
234 printf("1\t%d\n",depth);
|
|
235 }
|
|
236 // Summarize the coverage for the current interval,
|
|
237 // assuming the user has not requested "per-base" coverage.
|
|
238 else if (_eachBase == false)
|
|
239 {
|
|
240 CHRPOS length = bedItr->end - bedItr->start;
|
|
241 if (bedItr->zeroLength == true) {
|
|
242 length = 0;
|
|
243 }
|
|
244 totalLength += length;
|
|
245 int nonZeroBases = (length - zeroDepthCount);
|
|
246
|
|
247 float fractCovered = 0.0;
|
|
248 if (bedItr->zeroLength == false) {
|
|
249 fractCovered = (float) nonZeroBases / length;
|
|
250 }
|
|
251
|
|
252 // print a summary of the coverage
|
|
253 if (_writeHistogram == false) {
|
|
254 _bedB->reportBedTab(*bedItr);
|
|
255 printf("%d\t%d\t%d\t%0.7f\n", bedItr->count, nonZeroBases, length, fractCovered);
|
|
256 }
|
|
257 // HISTOGRAM
|
|
258 // report the number of bases with coverage == x
|
|
259 else {
|
|
260 // produce a histogram when not a zero length feature.
|
|
261 if (bedItr->zeroLength == false) {
|
|
262 map<unsigned int, unsigned int>::const_iterator histItr = depthHist.begin();
|
|
263 map<unsigned int, unsigned int>::const_iterator histEnd = depthHist.end();
|
|
264 for (; histItr != histEnd; ++histItr)
|
|
265 {
|
|
266 float fractAtThisDepth = (float) histItr->second / length;
|
|
267 _bedB->reportBedTab(*bedItr);
|
|
268 printf("%d\t%d\t%d\t%0.7f\n", histItr->first, histItr->second, length, fractAtThisDepth);
|
|
269 }
|
|
270 }
|
|
271 // special case when it is a zero length feauture.
|
|
272 else {
|
|
273 _bedB->reportBedTab(*bedItr);
|
|
274 printf("%d\t%d\t%d\t%0.7f\n", bedItr->count, 0, 0, 1.0000000);
|
|
275 }
|
|
276 }
|
|
277 }
|
|
278 }
|
|
279 }
|
|
280 }
|
|
281 // report a histogram of coverage among _all_
|
|
282 // features in B.
|
|
283 if (_writeHistogram == true) {
|
|
284 map<unsigned int, unsigned int>::const_iterator histItr = allDepthHist.begin();
|
|
285 map<unsigned int, unsigned int>::const_iterator histEnd = allDepthHist.end();
|
|
286 for (; histItr != histEnd; ++histItr) {
|
|
287 float fractAtThisDepth = (float) histItr->second / totalLength;
|
|
288 printf("all\t%d\t%d\t%d\t%0.7f\n", histItr->first, histItr->second, totalLength, fractAtThisDepth);
|
|
289 }
|
|
290 }
|
|
291 }
|
|
292
|
|
293
|