diff BEDTools-Version-2.14.3/src/coverageBed/coverageBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/BEDTools-Version-2.14.3/src/coverageBed/coverageBed.cpp	Thu Nov 03 10:25:04 2011 -0400
@@ -0,0 +1,293 @@
+/*****************************************************************************
+  coverageBed.cpp
+
+  (c) 2009 - Aaron Quinlan
+  Hall Laboratory
+  Department of Biochemistry and Molecular Genetics
+  University of Virginia
+  aaronquinlan@gmail.com
+
+  Licenced under the GNU General Public License 2.0 license.
+******************************************************************************/
+#include "lineFileUtilities.h"
+#include "coverageBed.h"
+
+// build
+BedCoverage::BedCoverage(string &bedAFile, string &bedBFile, bool sameStrand, bool diffStrand,
+                         bool writeHistogram, bool bamInput, bool obeySplits, 
+                         bool eachBase, bool countsOnly) {
+
+    _bedAFile       = bedAFile;
+    _bedBFile       = bedBFile;
+
+    _bedA           = new BedFile(bedAFile);
+    _bedB           = new BedFile(bedBFile);
+
+    _sameStrand     = sameStrand;
+    _diffStrand     = diffStrand;
+    _obeySplits     = obeySplits;
+    _eachBase       = eachBase;
+    _writeHistogram = writeHistogram;
+    _bamInput       = bamInput;
+    _countsOnly     = countsOnly;
+
+
+    if (_bamInput == false)
+        CollectCoverageBed();
+    else
+        CollectCoverageBam(_bedA->bedFile);
+}
+
+// destroy
+BedCoverage::~BedCoverage(void) {
+    delete _bedA;
+    delete _bedB;
+}
+
+
+void BedCoverage::CollectCoverageBed() {
+
+    // load the "B" bed file into a map so
+    // that we can easily compare "A" to it for overlaps
+    _bedB->loadBedCovFileIntoMap();
+
+    int lineNum = 0;                    // current input line number
+    BED a, nullBed;
+    BedLineStatus bedStatus;
+
+    _bedA->Open();
+    // process each entry in A
+    while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) {
+        if (bedStatus == BED_VALID) {
+            // process the BED entry as a single block
+            if (_obeySplits == false)
+                _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly);
+            // split the BED into discrete blocksand process each independently.
+            else {
+                bedVector bedBlocks;
+                splitBedIntoBlocks(a, lineNum, bedBlocks);
+
+                // use countSplitHits to avoid over-counting each split chunk
+                // as distinct read coverage.
+                _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly);
+            }
+            a = nullBed;
+        }
+    }
+    _bedA->Close();
+
+    // report the coverage (summary or histogram) for BED B.
+    if (_countsOnly == true)
+        ReportCounts();
+    else 
+        ReportCoverage();
+}
+
+
+void BedCoverage::CollectCoverageBam(string bamFile) {
+
+    // load the "B" bed file into a map so
+    // that we can easily compare "A" to it for overlaps
+    _bedB->loadBedCovFileIntoMap();
+
+    // open the BAM file
+    BamReader reader;
+    reader.Open(bamFile);
+
+    // get header & reference information
+    string header = reader.GetHeaderText();
+    RefVector refs = reader.GetReferenceData();
+
+    // convert each aligned BAM entry to BED
+    // and compute coverage on B
+    BamAlignment bam;
+    while (reader.GetNextAlignment(bam)) {
+        if (bam.IsMapped()) {
+            // treat the BAM alignment as a single "block"
+            if (_obeySplits == false) {
+                // construct a new BED entry from the current BAM alignment.
+                BED a;
+                a.chrom  = refs.at(bam.RefID).RefName;
+                a.start  = bam.Position;
+                a.end    = bam.GetEndPosition(false, false);
+                a.strand = "+";
+                if (bam.IsReverseStrand()) a.strand = "-";
+
+                _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly);
+            }
+            // split the BAM alignment into discrete blocks and
+            // look for overlaps only within each block.
+            else {
+                // vec to store the discrete BED "blocks" from a
+                bedVector bedBlocks;
+                // since we are counting coverage, we do want to split blocks when a
+                // deletion (D) CIGAR op is encountered (hence the true for the last parm)
+                getBamBlocks(bam, refs, bedBlocks, true);
+                // use countSplitHits to avoid over-counting each split chunk
+                // as distinct read coverage.
+                _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly);
+            }
+        }
+    }
+    // report the coverage (summary or histogram) for BED B.
+    if (_countsOnly == true)
+        ReportCounts();
+    else 
+        ReportCoverage();
+    // close the BAM file
+    reader.Close();
+}
+
+
+void BedCoverage::ReportCounts() {
+
+
+    // process each chromosome
+    masterBedCovMap::const_iterator chromItr = _bedB->bedCovMap.begin();
+    masterBedCovMap::const_iterator chromEnd = _bedB->bedCovMap.end();
+    for (; chromItr != chromEnd; ++chromItr)
+    {
+        // for each chrom, process each bin
+        binsToBedCovs::const_iterator binItr = chromItr->second.begin();
+        binsToBedCovs::const_iterator binEnd = chromItr->second.end();
+        for (; binItr != binEnd; ++binItr)
+        {
+            // for each chrom & bin, compute and report
+            // the observed coverage for each feature
+            vector<BEDCOV>::const_iterator bedItr = binItr->second.begin();
+            vector<BEDCOV>::const_iterator bedEnd = binItr->second.end();
+            for (; bedItr != bedEnd; ++bedItr)
+            {
+                _bedB->reportBedTab(*bedItr);
+                printf("%d\n", bedItr->count);
+            }
+        }
+    }
+}
+
+void BedCoverage::ReportCoverage() {
+
+    map<unsigned int, unsigned int> allDepthHist;
+    unsigned int totalLength = 0;
+
+    // process each chromosome
+    masterBedCovMap::const_iterator chromItr = _bedB->bedCovMap.begin();
+    masterBedCovMap::const_iterator chromEnd = _bedB->bedCovMap.end();
+    for (; chromItr != chromEnd; ++chromItr)
+    {
+        // for each chrom, process each bin
+        binsToBedCovs::const_iterator binItr = chromItr->second.begin();
+        binsToBedCovs::const_iterator binEnd = chromItr->second.end();
+        for (; binItr != binEnd; ++binItr)
+        {
+            // for each chrom & bin, compute and report
+            // the observed coverage for each feature
+            vector<BEDCOV>::const_iterator bedItr = binItr->second.begin();
+            vector<BEDCOV>::const_iterator bedEnd = binItr->second.end();
+            for (; bedItr != bedEnd; ++bedItr)
+            {
+                int zeroDepthCount = 0; // number of bases with zero depth
+                int depth          = 0; // tracks the depth at the current base
+
+                // the start is either the first base in the feature OR
+                // the leftmost position of an overlapping feature. e.g. (s = start):
+                // A    ----------
+                // B    s    ------------
+                int start          = min(bedItr->minOverlapStart, bedItr->start);
+
+                // track the number of bases in the feature covered by
+                // 0, 1, 2, ... n features in A
+                map<unsigned int, unsigned int> depthHist;
+                map<unsigned int, DEPTH>::const_iterator depthItr;
+
+                // compute the coverage observed at each base in the feature marching from start to end.
+                for (CHRPOS pos = start+1; pos <= bedItr->end; pos++)
+                {
+                    // map pointer grabbing the starts and ends observed at this position
+                    depthItr = bedItr->depthMap.find(pos);
+                    // increment coverage if starts observed at this position.
+                    if (depthItr != bedItr->depthMap.end())
+                        depth += depthItr->second.starts;
+                    // update coverage assuming the current position is within the current B feature
+                    if ((pos > bedItr->start) && (pos <= bedItr->end)) {
+                        if (depth == 0) zeroDepthCount++;
+                        // update our histograms, assuming we are not reporting "per-base" coverage.
+                        if (_eachBase == false) {
+                            depthHist[depth]++;
+                            allDepthHist[depth]++;
+                        }
+                        else if ((_eachBase == true) && (bedItr->zeroLength == false))
+                        {
+                            _bedB->reportBedTab(*bedItr);
+                            printf("%d\t%d\n", pos-bedItr->start, depth);
+                        }
+                    }
+                    // decrement coverage if ends observed at this position.
+                    if (depthItr != bedItr->depthMap.end())
+                        depth = depth - depthItr->second.ends;
+                }
+
+                // handle the special case where the user wants "per-base" depth
+                // but the current feature is length = 0.
+                if ((_eachBase == true) && (bedItr->zeroLength == true)) {
+                    _bedB->reportBedTab(*bedItr);
+                    printf("1\t%d\n",depth);
+                }
+                // Summarize the coverage for the current interval,
+                // assuming the user has not requested "per-base" coverage.
+                else if (_eachBase == false) 
+                {
+                    CHRPOS length     = bedItr->end - bedItr->start;
+                    if (bedItr->zeroLength == true) {
+                        length = 0;
+                    }
+                    totalLength       += length;
+                    int nonZeroBases   = (length - zeroDepthCount);
+                    
+                    float fractCovered = 0.0;
+                    if (bedItr->zeroLength == false) {
+                        fractCovered = (float) nonZeroBases / length;
+                    }
+                    
+                    // print a summary of the coverage
+                    if (_writeHistogram == false) {
+                        _bedB->reportBedTab(*bedItr);
+                        printf("%d\t%d\t%d\t%0.7f\n", bedItr->count, nonZeroBases, length, fractCovered);
+                    }
+                    // HISTOGRAM
+                    // report the number of bases with coverage == x
+                    else {
+                        // produce a histogram when not a zero length feature.
+                        if (bedItr->zeroLength == false) {
+                            map<unsigned int, unsigned int>::const_iterator histItr = depthHist.begin();
+                            map<unsigned int, unsigned int>::const_iterator histEnd = depthHist.end();
+                            for (; histItr != histEnd; ++histItr)
+                            {
+                                float fractAtThisDepth = (float) histItr->second / length;
+                                _bedB->reportBedTab(*bedItr);
+                                printf("%d\t%d\t%d\t%0.7f\n", histItr->first, histItr->second, length, fractAtThisDepth);
+                            }
+                        }
+                        // special case when it is a zero length feauture.
+                        else {
+                            _bedB->reportBedTab(*bedItr);
+                            printf("%d\t%d\t%d\t%0.7f\n", bedItr->count, 0, 0, 1.0000000);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    // report a histogram of coverage among _all_
+    // features in B.
+    if (_writeHistogram == true) {
+        map<unsigned int, unsigned int>::const_iterator histItr = allDepthHist.begin();
+        map<unsigned int, unsigned int>::const_iterator histEnd = allDepthHist.end();
+        for (; histItr != histEnd; ++histItr) {
+            float fractAtThisDepth = (float) histItr->second / totalLength;
+            printf("all\t%d\t%d\t%d\t%0.7f\n", histItr->first, histItr->second, totalLength, fractAtThisDepth);
+        }
+    }
+}
+
+