Mercurial > repos > aaronquinlan > multi_intersect
diff BEDTools-Version-2.14.3/src/genomeCoverageBed/genomeCoverageBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BEDTools-Version-2.14.3/src/genomeCoverageBed/genomeCoverageBed.cpp Thu Nov 03 10:25:04 2011 -0400 @@ -0,0 +1,396 @@ +/***************************************************************************** +genomeCoverage.cpp + +(c) 2009 - Aaron Quinlan +Hall Laboratory +Department of Biochemistry and Molecular Genetics +University of Virginia +aaronquinlan@gmail.com + +Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "lineFileUtilities.h" +#include "genomeCoverageBed.h" + + +BedGenomeCoverage::BedGenomeCoverage(string bedFile, string genomeFile, + bool eachBase, bool startSites, + bool bedGraph, bool bedGraphAll, + int max, float scale, + bool bamInput, bool obeySplits, + bool filterByStrand, string requestedStrand, + bool only_5p_end, bool only_3p_end, + bool eachBaseZeroBased, + bool add_gb_track_line, string gb_track_line_opts) { + + _bedFile = bedFile; + _genomeFile = genomeFile; + _eachBase = eachBase; + _eachBaseZeroBased = eachBaseZeroBased; + _startSites = startSites; + _bedGraph = bedGraph; + _bedGraphAll = bedGraphAll; + _max = max; + _scale = scale; + _bamInput = bamInput; + _obeySplits = obeySplits; + _filterByStrand = filterByStrand; + _requestedStrand = requestedStrand; + _only_3p_end = only_3p_end; + _only_5p_end = only_5p_end; + _add_gb_track_line = add_gb_track_line; + _gb_track_line_opts = gb_track_line_opts; + _currChromName = ""; + _currChromSize = 0 ; + + + if (_bamInput == false) { + _genome = new GenomeFile(genomeFile); + } + + PrintTrackDefinitionLine(); + + if (_bamInput == false) { + _bed = new BedFile(bedFile); + CoverageBed(); + } + else { + CoverageBam(_bedFile); + } +} + +void BedGenomeCoverage::PrintTrackDefinitionLine() +{ + //Print Track Definition line (if requested) + if ( (_bedGraph||_bedGraphAll) && _add_gb_track_line) { + string line = "track type=bedGraph"; + if (!_gb_track_line_opts.empty()) { + line += " " ; + line += _gb_track_line_opts ; + } + cout << line << endl; + } + +} + + +BedGenomeCoverage::~BedGenomeCoverage(void) { + delete _bed; + delete _genome; +} + + +void BedGenomeCoverage::ResetChromCoverage() { + _currChromName = ""; + _currChromSize = 0 ; + std::vector<DEPTH>().swap(_currChromCoverage); +} + + +void BedGenomeCoverage::StartNewChrom(const string& newChrom) { + // If we've moved beyond the first encountered chromosomes, + // process the results of the previous chromosome. + if (_currChromName.length() > 0) { + ReportChromCoverage(_currChromCoverage, _currChromSize, + _currChromName, _currChromDepthHist); + } + + // empty the previous chromosome and reserve new + std::vector<DEPTH>().swap(_currChromCoverage); + + if (_visitedChromosomes.find(newChrom) != _visitedChromosomes.end()) { + cerr << "Input error: Chromosome " << _currChromName + << " found in non-sequential lines. This suggests that the input file is not sorted correctly." << endl; + + } + _visitedChromosomes.insert(newChrom); + + _currChromName = newChrom; + + // get the current chrom size and allocate space + _currChromSize = _genome->getChromSize(_currChromName); + + if (_currChromSize >= 0) + _currChromCoverage.resize(_currChromSize); + else { + cerr << "Input error: Chromosome " << _currChromName << " found in your input file but not in your genome file." << endl; + exit(1); + } +} + + +void BedGenomeCoverage::AddCoverage(int start, int end) { + // process the first line for this chromosome. + // make sure the coordinates fit within the chrom + if (start < _currChromSize) + _currChromCoverage[start].starts++; + if (end < _currChromSize) + _currChromCoverage[end].ends++; + else + _currChromCoverage[_currChromSize-1].ends++; +} + + +void BedGenomeCoverage::AddBlockedCoverage(const vector<BED> &bedBlocks) { + vector<BED>::const_iterator bedItr = bedBlocks.begin(); + vector<BED>::const_iterator bedEnd = bedBlocks.end(); + for (; bedItr != bedEnd; ++bedItr) { + // the end - 1 must be done because BamAncillary::getBamBlocks + // returns ends uncorrected for the genomeCoverageBed data structure. + // ugly, but necessary. + AddCoverage(bedItr->start, bedItr->end - 1); + } +} + + +void BedGenomeCoverage::CoverageBed() { + + BED a, nullBed; + int lineNum = 0; // current input line number + BedLineStatus bedStatus; + + ResetChromCoverage(); + + _bed->Open(); + while ( (bedStatus = _bed->GetNextBed(a, lineNum)) != BED_INVALID ) { + if (bedStatus == BED_VALID) { + if (_filterByStrand == true) { + if (a.strand.empty()) { + cerr << "Input error: Interval is missing a strand value on line " << lineNum << "." <<endl; + exit(1); + } + if ( ! (a.strand == "-" || a.strand == "+") ) { + cerr << "Input error: Invalid strand value (" << a.strand << ") on line " << lineNum << "." << endl; + exit(1); + } + // skip if the strand is not what the user requested. + if (a.strand != _requestedStrand) + continue; + } + + // are we on a new chromosome? + if (a.chrom != _currChromName) + StartNewChrom(a.chrom); + + if (_obeySplits == true) { + bedVector bedBlocks; // vec to store the discrete BED "blocks" + splitBedIntoBlocks(a, lineNum, bedBlocks); + AddBlockedCoverage(bedBlocks); + } + else if (_only_5p_end) { + int pos = ( a.strand=="+" ) ? a.start : a.end-1; + AddCoverage(pos,pos); + } + else if (_only_3p_end) { + int pos = ( a.strand=="-" ) ? a.start : a.end-1; + AddCoverage(pos,pos); + } + else + AddCoverage(a.start, a.end-1); + } + } + _bed->Close(); + PrintFinalCoverage(); +} + + +void BedGenomeCoverage::PrintFinalCoverage() +{ + + + // process the results of the last chromosome. + ReportChromCoverage(_currChromCoverage, _currChromSize, + _currChromName, _currChromDepthHist); + if (_eachBase == false && _bedGraph == false && _bedGraphAll == false) { + ReportGenomeCoverage(_currChromDepthHist); + } +} + + +void BedGenomeCoverage::CoverageBam(string bamFile) { + + ResetChromCoverage(); + + // open the BAM file + BamReader reader; + reader.Open(bamFile); + + // get header & reference information + string header = reader.GetHeaderText(); + RefVector refs = reader.GetReferenceData(); + + // load the BAM header references into a BEDTools "genome file" + _genome = new GenomeFile(refs); + // convert each aligned BAM entry to BED + // and compute coverage on B + BamAlignment bam; + while (reader.GetNextAlignment(bam)) { + // skip if the read is unaligned + if (bam.IsMapped() == false) + continue; + + // skip if we care about strands and the strand isn't what + // the user wanted + if ( (_filterByStrand == true) && + ((_requestedStrand == "-") != bam.IsReverseStrand()) ) + continue; + + // extract the chrom, start and end from the BAM alignment + string chrom(refs.at(bam.RefID).RefName); + CHRPOS start = bam.Position; + CHRPOS end = bam.GetEndPosition(false, false) - 1; + + // are we on a new chromosome? + if ( chrom != _currChromName ) + StartNewChrom(chrom); + + // add coverage accordingly. + if (_obeySplits) { + bedVector bedBlocks; + // since we are counting coverage, we do want to split blocks when a + // deletion (D) CIGAR op is encountered (hence the true for the last parm) + getBamBlocks(bam, refs, bedBlocks, true); + AddBlockedCoverage(bedBlocks); + } + else if (_only_5p_end) { + int pos = ( !bam.IsReverseStrand() ) ? start : end; + AddCoverage(pos,pos); + } + else if (_only_3p_end) { + int pos = ( bam.IsReverseStrand() ) ? start : end; + AddCoverage(pos,pos); + } + else + AddCoverage(start, end); + } + // close the BAM + reader.Close(); + PrintFinalCoverage(); +} + + +void BedGenomeCoverage::ReportChromCoverage(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom, chromHistMap &chromDepthHist) { + + if (_eachBase) { + int depth = 0; // initialize the depth + int offset = (_eachBaseZeroBased)?0:1; + for (int pos = 0; pos < chromSize; pos++) { + + depth += chromCov[pos].starts; + // report the depth for this position. + if (depth>0 || !_eachBaseZeroBased) + cout << chrom << "\t" << pos+offset << "\t" << depth * _scale << endl; + depth = depth - chromCov[pos].ends; + } + } + else if (_bedGraph == true || _bedGraphAll == true) { + ReportChromCoverageBedGraph(chromCov, chromSize, chrom); + } + else { + + int depth = 0; // initialize the depth + + for (int pos = 0; pos < chromSize; pos++) { + + depth += chromCov[pos].starts; + + // add the depth at this position to the depth histogram + // for this chromosome. if the depth is greater than the + // maximum bin requested, then readjust the depth to be the max + if (depth >= _max) { + chromDepthHist[chrom][_max]++; + } + else { + chromDepthHist[chrom][depth]++; + } + depth = depth - chromCov[pos].ends; + } + // report the histogram for each chromosome + histMap::const_iterator depthIt = chromDepthHist[chrom].begin(); + histMap::const_iterator depthEnd = chromDepthHist[chrom].end(); + for (; depthIt != depthEnd; ++depthIt) { + int depth = depthIt->first; + unsigned int numBasesAtDepth = depthIt->second; + cout << chrom << "\t" << depth << "\t" << numBasesAtDepth << "\t" + << chromSize << "\t" << (float) ((float)numBasesAtDepth / (float)chromSize) << endl; + } + } +} + + + +void BedGenomeCoverage::ReportGenomeCoverage(chromHistMap &chromDepthHist) { + + // get the list of chromosome names in the genome + vector<string> chromList = _genome->getChromList(); + + unsigned int genomeSize = 0; + vector<string>::const_iterator chromItr = chromList.begin(); + vector<string>::const_iterator chromEnd = chromList.end(); + for (; chromItr != chromEnd; ++chromItr) { + string chrom = *chromItr; + genomeSize += _genome->getChromSize(chrom); + // if there were no reads for a give chromosome, then + // add the length of the chrom to the 0 bin. + if ( chromDepthHist.find(chrom) == chromDepthHist.end() ) { + chromDepthHist[chrom][0] += _genome->getChromSize(chrom); + } + } + + histMap genomeHist; // depth histogram for the entire genome + + // loop through each chromosome and add the depth and number of bases at each depth + // to the aggregate histogram for the entire genome + for (chromHistMap::iterator chromIt = chromDepthHist.begin(); chromIt != chromDepthHist.end(); ++chromIt) { + string chrom = chromIt->first; + for (histMap::iterator depthIt = chromDepthHist[chrom].begin(); depthIt != chromDepthHist[chrom].end(); ++depthIt) { + int depth = depthIt->first; + unsigned int numBasesAtDepth = depthIt->second; + genomeHist[depth] += numBasesAtDepth; + } + } + + // loop through the depths for the entire genome + // and report the number and fraction of bases in + // the entire genome that are at said depth. + for (histMap::iterator genomeDepthIt = genomeHist.begin(); genomeDepthIt != genomeHist.end(); ++genomeDepthIt) { + int depth = genomeDepthIt->first; + unsigned int numBasesAtDepth = genomeDepthIt->second; + + cout << "genome" << "\t" << depth << "\t" << numBasesAtDepth << "\t" + << genomeSize << "\t" << (float) ((float)numBasesAtDepth / (float)genomeSize) << endl; + } +} + + +void BedGenomeCoverage::ReportChromCoverageBedGraph(const vector<DEPTH> &chromCov, const int &chromSize, const string &chrom) { + + int depth = 0; // initialize the depth + int lastStart = -1; + int lastDepth = -1; + + for (int pos = 0; pos < chromSize; pos++) { + depth += chromCov[pos].starts; + + if (depth != lastDepth) { + // Coverage depth has changed, print the last interval coverage (if any) + // Print if: + // (1) depth>0 (the default running mode), + // (2) depth==0 and the user requested to print zero covered regions (_bedGraphAll) + if ( (lastDepth != -1) && (lastDepth > 0 || _bedGraphAll) ) { + cout << chrom << "\t" << lastStart << "\t" << pos << "\t" << lastDepth * _scale << endl; + } + //Set current position as the new interval start + depth + lastDepth = depth; + lastStart = pos; + } + // Default: the depth has not changed, so we will not print anything. + // Proceed until the depth changes. + // Update depth + depth = depth - chromCov[pos].ends; + } + //Print information about the last position + if ( (lastDepth != -1) && (lastDepth > 0 || _bedGraphAll) ) { + cout << chrom << "\t" << lastStart << "\t" << chromSize << "\t" << lastDepth * _scale << endl; + } +}