Mercurial > repos > aaronquinlan > multi_intersect
diff BEDTools-Version-2.14.3/src/nucBed/nucBed.cpp @ 0:dfcd8b6c1bda
Uploaded
author | aaronquinlan |
---|---|
date | Thu, 03 Nov 2011 10:25:04 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/BEDTools-Version-2.14.3/src/nucBed/nucBed.cpp Thu Nov 03 10:25:04 2011 -0400 @@ -0,0 +1,158 @@ +/***************************************************************************** + nucBed.cpp + + (c) 2009 - Aaron Quinlan + Hall Laboratory + Department of Biochemistry and Molecular Genetics + University of Virginia + aaronquinlan@gmail.com + + Licenced under the GNU General Public License 2.0 license. +******************************************************************************/ +#include "lineFileUtilities.h" +#include "nucBed.h" + + +NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq, + bool hasPattern, const string &pattern, bool forceStrand) { + + _dbFile = dbFile; + _bedFile = bedFile; + _printSeq = printSeq; + _hasPattern = hasPattern; + _pattern = pattern; + _forceStrand = forceStrand; + + _bed = new BedFile(_bedFile); + + // Compute the DNA content in each BED/GFF/VCF interval + ProfileDNA(); +} + + +NucBed::~NucBed(void) +{} + + +void NucBed::ReportDnaProfile(const BED& bed, const string &sequence, int seqLength) +{ + int a,c,g,t,n,other,userPatternCount; + a = c = g = t = n = other = userPatternCount = 0; + + getDnaContent(sequence,a,c,g,t,n,other); + + if (_hasPattern) + userPatternCount = countPattern(sequence, _pattern); + + + // report the original interval + _bed->reportBedTab(bed); + // report AT and GC content + printf("%f\t%f\t",(float)(a+t)/seqLength, (float)(c+g)/seqLength); + // report raw nucleotide counts + printf("%d\t%d\t%d\t%d\t%d\t%d\t%d",a,c,g,t,n,other,seqLength); + // add the original sequence if requested. + + if (_printSeq) + printf("\t%s",sequence.c_str()); + if (_hasPattern) + printf("\t%d",userPatternCount); + printf("\n"); + +} + + +void NucBed::PrintHeader(void) { + printf("#"); + + int numOrigColumns = (int) _bed->bedType; + for (int i = 1; i <= numOrigColumns; ++i) { + printf("%d_usercol\t", i); + } + printf("%d_pct_at\t", numOrigColumns + 1); + printf("%d_pct_gc\t", numOrigColumns + 2); + printf("%d_num_A\t", numOrigColumns + 3); + printf("%d_num_C\t", numOrigColumns + 4); + printf("%d_num_G\t", numOrigColumns + 5); + printf("%d_num_T\t", numOrigColumns + 6); + printf("%d_num_N\t", numOrigColumns + 7); + printf("%d_num_oth\t", numOrigColumns + 8); + printf("%d_seq_len\t", numOrigColumns + 9); + + if (_printSeq) + printf("%d_seq", numOrigColumns + 10); + if (_hasPattern && !_printSeq) + printf("%d_user_patt_count", numOrigColumns + 10); + else if (_hasPattern && _printSeq) + printf("\t%d_user_patt_count", numOrigColumns + 11); + printf("\n"); + +} + + +//****************************************************************************** +// ExtractDNA +//****************************************************************************** +void NucBed::ProfileDNA() { + + /* Make sure that we can oen all of the files successfully*/ + + // open the fasta database for reading + ifstream faDb(_dbFile.c_str(), ios::in); + if ( !faDb ) { + cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl; + exit (1); + } + + // open and memory-map genome file + FastaReference fr; + bool memmap = true; + fr.open(_dbFile, memmap); + + bool headerReported = false; + BED bed, nullBed; + int lineNum = 0; + BedLineStatus bedStatus; + string sequence; + + _bed->Open(); + while ((bedStatus = _bed->GetNextBed(bed, lineNum)) != BED_INVALID) { + if (bedStatus == BED_VALID) { + if (headerReported == false) { + PrintHeader(); + headerReported = true; + } + // make sure we are extracting >= 1 bp + if (bed.zeroLength == false) { + size_t seqLength = fr.sequenceLength(bed.chrom); + // make sure this feature will not exceed the end of the chromosome. + if ( (bed.start <= seqLength) && (bed.end <= seqLength) ) + { + // grab the dna at this interval + int length = bed.end - bed.start; + // report the sequence's content + string dna = fr.getSubSequence(bed.chrom, bed.start, length); + // rev comp si necessaire + if ((_forceStrand == true) && (bed.strand == "-")) + reverseComplement(dna); + ReportDnaProfile(bed, dna, length); + bed = nullBed; + } + else + { + cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond the length of " + << bed.chrom << " size (" << seqLength << " bp). Skipping." << endl; + } + } + // handle zeroLength + else { + cerr << "Feature (" << bed.chrom << ":" << bed.start+1 << "-" << bed.end-1 << ") has length = 0, Skipping." << endl; + } + bed = nullBed; + } + } + _bed->Close(); +} + + +