Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/nucBed/nucBed.cpp @ 0:dfcd8b6c1bda
Uploaded
| author | aaronquinlan |
|---|---|
| date | Thu, 03 Nov 2011 10:25:04 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:dfcd8b6c1bda |
|---|---|
| 1 /***************************************************************************** | |
| 2 nucBed.cpp | |
| 3 | |
| 4 (c) 2009 - Aaron Quinlan | |
| 5 Hall Laboratory | |
| 6 Department of Biochemistry and Molecular Genetics | |
| 7 University of Virginia | |
| 8 aaronquinlan@gmail.com | |
| 9 | |
| 10 Licenced under the GNU General Public License 2.0 license. | |
| 11 ******************************************************************************/ | |
| 12 #include "lineFileUtilities.h" | |
| 13 #include "nucBed.h" | |
| 14 | |
| 15 | |
| 16 NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq, | |
| 17 bool hasPattern, const string &pattern, bool forceStrand) { | |
| 18 | |
| 19 _dbFile = dbFile; | |
| 20 _bedFile = bedFile; | |
| 21 _printSeq = printSeq; | |
| 22 _hasPattern = hasPattern; | |
| 23 _pattern = pattern; | |
| 24 _forceStrand = forceStrand; | |
| 25 | |
| 26 _bed = new BedFile(_bedFile); | |
| 27 | |
| 28 // Compute the DNA content in each BED/GFF/VCF interval | |
| 29 ProfileDNA(); | |
| 30 } | |
| 31 | |
| 32 | |
| 33 NucBed::~NucBed(void) | |
| 34 {} | |
| 35 | |
| 36 | |
| 37 void NucBed::ReportDnaProfile(const BED& bed, const string &sequence, int seqLength) | |
| 38 { | |
| 39 int a,c,g,t,n,other,userPatternCount; | |
| 40 a = c = g = t = n = other = userPatternCount = 0; | |
| 41 | |
| 42 getDnaContent(sequence,a,c,g,t,n,other); | |
| 43 | |
| 44 if (_hasPattern) | |
| 45 userPatternCount = countPattern(sequence, _pattern); | |
| 46 | |
| 47 | |
| 48 // report the original interval | |
| 49 _bed->reportBedTab(bed); | |
| 50 // report AT and GC content | |
| 51 printf("%f\t%f\t",(float)(a+t)/seqLength, (float)(c+g)/seqLength); | |
| 52 // report raw nucleotide counts | |
| 53 printf("%d\t%d\t%d\t%d\t%d\t%d\t%d",a,c,g,t,n,other,seqLength); | |
| 54 // add the original sequence if requested. | |
| 55 | |
| 56 if (_printSeq) | |
| 57 printf("\t%s",sequence.c_str()); | |
| 58 if (_hasPattern) | |
| 59 printf("\t%d",userPatternCount); | |
| 60 printf("\n"); | |
| 61 | |
| 62 } | |
| 63 | |
| 64 | |
| 65 void NucBed::PrintHeader(void) { | |
| 66 printf("#"); | |
| 67 | |
| 68 int numOrigColumns = (int) _bed->bedType; | |
| 69 for (int i = 1; i <= numOrigColumns; ++i) { | |
| 70 printf("%d_usercol\t", i); | |
| 71 } | |
| 72 printf("%d_pct_at\t", numOrigColumns + 1); | |
| 73 printf("%d_pct_gc\t", numOrigColumns + 2); | |
| 74 printf("%d_num_A\t", numOrigColumns + 3); | |
| 75 printf("%d_num_C\t", numOrigColumns + 4); | |
| 76 printf("%d_num_G\t", numOrigColumns + 5); | |
| 77 printf("%d_num_T\t", numOrigColumns + 6); | |
| 78 printf("%d_num_N\t", numOrigColumns + 7); | |
| 79 printf("%d_num_oth\t", numOrigColumns + 8); | |
| 80 printf("%d_seq_len\t", numOrigColumns + 9); | |
| 81 | |
| 82 if (_printSeq) | |
| 83 printf("%d_seq", numOrigColumns + 10); | |
| 84 if (_hasPattern && !_printSeq) | |
| 85 printf("%d_user_patt_count", numOrigColumns + 10); | |
| 86 else if (_hasPattern && _printSeq) | |
| 87 printf("\t%d_user_patt_count", numOrigColumns + 11); | |
| 88 printf("\n"); | |
| 89 | |
| 90 } | |
| 91 | |
| 92 | |
| 93 //****************************************************************************** | |
| 94 // ExtractDNA | |
| 95 //****************************************************************************** | |
| 96 void NucBed::ProfileDNA() { | |
| 97 | |
| 98 /* Make sure that we can oen all of the files successfully*/ | |
| 99 | |
| 100 // open the fasta database for reading | |
| 101 ifstream faDb(_dbFile.c_str(), ios::in); | |
| 102 if ( !faDb ) { | |
| 103 cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl; | |
| 104 exit (1); | |
| 105 } | |
| 106 | |
| 107 // open and memory-map genome file | |
| 108 FastaReference fr; | |
| 109 bool memmap = true; | |
| 110 fr.open(_dbFile, memmap); | |
| 111 | |
| 112 bool headerReported = false; | |
| 113 BED bed, nullBed; | |
| 114 int lineNum = 0; | |
| 115 BedLineStatus bedStatus; | |
| 116 string sequence; | |
| 117 | |
| 118 _bed->Open(); | |
| 119 while ((bedStatus = _bed->GetNextBed(bed, lineNum)) != BED_INVALID) { | |
| 120 if (bedStatus == BED_VALID) { | |
| 121 if (headerReported == false) { | |
| 122 PrintHeader(); | |
| 123 headerReported = true; | |
| 124 } | |
| 125 // make sure we are extracting >= 1 bp | |
| 126 if (bed.zeroLength == false) { | |
| 127 size_t seqLength = fr.sequenceLength(bed.chrom); | |
| 128 // make sure this feature will not exceed the end of the chromosome. | |
| 129 if ( (bed.start <= seqLength) && (bed.end <= seqLength) ) | |
| 130 { | |
| 131 // grab the dna at this interval | |
| 132 int length = bed.end - bed.start; | |
| 133 // report the sequence's content | |
| 134 string dna = fr.getSubSequence(bed.chrom, bed.start, length); | |
| 135 // rev comp si necessaire | |
| 136 if ((_forceStrand == true) && (bed.strand == "-")) | |
| 137 reverseComplement(dna); | |
| 138 ReportDnaProfile(bed, dna, length); | |
| 139 bed = nullBed; | |
| 140 } | |
| 141 else | |
| 142 { | |
| 143 cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond the length of " | |
| 144 << bed.chrom << " size (" << seqLength << " bp). Skipping." << endl; | |
| 145 } | |
| 146 } | |
| 147 // handle zeroLength | |
| 148 else { | |
| 149 cerr << "Feature (" << bed.chrom << ":" << bed.start+1 << "-" << bed.end-1 << ") has length = 0, Skipping." << endl; | |
| 150 } | |
| 151 bed = nullBed; | |
| 152 } | |
| 153 } | |
| 154 _bed->Close(); | |
| 155 } | |
| 156 | |
| 157 | |
| 158 |
