0
|
1 /*****************************************************************************
|
|
2 nucBed.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "nucBed.h"
|
|
14
|
|
15
|
|
16 NucBed::NucBed(string &dbFile, string &bedFile, bool printSeq,
|
|
17 bool hasPattern, const string &pattern, bool forceStrand) {
|
|
18
|
|
19 _dbFile = dbFile;
|
|
20 _bedFile = bedFile;
|
|
21 _printSeq = printSeq;
|
|
22 _hasPattern = hasPattern;
|
|
23 _pattern = pattern;
|
|
24 _forceStrand = forceStrand;
|
|
25
|
|
26 _bed = new BedFile(_bedFile);
|
|
27
|
|
28 // Compute the DNA content in each BED/GFF/VCF interval
|
|
29 ProfileDNA();
|
|
30 }
|
|
31
|
|
32
|
|
33 NucBed::~NucBed(void)
|
|
34 {}
|
|
35
|
|
36
|
|
37 void NucBed::ReportDnaProfile(const BED& bed, const string &sequence, int seqLength)
|
|
38 {
|
|
39 int a,c,g,t,n,other,userPatternCount;
|
|
40 a = c = g = t = n = other = userPatternCount = 0;
|
|
41
|
|
42 getDnaContent(sequence,a,c,g,t,n,other);
|
|
43
|
|
44 if (_hasPattern)
|
|
45 userPatternCount = countPattern(sequence, _pattern);
|
|
46
|
|
47
|
|
48 // report the original interval
|
|
49 _bed->reportBedTab(bed);
|
|
50 // report AT and GC content
|
|
51 printf("%f\t%f\t",(float)(a+t)/seqLength, (float)(c+g)/seqLength);
|
|
52 // report raw nucleotide counts
|
|
53 printf("%d\t%d\t%d\t%d\t%d\t%d\t%d",a,c,g,t,n,other,seqLength);
|
|
54 // add the original sequence if requested.
|
|
55
|
|
56 if (_printSeq)
|
|
57 printf("\t%s",sequence.c_str());
|
|
58 if (_hasPattern)
|
|
59 printf("\t%d",userPatternCount);
|
|
60 printf("\n");
|
|
61
|
|
62 }
|
|
63
|
|
64
|
|
65 void NucBed::PrintHeader(void) {
|
|
66 printf("#");
|
|
67
|
|
68 int numOrigColumns = (int) _bed->bedType;
|
|
69 for (int i = 1; i <= numOrigColumns; ++i) {
|
|
70 printf("%d_usercol\t", i);
|
|
71 }
|
|
72 printf("%d_pct_at\t", numOrigColumns + 1);
|
|
73 printf("%d_pct_gc\t", numOrigColumns + 2);
|
|
74 printf("%d_num_A\t", numOrigColumns + 3);
|
|
75 printf("%d_num_C\t", numOrigColumns + 4);
|
|
76 printf("%d_num_G\t", numOrigColumns + 5);
|
|
77 printf("%d_num_T\t", numOrigColumns + 6);
|
|
78 printf("%d_num_N\t", numOrigColumns + 7);
|
|
79 printf("%d_num_oth\t", numOrigColumns + 8);
|
|
80 printf("%d_seq_len\t", numOrigColumns + 9);
|
|
81
|
|
82 if (_printSeq)
|
|
83 printf("%d_seq", numOrigColumns + 10);
|
|
84 if (_hasPattern && !_printSeq)
|
|
85 printf("%d_user_patt_count", numOrigColumns + 10);
|
|
86 else if (_hasPattern && _printSeq)
|
|
87 printf("\t%d_user_patt_count", numOrigColumns + 11);
|
|
88 printf("\n");
|
|
89
|
|
90 }
|
|
91
|
|
92
|
|
93 //******************************************************************************
|
|
94 // ExtractDNA
|
|
95 //******************************************************************************
|
|
96 void NucBed::ProfileDNA() {
|
|
97
|
|
98 /* Make sure that we can oen all of the files successfully*/
|
|
99
|
|
100 // open the fasta database for reading
|
|
101 ifstream faDb(_dbFile.c_str(), ios::in);
|
|
102 if ( !faDb ) {
|
|
103 cerr << "Error: The requested fasta database file (" << _dbFile << ") could not be opened. Exiting!" << endl;
|
|
104 exit (1);
|
|
105 }
|
|
106
|
|
107 // open and memory-map genome file
|
|
108 FastaReference fr;
|
|
109 bool memmap = true;
|
|
110 fr.open(_dbFile, memmap);
|
|
111
|
|
112 bool headerReported = false;
|
|
113 BED bed, nullBed;
|
|
114 int lineNum = 0;
|
|
115 BedLineStatus bedStatus;
|
|
116 string sequence;
|
|
117
|
|
118 _bed->Open();
|
|
119 while ((bedStatus = _bed->GetNextBed(bed, lineNum)) != BED_INVALID) {
|
|
120 if (bedStatus == BED_VALID) {
|
|
121 if (headerReported == false) {
|
|
122 PrintHeader();
|
|
123 headerReported = true;
|
|
124 }
|
|
125 // make sure we are extracting >= 1 bp
|
|
126 if (bed.zeroLength == false) {
|
|
127 size_t seqLength = fr.sequenceLength(bed.chrom);
|
|
128 // make sure this feature will not exceed the end of the chromosome.
|
|
129 if ( (bed.start <= seqLength) && (bed.end <= seqLength) )
|
|
130 {
|
|
131 // grab the dna at this interval
|
|
132 int length = bed.end - bed.start;
|
|
133 // report the sequence's content
|
|
134 string dna = fr.getSubSequence(bed.chrom, bed.start, length);
|
|
135 // rev comp si necessaire
|
|
136 if ((_forceStrand == true) && (bed.strand == "-"))
|
|
137 reverseComplement(dna);
|
|
138 ReportDnaProfile(bed, dna, length);
|
|
139 bed = nullBed;
|
|
140 }
|
|
141 else
|
|
142 {
|
|
143 cerr << "Feature (" << bed.chrom << ":" << bed.start << "-" << bed.end << ") beyond the length of "
|
|
144 << bed.chrom << " size (" << seqLength << " bp). Skipping." << endl;
|
|
145 }
|
|
146 }
|
|
147 // handle zeroLength
|
|
148 else {
|
|
149 cerr << "Feature (" << bed.chrom << ":" << bed.start+1 << "-" << bed.end-1 << ") has length = 0, Skipping." << endl;
|
|
150 }
|
|
151 bed = nullBed;
|
|
152 }
|
|
153 }
|
|
154 _bed->Close();
|
|
155 }
|
|
156
|
|
157
|
|
158
|