0
|
1 /*****************************************************************************
|
|
2 annotateBed.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "annotateBed.h"
|
|
14
|
|
15 // build
|
|
16 BedAnnotate::BedAnnotate(const string &mainFile, const vector<string> &annoFileNames,
|
|
17 const vector<string> &annoTitles, bool sameStrand, bool diffStrand, bool reportCounts, bool reportBoth) :
|
|
18
|
|
19 _mainFile(mainFile),
|
|
20 _annoFileNames(annoFileNames),
|
|
21 _annoTitles(annoTitles),
|
|
22 _sameStrand(sameStrand),
|
|
23 _diffStrand(diffStrand),
|
|
24 _reportCounts(reportCounts),
|
|
25 _reportBoth(reportBoth)
|
|
26 {
|
|
27 _bed = new BedFile(_mainFile);
|
|
28 }
|
|
29
|
|
30
|
|
31 // destroy and delete the open file pointers
|
|
32 BedAnnotate::~BedAnnotate(void) {
|
|
33 delete _bed;
|
|
34 CloseAnnoFiles();
|
|
35 }
|
|
36
|
|
37
|
|
38 void BedAnnotate::OpenAnnoFiles() {
|
|
39 for (size_t i=0; i < _annoFileNames.size(); ++i) {
|
|
40 BedFile *file = new BedFile(_annoFileNames[i]);
|
|
41 file->Open();
|
|
42 _annoFiles.push_back(file);
|
|
43 }
|
|
44 }
|
|
45
|
|
46
|
|
47 void BedAnnotate::CloseAnnoFiles() {
|
|
48 for (size_t i=0; i < _annoFiles.size(); ++i) {
|
|
49 BedFile *file = _annoFiles[i];
|
|
50 delete file;
|
|
51 _annoFiles[i] = NULL;
|
|
52 }
|
|
53 }
|
|
54
|
|
55
|
|
56 void BedAnnotate::PrintHeader() {
|
|
57 // print a hash to indicate header and then write a tab
|
|
58 // for each field in the main file.
|
|
59 printf("#");
|
|
60 for (size_t i = 0; i < _bed->bedType; ++i)
|
|
61 printf("\t");
|
|
62
|
|
63 // now print the label for each file.
|
|
64 if (_reportBoth == false) {
|
|
65 for (size_t i = 0; i < _annoTitles.size(); ++i)
|
|
66 printf("%s\t", _annoTitles[i].c_str());
|
|
67 printf("\n");
|
|
68 }
|
|
69 else {
|
|
70 for (size_t i = 0; i < _annoTitles.size(); ++i)
|
|
71 printf("%s_cnt\t%s_pct", _annoTitles[i].c_str(), _annoTitles[i].c_str());
|
|
72 printf("\n");
|
|
73 }
|
|
74 }
|
|
75
|
|
76
|
|
77 void BedAnnotate::InitializeMainFile() {
|
|
78 // process each chromosome
|
|
79 masterBedCovListMap::iterator chromItr = _bed->bedCovListMap.begin();
|
|
80 masterBedCovListMap::iterator chromEnd = _bed->bedCovListMap.end();
|
|
81 for (; chromItr != chromEnd; ++chromItr) {
|
|
82 // for each chrom, process each bin
|
|
83 binsToBedCovLists::iterator binItr = chromItr->second.begin();
|
|
84 binsToBedCovLists::iterator binEnd = chromItr->second.end();
|
|
85 for (; binItr != binEnd; ++binItr) {
|
|
86 // initialize BEDCOVLIST in this chrom/bin
|
|
87 vector<BEDCOVLIST>::iterator bedItr = binItr->second.begin();
|
|
88 vector<BEDCOVLIST>::iterator bedEnd = binItr->second.end();
|
|
89 for (; bedItr != bedEnd; ++bedItr) {
|
|
90 // initialize the depthMaps, counts, etc. for each anno file.
|
|
91 for (size_t i = 0; i < _annoFiles.size(); ++i) {
|
|
92 map<unsigned int, DEPTH> dummy;
|
|
93 bedItr->depthMapList.push_back(dummy);
|
|
94 bedItr->counts.push_back(0);
|
|
95 bedItr->minOverlapStarts.push_back(INT_MAX);
|
|
96 }
|
|
97 }
|
|
98 }
|
|
99 }
|
|
100 }
|
|
101
|
|
102
|
|
103 void BedAnnotate::AnnotateBed() {
|
|
104
|
|
105 // load the "main" bed file into a map so
|
|
106 // that we can easily compare each annoFile to it for overlaps
|
|
107 _bed->loadBedCovListFileIntoMap();
|
|
108 // open the annotations files for processing;
|
|
109 OpenAnnoFiles();
|
|
110 // initialize counters, depths, etc. for the main file
|
|
111 InitializeMainFile();
|
|
112
|
|
113 // annotate the main file with the coverage from the annotation files.
|
|
114 for (size_t annoIndex = 0; annoIndex < _annoFiles.size(); ++annoIndex) {
|
|
115 // grab the current annotation file.
|
|
116 BedFile *anno = _annoFiles[annoIndex];
|
|
117 int lineNum = 0;
|
|
118 BED a, nullBed;
|
|
119 BedLineStatus bedStatus;
|
|
120
|
|
121 // process each entry in the current anno file
|
|
122 while ((bedStatus = anno->GetNextBed(a, lineNum)) != BED_INVALID) {
|
|
123 if (bedStatus == BED_VALID) {
|
|
124 _bed->countListHits(a, annoIndex, _sameStrand, _diffStrand);
|
|
125 a = nullBed;
|
|
126 }
|
|
127 }
|
|
128 }
|
|
129
|
|
130 // report the annotations of the main file from the anno file.
|
|
131 ReportAnnotations();
|
|
132 // close the annotations files;
|
|
133 CloseAnnoFiles();
|
|
134 }
|
|
135
|
|
136
|
|
137 void BedAnnotate::ReportAnnotations() {
|
|
138
|
|
139 if (_annoTitles.size() > 0) {
|
|
140 PrintHeader();
|
|
141 }
|
|
142
|
|
143 // process each chromosome
|
|
144 masterBedCovListMap::const_iterator chromItr = _bed->bedCovListMap.begin();
|
|
145 masterBedCovListMap::const_iterator chromEnd = _bed->bedCovListMap.end();
|
|
146 for (; chromItr != chromEnd; ++chromItr) {
|
|
147 // for each chrom, process each bin
|
|
148 binsToBedCovLists::const_iterator binItr = chromItr->second.begin();
|
|
149 binsToBedCovLists::const_iterator binEnd = chromItr->second.end();
|
|
150 for (; binItr != binEnd; ++binItr) {
|
|
151 // for each chrom & bin, compute and report
|
|
152 // the observed coverage for each feature
|
|
153 vector<BEDCOVLIST>::const_iterator bedItr = binItr->second.begin();
|
|
154 vector<BEDCOVLIST>::const_iterator bedEnd = binItr->second.end();
|
|
155 for (; bedItr != bedEnd; ++bedItr) {
|
|
156 // print the main BED entry.
|
|
157 _bed->reportBedTab(*bedItr);
|
|
158
|
|
159 // now report the coverage from each annotation file.
|
|
160 for (size_t i = 0; i < _annoFiles.size(); ++i) {
|
|
161 unsigned int totalLength = 0;
|
|
162 int zeroDepthCount = 0; // number of bases with zero depth
|
|
163 int depth = 0; // tracks the depth at the current base
|
|
164
|
|
165 // the start is either the first base in the feature OR
|
|
166 // the leftmost position of an overlapping feature. e.g. (s = start):
|
|
167 // A ----------
|
|
168 // B s ------------
|
|
169 int start = min(bedItr->minOverlapStarts[i], bedItr->start);
|
|
170
|
|
171 map<unsigned int, DEPTH>::const_iterator depthItr;
|
|
172 map<unsigned int, DEPTH>::const_iterator depthEnd;
|
|
173
|
|
174 // compute the coverage observed at each base in the feature marching from start to end.
|
|
175 for (CHRPOS pos = start+1; pos <= bedItr->end; pos++) {
|
|
176 // map pointer grabbing the starts and ends observed at this position
|
|
177 depthItr = bedItr->depthMapList[i].find(pos);
|
|
178 depthEnd = bedItr->depthMapList[i].end();
|
|
179
|
|
180 // increment coverage if starts observed at this position.
|
|
181 if (depthItr != depthEnd)
|
|
182 depth += depthItr->second.starts;
|
|
183 // update zero depth
|
|
184 if ((pos > bedItr->start) && (pos <= bedItr->end) && (depth == 0))
|
|
185 zeroDepthCount++;
|
|
186 // decrement coverage if ends observed at this position.
|
|
187 if (depthItr != depthEnd)
|
|
188 depth = depth - depthItr->second.ends;
|
|
189 }
|
|
190 // Summarize the coverage for the current interval,
|
|
191 CHRPOS length = bedItr->end - bedItr->start;
|
|
192 totalLength += length;
|
|
193 int nonZeroBases = (length - zeroDepthCount);
|
|
194 float fractCovered = (float) nonZeroBases / length;
|
|
195 if (_reportCounts == false && _reportBoth == false)
|
|
196 printf("%f\t", fractCovered);
|
|
197 else if (_reportCounts == true && _reportBoth == false)
|
|
198 printf("%d\t", bedItr->counts[i]);
|
|
199 else if (_reportCounts == false && _reportBoth == true)
|
|
200 printf("%d\t%f\t", bedItr->counts[i], fractCovered);
|
|
201 }
|
|
202 // print newline for next feature.
|
|
203 printf("\n");
|
|
204 }
|
|
205 }
|
|
206 }
|
|
207 }
|
|
208
|
|
209
|