annotate BEDTools-Version-2.14.3/src/mergeBed/mergeBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
1 /*****************************************************************************
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
2 mergeBed.cpp
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
3
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
4 (c) 2009 - Aaron Quinlan
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
5 Hall Laboratory
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
6 Department of Biochemistry and Molecular Genetics
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
7 University of Virginia
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
8 aaronquinlan@gmail.com
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
9
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
10 Licenced under the GNU General Public License 2.0 license.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
11 ******************************************************************************/
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
12 #include "lineFileUtilities.h"
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
13 #include "mergeBed.h"
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
14
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
15
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
16
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
17 void BedMerge::ReportMergedNames(const vector<string> &names) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
18 if (names.size() > 0) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
19 printf("\t");
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
20 vector<string>::const_iterator nameItr = names.begin();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
21 vector<string>::const_iterator nameEnd = names.end();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
22 for (; nameItr != nameEnd; ++nameItr) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
23 if (nameItr < (nameEnd - 1))
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
24 cout << *nameItr << ";";
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
25 else
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
26 cout << *nameItr;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
27 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
28 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
29 else {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
30 cerr << endl
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
31 << "*****" << endl
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
32 << "*****ERROR: No names found to report for the -names option. Exiting." << endl
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
33 << "*****" << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
34 exit(1);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
35 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
36 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
37
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
38
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
39 void BedMerge::ReportMergedScores(const vector<string> &scores) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
40 if (scores.size() > 0) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
41 printf("\t");
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
42
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
43 // convert the scores to floats
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
44 vector<float> data;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
45 for (size_t i = 0 ; i < scores.size() ; i++) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
46 data.push_back(atof(scores[i].c_str()));
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
47 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
48
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
49 if (_scoreOp == "sum") {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
50 printf("%.3f", accumulate(data.begin(), data.end(), 0.0));
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
51 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
52 else if (_scoreOp == "min") {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
53 printf("%.3f", *min_element( data.begin(), data.end() ));
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
54 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
55 else if (_scoreOp == "max") {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
56 printf("%.3f", *max_element( data.begin(), data.end() ));
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
57 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
58 else if (_scoreOp == "mean") {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
59 double total = accumulate(data.begin(), data.end(), 0.0);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
60 double mean = total / data.size();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
61 printf("%.3f", mean);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
62 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
63 else if (_scoreOp == "median") {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
64 double median = 0.0;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
65 sort(data.begin(), data.end());
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
66 int totalLines = data.size();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
67 if ((totalLines % 2) > 0) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
68 long mid;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
69 mid = totalLines / 2;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
70 median = data[mid];
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
71 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
72 else {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
73 long midLow, midHigh;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
74 midLow = (totalLines / 2) - 1;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
75 midHigh = (totalLines / 2);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
76 median = (data[midLow] + data[midHigh]) / 2.0;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
77 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
78 printf("%.3f", median);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
79 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
80 else if ((_scoreOp == "mode") || (_scoreOp == "antimode")) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
81 // compute the frequency of each unique value
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
82 map<string, int> freqs;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
83 vector<string>::const_iterator dIt = scores.begin();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
84 vector<string>::const_iterator dEnd = scores.end();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
85 for (; dIt != dEnd; ++dIt) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
86 freqs[*dIt]++;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
87 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
88
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
89 // grab the mode and the anti mode
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
90 string mode, antiMode;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
91 int count = 0;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
92 int minCount = INT_MAX;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
93 for(map<string,int>::const_iterator iter = freqs.begin(); iter != freqs.end(); ++iter) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
94 if (iter->second > count) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
95 mode = iter->first;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
96 count = iter->second;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
97 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
98 if (iter->second < minCount) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
99 antiMode = iter->first;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
100 minCount = iter->second;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
101 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
102 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
103 // report
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
104 if (_scoreOp == "mode") {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
105 printf("%s", mode.c_str());
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
106 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
107 else if (_scoreOp == "antimode") {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
108 printf("%s", antiMode.c_str());
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
109 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
110 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
111 else if (_scoreOp == "collapse") {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
112 vector<string>::const_iterator scoreItr = scores.begin();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
113 vector<string>::const_iterator scoreEnd = scores.end();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
114 for (; scoreItr != scoreEnd; ++scoreItr) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
115 if (scoreItr < (scoreEnd - 1))
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
116 cout << *scoreItr << ";";
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
117 else
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
118 cout << *scoreItr;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
119 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
120 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
121 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
122 else {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
123 cerr << endl
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
124 << "*****" << endl
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
125 << "*****ERROR: No scores found to report for the -scores option. Exiting." << endl
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
126 << "*****" << endl;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
127 exit(1);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
128 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
129 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
130
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
131 // ===============
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
132 // = Constructor =
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
133 // ===============
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
134 BedMerge::BedMerge(string &bedFile,
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
135 bool numEntries,
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
136 int maxDistance,
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
137 bool forceStrand,
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
138 bool reportNames,
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
139 bool reportScores,
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
140 const string &scoreOp) :
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
141 _bedFile(bedFile),
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
142 _numEntries(numEntries),
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
143 _forceStrand(forceStrand),
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
144 _reportNames(reportNames),
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
145 _reportScores(reportScores),
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
146 _scoreOp(scoreOp),
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
147 _maxDistance(maxDistance)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
148 {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
149 _bed = new BedFile(bedFile);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
150
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
151 if (_forceStrand == false)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
152 MergeBed();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
153 else
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
154 MergeBedStranded();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
155 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
156
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
157
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
158 // =================
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
159 // = Destructor =
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
160 // =================
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
161 BedMerge::~BedMerge(void) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
162 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
163
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
164
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
165 // ===============================================
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
166 // Convenience method for reporting merged blocks
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
167 // ================================================
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
168 void BedMerge::Report(string chrom, int start, int end,
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
169 const vector<string> &names, const vector<string> &scores, int mergeCount)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
170 {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
171 // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
172 //if (_bed->isZeroBased == false) {start++;}
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
173
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
174 printf("%s\t%d\t%d", chrom.c_str(), start, end);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
175 // just the merged intervals
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
176 if (_numEntries == false && _reportNames == false && _reportScores == false) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
177 printf("\n");
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
178 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
179 // merged intervals and counts
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
180 else if (_numEntries == true && _reportNames == false && _reportScores == false) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
181 printf("\t%d\n", mergeCount);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
182 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
183 // merged intervals and names
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
184 else if (_numEntries == false && _reportNames == true && _reportScores == false) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
185 ReportMergedNames(names);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
186 printf("\n");
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
187 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
188 // merged intervals and scores
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
189 else if (_numEntries == false && _reportNames == false && _reportScores == true) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
190 ReportMergedScores(scores);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
191 printf("\n");
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
192 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
193 // merged intervals, names, and scores
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
194 else if (_numEntries == false && _reportNames == true && _reportScores == true) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
195 ReportMergedNames(names);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
196 ReportMergedScores(scores);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
197 printf("\n");
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
198 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
199 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
200
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
201
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
202 // =========================================================
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
203 // Convenience method for reporting merged blocks by strand
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
204 // =========================================================
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
205 void BedMerge::ReportStranded(string chrom, int start, int end,
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
206 const vector<string> &names, const vector<string> &scores,
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
207 int mergeCount, string strand)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
208 {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
209 // ARQ: removed to force all output to be zero-based, BED format, reagrdless of input type
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
210 //if (_bed->isZeroBased == false) {start++;}
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
211
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
212 printf("%s\t%d\t%d", chrom.c_str(), start, end);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
213 // just the merged intervals
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
214 if (_numEntries == false && _reportNames == false && _reportScores == false) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
215 printf("\t%s\n", strand.c_str());
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
216 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
217 // merged intervals and counts
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
218 else if (_numEntries == true && _reportNames == false && _reportScores == false) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
219 printf("\t%d\t%s\n", mergeCount, strand.c_str());
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
220 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
221 // merged intervals and names
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
222 else if (_numEntries == false && _reportNames == true && _reportScores == false) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
223 ReportMergedNames(names);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
224 printf("\t%s\n", strand.c_str());
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
225 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
226 // merged intervals and scores
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
227 else if (_numEntries == false && _reportNames == false && _reportScores == true) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
228 ReportMergedScores(scores);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
229 printf("\t%s\n", strand.c_str());
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
230 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
231 // merged intervals, names, and scores
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
232 else if (_numEntries == false && _reportNames == true && _reportScores == true) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
233 ReportMergedNames(names);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
234 ReportMergedScores(scores);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
235 printf("\t%s\n", strand.c_str());
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
236 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
237 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
238
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
239
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
240 // =====================================================
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
241 // = Merge overlapping BED entries into a single entry =
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
242 // =====================================================
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
243 void BedMerge::MergeBed() {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
244
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
245 // load the "B" bed file into a map so
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
246 // that we can easily compare "A" to it for overlaps
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
247 _bed->loadBedFileIntoMapNoBin();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
248
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
249 // loop through each chromosome and merge their BED entries
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
250 masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
251 masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
252 for (; m != mEnd; ++m) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
253
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
254 // bedList is already sorted by start position.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
255 string chrom = m->first;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
256 vector<BED> bedList = m->second;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
257 int mergeCount = 1;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
258 vector<string> names;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
259 vector<string> scores;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
260
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
261 // merge overlapping features for this chromosome.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
262 int start = -1;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
263 int end = -1;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
264 vector<BED>::const_iterator bedItr = bedList.begin();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
265 vector<BED>::const_iterator bedEnd = bedList.end();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
266 for (; bedItr != bedEnd; ++bedItr) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
267 // new block, no overlap
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
268 if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
269 if (start >= 0) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
270 Report(chrom, start, end, names, scores, mergeCount);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
271 // reset
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
272 mergeCount = 1;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
273 names.clear();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
274 scores.clear();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
275 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
276 start = bedItr->start;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
277 end = bedItr->end;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
278 if (!bedItr->name.empty()) names.push_back(bedItr->name);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
279 if (!bedItr->score.empty()) scores.push_back(bedItr->score);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
280 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
281 // same block, overlaps
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
282 else {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
283 if ((int) bedItr-> end > end) end = bedItr->end;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
284 mergeCount++;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
285 if (!bedItr->name.empty()) names.push_back(bedItr->name);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
286 if (!bedItr->score.empty()) scores.push_back(bedItr->score);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
287 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
288 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
289 if (start >= 0) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
290 Report(chrom, start, end, names, scores, mergeCount);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
291 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
292 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
293 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
294
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
295
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
296 // ==================================================================================
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
297 // = Merge overlapping BED entries into a single entry, accounting for strandedness =
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
298 // ==================================================================================
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
299 void BedMerge::MergeBedStranded() {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
300
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
301 // load the "B" bed file into a map so
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
302 // that we can easily compare "A" to it for overlaps
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
303 _bed->loadBedFileIntoMapNoBin();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
304
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
305 // loop through each chromosome and merge their BED entries
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
306 masterBedMapNoBin::const_iterator m = _bed->bedMapNoBin.begin();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
307 masterBedMapNoBin::const_iterator mEnd = _bed->bedMapNoBin.end();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
308 for (; m != mEnd; ++m) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
309
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
310 // bedList is already sorted by start position.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
311 string chrom = m->first;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
312 vector<BED> bedList = m->second;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
313
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
314 // make a list of the two strands to merge separately.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
315 vector<string> strands(2);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
316 strands[0] = "+";
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
317 strands[1] = "-";
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
318
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
319 // do two passes, one for each strand.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
320 for (unsigned int s = 0; s < strands.size(); s++) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
321
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
322 int mergeCount = 1;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
323 int numOnStrand = 0;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
324 vector<string> names;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
325 vector<string> scores;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
326
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
327 // merge overlapping features for this chromosome.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
328 int start = -1;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
329 int end = -1;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
330 vector<BED>::const_iterator bedItr = bedList.begin();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
331 vector<BED>::const_iterator bedEnd = bedList.end();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
332 for (; bedItr != bedEnd; ++bedItr) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
333
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
334 // if forcing strandedness, move on if the hit
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
335 // is not on the current strand.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
336 if (bedItr->strand != strands[s]) { continue; }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
337 else { numOnStrand++; }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
338
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
339 if ( (((int) bedItr->start - end) > _maxDistance) || (end < 0)) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
340 if (start >= 0) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
341 ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
342 // reset
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
343 mergeCount = 1;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
344 names.clear();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
345 scores.clear();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
346 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
347 start = bedItr->start;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
348 end = bedItr->end;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
349 if (!bedItr->name.empty()) names.push_back(bedItr->name);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
350 if (!bedItr->score.empty()) scores.push_back(bedItr->score);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
351 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
352 else {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
353 if ((int) bedItr-> end > end) end = bedItr->end;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
354 mergeCount++;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
355 if (!bedItr->name.empty()) names.push_back(bedItr->name);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
356 if (!bedItr->score.empty()) scores.push_back(bedItr->score);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
357 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
358 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
359 if (start >= 0) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
360 ReportStranded(chrom, start, end, names, scores, mergeCount, strands[s]);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
361 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
362 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
363 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
364 }