annotate BEDTools-Version-2.14.3/src/subtractBed/subtractBed.cpp @ 0:dfcd8b6c1bda

Uploaded
author aaronquinlan
date Thu, 03 Nov 2011 10:25:04 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
1 /*****************************************************************************
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
2 subtractBed.cpp
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
3
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
4 (c) 2009 - Aaron Quinlan
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
5 Hall Laboratory
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
6 Department of Biochemistry and Molecular Genetics
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
7 University of Virginia
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
8 aaronquinlan@gmail.com
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
9
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
10 Licenced under the GNU General Public License 2.0 license.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
11 ******************************************************************************/
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
12 #include "lineFileUtilities.h"
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
13 #include "subtractBed.h"
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
14
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
15
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
16 /*
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
17 Constructor
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
18 */
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
19 BedSubtract::BedSubtract(string &bedAFile, string &bedBFile, float overlapFraction, bool sameStrand, bool diffStrand) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
20
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
21 _bedAFile = bedAFile;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
22 _bedBFile = bedBFile;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
23 _overlapFraction = overlapFraction;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
24 _sameStrand = sameStrand;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
25 _diffStrand = diffStrand;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
26
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
27 _bedA = new BedFile(bedAFile);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
28 _bedB = new BedFile(bedBFile);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
29
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
30 SubtractBed();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
31 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
32
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
33
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
34 /*
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
35 Destructor
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
36 */
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
37 BedSubtract::~BedSubtract(void) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
38 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
39
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
40
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
41 void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
42
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
43 // find all of the overlaps between a and B.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
44 _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
45
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
46 // is A completely spanned by an entry in B?
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
47 // if so, A should not be reported.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
48 int numConsumedByB = 0;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
49 int numOverlaps = 0;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
50 vector<BED> bOverlaps; // list of hits in B. Special processing if there are multiple.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
51
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
52 vector<BED>::const_iterator h = hits.begin();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
53 vector<BED>::const_iterator hitsEnd = hits.end();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
54 for (; h != hitsEnd; ++h) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
55
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
56 int s = max(a.start, h->start);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
57 int e = min(a.end, h->end);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
58 int overlapBases = (e - s); // the number of overlapping bases b/w a and b
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
59 int aLength = (a.end - a.start); // the length of a in b.p.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
60
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
61 if (s < e) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
62
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
63 // is there enough overlap (default ~ 1bp)
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
64 float overlap = ((float) overlapBases / (float) aLength);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
65
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
66 if (overlap >= 1.0) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
67 numOverlaps++;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
68 numConsumedByB++;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
69 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
70 else if ( overlap >= _overlapFraction ) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
71 numOverlaps++;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
72 bOverlaps.push_back(*h);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
73 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
74 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
75 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
76
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
77 if (numOverlaps == 0) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
78 // no overlap found, so just report A as-is.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
79 _bedA->reportBedNewLine(a);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
80 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
81 else if (numOverlaps == 1) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
82 // one overlap found. only need to look at the single
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
83 // entry in bOverlaps.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
84
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
85 // if A was not "consumed" by any entry in B
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
86 if (numConsumedByB == 0) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
87
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
88 BED theHit = bOverlaps[0];
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
89
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
90 // A ++++++++++++
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
91 // B ----
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
92 // Res. ==== ====
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
93 if ( (theHit.start > a.start) && (theHit.end < a.end) ) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
94 _bedA->reportBedRangeNewLine(a,a.start,theHit.start);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
95 _bedA->reportBedRangeNewLine(a,theHit.end,a.end);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
96 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
97 // A ++++++++++++
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
98 // B ----------
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
99 // Res. ==
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
100 else if (theHit.start == a.start) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
101 _bedA->reportBedRangeNewLine(a,theHit.end,a.end);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
102 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
103 // A ++++++++++++
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
104 // B ----------
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
105 // Res. ====
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
106 else if (theHit.start < a.start) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
107 _bedA->reportBedRangeNewLine(a,theHit.end,a.end);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
108 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
109 // A ++++++++++++
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
110 // B ----------
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
111 // Res. =======
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
112 else if (theHit.start > a.start) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
113 _bedA->reportBedRangeNewLine(a,a.start,theHit.start);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
114 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
115 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
116 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
117 else if (numOverlaps > 1) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
118 // multiple overlapz found. look at all the hits
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
119 // and figure out which bases in A survived. then
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
120 // report the contigous intervals that survived.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
121
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
122 vector<bool> aKeep(a.end - a.start, true);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
123
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
124 if (numConsumedByB == 0) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
125 // track the number of hit starts and ends at each position in A
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
126 for (vector<BED>::iterator h = bOverlaps.begin(); h != bOverlaps.end(); ++h) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
127 int s = max(a.start, h->start);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
128 int e = min(a.end, h->end);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
129
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
130 for (int i = s+1; i <= e; ++i) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
131 aKeep[i-a.start-1] = false;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
132 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
133 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
134 // report the remaining blocks.
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
135 for (unsigned int i = 0; i < aKeep.size(); ++i) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
136 if (aKeep[i] == true) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
137 CHRPOS blockStart = i + a.start;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
138 while ((aKeep[i] == true) && (i < aKeep.size())) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
139 i++;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
140 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
141 CHRPOS blockEnd = i + a.start;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
142 blockEnd = min(a.end, blockEnd);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
143 _bedA->reportBedRangeNewLine(a,blockStart,blockEnd);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
144 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
145 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
146 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
147 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
148 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
149
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
150
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
151
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
152 void BedSubtract::SubtractBed() {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
153
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
154 // load the "B" bed file into a map so
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
155 // that we can easily compare "A" to it for overlaps
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
156 _bedB->loadBedFileIntoMap();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
157
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
158 BED a, nullBed;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
159 BedLineStatus bedStatus;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
160 int lineNum = 0; // current input line number
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
161 vector<BED> hits; // vector of potential hits
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
162 // reserve some space
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
163 hits.reserve(100);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
164
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
165 _bedA->Open();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
166 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
167 if (bedStatus == BED_VALID) {
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
168 FindAndSubtractOverlaps(a, hits);
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
169 hits.clear();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
170 a = nullBed;
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
171 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
172 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
173 _bedA->Close();
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
174
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
175 }
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
176 // END Intersect
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
177
dfcd8b6c1bda Uploaded
aaronquinlan
parents:
diff changeset
178