0
|
1 /*****************************************************************************
|
|
2 subtractBed.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "subtractBed.h"
|
|
14
|
|
15
|
|
16 /*
|
|
17 Constructor
|
|
18 */
|
|
19 BedSubtract::BedSubtract(string &bedAFile, string &bedBFile, float overlapFraction, bool sameStrand, bool diffStrand) {
|
|
20
|
|
21 _bedAFile = bedAFile;
|
|
22 _bedBFile = bedBFile;
|
|
23 _overlapFraction = overlapFraction;
|
|
24 _sameStrand = sameStrand;
|
|
25 _diffStrand = diffStrand;
|
|
26
|
|
27 _bedA = new BedFile(bedAFile);
|
|
28 _bedB = new BedFile(bedBFile);
|
|
29
|
|
30 SubtractBed();
|
|
31 }
|
|
32
|
|
33
|
|
34 /*
|
|
35 Destructor
|
|
36 */
|
|
37 BedSubtract::~BedSubtract(void) {
|
|
38 }
|
|
39
|
|
40
|
|
41 void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) {
|
|
42
|
|
43 // find all of the overlaps between a and B.
|
|
44 _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand);
|
|
45
|
|
46 // is A completely spanned by an entry in B?
|
|
47 // if so, A should not be reported.
|
|
48 int numConsumedByB = 0;
|
|
49 int numOverlaps = 0;
|
|
50 vector<BED> bOverlaps; // list of hits in B. Special processing if there are multiple.
|
|
51
|
|
52 vector<BED>::const_iterator h = hits.begin();
|
|
53 vector<BED>::const_iterator hitsEnd = hits.end();
|
|
54 for (; h != hitsEnd; ++h) {
|
|
55
|
|
56 int s = max(a.start, h->start);
|
|
57 int e = min(a.end, h->end);
|
|
58 int overlapBases = (e - s); // the number of overlapping bases b/w a and b
|
|
59 int aLength = (a.end - a.start); // the length of a in b.p.
|
|
60
|
|
61 if (s < e) {
|
|
62
|
|
63 // is there enough overlap (default ~ 1bp)
|
|
64 float overlap = ((float) overlapBases / (float) aLength);
|
|
65
|
|
66 if (overlap >= 1.0) {
|
|
67 numOverlaps++;
|
|
68 numConsumedByB++;
|
|
69 }
|
|
70 else if ( overlap >= _overlapFraction ) {
|
|
71 numOverlaps++;
|
|
72 bOverlaps.push_back(*h);
|
|
73 }
|
|
74 }
|
|
75 }
|
|
76
|
|
77 if (numOverlaps == 0) {
|
|
78 // no overlap found, so just report A as-is.
|
|
79 _bedA->reportBedNewLine(a);
|
|
80 }
|
|
81 else if (numOverlaps == 1) {
|
|
82 // one overlap found. only need to look at the single
|
|
83 // entry in bOverlaps.
|
|
84
|
|
85 // if A was not "consumed" by any entry in B
|
|
86 if (numConsumedByB == 0) {
|
|
87
|
|
88 BED theHit = bOverlaps[0];
|
|
89
|
|
90 // A ++++++++++++
|
|
91 // B ----
|
|
92 // Res. ==== ====
|
|
93 if ( (theHit.start > a.start) && (theHit.end < a.end) ) {
|
|
94 _bedA->reportBedRangeNewLine(a,a.start,theHit.start);
|
|
95 _bedA->reportBedRangeNewLine(a,theHit.end,a.end);
|
|
96 }
|
|
97 // A ++++++++++++
|
|
98 // B ----------
|
|
99 // Res. ==
|
|
100 else if (theHit.start == a.start) {
|
|
101 _bedA->reportBedRangeNewLine(a,theHit.end,a.end);
|
|
102 }
|
|
103 // A ++++++++++++
|
|
104 // B ----------
|
|
105 // Res. ====
|
|
106 else if (theHit.start < a.start) {
|
|
107 _bedA->reportBedRangeNewLine(a,theHit.end,a.end);
|
|
108 }
|
|
109 // A ++++++++++++
|
|
110 // B ----------
|
|
111 // Res. =======
|
|
112 else if (theHit.start > a.start) {
|
|
113 _bedA->reportBedRangeNewLine(a,a.start,theHit.start);
|
|
114 }
|
|
115 }
|
|
116 }
|
|
117 else if (numOverlaps > 1) {
|
|
118 // multiple overlapz found. look at all the hits
|
|
119 // and figure out which bases in A survived. then
|
|
120 // report the contigous intervals that survived.
|
|
121
|
|
122 vector<bool> aKeep(a.end - a.start, true);
|
|
123
|
|
124 if (numConsumedByB == 0) {
|
|
125 // track the number of hit starts and ends at each position in A
|
|
126 for (vector<BED>::iterator h = bOverlaps.begin(); h != bOverlaps.end(); ++h) {
|
|
127 int s = max(a.start, h->start);
|
|
128 int e = min(a.end, h->end);
|
|
129
|
|
130 for (int i = s+1; i <= e; ++i) {
|
|
131 aKeep[i-a.start-1] = false;
|
|
132 }
|
|
133 }
|
|
134 // report the remaining blocks.
|
|
135 for (unsigned int i = 0; i < aKeep.size(); ++i) {
|
|
136 if (aKeep[i] == true) {
|
|
137 CHRPOS blockStart = i + a.start;
|
|
138 while ((aKeep[i] == true) && (i < aKeep.size())) {
|
|
139 i++;
|
|
140 }
|
|
141 CHRPOS blockEnd = i + a.start;
|
|
142 blockEnd = min(a.end, blockEnd);
|
|
143 _bedA->reportBedRangeNewLine(a,blockStart,blockEnd);
|
|
144 }
|
|
145 }
|
|
146 }
|
|
147 }
|
|
148 }
|
|
149
|
|
150
|
|
151
|
|
152 void BedSubtract::SubtractBed() {
|
|
153
|
|
154 // load the "B" bed file into a map so
|
|
155 // that we can easily compare "A" to it for overlaps
|
|
156 _bedB->loadBedFileIntoMap();
|
|
157
|
|
158 BED a, nullBed;
|
|
159 BedLineStatus bedStatus;
|
|
160 int lineNum = 0; // current input line number
|
|
161 vector<BED> hits; // vector of potential hits
|
|
162 // reserve some space
|
|
163 hits.reserve(100);
|
|
164
|
|
165 _bedA->Open();
|
|
166 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) {
|
|
167 if (bedStatus == BED_VALID) {
|
|
168 FindAndSubtractOverlaps(a, hits);
|
|
169 hits.clear();
|
|
170 a = nullBed;
|
|
171 }
|
|
172 }
|
|
173 _bedA->Close();
|
|
174
|
|
175 }
|
|
176 // END Intersect
|
|
177
|
|
178
|