Mercurial > repos > aaronquinlan > multi_intersect
comparison BEDTools-Version-2.14.3/src/subtractBed/subtractBed.cpp @ 1:bec36315bd12 default tip
Deleted selected files
author | aaronquinlan |
---|---|
date | Sat, 19 Nov 2011 14:17:03 -0500 |
parents | dfcd8b6c1bda |
children |
comparison
equal
deleted
inserted
replaced
0:dfcd8b6c1bda | 1:bec36315bd12 |
---|---|
1 /***************************************************************************** | |
2 subtractBed.cpp | |
3 | |
4 (c) 2009 - Aaron Quinlan | |
5 Hall Laboratory | |
6 Department of Biochemistry and Molecular Genetics | |
7 University of Virginia | |
8 aaronquinlan@gmail.com | |
9 | |
10 Licenced under the GNU General Public License 2.0 license. | |
11 ******************************************************************************/ | |
12 #include "lineFileUtilities.h" | |
13 #include "subtractBed.h" | |
14 | |
15 | |
16 /* | |
17 Constructor | |
18 */ | |
19 BedSubtract::BedSubtract(string &bedAFile, string &bedBFile, float overlapFraction, bool sameStrand, bool diffStrand) { | |
20 | |
21 _bedAFile = bedAFile; | |
22 _bedBFile = bedBFile; | |
23 _overlapFraction = overlapFraction; | |
24 _sameStrand = sameStrand; | |
25 _diffStrand = diffStrand; | |
26 | |
27 _bedA = new BedFile(bedAFile); | |
28 _bedB = new BedFile(bedBFile); | |
29 | |
30 SubtractBed(); | |
31 } | |
32 | |
33 | |
34 /* | |
35 Destructor | |
36 */ | |
37 BedSubtract::~BedSubtract(void) { | |
38 } | |
39 | |
40 | |
41 void BedSubtract::FindAndSubtractOverlaps(BED &a, vector<BED> &hits) { | |
42 | |
43 // find all of the overlaps between a and B. | |
44 _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand); | |
45 | |
46 // is A completely spanned by an entry in B? | |
47 // if so, A should not be reported. | |
48 int numConsumedByB = 0; | |
49 int numOverlaps = 0; | |
50 vector<BED> bOverlaps; // list of hits in B. Special processing if there are multiple. | |
51 | |
52 vector<BED>::const_iterator h = hits.begin(); | |
53 vector<BED>::const_iterator hitsEnd = hits.end(); | |
54 for (; h != hitsEnd; ++h) { | |
55 | |
56 int s = max(a.start, h->start); | |
57 int e = min(a.end, h->end); | |
58 int overlapBases = (e - s); // the number of overlapping bases b/w a and b | |
59 int aLength = (a.end - a.start); // the length of a in b.p. | |
60 | |
61 if (s < e) { | |
62 | |
63 // is there enough overlap (default ~ 1bp) | |
64 float overlap = ((float) overlapBases / (float) aLength); | |
65 | |
66 if (overlap >= 1.0) { | |
67 numOverlaps++; | |
68 numConsumedByB++; | |
69 } | |
70 else if ( overlap >= _overlapFraction ) { | |
71 numOverlaps++; | |
72 bOverlaps.push_back(*h); | |
73 } | |
74 } | |
75 } | |
76 | |
77 if (numOverlaps == 0) { | |
78 // no overlap found, so just report A as-is. | |
79 _bedA->reportBedNewLine(a); | |
80 } | |
81 else if (numOverlaps == 1) { | |
82 // one overlap found. only need to look at the single | |
83 // entry in bOverlaps. | |
84 | |
85 // if A was not "consumed" by any entry in B | |
86 if (numConsumedByB == 0) { | |
87 | |
88 BED theHit = bOverlaps[0]; | |
89 | |
90 // A ++++++++++++ | |
91 // B ---- | |
92 // Res. ==== ==== | |
93 if ( (theHit.start > a.start) && (theHit.end < a.end) ) { | |
94 _bedA->reportBedRangeNewLine(a,a.start,theHit.start); | |
95 _bedA->reportBedRangeNewLine(a,theHit.end,a.end); | |
96 } | |
97 // A ++++++++++++ | |
98 // B ---------- | |
99 // Res. == | |
100 else if (theHit.start == a.start) { | |
101 _bedA->reportBedRangeNewLine(a,theHit.end,a.end); | |
102 } | |
103 // A ++++++++++++ | |
104 // B ---------- | |
105 // Res. ==== | |
106 else if (theHit.start < a.start) { | |
107 _bedA->reportBedRangeNewLine(a,theHit.end,a.end); | |
108 } | |
109 // A ++++++++++++ | |
110 // B ---------- | |
111 // Res. ======= | |
112 else if (theHit.start > a.start) { | |
113 _bedA->reportBedRangeNewLine(a,a.start,theHit.start); | |
114 } | |
115 } | |
116 } | |
117 else if (numOverlaps > 1) { | |
118 // multiple overlapz found. look at all the hits | |
119 // and figure out which bases in A survived. then | |
120 // report the contigous intervals that survived. | |
121 | |
122 vector<bool> aKeep(a.end - a.start, true); | |
123 | |
124 if (numConsumedByB == 0) { | |
125 // track the number of hit starts and ends at each position in A | |
126 for (vector<BED>::iterator h = bOverlaps.begin(); h != bOverlaps.end(); ++h) { | |
127 int s = max(a.start, h->start); | |
128 int e = min(a.end, h->end); | |
129 | |
130 for (int i = s+1; i <= e; ++i) { | |
131 aKeep[i-a.start-1] = false; | |
132 } | |
133 } | |
134 // report the remaining blocks. | |
135 for (unsigned int i = 0; i < aKeep.size(); ++i) { | |
136 if (aKeep[i] == true) { | |
137 CHRPOS blockStart = i + a.start; | |
138 while ((aKeep[i] == true) && (i < aKeep.size())) { | |
139 i++; | |
140 } | |
141 CHRPOS blockEnd = i + a.start; | |
142 blockEnd = min(a.end, blockEnd); | |
143 _bedA->reportBedRangeNewLine(a,blockStart,blockEnd); | |
144 } | |
145 } | |
146 } | |
147 } | |
148 } | |
149 | |
150 | |
151 | |
152 void BedSubtract::SubtractBed() { | |
153 | |
154 // load the "B" bed file into a map so | |
155 // that we can easily compare "A" to it for overlaps | |
156 _bedB->loadBedFileIntoMap(); | |
157 | |
158 BED a, nullBed; | |
159 BedLineStatus bedStatus; | |
160 int lineNum = 0; // current input line number | |
161 vector<BED> hits; // vector of potential hits | |
162 // reserve some space | |
163 hits.reserve(100); | |
164 | |
165 _bedA->Open(); | |
166 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { | |
167 if (bedStatus == BED_VALID) { | |
168 FindAndSubtractOverlaps(a, hits); | |
169 hits.clear(); | |
170 a = nullBed; | |
171 } | |
172 } | |
173 _bedA->Close(); | |
174 | |
175 } | |
176 // END Intersect | |
177 | |
178 |