0
|
1 /*****************************************************************************
|
|
2 windowBed.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "windowBed.h"
|
|
14
|
|
15
|
|
16 /*
|
|
17 Constructor
|
|
18 */
|
|
19 BedWindow::BedWindow(string bedAFile, string bedBFile, int leftSlop, int rightSlop,
|
|
20 bool anyHit, bool noHit, bool writeCount, bool strandWindows,
|
|
21 bool matchOnSameStrand, bool matchOnDiffStrand, bool bamInput, bool bamOutput, bool isUncompressedBam) {
|
|
22
|
|
23 _bedAFile = bedAFile;
|
|
24 _bedBFile = bedBFile;
|
|
25
|
|
26 _leftSlop = leftSlop;
|
|
27 _rightSlop = rightSlop;
|
|
28
|
|
29 _anyHit = anyHit;
|
|
30 _noHit = noHit;
|
|
31 _writeCount = writeCount;
|
|
32 _strandWindows = strandWindows;
|
|
33 _matchOnSameStrand = matchOnSameStrand;
|
|
34 _matchOnDiffStrand = matchOnDiffStrand;
|
|
35 _bamInput = bamInput;
|
|
36 _bamOutput = bamOutput;
|
|
37 _isUncompressedBam = isUncompressedBam;
|
|
38
|
|
39 _bedA = new BedFile(bedAFile);
|
|
40 _bedB = new BedFile(bedBFile);
|
|
41
|
|
42 if (_bamInput == false)
|
|
43 WindowIntersectBed();
|
|
44 else
|
|
45 WindowIntersectBam(_bedAFile);
|
|
46 }
|
|
47
|
|
48
|
|
49
|
|
50 /*
|
|
51 Destructor
|
|
52 */
|
|
53 BedWindow::~BedWindow(void) {
|
|
54 }
|
|
55
|
|
56
|
|
57
|
|
58 void BedWindow::FindWindowOverlaps(const BED &a, vector<BED> &hits) {
|
|
59
|
|
60 /*
|
|
61 Adjust the start and end of a based on the requested window
|
|
62 */
|
|
63
|
|
64 // update the current feature's start and end
|
|
65 // according to the slop requested (slop = 0 by default)
|
|
66 CHRPOS aFudgeStart = 0;
|
|
67 CHRPOS aFudgeEnd;
|
|
68 AddWindow(a, aFudgeStart, aFudgeEnd);
|
|
69
|
|
70 /*
|
|
71 Now report the hits (if any) based on the window around a.
|
|
72 */
|
|
73 // get the hits in B for the A feature
|
|
74 _bedB->FindOverlapsPerBin(a.chrom, aFudgeStart, aFudgeEnd, a.strand, hits, _matchOnSameStrand, _matchOnDiffStrand);
|
|
75
|
|
76 int numOverlaps = 0;
|
|
77
|
|
78 // loop through the hits and report those that meet the user's criteria
|
|
79 vector<BED>::const_iterator h = hits.begin();
|
|
80 vector<BED>::const_iterator hitsEnd = hits.end();
|
|
81 for (; h != hitsEnd; ++h) {
|
|
82
|
|
83 int s = max(aFudgeStart, h->start);
|
|
84 int e = min(aFudgeEnd, h->end);
|
|
85 int overlapBases = (e - s); // the number of overlapping bases b/w a and b
|
|
86 int aLength = (a.end - a.start); // the length of a in b.p.
|
|
87
|
|
88 if (s < e) {
|
|
89 // is there enough overlap (default ~ 1bp)
|
|
90 if ( ((float) overlapBases / (float) aLength) > 0 ) {
|
|
91 numOverlaps++;
|
|
92 if (_anyHit == false && _noHit == false && _writeCount == false) {
|
|
93 _bedA->reportBedTab(a);
|
|
94 _bedB->reportBedNewLine(*h);
|
|
95 }
|
|
96 }
|
|
97 }
|
|
98 }
|
|
99 if (_anyHit == true && (numOverlaps >= 1)) {
|
|
100 _bedA->reportBedNewLine(a); }
|
|
101 else if (_writeCount == true) {
|
|
102 _bedA->reportBedTab(a); printf("\t%d\n", numOverlaps);
|
|
103 }
|
|
104 else if (_noHit == true && (numOverlaps == 0)) {
|
|
105 _bedA->reportBedNewLine(a);
|
|
106 }
|
|
107 }
|
|
108
|
|
109
|
|
110 bool BedWindow::FindOneOrMoreWindowOverlaps(const BED &a) {
|
|
111
|
|
112 // update the current feature's start and end
|
|
113 // according to the slop requested (slop = 0 by default)
|
|
114 CHRPOS aFudgeStart = 0;
|
|
115 CHRPOS aFudgeEnd;
|
|
116 AddWindow(a, aFudgeStart, aFudgeEnd);
|
|
117
|
|
118 bool overlapsFound = _bedB->FindOneOrMoreOverlapsPerBin(a.chrom, a.start, a.end, a.strand, _matchOnSameStrand, _matchOnDiffStrand);
|
|
119 return overlapsFound;
|
|
120 }
|
|
121
|
|
122
|
|
123 void BedWindow::WindowIntersectBed() {
|
|
124
|
|
125 // load the "B" bed file into a map so
|
|
126 // that we can easily compare "A" to it for overlaps
|
|
127 _bedB->loadBedFileIntoMap();
|
|
128
|
|
129 BED a, nullBed;
|
|
130 int lineNum = 0; // current input line number
|
|
131 BedLineStatus bedStatus;
|
|
132 vector<BED> hits; // vector of potential hits
|
|
133 hits.reserve(100);
|
|
134
|
|
135 _bedA->Open();
|
|
136 while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) {
|
|
137 if (bedStatus == BED_VALID) {
|
|
138 FindWindowOverlaps(a, hits);
|
|
139 hits.clear();
|
|
140 a = nullBed;
|
|
141 }
|
|
142 }
|
|
143 _bedA->Close();
|
|
144 }
|
|
145
|
|
146
|
|
147 void BedWindow::WindowIntersectBam(string bamFile) {
|
|
148
|
|
149 // load the "B" bed file into a map so
|
|
150 // that we can easily compare "A" to it for overlaps
|
|
151 _bedB->loadBedFileIntoMap();
|
|
152
|
|
153 // open the BAM file
|
|
154 BamReader reader;
|
|
155 BamWriter writer;
|
|
156 reader.Open(bamFile);
|
|
157
|
|
158 // get header & reference information
|
|
159 string bamHeader = reader.GetHeaderText();
|
|
160 RefVector refs = reader.GetReferenceData();
|
|
161
|
|
162 // open a BAM output to stdout if we are writing BAM
|
|
163 if (_bamOutput == true) {
|
|
164 // set compression mode
|
|
165 BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
|
|
166 if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
|
|
167 writer.SetCompressionMode(compressionMode);
|
|
168 // open our BAM writer
|
|
169 writer.Open("stdout", bamHeader, refs);
|
|
170 }
|
|
171
|
|
172 vector<BED> hits; // vector of potential hits
|
|
173 // reserve some space
|
|
174 hits.reserve(100);
|
|
175
|
|
176 _bedA->bedType = 6;
|
|
177 BamAlignment bam;
|
|
178 bool overlapsFound;
|
|
179 // get each set of alignments for each pair.
|
|
180 while (reader.GetNextAlignment(bam)) {
|
|
181
|
|
182 if (bam.IsMapped()) {
|
|
183 BED a;
|
|
184 a.chrom = refs.at(bam.RefID).RefName;
|
|
185 a.start = bam.Position;
|
|
186 a.end = bam.GetEndPosition(false, false);
|
|
187
|
|
188 // build the name field from the BAM alignment.
|
|
189 a.name = bam.Name;
|
|
190 if (bam.IsFirstMate()) a.name += "/1";
|
|
191 if (bam.IsSecondMate()) a.name += "/2";
|
|
192
|
|
193 a.score = ToString(bam.MapQuality);
|
|
194 a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-";
|
|
195
|
|
196 if (_bamOutput == true) {
|
|
197 overlapsFound = FindOneOrMoreWindowOverlaps(a);
|
|
198 if (overlapsFound == true) {
|
|
199 if (_noHit == false)
|
|
200 writer.SaveAlignment(bam);
|
|
201 }
|
|
202 else {
|
|
203 if (_noHit == true)
|
|
204 writer.SaveAlignment(bam);
|
|
205 }
|
|
206 }
|
|
207 else {
|
|
208 FindWindowOverlaps(a, hits);
|
|
209 hits.clear();
|
|
210 }
|
|
211 }
|
|
212 // BAM IsMapped() is false
|
|
213 else if (_noHit == true) {
|
|
214 writer.SaveAlignment(bam);
|
|
215 }
|
|
216 }
|
|
217
|
|
218 // close the relevant BAM files.
|
|
219 reader.Close();
|
|
220 if (_bamOutput == true) {
|
|
221 writer.Close();
|
|
222 }
|
|
223 }
|
|
224
|
|
225
|
|
226 void BedWindow::AddWindow(const BED &a, CHRPOS &fudgeStart, CHRPOS &fudgeEnd) {
|
|
227 // Does the user want to treat the windows based on strand?
|
|
228 // If so,
|
|
229 // if "+", then left is left and right is right
|
|
230 // if "-", the left is right and right is left.
|
|
231 if (_strandWindows) {
|
|
232 if (a.strand == "+") {
|
|
233 if ((int) (a.start - _leftSlop) > 0)
|
|
234 fudgeStart = a.start - _leftSlop;
|
|
235 else fudgeStart = 0;
|
|
236 fudgeEnd = a.end + _rightSlop;
|
|
237 }
|
|
238 else {
|
|
239 if ((int) (a.start - _rightSlop) > 0)
|
|
240 fudgeStart = a.start - _rightSlop;
|
|
241 else fudgeStart = 0;
|
|
242 fudgeEnd = a.end + _leftSlop;
|
|
243 }
|
|
244 }
|
|
245 // If not, add the windows irrespective of strand
|
|
246 else {
|
|
247 if ((int) (a.start - _leftSlop) > 0)
|
|
248 fudgeStart = a.start - _leftSlop;
|
|
249 else fudgeStart = 0;
|
|
250 fudgeEnd = a.end + _rightSlop;
|
|
251 }
|
|
252 }
|
|
253
|