0
|
1 /*****************************************************************************
|
|
2 chromsweep.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "chromsweep.h"
|
|
14 #include <queue>
|
|
15
|
|
16 bool after(const BED &a, const BED &b);
|
|
17 void report_hits(const BED &curr_qy, const vector<BED> &hits);
|
|
18 vector<BED> scan_cache(const BED &curr_qy, BedLineStatus qy_status, const vector<BED> &db_cache, vector<BED> &hits);
|
|
19
|
|
20
|
|
21 /*
|
|
22 // constructor using existing BedFile pointers
|
|
23 */
|
|
24 ChromSweep::ChromSweep(BedFile *bedA, BedFile *bedB, bool sameStrand, bool diffStrand)
|
|
25 : _bedA(bedA)
|
|
26 , _bedB(bedB)
|
|
27 , _sameStrand(sameStrand)
|
|
28 , _diffStrand(diffStrand)
|
|
29 {
|
|
30 // prime the results pump.
|
|
31 _qy_lineNum = 0;
|
|
32 _db_lineNum = 0;
|
|
33
|
|
34 _hits.reserve(1000);
|
|
35 _cache.reserve(1000);
|
|
36
|
|
37 _bedA->Open();
|
|
38 _bedB->Open();
|
|
39 _qy_status = _bedA->GetNextBed(_curr_qy, _qy_lineNum);
|
|
40 _db_status = _bedB->GetNextBed(_curr_db, _db_lineNum);
|
|
41 }
|
|
42
|
|
43 /*
|
|
44 Constructor with filenames
|
|
45 */
|
|
46 ChromSweep::ChromSweep(string &bedAFile, string &bedBFile)
|
|
47 {
|
|
48 // prime the results pump.
|
|
49 _qy_lineNum = 0;
|
|
50 _db_lineNum = 0;
|
|
51
|
|
52 _hits.reserve(100000);
|
|
53 _cache.reserve(100000);
|
|
54
|
|
55 _bedA = new BedFile(bedAFile);
|
|
56 _bedB = new BedFile(bedBFile);
|
|
57
|
|
58 _bedA->Open();
|
|
59 _bedB->Open();
|
|
60
|
|
61 _qy_status = _bedA->GetNextBed(_curr_qy, _qy_lineNum);
|
|
62 _db_status = _bedB->GetNextBed(_curr_db, _db_lineNum);
|
|
63 }
|
|
64
|
|
65
|
|
66 /*
|
|
67 Destructor
|
|
68 */
|
|
69 ChromSweep::~ChromSweep(void) {
|
|
70 }
|
|
71
|
|
72
|
|
73 void ChromSweep::ScanCache() {
|
|
74 if (_qy_status != BED_INVALID) {
|
|
75 vector<BED>::iterator c = _cache.begin();
|
|
76 while (c != _cache.end())
|
|
77 {
|
|
78 if ((_curr_qy.chrom == c->chrom) && !(after(_curr_qy, *c))) {
|
|
79 if (IsValidHit(_curr_qy, *c)) {
|
|
80 _hits.push_back(*c);
|
|
81 }
|
|
82 ++c;
|
|
83 }
|
|
84 else {
|
|
85 c = _cache.erase(c);
|
|
86 }
|
|
87 }
|
|
88 }
|
|
89 }
|
|
90
|
|
91
|
|
92 bool ChromSweep::ChromChange()
|
|
93 {
|
|
94 // the files are on the same chrom
|
|
95 if ((_curr_qy.chrom == _curr_db.chrom) || (_db_status == BED_INVALID) || (_qy_status == BED_INVALID)) {
|
|
96 return false;
|
|
97 }
|
|
98 // the query is ahead of the database. fast-forward the database to catch-up.
|
|
99 else if (_curr_qy.chrom > _curr_db.chrom) {
|
|
100 while (!_bedB->Empty() && _curr_db.chrom < _curr_qy.chrom)
|
|
101 {
|
|
102 _db_status = _bedB->GetNextBed(_curr_db, _db_lineNum);
|
|
103 }
|
|
104 _cache.clear();
|
|
105 return false;
|
|
106 }
|
|
107 // the database is ahead of the query.
|
|
108 else {
|
|
109 // 1. scan the cache for remaining hits on the query's current chrom.
|
|
110 if (_curr_qy.chrom == _curr_chrom)
|
|
111 {
|
|
112 ScanCache();
|
|
113 _results.push(make_pair(_curr_qy, _hits));
|
|
114 _hits.clear();
|
|
115 }
|
|
116 // 2. fast-forward until we catch up and report 0 hits until we do.
|
|
117 else if (_curr_qy.chrom < _curr_db.chrom)
|
|
118 {
|
|
119 _results.push(make_pair(_curr_qy, _no_hits));
|
|
120 _cache.clear();
|
|
121 }
|
|
122 _qy_status = _bedA->GetNextBed(_curr_qy, _qy_lineNum);
|
|
123 _curr_chrom = _curr_qy.chrom;
|
|
124 return true;
|
|
125 }
|
|
126 }
|
|
127
|
|
128 bool ChromSweep::IsValidHit(const BED &query, const BED &db) {
|
|
129 // do we have an overlap in the DB?
|
|
130 if (overlaps(query.start, query.end, db.start, db.end) > 0) {
|
|
131 // Now test for necessary strandedness.
|
|
132 bool strands_are_same = (query.strand == db.strand);
|
|
133 if ( (_sameStrand == false && _diffStrand == false)
|
|
134 ||
|
|
135 (_sameStrand == true && strands_are_same == true)
|
|
136 ||
|
|
137 (_diffStrand == true && strands_are_same == false)
|
|
138 )
|
|
139 {
|
|
140 return true;
|
|
141 }
|
|
142 }
|
|
143 return false;
|
|
144 }
|
|
145
|
|
146
|
|
147 bool ChromSweep::Next(pair<BED, vector<BED> > &next) {
|
|
148 if (!_bedA->Empty()) {
|
|
149 // have we changed chromosomes?
|
|
150 if (ChromChange() == false) {
|
|
151 // scan the database cache for hits
|
|
152 ScanCache();
|
|
153 // advance the db until we are ahead of the query. update hits and cache as necessary
|
|
154 while (!_bedB->Empty() && _curr_qy.chrom == _curr_db.chrom && !(after(_curr_db, _curr_qy)))
|
|
155 {
|
|
156 if (IsValidHit(_curr_qy, _curr_db)) {
|
|
157 _hits.push_back(_curr_db);
|
|
158 }
|
|
159 _cache.push_back(_curr_db);
|
|
160 _db_status = _bedB->GetNextBed(_curr_db, _db_lineNum);
|
|
161 }
|
|
162 // add the hits for this query to the pump
|
|
163 _results.push(make_pair(_curr_qy, _hits));
|
|
164 // reset for the next query
|
|
165 _hits.clear();
|
|
166 _curr_qy = _nullBed;
|
|
167 _qy_status = _bedA->GetNextBed(_curr_qy, _qy_lineNum);
|
|
168 _curr_chrom = _curr_qy.chrom;
|
|
169 }
|
|
170 }
|
|
171 // report the next set if hits if there are still overlaps in the pump
|
|
172 if (!_results.empty()) {
|
|
173 next = _results.front();
|
|
174 _results.pop();
|
|
175 return true;
|
|
176 }
|
|
177 // otherwise, the party is over.
|
|
178 else {return false;}
|
|
179 }
|
|
180
|