0
|
1 /*****************************************************************************
|
|
2 pairToPair.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "pairToPair.h"
|
|
14
|
|
15
|
|
16 /*
|
|
17 Constructor
|
|
18 */
|
|
19 PairToPair::PairToPair(string &bedAFilePE, string &bedBFilePE, float &overlapFraction,
|
|
20 string searchType, bool ignoreStrand, bool reqDiffNames, int slop, bool strandedSlop) {
|
|
21
|
|
22 _bedAFilePE = bedAFilePE;
|
|
23 _bedBFilePE = bedBFilePE;
|
|
24 _overlapFraction = overlapFraction;
|
|
25 _searchType = searchType;
|
|
26 _ignoreStrand = ignoreStrand;
|
|
27 _reqDiffNames = reqDiffNames;
|
|
28 _slop = slop;
|
|
29 _strandedSlop = strandedSlop;
|
|
30
|
|
31 _bedA = new BedFilePE(bedAFilePE);
|
|
32 _bedB = new BedFilePE(bedBFilePE);
|
|
33
|
|
34 IntersectPairs();
|
|
35 }
|
|
36
|
|
37
|
|
38 /*
|
|
39 Destructor
|
|
40 */
|
|
41 PairToPair::~PairToPair(void) {
|
|
42 }
|
|
43
|
|
44
|
|
45
|
|
46 void PairToPair::IntersectPairs() {
|
|
47
|
|
48 // load the "B" bed file into a map so
|
|
49 // that we can easily compare "A" to it for overlaps
|
|
50 _bedB->loadBedPEFileIntoMap();
|
|
51
|
|
52 int lineNum = 0;
|
|
53 BedLineStatus bedStatus;
|
|
54 BEDPE a, nullBedPE;
|
|
55
|
|
56 _bedA->Open();
|
|
57 while ((bedStatus = _bedA->GetNextBedPE(a, lineNum)) != BED_INVALID) {
|
|
58 if (bedStatus == BED_VALID) {
|
|
59 // identify overlaps b/w the pairs
|
|
60 FindOverlaps(a);
|
|
61 a = nullBedPE;
|
|
62 }
|
|
63 }
|
|
64 _bedA->Close();
|
|
65 }
|
|
66 // END IntersectPE
|
|
67
|
|
68
|
|
69
|
|
70 void PairToPair::FindOverlaps(const BEDPE &a) {
|
|
71 //
|
|
72 vector<MATE> hitsA1B1, hitsA1B2, hitsA2B1, hitsA2B2;
|
|
73
|
|
74 // add the appropriate slop to the starts and ends
|
|
75 int start1 = a.start1;
|
|
76 int end1 = a.end1;
|
|
77 int start2 = a.start2;
|
|
78 int end2 = a.end2;
|
|
79
|
|
80 if (_strandedSlop == true) {
|
|
81 if (a.strand1 == "+")
|
|
82 end1 += _slop;
|
|
83 else
|
|
84 start1 -= _slop;
|
|
85 if (a.strand2 == "+")
|
|
86 end2 += _slop;
|
|
87 else
|
|
88 start2 -= _slop;
|
|
89 }
|
|
90 else {
|
|
91 (start1 - _slop) >= 0 ? start1 -= _slop : start1 = 0;
|
|
92 (start2 - _slop) >= 0 ? start2 -= _slop : start2 = 0;
|
|
93 end1 += _slop;
|
|
94 end2 += _slop;
|
|
95 }
|
|
96
|
|
97 // Find the _potential_ hits between each end of A and B
|
|
98 _bedB->FindOverlapsPerBin(1, a.chrom1, start1, end1, a.name, a.strand1, hitsA1B1, _overlapFraction, !(_ignoreStrand), _reqDiffNames); // hits b/w A1 & B1
|
|
99 _bedB->FindOverlapsPerBin(1, a.chrom2, start2, end2, a.name, a.strand2, hitsA2B1, _overlapFraction, !(_ignoreStrand), _reqDiffNames); // hits b/w A2 & B1
|
|
100 _bedB->FindOverlapsPerBin(2, a.chrom1, start1, end1, a.name, a.strand1, hitsA1B2, _overlapFraction, !(_ignoreStrand), _reqDiffNames); // hits b/w A1 & B2
|
|
101 _bedB->FindOverlapsPerBin(2, a.chrom2, start2, end2, a.name, a.strand2, hitsA2B2, _overlapFraction, !(_ignoreStrand), _reqDiffNames); // hits b/w A2 & B2
|
|
102
|
|
103 unsigned int matchCount1 = (hitsA1B1.size() + hitsA2B2.size());
|
|
104 unsigned int matchCount2 = (hitsA2B1.size() + hitsA1B2.size());
|
|
105
|
|
106
|
|
107 // report the fact that no hits were found iff _searchType is neither.
|
|
108 if ((matchCount1 == 0) && (matchCount2 == 0) && (_searchType == "neither")) {
|
|
109 _bedA->reportBedPENewLine(a);
|
|
110 }
|
|
111 else if (_searchType == "both") {
|
|
112 bool found1 = false;
|
|
113 bool found2 = false;
|
|
114 if ((hitsA1B1.size() > 0) || (hitsA2B2.size() > 0))
|
|
115 found1 = FindHitsOnBothEnds(a, hitsA1B1, hitsA2B2);
|
|
116 if ((hitsA2B1.size() > 0) || (hitsA1B2.size() > 0))
|
|
117 found2 = FindHitsOnBothEnds(a, hitsA2B1, hitsA1B2);
|
|
118 }
|
|
119 else if (_searchType == "notboth") {
|
|
120 bool found1 = false;
|
|
121 bool found2 = false;
|
|
122 if ((hitsA1B1.size() > 0) || (hitsA2B2.size() > 0))
|
|
123 found1 = FindHitsOnBothEnds(a, hitsA1B1, hitsA2B2);
|
|
124 if ((hitsA2B1.size() > 0) || (hitsA1B2.size() > 0))
|
|
125 found2 = FindHitsOnBothEnds(a, hitsA2B1, hitsA1B2);
|
|
126 if (found1 == false && found2 == false)
|
|
127 _bedA->reportBedPENewLine(a);
|
|
128 }
|
|
129 else if (_searchType == "either") {
|
|
130 FindHitsOnEitherEnd(a, hitsA1B1, hitsA2B2);
|
|
131 FindHitsOnEitherEnd(a, hitsA2B1, hitsA1B2);
|
|
132 }
|
|
133 }
|
|
134
|
|
135
|
|
136 bool PairToPair::FindHitsOnBothEnds(const BEDPE &a, const vector<MATE> &qualityHitsEnd1,
|
|
137 const vector<MATE> &qualityHitsEnd2) {
|
|
138
|
|
139 map<unsigned int, vector<MATE>, less<int> > hitsMap;
|
|
140
|
|
141 for (vector<MATE>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) {
|
|
142 hitsMap[h->lineNum].push_back(*h);
|
|
143 }
|
|
144 for (vector<MATE>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) {
|
|
145 hitsMap[h->lineNum].push_back(*h);
|
|
146 }
|
|
147
|
|
148
|
|
149 bool bothFound = false;
|
|
150 for (map<unsigned int, vector<MATE>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) {
|
|
151
|
|
152 // hits on both sides
|
|
153 if (m->second.size() >= 2) {
|
|
154 bothFound = true;
|
|
155 MATE b1 = m->second[0];
|
|
156 MATE b2 = m->second[1];
|
|
157
|
|
158 if (_searchType == "both") {
|
|
159 _bedA->reportBedPETab(a);
|
|
160 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end,
|
|
161 b2.bed.chrom.c_str(), b2.bed.start, b2.bed.end,
|
|
162 b1.bed.name.c_str(), b1.bed.score.c_str(),
|
|
163 b1.bed.strand.c_str(), b2.bed.strand.c_str());
|
|
164 for (size_t i = 0; i < b1.bed.otherFields.size(); ++i)
|
|
165 printf("\t%s", b1.bed.otherFields[i].c_str());
|
|
166 printf("\n");
|
|
167 }
|
|
168 }
|
|
169 }
|
|
170 return bothFound;
|
|
171 }
|
|
172
|
|
173
|
|
174 void PairToPair::FindHitsOnEitherEnd(const BEDPE &a, const vector<MATE> &qualityHitsEnd1,
|
|
175 const vector<MATE> &qualityHitsEnd2) {
|
|
176
|
|
177 map<unsigned int, vector<MATE>, less<int> > hitsMap;
|
|
178
|
|
179 for (vector<MATE>::const_iterator h = qualityHitsEnd1.begin(); h != qualityHitsEnd1.end(); ++h) {
|
|
180 hitsMap[h->lineNum].push_back(*h);
|
|
181 }
|
|
182 for (vector<MATE>::const_iterator h = qualityHitsEnd2.begin(); h != qualityHitsEnd2.end(); ++h) {
|
|
183 hitsMap[h->lineNum].push_back(*h);
|
|
184 }
|
|
185
|
|
186 for (map<unsigned int, vector<MATE>, less<unsigned int> >::iterator m = hitsMap.begin(); m != hitsMap.end(); ++m) {
|
|
187 if (m->second.size() >= 1) {
|
|
188
|
|
189 if ((m->second.size()) == 2) {
|
|
190 MATE b1 = m->second[0];
|
|
191 MATE b2 = m->second[1];
|
|
192
|
|
193 _bedA->reportBedPETab(a);
|
|
194 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end,
|
|
195 b2.bed.chrom.c_str(), b2.bed.start, b2.bed.end,
|
|
196 b1.bed.name.c_str(), b1.bed.score.c_str(),
|
|
197 b1.bed.strand.c_str(), b2.bed.strand.c_str());
|
|
198 for (size_t i = 0; i < b1.bed.otherFields.size(); ++i)
|
|
199 printf("\t%s", b1.bed.otherFields[i].c_str());
|
|
200 printf("\n");
|
|
201 }
|
|
202 else {
|
|
203 MATE b1 = m->second[0];
|
|
204
|
|
205 _bedA->reportBedPETab(a);
|
|
206 printf("%s\t%d\t%d\t%s\t%d\t%d\t%s\t%s\t%s\t%s", b1.bed.chrom.c_str(), b1.bed.start, b1.bed.end,
|
|
207 b1.mate->bed.chrom.c_str(), b1.mate->bed.start, b1.mate->bed.end,
|
|
208 b1.bed.name.c_str(), b1.bed.score.c_str(),
|
|
209 b1.bed.strand.c_str(), b1.mate->bed.strand.c_str());
|
|
210 for (size_t i = 0; i < b1.bed.otherFields.size(); ++i)
|
|
211 printf("\t%s", b1.bed.otherFields[i].c_str());
|
|
212 printf("\n");
|
|
213 }
|
|
214 }
|
|
215 }
|
|
216 }
|