0
|
1 /*****************************************************************************
|
|
2 shuffleBed.cpp
|
|
3
|
|
4 (c) 2009 - Aaron Quinlan
|
|
5 Hall Laboratory
|
|
6 Department of Biochemistry and Molecular Genetics
|
|
7 University of Virginia
|
|
8 aaronquinlan@gmail.com
|
|
9
|
|
10 Licenced under the GNU General Public License 2.0 license.
|
|
11 ******************************************************************************/
|
|
12 #include "lineFileUtilities.h"
|
|
13 #include "shuffleBed.h"
|
|
14
|
|
15
|
|
16 BedShuffle::BedShuffle(string &bedFile, string &genomeFile, string &excludeFile, string &includeFile,
|
|
17 bool haveSeed, bool haveExclude, bool haveInclude, bool sameChrom,
|
|
18 float overlapFraction, int seed) {
|
|
19
|
|
20 _bedFile = bedFile;
|
|
21 _genomeFile = genomeFile;
|
|
22 _excludeFile = excludeFile;
|
|
23 _includeFile = includeFile;
|
|
24 _sameChrom = sameChrom;
|
|
25 _haveExclude = haveExclude;
|
|
26 _haveInclude = haveInclude;
|
|
27 _overlapFraction = overlapFraction;
|
|
28 _haveSeed = haveSeed;
|
|
29
|
|
30
|
|
31 // use the supplied seed for the random
|
|
32 // number generation if given. else,
|
|
33 // roll our own.
|
|
34 if (_haveSeed) {
|
|
35 _seed = seed;
|
|
36 srand(seed);
|
|
37 }
|
|
38 else {
|
|
39 // thanks to Rob Long for the tip.
|
|
40 _seed = (unsigned)time(0)+(unsigned)getpid();
|
|
41 srand(_seed);
|
|
42 }
|
|
43
|
|
44 _bed = new BedFile(bedFile);
|
|
45 _genome = new GenomeFile(genomeFile);
|
|
46 _chroms = _genome->getChromList();
|
|
47 _numChroms = _genome->getNumberOfChroms();
|
|
48
|
|
49 if (_haveExclude) {
|
|
50 _exclude = new BedFile(excludeFile);
|
|
51 _exclude->loadBedFileIntoMap();
|
|
52 }
|
|
53
|
|
54 if (_haveInclude) {
|
|
55 _include = new BedFile(includeFile);
|
|
56 _include->loadBedFileIntoMapNoBin();
|
|
57
|
|
58 _numIncludeChroms = 0;
|
|
59 masterBedMapNoBin::const_iterator it = _include->bedMapNoBin.begin();
|
|
60 masterBedMapNoBin::const_iterator itEnd = _include->bedMapNoBin.end();
|
|
61 for(; it != itEnd; ++it) {
|
|
62 _includeChroms.push_back(it->first);
|
|
63 _numIncludeChroms++;
|
|
64 }
|
|
65 }
|
|
66
|
|
67 if (_haveExclude == true && _haveInclude == false)
|
|
68 ShuffleWithExclusions();
|
|
69 else if (_haveExclude == false && _haveInclude == true)
|
|
70 ShuffleWithInclusions();
|
|
71 else
|
|
72 Shuffle();
|
|
73 }
|
|
74
|
|
75
|
|
76 BedShuffle::~BedShuffle(void) {
|
|
77
|
|
78 }
|
|
79
|
|
80
|
|
81 void BedShuffle::Shuffle() {
|
|
82
|
|
83 int lineNum = 0;
|
|
84 BED bedEntry, nullBed; // used to store the current BED line from the BED file.
|
|
85 BedLineStatus bedStatus;
|
|
86
|
|
87 _bed->Open();
|
|
88 while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
|
|
89 if (bedStatus == BED_VALID) {
|
|
90 ChooseLocus(bedEntry);
|
|
91 _bed->reportBedNewLine(bedEntry);
|
|
92 bedEntry = nullBed;
|
|
93 }
|
|
94 }
|
|
95 _bed->Close();
|
|
96 }
|
|
97
|
|
98
|
|
99
|
|
100 void BedShuffle::ShuffleWithExclusions() {
|
|
101
|
|
102 int lineNum = 0;
|
|
103 BED bedEntry, nullBed; // used to store the current BED line from the BED file.
|
|
104 BedLineStatus bedStatus;
|
|
105
|
|
106 _bed->Open();
|
|
107 while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
|
|
108 if (bedStatus == BED_VALID) {
|
|
109 // keep looking as long as the chosen
|
|
110 // locus happens to overlap with regions
|
|
111 // that the user wishes to exclude.
|
|
112 int tries = 0;
|
|
113 bool haveOverlap = false;
|
|
114 do
|
|
115 {
|
|
116 // choose a new locus
|
|
117 ChooseLocus(bedEntry);
|
|
118 haveOverlap = _exclude->FindOneOrMoreOverlapsPerBin(bedEntry.chrom, bedEntry.start, bedEntry.end,
|
|
119 bedEntry.strand, false, _overlapFraction);
|
|
120 tries++;
|
|
121 } while ((haveOverlap == true) && (tries <= MAX_TRIES));
|
|
122
|
|
123
|
|
124 if (tries > MAX_TRIES) {
|
|
125 cerr << "Error, line " << lineNum << ": tried " << MAX_TRIES << " potential loci for entry, but could not avoid excluded regions. Ignoring entry and moving on." << endl;
|
|
126 }
|
|
127 else {
|
|
128 _bed->reportBedNewLine(bedEntry);
|
|
129 }
|
|
130 }
|
|
131 bedEntry = nullBed;
|
|
132 }
|
|
133 _bed->Close();
|
|
134 }
|
|
135
|
|
136
|
|
137 void BedShuffle::ShuffleWithInclusions() {
|
|
138
|
|
139 int lineNum = 0;
|
|
140 BED bedEntry, nullBed; // used to store the current BED line from the BED file.
|
|
141 BedLineStatus bedStatus;
|
|
142
|
|
143 _bed->Open();
|
|
144 while ((bedStatus = _bed->GetNextBed(bedEntry, lineNum)) != BED_INVALID) {
|
|
145 if (bedStatus == BED_VALID) {
|
|
146 // choose a new locus
|
|
147 ChooseLocusFromInclusionFile(bedEntry);
|
|
148 _bed->reportBedNewLine(bedEntry);
|
|
149 }
|
|
150 bedEntry = nullBed;
|
|
151 }
|
|
152 _bed->Close();
|
|
153 }
|
|
154
|
|
155
|
|
156 void BedShuffle::ChooseLocus(BED &bedEntry) {
|
|
157
|
|
158 string chrom = bedEntry.chrom;
|
|
159 CHRPOS start = bedEntry.start;
|
|
160 CHRPOS end = bedEntry.end;
|
|
161 CHRPOS length = end - start;
|
|
162
|
|
163 string randomChrom;
|
|
164 CHRPOS randomStart;
|
|
165 CHRPOS chromSize;
|
|
166
|
|
167 if (_sameChrom == false) {
|
|
168 randomChrom = _chroms[rand() % _numChroms];
|
|
169 chromSize = _genome->getChromSize(randomChrom);
|
|
170 randomStart = rand() % chromSize;
|
|
171 bedEntry.chrom = randomChrom;
|
|
172 bedEntry.start = randomStart;
|
|
173 bedEntry.end = randomStart + length;
|
|
174 }
|
|
175 else {
|
|
176 chromSize = _genome->getChromSize(chrom);
|
|
177 randomStart = rand() % chromSize;
|
|
178 bedEntry.start = randomStart;
|
|
179 bedEntry.end = randomStart + length;
|
|
180 }
|
|
181
|
|
182 // ensure that the chosen location doesn't go past
|
|
183 // the length of the chromosome. if so, keep looking
|
|
184 // for a new spot.
|
|
185 while (bedEntry.end > chromSize) {
|
|
186 if (_sameChrom == false) {
|
|
187 randomChrom = _chroms[rand() % _numChroms];
|
|
188 chromSize = _genome->getChromSize(randomChrom);
|
|
189 randomStart = rand() % chromSize;
|
|
190 bedEntry.chrom = randomChrom;
|
|
191 bedEntry.start = randomStart;
|
|
192 bedEntry.end = randomStart + length;
|
|
193 }
|
|
194 else {
|
|
195 chromSize = _genome->getChromSize(chrom);
|
|
196 randomStart = rand() % chromSize;
|
|
197 bedEntry.start = randomStart;
|
|
198 bedEntry.end = randomStart + length;
|
|
199 }
|
|
200 }
|
|
201 }
|
|
202
|
|
203
|
|
204 void BedShuffle::ChooseLocusFromInclusionFile(BED &bedEntry) {
|
|
205
|
|
206 string chrom = bedEntry.chrom;
|
|
207 CHRPOS length = bedEntry.end - bedEntry.start;
|
|
208
|
|
209 string randomChrom;
|
|
210 CHRPOS randomStart;
|
|
211 BED includeInterval;
|
|
212
|
|
213 if (_sameChrom == false) {
|
|
214
|
|
215 // grab a random chromosome from the inclusion file.
|
|
216 randomChrom = _includeChroms[rand() % _numIncludeChroms];
|
|
217 // get the number of inclusion intervals for that chrom
|
|
218 size_t size = _include->bedMapNoBin[randomChrom].size();
|
|
219 // grab a random interval on the chosen chromosome.
|
|
220 size_t interval = rand() % size;
|
|
221 // retreive a ranom -incl interval on the selected chrom
|
|
222 includeInterval = _include->bedMapNoBin[randomChrom][interval];
|
|
223
|
|
224 bedEntry.chrom = randomChrom;
|
|
225 }
|
|
226 else {
|
|
227 // get the number of inclusion intervals for the original chrom
|
|
228 size_t size = _include->bedMapNoBin[chrom].size();
|
|
229 // grab a random interval on the chosen chromosome.
|
|
230 includeInterval = _include->bedMapNoBin[chrom][rand() % size];
|
|
231 }
|
|
232
|
|
233 randomStart = includeInterval.start + rand() % (includeInterval.size());
|
|
234 bedEntry.start = randomStart;
|
|
235 bedEntry.end = randomStart + length;
|
|
236
|
|
237 // use recursion to ensure that the chosen location
|
|
238 // doesn't go past the end of the chrom
|
|
239 if (bedEntry.end > ((size_t) _genome->getChromSize(chrom))) {
|
|
240 //bedEntry.end = _genome->getChromSize(chrom);
|
|
241 ChooseLocusFromInclusionFile(bedEntry);
|
|
242 }
|
|
243 }
|
|
244
|