Mercurial > repos > greg > genetrack
comparison genetrack_util.py @ 0:0368815ae4d5 draft
Uploaded
author | greg |
---|---|
date | Tue, 17 Nov 2015 14:19:45 -0500 |
parents | |
children | a952b6740fb9 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:0368815ae4d5 |
---|---|
1 import bisect | |
2 import math | |
3 import numpy | |
4 import re | |
5 import subprocess | |
6 import sys | |
7 import tempfile | |
8 | |
9 ROMAN = ['0', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', | |
10 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'XVII', 'XVIII', 'XIX', 'XX', | |
11 'XXI', 'XXII', 'XXIII', 'XXIV', 'XXV', 'XXVI', 'XXVII', 'XXVIII', 'XXIX', | |
12 'XXX'] | |
13 | |
14 | |
15 def noop(data): | |
16 return data | |
17 | |
18 | |
19 def zeropad_to_numeric(data): | |
20 return re.sub(r'chr0(\d)', r'chr\1', data) | |
21 | |
22 | |
23 def numeric_to_zeropad(data): | |
24 return re.sub(r'chr(\d([^\d]|$))', r'chr0\1', data) | |
25 | |
26 | |
27 def roman_to_numeric(data): | |
28 def convert(match): | |
29 """ | |
30 Converts a single roman numeral to a number | |
31 """ | |
32 numeral = match.group(1) | |
33 numeral = numeral.upper() | |
34 if numeral not in ROMAN: | |
35 # Unable to convert detected Roman numeral | |
36 return match.group(0) | |
37 return 'chr'+str(ROMAN.index(numeral))+(match.group(2) or '') | |
38 r = re.compile('chr([IVX]+)([^IVX]|$)', flags=re.IGNORECASE) | |
39 data = r.sub(convert, data) | |
40 return data | |
41 | |
42 | |
43 def numeric_to_roman(data): | |
44 def convert(match): | |
45 """ | |
46 Converts a number to a roman numeral | |
47 """ | |
48 number = int(match.group(1)) | |
49 if number >= len(ROMAN): | |
50 # Number is out of range to convert to a Roman numeral | |
51 return match.group(0) | |
52 return 'chr'+ROMAN[number]+(match.group(2) or '') | |
53 r = re.compile('chr(\d+)([^\d]|$)') | |
54 data = r.sub(convert, data) | |
55 return data | |
56 | |
57 FORMATS = ['zeropad', 'numeric', 'roman'] | |
58 IN_CONVERT = {'zeropad': zeropad_to_numeric, 'roman': roman_to_numeric, 'numeric': noop} | |
59 OUT_CONVERT = {'zeropad': numeric_to_zeropad, 'roman': numeric_to_roman, 'numeric': noop} | |
60 | |
61 | |
62 def conversion_functions(in_fmt, out_fmt): | |
63 """ | |
64 Returns the proper list of functions to apply to perform a conversion | |
65 """ | |
66 return [IN_CONVERT[in_fmt], OUT_CONVERT[out_fmt]] | |
67 | |
68 | |
69 def autodetect_format(data): | |
70 if re.search('chr0\d', data): | |
71 fmt = 'zeropad' | |
72 elif re.search('chr[IVXivx]', data): | |
73 fmt = 'roman' | |
74 else: | |
75 fmt = 'numeric' | |
76 return fmt | |
77 | |
78 | |
79 def convert_data(data, in_fmt, out_fmt): | |
80 if in_fmt == 'autodetect': | |
81 in_fmt = autodetect_format(data) | |
82 for fn in conversion_functions(in_fmt, out_fmt): | |
83 data = fn(data) | |
84 return data | |
85 | |
86 | |
87 class ChromosomeManager(object): | |
88 """ | |
89 Manages a CSV reader of an index file to only load one chrom at a time | |
90 """ | |
91 | |
92 def __init__(self, reader): | |
93 self.done = False | |
94 self.reader = reader | |
95 self.processed_chromosomes = [] | |
96 self.current_index = 0 | |
97 self.next_valid() | |
98 | |
99 def next(self): | |
100 self.line = self.reader.next() | |
101 | |
102 def is_valid(self, line): | |
103 if len(line) not in [4, 5, 9]: | |
104 return False | |
105 try: | |
106 [int(i) for i in line[1:]] | |
107 self.format = 'ssccidx' | |
108 return True | |
109 except ValueError: | |
110 try: | |
111 if len(line) < 6: | |
112 return False | |
113 [int(line[4]), int(line[5])] | |
114 self.format = 'gff' | |
115 return True | |
116 except ValueError: | |
117 return False | |
118 | |
119 def next_valid(self): | |
120 """ | |
121 Advance to the next valid line in the reader | |
122 """ | |
123 self.line = self.reader.next() | |
124 s = 0 | |
125 while not self.is_valid(self.line): | |
126 self.line = self.reader.next() | |
127 s += 1 | |
128 if s > 0: | |
129 # Skip initial line(s) of file | |
130 pass | |
131 | |
132 def parse_line(self, line): | |
133 if self.format == 'ssccidx': | |
134 return [int(line[1]), int(line[2]), int(line[3])] | |
135 else: | |
136 return [int(line[3]), line[6], line[5]] | |
137 | |
138 def chromosome_name(self): | |
139 """ | |
140 Return the name of the chromosome about to be loaded | |
141 """ | |
142 return self.line[0] | |
143 | |
144 def load_chromosome(self, collect_data=True): | |
145 """ | |
146 Load the current chromosome into an array and return it | |
147 """ | |
148 cname = self.chromosome_name() | |
149 if cname in self.processed_chromosomes: | |
150 stop_err('File is not grouped by chromosome') | |
151 self.data = [] | |
152 while self.line[0] == cname: | |
153 if collect_data: | |
154 read = self.parse_line(self.line) | |
155 if read[0] < self.current_index: | |
156 msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (cname, self.current_index) | |
157 stop_err(msg) | |
158 self.current_index = read[0] | |
159 self.add_read(read) | |
160 try: | |
161 self.next() | |
162 except StopIteration: | |
163 self.done = True | |
164 break | |
165 self.processed_chromosomes.append(cname) | |
166 self.current_index = 0 | |
167 data = self.data | |
168 # Don't retain reference anymore to save memory | |
169 del self.data | |
170 return data | |
171 | |
172 def add_read(self, read): | |
173 if self.format == 'ssccidx': | |
174 self.data.append(read) | |
175 else: | |
176 index, strand, value = read | |
177 if value == '' or value == '.': | |
178 value = 1 | |
179 else: | |
180 value = int(value) | |
181 if not self.data: | |
182 self.data.append([index, 0, 0]) | |
183 current_read = self.data[-1] | |
184 if self.data[-1][0] == index: | |
185 current_read = self.data[-1] | |
186 elif self.data[-1][0] < index: | |
187 self.data.append([index, 0, 0]) | |
188 current_read = self.data[-1] | |
189 else: | |
190 msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (self.chromosome_name(), index) | |
191 stop_err(msg) | |
192 if strand == '+': | |
193 current_read[1] += value | |
194 elif strand == '-': | |
195 current_read[2] += value | |
196 else: | |
197 msg = 'Strand "%s" at chromosome "%s" index %d is not valid.' % (strand, self.chromosome_name(), index) | |
198 stop_err(msg) | |
199 | |
200 def skip_chromosome(self): | |
201 """ | |
202 Skip the current chromosome, discarding data | |
203 """ | |
204 self.load_chromosome(collect_data=False) | |
205 | |
206 | |
207 class Peak(object): | |
208 def __init__(self, index, pos_width, neg_width): | |
209 self.index = index | |
210 self.start = index - neg_width | |
211 self.end = index + pos_width | |
212 self.value = 0 | |
213 self.deleted = False | |
214 self.safe = False | |
215 | |
216 def __repr__(self): | |
217 return '[%d] %d' % (self.index, self.value) | |
218 | |
219 | |
220 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): | |
221 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) | |
222 | |
223 | |
224 def gff_attrs(d): | |
225 if not d: | |
226 return '.' | |
227 return ';'.join('%s=%s' % item for item in d.items()) | |
228 | |
229 | |
230 def stop_err(msg): | |
231 sys.stderr.write(msg) | |
232 sys.exit(1) | |
233 | |
234 | |
235 def is_int(i): | |
236 try: | |
237 int(i) | |
238 return True | |
239 except ValueError: | |
240 return False | |
241 | |
242 | |
243 def make_keys(data): | |
244 return [read[0] for read in data] | |
245 | |
246 | |
247 def make_peak_keys(peaks): | |
248 return [peak.index for peak in peaks] | |
249 | |
250 | |
251 def get_window(data, start, end, keys): | |
252 """ | |
253 Returns all reads from the data set with index between the two indexes | |
254 """ | |
255 start_index = bisect.bisect_left(keys, start) | |
256 end_index = bisect.bisect_right(keys, end) | |
257 return data[start_index:end_index] | |
258 | |
259 | |
260 def get_index(value, keys): | |
261 """ | |
262 Returns the index of the value in the keys using bisect | |
263 """ | |
264 return bisect.bisect_left(keys, value) | |
265 | |
266 | |
267 def get_range(data): | |
268 lo = min([item[0] for item in data]) | |
269 hi = max([item[0] for item in data]) | |
270 return lo, hi | |
271 | |
272 | |
273 def get_chunks(lo, hi, size, overlap=500): | |
274 """ | |
275 Divides a range into chunks of maximum size size. Returns a list of | |
276 2-tuples (slice_range, process_range), each a 2-tuple (start, end). | |
277 process_range has zero overlap and should be given to process_chromosome | |
278 as-is, and slice_range is overlapped and should be used to slice the | |
279 data (using get_window) to be given to process_chromosome. | |
280 """ | |
281 chunks = [] | |
282 for start_index in range(lo, hi, size): | |
283 process_start = start_index | |
284 # Don't go over upper bound | |
285 process_end = min(start_index + size, hi) | |
286 # Don't go under lower bound | |
287 slice_start = max(process_start - overlap, lo) | |
288 # Don't go over upper bound | |
289 slice_end = min(process_end + overlap, hi) | |
290 chunks.append(((slice_start, slice_end), (process_start, process_end))) | |
291 return chunks | |
292 | |
293 | |
294 def allocate_array(data, width): | |
295 """ | |
296 Allocates a new array with the dimensions required to fit all reads in | |
297 the argument. The new array is totally empty. Returns the array and the | |
298 shift (number to add to a read index to get the position in the array it | |
299 should be at). | |
300 """ | |
301 lo, hi = get_range(data) | |
302 rng = hi - lo | |
303 shift = width - lo | |
304 return numpy.zeros(rng+width*2, numpy.float), shift | |
305 | |
306 | |
307 def normal_array(width, sigma, normalize=True): | |
308 """ | |
309 Returns an array of the normal distribution of the specified width | |
310 """ | |
311 sigma2 = float(sigma)**2 | |
312 | |
313 def normal_func(x): | |
314 return math.exp(-x * x / (2 * sigma2)) | |
315 | |
316 # width is the half of the distribution | |
317 values = map(normal_func, range(-width, width)) | |
318 values = numpy.array(values, numpy.float) | |
319 # normalization | |
320 if normalize: | |
321 values = 1.0/math.sqrt(2 * numpy.pi * sigma2) * values | |
322 return values | |
323 | |
324 | |
325 def call_peaks(array, shift, data, keys, direction, down_width, up_width, exclusion): | |
326 peaks = [] | |
327 | |
328 def find_peaks(): | |
329 # Go through the array and call each peak | |
330 results = (array > numpy.roll(array, 1)) & (array > numpy.roll(array, -1)) | |
331 indexes = numpy.where(results) | |
332 for index in indexes[0]: | |
333 pos = down_width or exclusion // 2 | |
334 neg = up_width or exclusion // 2 | |
335 # Reverse strand | |
336 if direction == 2: | |
337 # Swap positive and negative widths | |
338 pos, neg = neg, pos | |
339 peaks.append(Peak(int(index)-shift, pos, neg)) | |
340 find_peaks() | |
341 | |
342 def calculate_reads(): | |
343 # Calculate the number of reads in each peak | |
344 for peak in peaks: | |
345 reads = get_window(data, peak.start, peak.end, keys) | |
346 peak.value = sum([read[direction] for read in reads]) | |
347 # Flat list of indexes with frequency | |
348 indexes = [r for read in reads for r in [read[0]] * read[direction]] | |
349 peak.stddev = numpy.std(indexes) | |
350 calculate_reads() | |
351 | |
352 def perform_exclusion(): | |
353 # Process the exclusion zone | |
354 peak_keys = make_peak_keys(peaks) | |
355 peaks_by_value = peaks[:] | |
356 peaks_by_value.sort(key=lambda peak: -peak.value) | |
357 for peak in peaks_by_value: | |
358 peak.safe = True | |
359 window = get_window(peaks, | |
360 peak.index-exclusion//2, | |
361 peak.index+exclusion//2, | |
362 peak_keys) | |
363 for excluded in window: | |
364 if excluded.safe: | |
365 continue | |
366 i = get_index(excluded.index, peak_keys) | |
367 del peak_keys[i] | |
368 del peaks[i] | |
369 perform_exclusion() | |
370 return peaks | |
371 | |
372 | |
373 def process_chromosome(cname, data, writer, process_bounds, width, sigma, down_width, up_width, exclusion, filter): | |
374 """ | |
375 Process a chromosome. Takes the chromosome name, list of reads, a CSV | |
376 writer to write processes results to, the bounds (2-tuple) to write | |
377 results in, and options. | |
378 """ | |
379 if not data: | |
380 return | |
381 keys = make_keys(data) | |
382 # Create the arrays that hold the sum of the normals | |
383 forward_array, forward_shift = allocate_array(data, width) | |
384 reverse_array, reverse_shift = allocate_array(data, width) | |
385 normal = normal_array(width, sigma) | |
386 | |
387 def populate_array(): | |
388 # Add each read's normal to the array | |
389 for read in data: | |
390 index, forward, reverse = read | |
391 # Add the normals to the appropriate regions | |
392 if forward: | |
393 forward_array[index+forward_shift-width:index+forward_shift+width] += normal * forward | |
394 if reverse: | |
395 reverse_array[index+reverse_shift-width:index+reverse_shift+width] += normal * reverse | |
396 populate_array() | |
397 forward_peaks = call_peaks(forward_array, forward_shift, data, keys, 1, down_width, up_width, exclusion) | |
398 reverse_peaks = call_peaks(reverse_array, reverse_shift, data, keys, 2, down_width, up_width, exclusion) | |
399 # Convert chromosome name in preparation for writing output | |
400 cname = convert_data(cname, 'zeropad', 'numeric') | |
401 | |
402 def write(cname, strand, peak): | |
403 start = max(peak.start, 1) | |
404 end = peak.end | |
405 value = peak.value | |
406 stddev = peak.stddev | |
407 if value > filter: | |
408 # This version of genetrack outputs only gff files. | |
409 writer.writerow(gff_row(cname=cname, | |
410 source='genetrack', | |
411 start=start, | |
412 end=end, | |
413 score=value, | |
414 strand=strand, | |
415 attrs={'stddev': stddev})) | |
416 | |
417 for peak in forward_peaks: | |
418 if process_bounds[0] < peak.index < process_bounds[1]: | |
419 write(cname, '+', peak) | |
420 for peak in reverse_peaks: | |
421 if process_bounds[0] < peak.index < process_bounds[1]: | |
422 write(cname, '-', peak) | |
423 | |
424 | |
425 def sort_chromosome_reads_by_index( input_path ): | |
426 """ | |
427 Return a gff file with chromosome reads sorted by index. | |
428 """ | |
429 # Will this sort produce different results across platforms? | |
430 output_path = tempfile.NamedTemporaryFile( delete=False ).name | |
431 command = 'sort -k 1,1 -k 4,4n "%s" > "%s"' % (input_path, output_path) | |
432 p = subprocess.Popen(command, shell=True) | |
433 p.wait() | |
434 return output_path |