comparison genetrack_util.py @ 0:0368815ae4d5 draft

Uploaded
author greg
date Tue, 17 Nov 2015 14:19:45 -0500
parents
children a952b6740fb9
comparison
equal deleted inserted replaced
-1:000000000000 0:0368815ae4d5
1 import bisect
2 import math
3 import numpy
4 import re
5 import subprocess
6 import sys
7 import tempfile
8
9 ROMAN = ['0', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X',
10 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'XVII', 'XVIII', 'XIX', 'XX',
11 'XXI', 'XXII', 'XXIII', 'XXIV', 'XXV', 'XXVI', 'XXVII', 'XXVIII', 'XXIX',
12 'XXX']
13
14
15 def noop(data):
16 return data
17
18
19 def zeropad_to_numeric(data):
20 return re.sub(r'chr0(\d)', r'chr\1', data)
21
22
23 def numeric_to_zeropad(data):
24 return re.sub(r'chr(\d([^\d]|$))', r'chr0\1', data)
25
26
27 def roman_to_numeric(data):
28 def convert(match):
29 """
30 Converts a single roman numeral to a number
31 """
32 numeral = match.group(1)
33 numeral = numeral.upper()
34 if numeral not in ROMAN:
35 # Unable to convert detected Roman numeral
36 return match.group(0)
37 return 'chr'+str(ROMAN.index(numeral))+(match.group(2) or '')
38 r = re.compile('chr([IVX]+)([^IVX]|$)', flags=re.IGNORECASE)
39 data = r.sub(convert, data)
40 return data
41
42
43 def numeric_to_roman(data):
44 def convert(match):
45 """
46 Converts a number to a roman numeral
47 """
48 number = int(match.group(1))
49 if number >= len(ROMAN):
50 # Number is out of range to convert to a Roman numeral
51 return match.group(0)
52 return 'chr'+ROMAN[number]+(match.group(2) or '')
53 r = re.compile('chr(\d+)([^\d]|$)')
54 data = r.sub(convert, data)
55 return data
56
57 FORMATS = ['zeropad', 'numeric', 'roman']
58 IN_CONVERT = {'zeropad': zeropad_to_numeric, 'roman': roman_to_numeric, 'numeric': noop}
59 OUT_CONVERT = {'zeropad': numeric_to_zeropad, 'roman': numeric_to_roman, 'numeric': noop}
60
61
62 def conversion_functions(in_fmt, out_fmt):
63 """
64 Returns the proper list of functions to apply to perform a conversion
65 """
66 return [IN_CONVERT[in_fmt], OUT_CONVERT[out_fmt]]
67
68
69 def autodetect_format(data):
70 if re.search('chr0\d', data):
71 fmt = 'zeropad'
72 elif re.search('chr[IVXivx]', data):
73 fmt = 'roman'
74 else:
75 fmt = 'numeric'
76 return fmt
77
78
79 def convert_data(data, in_fmt, out_fmt):
80 if in_fmt == 'autodetect':
81 in_fmt = autodetect_format(data)
82 for fn in conversion_functions(in_fmt, out_fmt):
83 data = fn(data)
84 return data
85
86
87 class ChromosomeManager(object):
88 """
89 Manages a CSV reader of an index file to only load one chrom at a time
90 """
91
92 def __init__(self, reader):
93 self.done = False
94 self.reader = reader
95 self.processed_chromosomes = []
96 self.current_index = 0
97 self.next_valid()
98
99 def next(self):
100 self.line = self.reader.next()
101
102 def is_valid(self, line):
103 if len(line) not in [4, 5, 9]:
104 return False
105 try:
106 [int(i) for i in line[1:]]
107 self.format = 'ssccidx'
108 return True
109 except ValueError:
110 try:
111 if len(line) < 6:
112 return False
113 [int(line[4]), int(line[5])]
114 self.format = 'gff'
115 return True
116 except ValueError:
117 return False
118
119 def next_valid(self):
120 """
121 Advance to the next valid line in the reader
122 """
123 self.line = self.reader.next()
124 s = 0
125 while not self.is_valid(self.line):
126 self.line = self.reader.next()
127 s += 1
128 if s > 0:
129 # Skip initial line(s) of file
130 pass
131
132 def parse_line(self, line):
133 if self.format == 'ssccidx':
134 return [int(line[1]), int(line[2]), int(line[3])]
135 else:
136 return [int(line[3]), line[6], line[5]]
137
138 def chromosome_name(self):
139 """
140 Return the name of the chromosome about to be loaded
141 """
142 return self.line[0]
143
144 def load_chromosome(self, collect_data=True):
145 """
146 Load the current chromosome into an array and return it
147 """
148 cname = self.chromosome_name()
149 if cname in self.processed_chromosomes:
150 stop_err('File is not grouped by chromosome')
151 self.data = []
152 while self.line[0] == cname:
153 if collect_data:
154 read = self.parse_line(self.line)
155 if read[0] < self.current_index:
156 msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (cname, self.current_index)
157 stop_err(msg)
158 self.current_index = read[0]
159 self.add_read(read)
160 try:
161 self.next()
162 except StopIteration:
163 self.done = True
164 break
165 self.processed_chromosomes.append(cname)
166 self.current_index = 0
167 data = self.data
168 # Don't retain reference anymore to save memory
169 del self.data
170 return data
171
172 def add_read(self, read):
173 if self.format == 'ssccidx':
174 self.data.append(read)
175 else:
176 index, strand, value = read
177 if value == '' or value == '.':
178 value = 1
179 else:
180 value = int(value)
181 if not self.data:
182 self.data.append([index, 0, 0])
183 current_read = self.data[-1]
184 if self.data[-1][0] == index:
185 current_read = self.data[-1]
186 elif self.data[-1][0] < index:
187 self.data.append([index, 0, 0])
188 current_read = self.data[-1]
189 else:
190 msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (self.chromosome_name(), index)
191 stop_err(msg)
192 if strand == '+':
193 current_read[1] += value
194 elif strand == '-':
195 current_read[2] += value
196 else:
197 msg = 'Strand "%s" at chromosome "%s" index %d is not valid.' % (strand, self.chromosome_name(), index)
198 stop_err(msg)
199
200 def skip_chromosome(self):
201 """
202 Skip the current chromosome, discarding data
203 """
204 self.load_chromosome(collect_data=False)
205
206
207 class Peak(object):
208 def __init__(self, index, pos_width, neg_width):
209 self.index = index
210 self.start = index - neg_width
211 self.end = index + pos_width
212 self.value = 0
213 self.deleted = False
214 self.safe = False
215
216 def __repr__(self):
217 return '[%d] %d' % (self.index, self.value)
218
219
220 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
221 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
222
223
224 def gff_attrs(d):
225 if not d:
226 return '.'
227 return ';'.join('%s=%s' % item for item in d.items())
228
229
230 def stop_err(msg):
231 sys.stderr.write(msg)
232 sys.exit(1)
233
234
235 def is_int(i):
236 try:
237 int(i)
238 return True
239 except ValueError:
240 return False
241
242
243 def make_keys(data):
244 return [read[0] for read in data]
245
246
247 def make_peak_keys(peaks):
248 return [peak.index for peak in peaks]
249
250
251 def get_window(data, start, end, keys):
252 """
253 Returns all reads from the data set with index between the two indexes
254 """
255 start_index = bisect.bisect_left(keys, start)
256 end_index = bisect.bisect_right(keys, end)
257 return data[start_index:end_index]
258
259
260 def get_index(value, keys):
261 """
262 Returns the index of the value in the keys using bisect
263 """
264 return bisect.bisect_left(keys, value)
265
266
267 def get_range(data):
268 lo = min([item[0] for item in data])
269 hi = max([item[0] for item in data])
270 return lo, hi
271
272
273 def get_chunks(lo, hi, size, overlap=500):
274 """
275 Divides a range into chunks of maximum size size. Returns a list of
276 2-tuples (slice_range, process_range), each a 2-tuple (start, end).
277 process_range has zero overlap and should be given to process_chromosome
278 as-is, and slice_range is overlapped and should be used to slice the
279 data (using get_window) to be given to process_chromosome.
280 """
281 chunks = []
282 for start_index in range(lo, hi, size):
283 process_start = start_index
284 # Don't go over upper bound
285 process_end = min(start_index + size, hi)
286 # Don't go under lower bound
287 slice_start = max(process_start - overlap, lo)
288 # Don't go over upper bound
289 slice_end = min(process_end + overlap, hi)
290 chunks.append(((slice_start, slice_end), (process_start, process_end)))
291 return chunks
292
293
294 def allocate_array(data, width):
295 """
296 Allocates a new array with the dimensions required to fit all reads in
297 the argument. The new array is totally empty. Returns the array and the
298 shift (number to add to a read index to get the position in the array it
299 should be at).
300 """
301 lo, hi = get_range(data)
302 rng = hi - lo
303 shift = width - lo
304 return numpy.zeros(rng+width*2, numpy.float), shift
305
306
307 def normal_array(width, sigma, normalize=True):
308 """
309 Returns an array of the normal distribution of the specified width
310 """
311 sigma2 = float(sigma)**2
312
313 def normal_func(x):
314 return math.exp(-x * x / (2 * sigma2))
315
316 # width is the half of the distribution
317 values = map(normal_func, range(-width, width))
318 values = numpy.array(values, numpy.float)
319 # normalization
320 if normalize:
321 values = 1.0/math.sqrt(2 * numpy.pi * sigma2) * values
322 return values
323
324
325 def call_peaks(array, shift, data, keys, direction, down_width, up_width, exclusion):
326 peaks = []
327
328 def find_peaks():
329 # Go through the array and call each peak
330 results = (array > numpy.roll(array, 1)) & (array > numpy.roll(array, -1))
331 indexes = numpy.where(results)
332 for index in indexes[0]:
333 pos = down_width or exclusion // 2
334 neg = up_width or exclusion // 2
335 # Reverse strand
336 if direction == 2:
337 # Swap positive and negative widths
338 pos, neg = neg, pos
339 peaks.append(Peak(int(index)-shift, pos, neg))
340 find_peaks()
341
342 def calculate_reads():
343 # Calculate the number of reads in each peak
344 for peak in peaks:
345 reads = get_window(data, peak.start, peak.end, keys)
346 peak.value = sum([read[direction] for read in reads])
347 # Flat list of indexes with frequency
348 indexes = [r for read in reads for r in [read[0]] * read[direction]]
349 peak.stddev = numpy.std(indexes)
350 calculate_reads()
351
352 def perform_exclusion():
353 # Process the exclusion zone
354 peak_keys = make_peak_keys(peaks)
355 peaks_by_value = peaks[:]
356 peaks_by_value.sort(key=lambda peak: -peak.value)
357 for peak in peaks_by_value:
358 peak.safe = True
359 window = get_window(peaks,
360 peak.index-exclusion//2,
361 peak.index+exclusion//2,
362 peak_keys)
363 for excluded in window:
364 if excluded.safe:
365 continue
366 i = get_index(excluded.index, peak_keys)
367 del peak_keys[i]
368 del peaks[i]
369 perform_exclusion()
370 return peaks
371
372
373 def process_chromosome(cname, data, writer, process_bounds, width, sigma, down_width, up_width, exclusion, filter):
374 """
375 Process a chromosome. Takes the chromosome name, list of reads, a CSV
376 writer to write processes results to, the bounds (2-tuple) to write
377 results in, and options.
378 """
379 if not data:
380 return
381 keys = make_keys(data)
382 # Create the arrays that hold the sum of the normals
383 forward_array, forward_shift = allocate_array(data, width)
384 reverse_array, reverse_shift = allocate_array(data, width)
385 normal = normal_array(width, sigma)
386
387 def populate_array():
388 # Add each read's normal to the array
389 for read in data:
390 index, forward, reverse = read
391 # Add the normals to the appropriate regions
392 if forward:
393 forward_array[index+forward_shift-width:index+forward_shift+width] += normal * forward
394 if reverse:
395 reverse_array[index+reverse_shift-width:index+reverse_shift+width] += normal * reverse
396 populate_array()
397 forward_peaks = call_peaks(forward_array, forward_shift, data, keys, 1, down_width, up_width, exclusion)
398 reverse_peaks = call_peaks(reverse_array, reverse_shift, data, keys, 2, down_width, up_width, exclusion)
399 # Convert chromosome name in preparation for writing output
400 cname = convert_data(cname, 'zeropad', 'numeric')
401
402 def write(cname, strand, peak):
403 start = max(peak.start, 1)
404 end = peak.end
405 value = peak.value
406 stddev = peak.stddev
407 if value > filter:
408 # This version of genetrack outputs only gff files.
409 writer.writerow(gff_row(cname=cname,
410 source='genetrack',
411 start=start,
412 end=end,
413 score=value,
414 strand=strand,
415 attrs={'stddev': stddev}))
416
417 for peak in forward_peaks:
418 if process_bounds[0] < peak.index < process_bounds[1]:
419 write(cname, '+', peak)
420 for peak in reverse_peaks:
421 if process_bounds[0] < peak.index < process_bounds[1]:
422 write(cname, '-', peak)
423
424
425 def sort_chromosome_reads_by_index( input_path ):
426 """
427 Return a gff file with chromosome reads sorted by index.
428 """
429 # Will this sort produce different results across platforms?
430 output_path = tempfile.NamedTemporaryFile( delete=False ).name
431 command = 'sort -k 1,1 -k 4,4n "%s" > "%s"' % (input_path, output_path)
432 p = subprocess.Popen(command, shell=True)
433 p.wait()
434 return output_path