0
|
1 import bisect
|
|
2 import math
|
|
3 import numpy
|
|
4 import re
|
|
5 import subprocess
|
|
6 import sys
|
|
7 import tempfile
|
|
8
|
4
|
9 GFF_EXT = 'gff'
|
|
10 SCIDX_EXT = 'scidx'
|
|
11
|
0
|
12 ROMAN = ['0', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X',
|
|
13 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'XVII', 'XVIII', 'XIX', 'XX',
|
|
14 'XXI', 'XXII', 'XXIII', 'XXIV', 'XXV', 'XXVI', 'XXVII', 'XXVIII', 'XXIX',
|
|
15 'XXX']
|
|
16
|
|
17
|
|
18 def noop(data):
|
|
19 return data
|
|
20
|
|
21
|
|
22 def zeropad_to_numeric(data):
|
|
23 return re.sub(r'chr0(\d)', r'chr\1', data)
|
|
24
|
|
25
|
|
26 def numeric_to_zeropad(data):
|
|
27 return re.sub(r'chr(\d([^\d]|$))', r'chr0\1', data)
|
|
28
|
|
29
|
|
30 def roman_to_numeric(data):
|
|
31 def convert(match):
|
|
32 """
|
|
33 Converts a single roman numeral to a number
|
|
34 """
|
|
35 numeral = match.group(1)
|
|
36 numeral = numeral.upper()
|
|
37 if numeral not in ROMAN:
|
|
38 # Unable to convert detected Roman numeral
|
|
39 return match.group(0)
|
|
40 return 'chr'+str(ROMAN.index(numeral))+(match.group(2) or '')
|
|
41 r = re.compile('chr([IVX]+)([^IVX]|$)', flags=re.IGNORECASE)
|
|
42 data = r.sub(convert, data)
|
|
43 return data
|
|
44
|
|
45
|
|
46 def numeric_to_roman(data):
|
|
47 def convert(match):
|
|
48 """
|
|
49 Converts a number to a roman numeral
|
|
50 """
|
|
51 number = int(match.group(1))
|
|
52 if number >= len(ROMAN):
|
|
53 # Number is out of range to convert to a Roman numeral
|
|
54 return match.group(0)
|
|
55 return 'chr'+ROMAN[number]+(match.group(2) or '')
|
|
56 r = re.compile('chr(\d+)([^\d]|$)')
|
|
57 data = r.sub(convert, data)
|
|
58 return data
|
|
59
|
|
60 FORMATS = ['zeropad', 'numeric', 'roman']
|
|
61 IN_CONVERT = {'zeropad': zeropad_to_numeric, 'roman': roman_to_numeric, 'numeric': noop}
|
|
62 OUT_CONVERT = {'zeropad': numeric_to_zeropad, 'roman': numeric_to_roman, 'numeric': noop}
|
|
63
|
|
64
|
|
65 def conversion_functions(in_fmt, out_fmt):
|
|
66 """
|
|
67 Returns the proper list of functions to apply to perform a conversion
|
|
68 """
|
|
69 return [IN_CONVERT[in_fmt], OUT_CONVERT[out_fmt]]
|
|
70
|
|
71
|
|
72 def autodetect_format(data):
|
|
73 if re.search('chr0\d', data):
|
|
74 fmt = 'zeropad'
|
|
75 elif re.search('chr[IVXivx]', data):
|
|
76 fmt = 'roman'
|
|
77 else:
|
|
78 fmt = 'numeric'
|
|
79 return fmt
|
|
80
|
|
81
|
|
82 def convert_data(data, in_fmt, out_fmt):
|
|
83 if in_fmt == 'autodetect':
|
|
84 in_fmt = autodetect_format(data)
|
|
85 for fn in conversion_functions(in_fmt, out_fmt):
|
|
86 data = fn(data)
|
|
87 return data
|
|
88
|
|
89
|
|
90 class ChromosomeManager(object):
|
|
91 """
|
|
92 Manages a CSV reader of an index file to only load one chrom at a time
|
|
93 """
|
|
94
|
|
95 def __init__(self, reader):
|
|
96 self.done = False
|
|
97 self.reader = reader
|
|
98 self.processed_chromosomes = []
|
|
99 self.current_index = 0
|
|
100 self.next_valid()
|
|
101
|
|
102 def next(self):
|
|
103 self.line = self.reader.next()
|
|
104
|
|
105 def is_valid(self, line):
|
|
106 if len(line) not in [4, 5, 9]:
|
|
107 return False
|
|
108 try:
|
|
109 [int(i) for i in line[1:]]
|
4
|
110 self.format = SCIDX_EXT
|
0
|
111 return True
|
|
112 except ValueError:
|
|
113 try:
|
|
114 if len(line) < 6:
|
|
115 return False
|
|
116 [int(line[4]), int(line[5])]
|
4
|
117 self.format = GFF_EXT
|
0
|
118 return True
|
|
119 except ValueError:
|
|
120 return False
|
|
121
|
|
122 def next_valid(self):
|
|
123 """
|
|
124 Advance to the next valid line in the reader
|
|
125 """
|
|
126 self.line = self.reader.next()
|
|
127 s = 0
|
|
128 while not self.is_valid(self.line):
|
|
129 self.line = self.reader.next()
|
|
130 s += 1
|
|
131 if s > 0:
|
|
132 # Skip initial line(s) of file
|
|
133 pass
|
|
134
|
|
135 def parse_line(self, line):
|
4
|
136 if self.format == SCIDX_EXT:
|
0
|
137 return [int(line[1]), int(line[2]), int(line[3])]
|
|
138 else:
|
|
139 return [int(line[3]), line[6], line[5]]
|
|
140
|
|
141 def chromosome_name(self):
|
|
142 """
|
|
143 Return the name of the chromosome about to be loaded
|
|
144 """
|
|
145 return self.line[0]
|
|
146
|
|
147 def load_chromosome(self, collect_data=True):
|
|
148 """
|
|
149 Load the current chromosome into an array and return it
|
|
150 """
|
|
151 cname = self.chromosome_name()
|
|
152 if cname in self.processed_chromosomes:
|
|
153 stop_err('File is not grouped by chromosome')
|
|
154 self.data = []
|
|
155 while self.line[0] == cname:
|
|
156 if collect_data:
|
|
157 read = self.parse_line(self.line)
|
|
158 if read[0] < self.current_index:
|
|
159 msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (cname, self.current_index)
|
|
160 stop_err(msg)
|
|
161 self.current_index = read[0]
|
|
162 self.add_read(read)
|
|
163 try:
|
|
164 self.next()
|
|
165 except StopIteration:
|
|
166 self.done = True
|
|
167 break
|
|
168 self.processed_chromosomes.append(cname)
|
|
169 self.current_index = 0
|
|
170 data = self.data
|
|
171 # Don't retain reference anymore to save memory
|
|
172 del self.data
|
|
173 return data
|
|
174
|
|
175 def add_read(self, read):
|
4
|
176 if self.format == SCIDX_EXT:
|
0
|
177 self.data.append(read)
|
|
178 else:
|
|
179 index, strand, value = read
|
|
180 if value == '' or value == '.':
|
|
181 value = 1
|
|
182 else:
|
|
183 value = int(value)
|
|
184 if not self.data:
|
|
185 self.data.append([index, 0, 0])
|
|
186 current_read = self.data[-1]
|
|
187 if self.data[-1][0] == index:
|
|
188 current_read = self.data[-1]
|
|
189 elif self.data[-1][0] < index:
|
|
190 self.data.append([index, 0, 0])
|
|
191 current_read = self.data[-1]
|
|
192 else:
|
|
193 msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (self.chromosome_name(), index)
|
|
194 stop_err(msg)
|
|
195 if strand == '+':
|
|
196 current_read[1] += value
|
|
197 elif strand == '-':
|
|
198 current_read[2] += value
|
|
199 else:
|
|
200 msg = 'Strand "%s" at chromosome "%s" index %d is not valid.' % (strand, self.chromosome_name(), index)
|
|
201 stop_err(msg)
|
|
202
|
|
203 def skip_chromosome(self):
|
|
204 """
|
|
205 Skip the current chromosome, discarding data
|
|
206 """
|
|
207 self.load_chromosome(collect_data=False)
|
|
208
|
|
209
|
|
210 class Peak(object):
|
|
211 def __init__(self, index, pos_width, neg_width):
|
|
212 self.index = index
|
|
213 self.start = index - neg_width
|
|
214 self.end = index + pos_width
|
|
215 self.value = 0
|
|
216 self.deleted = False
|
|
217 self.safe = False
|
|
218
|
|
219 def __repr__(self):
|
|
220 return '[%d] %d' % (self.index, self.value)
|
|
221
|
|
222
|
|
223 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
|
|
224 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
|
|
225
|
|
226
|
|
227 def gff_attrs(d):
|
|
228 if not d:
|
|
229 return '.'
|
|
230 return ';'.join('%s=%s' % item for item in d.items())
|
|
231
|
|
232
|
|
233 def stop_err(msg):
|
|
234 sys.stderr.write(msg)
|
|
235 sys.exit(1)
|
|
236
|
|
237
|
|
238 def is_int(i):
|
|
239 try:
|
|
240 int(i)
|
|
241 return True
|
|
242 except ValueError:
|
|
243 return False
|
|
244
|
|
245
|
|
246 def make_keys(data):
|
|
247 return [read[0] for read in data]
|
|
248
|
|
249
|
|
250 def make_peak_keys(peaks):
|
|
251 return [peak.index for peak in peaks]
|
|
252
|
|
253
|
|
254 def get_window(data, start, end, keys):
|
|
255 """
|
|
256 Returns all reads from the data set with index between the two indexes
|
|
257 """
|
|
258 start_index = bisect.bisect_left(keys, start)
|
|
259 end_index = bisect.bisect_right(keys, end)
|
|
260 return data[start_index:end_index]
|
|
261
|
|
262
|
|
263 def get_index(value, keys):
|
|
264 """
|
|
265 Returns the index of the value in the keys using bisect
|
|
266 """
|
|
267 return bisect.bisect_left(keys, value)
|
|
268
|
|
269
|
|
270 def get_range(data):
|
|
271 lo = min([item[0] for item in data])
|
|
272 hi = max([item[0] for item in data])
|
|
273 return lo, hi
|
|
274
|
|
275
|
|
276 def get_chunks(lo, hi, size, overlap=500):
|
|
277 """
|
|
278 Divides a range into chunks of maximum size size. Returns a list of
|
|
279 2-tuples (slice_range, process_range), each a 2-tuple (start, end).
|
|
280 process_range has zero overlap and should be given to process_chromosome
|
|
281 as-is, and slice_range is overlapped and should be used to slice the
|
|
282 data (using get_window) to be given to process_chromosome.
|
|
283 """
|
|
284 chunks = []
|
|
285 for start_index in range(lo, hi, size):
|
|
286 process_start = start_index
|
|
287 # Don't go over upper bound
|
|
288 process_end = min(start_index + size, hi)
|
|
289 # Don't go under lower bound
|
|
290 slice_start = max(process_start - overlap, lo)
|
|
291 # Don't go over upper bound
|
|
292 slice_end = min(process_end + overlap, hi)
|
|
293 chunks.append(((slice_start, slice_end), (process_start, process_end)))
|
|
294 return chunks
|
|
295
|
|
296
|
|
297 def allocate_array(data, width):
|
|
298 """
|
|
299 Allocates a new array with the dimensions required to fit all reads in
|
|
300 the argument. The new array is totally empty. Returns the array and the
|
|
301 shift (number to add to a read index to get the position in the array it
|
|
302 should be at).
|
|
303 """
|
|
304 lo, hi = get_range(data)
|
|
305 rng = hi - lo
|
|
306 shift = width - lo
|
|
307 return numpy.zeros(rng+width*2, numpy.float), shift
|
|
308
|
|
309
|
|
310 def normal_array(width, sigma, normalize=True):
|
|
311 """
|
|
312 Returns an array of the normal distribution of the specified width
|
|
313 """
|
|
314 sigma2 = float(sigma)**2
|
|
315
|
|
316 def normal_func(x):
|
|
317 return math.exp(-x * x / (2 * sigma2))
|
|
318
|
|
319 # width is the half of the distribution
|
|
320 values = map(normal_func, range(-width, width))
|
|
321 values = numpy.array(values, numpy.float)
|
|
322 # normalization
|
|
323 if normalize:
|
|
324 values = 1.0/math.sqrt(2 * numpy.pi * sigma2) * values
|
|
325 return values
|
|
326
|
|
327
|
|
328 def call_peaks(array, shift, data, keys, direction, down_width, up_width, exclusion):
|
|
329 peaks = []
|
|
330
|
|
331 def find_peaks():
|
|
332 # Go through the array and call each peak
|
|
333 results = (array > numpy.roll(array, 1)) & (array > numpy.roll(array, -1))
|
|
334 indexes = numpy.where(results)
|
|
335 for index in indexes[0]:
|
|
336 pos = down_width or exclusion // 2
|
|
337 neg = up_width or exclusion // 2
|
|
338 # Reverse strand
|
|
339 if direction == 2:
|
|
340 # Swap positive and negative widths
|
|
341 pos, neg = neg, pos
|
|
342 peaks.append(Peak(int(index)-shift, pos, neg))
|
|
343 find_peaks()
|
|
344
|
|
345 def calculate_reads():
|
|
346 # Calculate the number of reads in each peak
|
|
347 for peak in peaks:
|
|
348 reads = get_window(data, peak.start, peak.end, keys)
|
|
349 peak.value = sum([read[direction] for read in reads])
|
|
350 # Flat list of indexes with frequency
|
|
351 indexes = [r for read in reads for r in [read[0]] * read[direction]]
|
|
352 peak.stddev = numpy.std(indexes)
|
|
353 calculate_reads()
|
|
354
|
|
355 def perform_exclusion():
|
|
356 # Process the exclusion zone
|
|
357 peak_keys = make_peak_keys(peaks)
|
|
358 peaks_by_value = peaks[:]
|
|
359 peaks_by_value.sort(key=lambda peak: -peak.value)
|
|
360 for peak in peaks_by_value:
|
|
361 peak.safe = True
|
|
362 window = get_window(peaks,
|
|
363 peak.index-exclusion//2,
|
|
364 peak.index+exclusion//2,
|
|
365 peak_keys)
|
|
366 for excluded in window:
|
|
367 if excluded.safe:
|
|
368 continue
|
|
369 i = get_index(excluded.index, peak_keys)
|
|
370 del peak_keys[i]
|
|
371 del peaks[i]
|
|
372 perform_exclusion()
|
|
373 return peaks
|
|
374
|
|
375
|
|
376 def process_chromosome(cname, data, writer, process_bounds, width, sigma, down_width, up_width, exclusion, filter):
|
|
377 """
|
|
378 Process a chromosome. Takes the chromosome name, list of reads, a CSV
|
|
379 writer to write processes results to, the bounds (2-tuple) to write
|
|
380 results in, and options.
|
|
381 """
|
|
382 if not data:
|
|
383 return
|
|
384 keys = make_keys(data)
|
|
385 # Create the arrays that hold the sum of the normals
|
|
386 forward_array, forward_shift = allocate_array(data, width)
|
|
387 reverse_array, reverse_shift = allocate_array(data, width)
|
|
388 normal = normal_array(width, sigma)
|
|
389
|
|
390 def populate_array():
|
|
391 # Add each read's normal to the array
|
|
392 for read in data:
|
|
393 index, forward, reverse = read
|
|
394 # Add the normals to the appropriate regions
|
|
395 if forward:
|
|
396 forward_array[index+forward_shift-width:index+forward_shift+width] += normal * forward
|
|
397 if reverse:
|
|
398 reverse_array[index+reverse_shift-width:index+reverse_shift+width] += normal * reverse
|
|
399 populate_array()
|
|
400 forward_peaks = call_peaks(forward_array, forward_shift, data, keys, 1, down_width, up_width, exclusion)
|
|
401 reverse_peaks = call_peaks(reverse_array, reverse_shift, data, keys, 2, down_width, up_width, exclusion)
|
|
402 # Convert chromosome name in preparation for writing output
|
|
403 cname = convert_data(cname, 'zeropad', 'numeric')
|
|
404
|
|
405 def write(cname, strand, peak):
|
|
406 start = max(peak.start, 1)
|
|
407 end = peak.end
|
|
408 value = peak.value
|
|
409 stddev = peak.stddev
|
|
410 if value > filter:
|
|
411 # This version of genetrack outputs only gff files.
|
|
412 writer.writerow(gff_row(cname=cname,
|
|
413 source='genetrack',
|
|
414 start=start,
|
|
415 end=end,
|
|
416 score=value,
|
|
417 strand=strand,
|
|
418 attrs={'stddev': stddev}))
|
|
419
|
|
420 for peak in forward_peaks:
|
|
421 if process_bounds[0] < peak.index < process_bounds[1]:
|
|
422 write(cname, '+', peak)
|
|
423 for peak in reverse_peaks:
|
|
424 if process_bounds[0] < peak.index < process_bounds[1]:
|
|
425 write(cname, '-', peak)
|
|
426
|
|
427
|
|
428 def sort_chromosome_reads_by_index( input_path ):
|
|
429 """
|
|
430 Return a gff file with chromosome reads sorted by index.
|
|
431 """
|
|
432 # Will this sort produce different results across platforms?
|
|
433 output_path = tempfile.NamedTemporaryFile( delete=False ).name
|
|
434 command = 'sort -k 1,1 -k 4,4n "%s" > "%s"' % (input_path, output_path)
|
|
435 p = subprocess.Popen(command, shell=True)
|
|
436 p.wait()
|
|
437 return output_path
|