annotate genetrack_util.py @ 0:0368815ae4d5 draft

Uploaded
author greg
date Tue, 17 Nov 2015 14:19:45 -0500
parents
children a952b6740fb9
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
1 import bisect
0368815ae4d5 Uploaded
greg
parents:
diff changeset
2 import math
0368815ae4d5 Uploaded
greg
parents:
diff changeset
3 import numpy
0368815ae4d5 Uploaded
greg
parents:
diff changeset
4 import re
0368815ae4d5 Uploaded
greg
parents:
diff changeset
5 import subprocess
0368815ae4d5 Uploaded
greg
parents:
diff changeset
6 import sys
0368815ae4d5 Uploaded
greg
parents:
diff changeset
7 import tempfile
0368815ae4d5 Uploaded
greg
parents:
diff changeset
8
0368815ae4d5 Uploaded
greg
parents:
diff changeset
9 ROMAN = ['0', 'I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X',
0368815ae4d5 Uploaded
greg
parents:
diff changeset
10 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI', 'XVII', 'XVIII', 'XIX', 'XX',
0368815ae4d5 Uploaded
greg
parents:
diff changeset
11 'XXI', 'XXII', 'XXIII', 'XXIV', 'XXV', 'XXVI', 'XXVII', 'XXVIII', 'XXIX',
0368815ae4d5 Uploaded
greg
parents:
diff changeset
12 'XXX']
0368815ae4d5 Uploaded
greg
parents:
diff changeset
13
0368815ae4d5 Uploaded
greg
parents:
diff changeset
14
0368815ae4d5 Uploaded
greg
parents:
diff changeset
15 def noop(data):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
16 return data
0368815ae4d5 Uploaded
greg
parents:
diff changeset
17
0368815ae4d5 Uploaded
greg
parents:
diff changeset
18
0368815ae4d5 Uploaded
greg
parents:
diff changeset
19 def zeropad_to_numeric(data):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
20 return re.sub(r'chr0(\d)', r'chr\1', data)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
21
0368815ae4d5 Uploaded
greg
parents:
diff changeset
22
0368815ae4d5 Uploaded
greg
parents:
diff changeset
23 def numeric_to_zeropad(data):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
24 return re.sub(r'chr(\d([^\d]|$))', r'chr0\1', data)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
25
0368815ae4d5 Uploaded
greg
parents:
diff changeset
26
0368815ae4d5 Uploaded
greg
parents:
diff changeset
27 def roman_to_numeric(data):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
28 def convert(match):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
29 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
30 Converts a single roman numeral to a number
0368815ae4d5 Uploaded
greg
parents:
diff changeset
31 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
32 numeral = match.group(1)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
33 numeral = numeral.upper()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
34 if numeral not in ROMAN:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
35 # Unable to convert detected Roman numeral
0368815ae4d5 Uploaded
greg
parents:
diff changeset
36 return match.group(0)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
37 return 'chr'+str(ROMAN.index(numeral))+(match.group(2) or '')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
38 r = re.compile('chr([IVX]+)([^IVX]|$)', flags=re.IGNORECASE)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
39 data = r.sub(convert, data)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
40 return data
0368815ae4d5 Uploaded
greg
parents:
diff changeset
41
0368815ae4d5 Uploaded
greg
parents:
diff changeset
42
0368815ae4d5 Uploaded
greg
parents:
diff changeset
43 def numeric_to_roman(data):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
44 def convert(match):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
45 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
46 Converts a number to a roman numeral
0368815ae4d5 Uploaded
greg
parents:
diff changeset
47 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
48 number = int(match.group(1))
0368815ae4d5 Uploaded
greg
parents:
diff changeset
49 if number >= len(ROMAN):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
50 # Number is out of range to convert to a Roman numeral
0368815ae4d5 Uploaded
greg
parents:
diff changeset
51 return match.group(0)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
52 return 'chr'+ROMAN[number]+(match.group(2) or '')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
53 r = re.compile('chr(\d+)([^\d]|$)')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
54 data = r.sub(convert, data)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
55 return data
0368815ae4d5 Uploaded
greg
parents:
diff changeset
56
0368815ae4d5 Uploaded
greg
parents:
diff changeset
57 FORMATS = ['zeropad', 'numeric', 'roman']
0368815ae4d5 Uploaded
greg
parents:
diff changeset
58 IN_CONVERT = {'zeropad': zeropad_to_numeric, 'roman': roman_to_numeric, 'numeric': noop}
0368815ae4d5 Uploaded
greg
parents:
diff changeset
59 OUT_CONVERT = {'zeropad': numeric_to_zeropad, 'roman': numeric_to_roman, 'numeric': noop}
0368815ae4d5 Uploaded
greg
parents:
diff changeset
60
0368815ae4d5 Uploaded
greg
parents:
diff changeset
61
0368815ae4d5 Uploaded
greg
parents:
diff changeset
62 def conversion_functions(in_fmt, out_fmt):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
63 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
64 Returns the proper list of functions to apply to perform a conversion
0368815ae4d5 Uploaded
greg
parents:
diff changeset
65 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
66 return [IN_CONVERT[in_fmt], OUT_CONVERT[out_fmt]]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
67
0368815ae4d5 Uploaded
greg
parents:
diff changeset
68
0368815ae4d5 Uploaded
greg
parents:
diff changeset
69 def autodetect_format(data):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
70 if re.search('chr0\d', data):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
71 fmt = 'zeropad'
0368815ae4d5 Uploaded
greg
parents:
diff changeset
72 elif re.search('chr[IVXivx]', data):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
73 fmt = 'roman'
0368815ae4d5 Uploaded
greg
parents:
diff changeset
74 else:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
75 fmt = 'numeric'
0368815ae4d5 Uploaded
greg
parents:
diff changeset
76 return fmt
0368815ae4d5 Uploaded
greg
parents:
diff changeset
77
0368815ae4d5 Uploaded
greg
parents:
diff changeset
78
0368815ae4d5 Uploaded
greg
parents:
diff changeset
79 def convert_data(data, in_fmt, out_fmt):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
80 if in_fmt == 'autodetect':
0368815ae4d5 Uploaded
greg
parents:
diff changeset
81 in_fmt = autodetect_format(data)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
82 for fn in conversion_functions(in_fmt, out_fmt):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
83 data = fn(data)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
84 return data
0368815ae4d5 Uploaded
greg
parents:
diff changeset
85
0368815ae4d5 Uploaded
greg
parents:
diff changeset
86
0368815ae4d5 Uploaded
greg
parents:
diff changeset
87 class ChromosomeManager(object):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
88 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
89 Manages a CSV reader of an index file to only load one chrom at a time
0368815ae4d5 Uploaded
greg
parents:
diff changeset
90 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
91
0368815ae4d5 Uploaded
greg
parents:
diff changeset
92 def __init__(self, reader):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
93 self.done = False
0368815ae4d5 Uploaded
greg
parents:
diff changeset
94 self.reader = reader
0368815ae4d5 Uploaded
greg
parents:
diff changeset
95 self.processed_chromosomes = []
0368815ae4d5 Uploaded
greg
parents:
diff changeset
96 self.current_index = 0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
97 self.next_valid()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
98
0368815ae4d5 Uploaded
greg
parents:
diff changeset
99 def next(self):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
100 self.line = self.reader.next()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
101
0368815ae4d5 Uploaded
greg
parents:
diff changeset
102 def is_valid(self, line):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
103 if len(line) not in [4, 5, 9]:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
104 return False
0368815ae4d5 Uploaded
greg
parents:
diff changeset
105 try:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
106 [int(i) for i in line[1:]]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
107 self.format = 'ssccidx'
0368815ae4d5 Uploaded
greg
parents:
diff changeset
108 return True
0368815ae4d5 Uploaded
greg
parents:
diff changeset
109 except ValueError:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
110 try:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
111 if len(line) < 6:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
112 return False
0368815ae4d5 Uploaded
greg
parents:
diff changeset
113 [int(line[4]), int(line[5])]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
114 self.format = 'gff'
0368815ae4d5 Uploaded
greg
parents:
diff changeset
115 return True
0368815ae4d5 Uploaded
greg
parents:
diff changeset
116 except ValueError:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
117 return False
0368815ae4d5 Uploaded
greg
parents:
diff changeset
118
0368815ae4d5 Uploaded
greg
parents:
diff changeset
119 def next_valid(self):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
120 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
121 Advance to the next valid line in the reader
0368815ae4d5 Uploaded
greg
parents:
diff changeset
122 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
123 self.line = self.reader.next()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
124 s = 0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
125 while not self.is_valid(self.line):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
126 self.line = self.reader.next()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
127 s += 1
0368815ae4d5 Uploaded
greg
parents:
diff changeset
128 if s > 0:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
129 # Skip initial line(s) of file
0368815ae4d5 Uploaded
greg
parents:
diff changeset
130 pass
0368815ae4d5 Uploaded
greg
parents:
diff changeset
131
0368815ae4d5 Uploaded
greg
parents:
diff changeset
132 def parse_line(self, line):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
133 if self.format == 'ssccidx':
0368815ae4d5 Uploaded
greg
parents:
diff changeset
134 return [int(line[1]), int(line[2]), int(line[3])]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
135 else:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
136 return [int(line[3]), line[6], line[5]]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
137
0368815ae4d5 Uploaded
greg
parents:
diff changeset
138 def chromosome_name(self):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
139 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
140 Return the name of the chromosome about to be loaded
0368815ae4d5 Uploaded
greg
parents:
diff changeset
141 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
142 return self.line[0]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
143
0368815ae4d5 Uploaded
greg
parents:
diff changeset
144 def load_chromosome(self, collect_data=True):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
145 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
146 Load the current chromosome into an array and return it
0368815ae4d5 Uploaded
greg
parents:
diff changeset
147 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
148 cname = self.chromosome_name()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
149 if cname in self.processed_chromosomes:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
150 stop_err('File is not grouped by chromosome')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
151 self.data = []
0368815ae4d5 Uploaded
greg
parents:
diff changeset
152 while self.line[0] == cname:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
153 if collect_data:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
154 read = self.parse_line(self.line)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
155 if read[0] < self.current_index:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
156 msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (cname, self.current_index)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
157 stop_err(msg)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
158 self.current_index = read[0]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
159 self.add_read(read)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
160 try:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
161 self.next()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
162 except StopIteration:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
163 self.done = True
0368815ae4d5 Uploaded
greg
parents:
diff changeset
164 break
0368815ae4d5 Uploaded
greg
parents:
diff changeset
165 self.processed_chromosomes.append(cname)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
166 self.current_index = 0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
167 data = self.data
0368815ae4d5 Uploaded
greg
parents:
diff changeset
168 # Don't retain reference anymore to save memory
0368815ae4d5 Uploaded
greg
parents:
diff changeset
169 del self.data
0368815ae4d5 Uploaded
greg
parents:
diff changeset
170 return data
0368815ae4d5 Uploaded
greg
parents:
diff changeset
171
0368815ae4d5 Uploaded
greg
parents:
diff changeset
172 def add_read(self, read):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
173 if self.format == 'ssccidx':
0368815ae4d5 Uploaded
greg
parents:
diff changeset
174 self.data.append(read)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
175 else:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
176 index, strand, value = read
0368815ae4d5 Uploaded
greg
parents:
diff changeset
177 if value == '' or value == '.':
0368815ae4d5 Uploaded
greg
parents:
diff changeset
178 value = 1
0368815ae4d5 Uploaded
greg
parents:
diff changeset
179 else:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
180 value = int(value)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
181 if not self.data:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
182 self.data.append([index, 0, 0])
0368815ae4d5 Uploaded
greg
parents:
diff changeset
183 current_read = self.data[-1]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
184 if self.data[-1][0] == index:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
185 current_read = self.data[-1]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
186 elif self.data[-1][0] < index:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
187 self.data.append([index, 0, 0])
0368815ae4d5 Uploaded
greg
parents:
diff changeset
188 current_read = self.data[-1]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
189 else:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
190 msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (self.chromosome_name(), index)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
191 stop_err(msg)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
192 if strand == '+':
0368815ae4d5 Uploaded
greg
parents:
diff changeset
193 current_read[1] += value
0368815ae4d5 Uploaded
greg
parents:
diff changeset
194 elif strand == '-':
0368815ae4d5 Uploaded
greg
parents:
diff changeset
195 current_read[2] += value
0368815ae4d5 Uploaded
greg
parents:
diff changeset
196 else:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
197 msg = 'Strand "%s" at chromosome "%s" index %d is not valid.' % (strand, self.chromosome_name(), index)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
198 stop_err(msg)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
199
0368815ae4d5 Uploaded
greg
parents:
diff changeset
200 def skip_chromosome(self):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
201 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
202 Skip the current chromosome, discarding data
0368815ae4d5 Uploaded
greg
parents:
diff changeset
203 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
204 self.load_chromosome(collect_data=False)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
205
0368815ae4d5 Uploaded
greg
parents:
diff changeset
206
0368815ae4d5 Uploaded
greg
parents:
diff changeset
207 class Peak(object):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
208 def __init__(self, index, pos_width, neg_width):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
209 self.index = index
0368815ae4d5 Uploaded
greg
parents:
diff changeset
210 self.start = index - neg_width
0368815ae4d5 Uploaded
greg
parents:
diff changeset
211 self.end = index + pos_width
0368815ae4d5 Uploaded
greg
parents:
diff changeset
212 self.value = 0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
213 self.deleted = False
0368815ae4d5 Uploaded
greg
parents:
diff changeset
214 self.safe = False
0368815ae4d5 Uploaded
greg
parents:
diff changeset
215
0368815ae4d5 Uploaded
greg
parents:
diff changeset
216 def __repr__(self):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
217 return '[%d] %d' % (self.index, self.value)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
218
0368815ae4d5 Uploaded
greg
parents:
diff changeset
219
0368815ae4d5 Uploaded
greg
parents:
diff changeset
220 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
221 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
0368815ae4d5 Uploaded
greg
parents:
diff changeset
222
0368815ae4d5 Uploaded
greg
parents:
diff changeset
223
0368815ae4d5 Uploaded
greg
parents:
diff changeset
224 def gff_attrs(d):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
225 if not d:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
226 return '.'
0368815ae4d5 Uploaded
greg
parents:
diff changeset
227 return ';'.join('%s=%s' % item for item in d.items())
0368815ae4d5 Uploaded
greg
parents:
diff changeset
228
0368815ae4d5 Uploaded
greg
parents:
diff changeset
229
0368815ae4d5 Uploaded
greg
parents:
diff changeset
230 def stop_err(msg):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
231 sys.stderr.write(msg)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
232 sys.exit(1)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
233
0368815ae4d5 Uploaded
greg
parents:
diff changeset
234
0368815ae4d5 Uploaded
greg
parents:
diff changeset
235 def is_int(i):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
236 try:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
237 int(i)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
238 return True
0368815ae4d5 Uploaded
greg
parents:
diff changeset
239 except ValueError:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
240 return False
0368815ae4d5 Uploaded
greg
parents:
diff changeset
241
0368815ae4d5 Uploaded
greg
parents:
diff changeset
242
0368815ae4d5 Uploaded
greg
parents:
diff changeset
243 def make_keys(data):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
244 return [read[0] for read in data]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
245
0368815ae4d5 Uploaded
greg
parents:
diff changeset
246
0368815ae4d5 Uploaded
greg
parents:
diff changeset
247 def make_peak_keys(peaks):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
248 return [peak.index for peak in peaks]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
249
0368815ae4d5 Uploaded
greg
parents:
diff changeset
250
0368815ae4d5 Uploaded
greg
parents:
diff changeset
251 def get_window(data, start, end, keys):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
252 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
253 Returns all reads from the data set with index between the two indexes
0368815ae4d5 Uploaded
greg
parents:
diff changeset
254 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
255 start_index = bisect.bisect_left(keys, start)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
256 end_index = bisect.bisect_right(keys, end)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
257 return data[start_index:end_index]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
258
0368815ae4d5 Uploaded
greg
parents:
diff changeset
259
0368815ae4d5 Uploaded
greg
parents:
diff changeset
260 def get_index(value, keys):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
261 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
262 Returns the index of the value in the keys using bisect
0368815ae4d5 Uploaded
greg
parents:
diff changeset
263 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
264 return bisect.bisect_left(keys, value)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
265
0368815ae4d5 Uploaded
greg
parents:
diff changeset
266
0368815ae4d5 Uploaded
greg
parents:
diff changeset
267 def get_range(data):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
268 lo = min([item[0] for item in data])
0368815ae4d5 Uploaded
greg
parents:
diff changeset
269 hi = max([item[0] for item in data])
0368815ae4d5 Uploaded
greg
parents:
diff changeset
270 return lo, hi
0368815ae4d5 Uploaded
greg
parents:
diff changeset
271
0368815ae4d5 Uploaded
greg
parents:
diff changeset
272
0368815ae4d5 Uploaded
greg
parents:
diff changeset
273 def get_chunks(lo, hi, size, overlap=500):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
274 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
275 Divides a range into chunks of maximum size size. Returns a list of
0368815ae4d5 Uploaded
greg
parents:
diff changeset
276 2-tuples (slice_range, process_range), each a 2-tuple (start, end).
0368815ae4d5 Uploaded
greg
parents:
diff changeset
277 process_range has zero overlap and should be given to process_chromosome
0368815ae4d5 Uploaded
greg
parents:
diff changeset
278 as-is, and slice_range is overlapped and should be used to slice the
0368815ae4d5 Uploaded
greg
parents:
diff changeset
279 data (using get_window) to be given to process_chromosome.
0368815ae4d5 Uploaded
greg
parents:
diff changeset
280 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
281 chunks = []
0368815ae4d5 Uploaded
greg
parents:
diff changeset
282 for start_index in range(lo, hi, size):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
283 process_start = start_index
0368815ae4d5 Uploaded
greg
parents:
diff changeset
284 # Don't go over upper bound
0368815ae4d5 Uploaded
greg
parents:
diff changeset
285 process_end = min(start_index + size, hi)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
286 # Don't go under lower bound
0368815ae4d5 Uploaded
greg
parents:
diff changeset
287 slice_start = max(process_start - overlap, lo)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
288 # Don't go over upper bound
0368815ae4d5 Uploaded
greg
parents:
diff changeset
289 slice_end = min(process_end + overlap, hi)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
290 chunks.append(((slice_start, slice_end), (process_start, process_end)))
0368815ae4d5 Uploaded
greg
parents:
diff changeset
291 return chunks
0368815ae4d5 Uploaded
greg
parents:
diff changeset
292
0368815ae4d5 Uploaded
greg
parents:
diff changeset
293
0368815ae4d5 Uploaded
greg
parents:
diff changeset
294 def allocate_array(data, width):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
295 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
296 Allocates a new array with the dimensions required to fit all reads in
0368815ae4d5 Uploaded
greg
parents:
diff changeset
297 the argument. The new array is totally empty. Returns the array and the
0368815ae4d5 Uploaded
greg
parents:
diff changeset
298 shift (number to add to a read index to get the position in the array it
0368815ae4d5 Uploaded
greg
parents:
diff changeset
299 should be at).
0368815ae4d5 Uploaded
greg
parents:
diff changeset
300 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
301 lo, hi = get_range(data)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
302 rng = hi - lo
0368815ae4d5 Uploaded
greg
parents:
diff changeset
303 shift = width - lo
0368815ae4d5 Uploaded
greg
parents:
diff changeset
304 return numpy.zeros(rng+width*2, numpy.float), shift
0368815ae4d5 Uploaded
greg
parents:
diff changeset
305
0368815ae4d5 Uploaded
greg
parents:
diff changeset
306
0368815ae4d5 Uploaded
greg
parents:
diff changeset
307 def normal_array(width, sigma, normalize=True):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
308 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
309 Returns an array of the normal distribution of the specified width
0368815ae4d5 Uploaded
greg
parents:
diff changeset
310 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
311 sigma2 = float(sigma)**2
0368815ae4d5 Uploaded
greg
parents:
diff changeset
312
0368815ae4d5 Uploaded
greg
parents:
diff changeset
313 def normal_func(x):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
314 return math.exp(-x * x / (2 * sigma2))
0368815ae4d5 Uploaded
greg
parents:
diff changeset
315
0368815ae4d5 Uploaded
greg
parents:
diff changeset
316 # width is the half of the distribution
0368815ae4d5 Uploaded
greg
parents:
diff changeset
317 values = map(normal_func, range(-width, width))
0368815ae4d5 Uploaded
greg
parents:
diff changeset
318 values = numpy.array(values, numpy.float)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
319 # normalization
0368815ae4d5 Uploaded
greg
parents:
diff changeset
320 if normalize:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
321 values = 1.0/math.sqrt(2 * numpy.pi * sigma2) * values
0368815ae4d5 Uploaded
greg
parents:
diff changeset
322 return values
0368815ae4d5 Uploaded
greg
parents:
diff changeset
323
0368815ae4d5 Uploaded
greg
parents:
diff changeset
324
0368815ae4d5 Uploaded
greg
parents:
diff changeset
325 def call_peaks(array, shift, data, keys, direction, down_width, up_width, exclusion):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
326 peaks = []
0368815ae4d5 Uploaded
greg
parents:
diff changeset
327
0368815ae4d5 Uploaded
greg
parents:
diff changeset
328 def find_peaks():
0368815ae4d5 Uploaded
greg
parents:
diff changeset
329 # Go through the array and call each peak
0368815ae4d5 Uploaded
greg
parents:
diff changeset
330 results = (array > numpy.roll(array, 1)) & (array > numpy.roll(array, -1))
0368815ae4d5 Uploaded
greg
parents:
diff changeset
331 indexes = numpy.where(results)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
332 for index in indexes[0]:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
333 pos = down_width or exclusion // 2
0368815ae4d5 Uploaded
greg
parents:
diff changeset
334 neg = up_width or exclusion // 2
0368815ae4d5 Uploaded
greg
parents:
diff changeset
335 # Reverse strand
0368815ae4d5 Uploaded
greg
parents:
diff changeset
336 if direction == 2:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
337 # Swap positive and negative widths
0368815ae4d5 Uploaded
greg
parents:
diff changeset
338 pos, neg = neg, pos
0368815ae4d5 Uploaded
greg
parents:
diff changeset
339 peaks.append(Peak(int(index)-shift, pos, neg))
0368815ae4d5 Uploaded
greg
parents:
diff changeset
340 find_peaks()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
341
0368815ae4d5 Uploaded
greg
parents:
diff changeset
342 def calculate_reads():
0368815ae4d5 Uploaded
greg
parents:
diff changeset
343 # Calculate the number of reads in each peak
0368815ae4d5 Uploaded
greg
parents:
diff changeset
344 for peak in peaks:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
345 reads = get_window(data, peak.start, peak.end, keys)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
346 peak.value = sum([read[direction] for read in reads])
0368815ae4d5 Uploaded
greg
parents:
diff changeset
347 # Flat list of indexes with frequency
0368815ae4d5 Uploaded
greg
parents:
diff changeset
348 indexes = [r for read in reads for r in [read[0]] * read[direction]]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
349 peak.stddev = numpy.std(indexes)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
350 calculate_reads()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
351
0368815ae4d5 Uploaded
greg
parents:
diff changeset
352 def perform_exclusion():
0368815ae4d5 Uploaded
greg
parents:
diff changeset
353 # Process the exclusion zone
0368815ae4d5 Uploaded
greg
parents:
diff changeset
354 peak_keys = make_peak_keys(peaks)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
355 peaks_by_value = peaks[:]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
356 peaks_by_value.sort(key=lambda peak: -peak.value)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
357 for peak in peaks_by_value:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
358 peak.safe = True
0368815ae4d5 Uploaded
greg
parents:
diff changeset
359 window = get_window(peaks,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
360 peak.index-exclusion//2,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
361 peak.index+exclusion//2,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
362 peak_keys)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
363 for excluded in window:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
364 if excluded.safe:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
365 continue
0368815ae4d5 Uploaded
greg
parents:
diff changeset
366 i = get_index(excluded.index, peak_keys)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
367 del peak_keys[i]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
368 del peaks[i]
0368815ae4d5 Uploaded
greg
parents:
diff changeset
369 perform_exclusion()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
370 return peaks
0368815ae4d5 Uploaded
greg
parents:
diff changeset
371
0368815ae4d5 Uploaded
greg
parents:
diff changeset
372
0368815ae4d5 Uploaded
greg
parents:
diff changeset
373 def process_chromosome(cname, data, writer, process_bounds, width, sigma, down_width, up_width, exclusion, filter):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
374 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
375 Process a chromosome. Takes the chromosome name, list of reads, a CSV
0368815ae4d5 Uploaded
greg
parents:
diff changeset
376 writer to write processes results to, the bounds (2-tuple) to write
0368815ae4d5 Uploaded
greg
parents:
diff changeset
377 results in, and options.
0368815ae4d5 Uploaded
greg
parents:
diff changeset
378 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
379 if not data:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
380 return
0368815ae4d5 Uploaded
greg
parents:
diff changeset
381 keys = make_keys(data)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
382 # Create the arrays that hold the sum of the normals
0368815ae4d5 Uploaded
greg
parents:
diff changeset
383 forward_array, forward_shift = allocate_array(data, width)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
384 reverse_array, reverse_shift = allocate_array(data, width)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
385 normal = normal_array(width, sigma)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
386
0368815ae4d5 Uploaded
greg
parents:
diff changeset
387 def populate_array():
0368815ae4d5 Uploaded
greg
parents:
diff changeset
388 # Add each read's normal to the array
0368815ae4d5 Uploaded
greg
parents:
diff changeset
389 for read in data:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
390 index, forward, reverse = read
0368815ae4d5 Uploaded
greg
parents:
diff changeset
391 # Add the normals to the appropriate regions
0368815ae4d5 Uploaded
greg
parents:
diff changeset
392 if forward:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
393 forward_array[index+forward_shift-width:index+forward_shift+width] += normal * forward
0368815ae4d5 Uploaded
greg
parents:
diff changeset
394 if reverse:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
395 reverse_array[index+reverse_shift-width:index+reverse_shift+width] += normal * reverse
0368815ae4d5 Uploaded
greg
parents:
diff changeset
396 populate_array()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
397 forward_peaks = call_peaks(forward_array, forward_shift, data, keys, 1, down_width, up_width, exclusion)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
398 reverse_peaks = call_peaks(reverse_array, reverse_shift, data, keys, 2, down_width, up_width, exclusion)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
399 # Convert chromosome name in preparation for writing output
0368815ae4d5 Uploaded
greg
parents:
diff changeset
400 cname = convert_data(cname, 'zeropad', 'numeric')
0368815ae4d5 Uploaded
greg
parents:
diff changeset
401
0368815ae4d5 Uploaded
greg
parents:
diff changeset
402 def write(cname, strand, peak):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
403 start = max(peak.start, 1)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
404 end = peak.end
0368815ae4d5 Uploaded
greg
parents:
diff changeset
405 value = peak.value
0368815ae4d5 Uploaded
greg
parents:
diff changeset
406 stddev = peak.stddev
0368815ae4d5 Uploaded
greg
parents:
diff changeset
407 if value > filter:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
408 # This version of genetrack outputs only gff files.
0368815ae4d5 Uploaded
greg
parents:
diff changeset
409 writer.writerow(gff_row(cname=cname,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
410 source='genetrack',
0368815ae4d5 Uploaded
greg
parents:
diff changeset
411 start=start,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
412 end=end,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
413 score=value,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
414 strand=strand,
0368815ae4d5 Uploaded
greg
parents:
diff changeset
415 attrs={'stddev': stddev}))
0368815ae4d5 Uploaded
greg
parents:
diff changeset
416
0368815ae4d5 Uploaded
greg
parents:
diff changeset
417 for peak in forward_peaks:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
418 if process_bounds[0] < peak.index < process_bounds[1]:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
419 write(cname, '+', peak)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
420 for peak in reverse_peaks:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
421 if process_bounds[0] < peak.index < process_bounds[1]:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
422 write(cname, '-', peak)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
423
0368815ae4d5 Uploaded
greg
parents:
diff changeset
424
0368815ae4d5 Uploaded
greg
parents:
diff changeset
425 def sort_chromosome_reads_by_index( input_path ):
0368815ae4d5 Uploaded
greg
parents:
diff changeset
426 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
427 Return a gff file with chromosome reads sorted by index.
0368815ae4d5 Uploaded
greg
parents:
diff changeset
428 """
0368815ae4d5 Uploaded
greg
parents:
diff changeset
429 # Will this sort produce different results across platforms?
0368815ae4d5 Uploaded
greg
parents:
diff changeset
430 output_path = tempfile.NamedTemporaryFile( delete=False ).name
0368815ae4d5 Uploaded
greg
parents:
diff changeset
431 command = 'sort -k 1,1 -k 4,4n "%s" > "%s"' % (input_path, output_path)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
432 p = subprocess.Popen(command, shell=True)
0368815ae4d5 Uploaded
greg
parents:
diff changeset
433 p.wait()
0368815ae4d5 Uploaded
greg
parents:
diff changeset
434 return output_path