Mercurial > repos > iuc > genetrack
comparison genetrack_util.py @ 0:a387a9e49dd6 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/genetrack commit e96df94dba60050fa28aaf55b5bb095717a5f260
| author | iuc |
|---|---|
| date | Tue, 22 Dec 2015 17:00:05 -0500 |
| parents | |
| children | 155f754c2863 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a387a9e49dd6 |
|---|---|
| 1 import bisect | |
| 2 import math | |
| 3 import numpy | |
| 4 import re | |
| 5 import subprocess | |
| 6 import sys | |
| 7 import tempfile | |
| 8 | |
| 9 GFF_EXT = 'gff' | |
| 10 SCIDX_EXT = 'scidx' | |
| 11 | |
| 12 | |
| 13 def noop(data): | |
| 14 return data | |
| 15 | |
| 16 | |
| 17 def zeropad_to_numeric(data): | |
| 18 return re.sub(r'chr0(\d)', r'chr\1', data) | |
| 19 | |
| 20 | |
| 21 def numeric_to_zeropad(data): | |
| 22 return re.sub(r'chr(\d([^\d]|$))', r'chr0\1', data) | |
| 23 | |
| 24 | |
| 25 FORMATS = ['zeropad', 'numeric'] | |
| 26 IN_CONVERT = {'zeropad': zeropad_to_numeric, 'numeric': noop} | |
| 27 OUT_CONVERT = {'zeropad': numeric_to_zeropad, 'numeric': noop} | |
| 28 | |
| 29 | |
| 30 def conversion_functions(in_fmt, out_fmt): | |
| 31 """ | |
| 32 Returns the proper list of functions to apply to perform a conversion | |
| 33 """ | |
| 34 return [IN_CONVERT[in_fmt], OUT_CONVERT[out_fmt]] | |
| 35 | |
| 36 | |
| 37 def convert_data(data, in_fmt, out_fmt): | |
| 38 for fn in conversion_functions(in_fmt, out_fmt): | |
| 39 data = fn(data) | |
| 40 return data | |
| 41 | |
| 42 | |
| 43 class ChromosomeManager(object): | |
| 44 """ | |
| 45 Manages a CSV reader of an index file to only load one chrom at a time | |
| 46 """ | |
| 47 | |
| 48 def __init__(self, reader): | |
| 49 self.done = False | |
| 50 self.reader = reader | |
| 51 self.processed_chromosomes = [] | |
| 52 self.current_index = 0 | |
| 53 self.next_valid() | |
| 54 | |
| 55 def next(self): | |
| 56 self.line = self.reader.next() | |
| 57 | |
| 58 def is_valid(self, line): | |
| 59 if len(line) not in [4, 5, 9]: | |
| 60 return False | |
| 61 try: | |
| 62 [int(i) for i in line[1:]] | |
| 63 self.format = SCIDX_EXT | |
| 64 return True | |
| 65 except ValueError: | |
| 66 try: | |
| 67 if len(line) < 6: | |
| 68 return False | |
| 69 [int(line[4]), int(line[5])] | |
| 70 self.format = GFF_EXT | |
| 71 return True | |
| 72 except ValueError: | |
| 73 return False | |
| 74 | |
| 75 def next_valid(self): | |
| 76 """ | |
| 77 Advance to the next valid line in the reader | |
| 78 """ | |
| 79 self.line = self.reader.next() | |
| 80 s = 0 | |
| 81 while not self.is_valid(self.line): | |
| 82 self.line = self.reader.next() | |
| 83 s += 1 | |
| 84 if s > 0: | |
| 85 # Skip initial line(s) of file | |
| 86 pass | |
| 87 | |
| 88 def parse_line(self, line): | |
| 89 if self.format == SCIDX_EXT: | |
| 90 return [int(line[1]), int(line[2]), int(line[3])] | |
| 91 else: | |
| 92 return [int(line[3]), line[6], line[5]] | |
| 93 | |
| 94 def chromosome_name(self): | |
| 95 """ | |
| 96 Return the name of the chromosome about to be loaded | |
| 97 """ | |
| 98 return self.line[0] | |
| 99 | |
| 100 def load_chromosome(self, collect_data=True): | |
| 101 """ | |
| 102 Load the current chromosome into an array and return it | |
| 103 """ | |
| 104 cname = self.chromosome_name() | |
| 105 if cname in self.processed_chromosomes: | |
| 106 stop_err('File is not grouped by chromosome') | |
| 107 self.data = [] | |
| 108 while self.line[0] == cname: | |
| 109 if collect_data: | |
| 110 read = self.parse_line(self.line) | |
| 111 if read[0] < self.current_index: | |
| 112 msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (cname, self.current_index) | |
| 113 stop_err(msg) | |
| 114 self.current_index = read[0] | |
| 115 self.add_read(read) | |
| 116 try: | |
| 117 self.next() | |
| 118 except StopIteration: | |
| 119 self.done = True | |
| 120 break | |
| 121 self.processed_chromosomes.append(cname) | |
| 122 self.current_index = 0 | |
| 123 data = self.data | |
| 124 # Don't retain reference anymore to save memory | |
| 125 del self.data | |
| 126 return data | |
| 127 | |
| 128 def add_read(self, read): | |
| 129 if self.format == SCIDX_EXT: | |
| 130 self.data.append(read) | |
| 131 else: | |
| 132 index, strand, value = read | |
| 133 if value == '' or value == '.': | |
| 134 value = 1 | |
| 135 else: | |
| 136 value = int(value) | |
| 137 if not self.data: | |
| 138 self.data.append([index, 0, 0]) | |
| 139 current_read = self.data[-1] | |
| 140 if self.data[-1][0] == index: | |
| 141 current_read = self.data[-1] | |
| 142 elif self.data[-1][0] < index: | |
| 143 self.data.append([index, 0, 0]) | |
| 144 current_read = self.data[-1] | |
| 145 else: | |
| 146 msg = 'Reads in chromosome %s are not sorted by index. (At index %d)' % (self.chromosome_name(), index) | |
| 147 stop_err(msg) | |
| 148 if strand == '+': | |
| 149 current_read[1] += value | |
| 150 elif strand == '-': | |
| 151 current_read[2] += value | |
| 152 else: | |
| 153 msg = 'Strand "%s" at chromosome "%s" index %d is not valid.' % (strand, self.chromosome_name(), index) | |
| 154 stop_err(msg) | |
| 155 | |
| 156 def skip_chromosome(self): | |
| 157 """ | |
| 158 Skip the current chromosome, discarding data | |
| 159 """ | |
| 160 self.load_chromosome(collect_data=False) | |
| 161 | |
| 162 | |
| 163 class Peak(object): | |
| 164 def __init__(self, index, pos_width, neg_width): | |
| 165 self.index = index | |
| 166 self.start = index - neg_width | |
| 167 self.end = index + pos_width | |
| 168 self.value = 0 | |
| 169 self.deleted = False | |
| 170 self.safe = False | |
| 171 | |
| 172 def __repr__(self): | |
| 173 return '[%d] %d' % (self.index, self.value) | |
| 174 | |
| 175 | |
| 176 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): | |
| 177 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) | |
| 178 | |
| 179 | |
| 180 def gff_attrs(d): | |
| 181 if not d: | |
| 182 return '.' | |
| 183 return ';'.join('%s=%s' % item for item in d.items()) | |
| 184 | |
| 185 | |
| 186 def stop_err(msg): | |
| 187 sys.stderr.write(msg) | |
| 188 sys.exit(1) | |
| 189 | |
| 190 | |
| 191 def is_int(i): | |
| 192 try: | |
| 193 int(i) | |
| 194 return True | |
| 195 except ValueError: | |
| 196 return False | |
| 197 | |
| 198 | |
| 199 def make_keys(data): | |
| 200 return [read[0] for read in data] | |
| 201 | |
| 202 | |
| 203 def make_peak_keys(peaks): | |
| 204 return [peak.index for peak in peaks] | |
| 205 | |
| 206 | |
| 207 def get_window(data, start, end, keys): | |
| 208 """ | |
| 209 Returns all reads from the data set with index between the two indexes | |
| 210 """ | |
| 211 start_index = bisect.bisect_left(keys, start) | |
| 212 end_index = bisect.bisect_right(keys, end) | |
| 213 return data[start_index:end_index] | |
| 214 | |
| 215 | |
| 216 def get_index(value, keys): | |
| 217 """ | |
| 218 Returns the index of the value in the keys using bisect | |
| 219 """ | |
| 220 return bisect.bisect_left(keys, value) | |
| 221 | |
| 222 | |
| 223 def get_range(data): | |
| 224 lo = min([item[0] for item in data]) | |
| 225 hi = max([item[0] for item in data]) | |
| 226 return lo, hi | |
| 227 | |
| 228 | |
| 229 def get_chunks(lo, hi, size, overlap=500): | |
| 230 """ | |
| 231 Divides a range into chunks of maximum size size. Returns a list of | |
| 232 2-tuples (slice_range, process_range), each a 2-tuple (start, end). | |
| 233 process_range has zero overlap and should be given to process_chromosome | |
| 234 as-is, and slice_range is overlapped and should be used to slice the | |
| 235 data (using get_window) to be given to process_chromosome. | |
| 236 """ | |
| 237 chunks = [] | |
| 238 for start_index in range(lo, hi, size): | |
| 239 process_start = start_index | |
| 240 # Don't go over upper bound | |
| 241 process_end = min(start_index + size, hi) | |
| 242 # Don't go under lower bound | |
| 243 slice_start = max(process_start - overlap, lo) | |
| 244 # Don't go over upper bound | |
| 245 slice_end = min(process_end + overlap, hi) | |
| 246 chunks.append(((slice_start, slice_end), (process_start, process_end))) | |
| 247 return chunks | |
| 248 | |
| 249 | |
| 250 def allocate_array(data, width): | |
| 251 """ | |
| 252 Allocates a new array with the dimensions required to fit all reads in | |
| 253 the argument. The new array is totally empty. Returns the array and the | |
| 254 shift (number to add to a read index to get the position in the array it | |
| 255 should be at). | |
| 256 """ | |
| 257 lo, hi = get_range(data) | |
| 258 rng = hi - lo | |
| 259 shift = width - lo | |
| 260 return numpy.zeros(rng+width*2, numpy.float), shift | |
| 261 | |
| 262 | |
| 263 def normal_array(width, sigma, normalize=True): | |
| 264 """ | |
| 265 Returns an array of the normal distribution of the specified width | |
| 266 """ | |
| 267 sigma2 = float(sigma)**2 | |
| 268 | |
| 269 def normal_func(x): | |
| 270 return math.exp(-x * x / (2 * sigma2)) | |
| 271 | |
| 272 # width is the half of the distribution | |
| 273 values = map(normal_func, range(-width, width)) | |
| 274 values = numpy.array(values, numpy.float) | |
| 275 # normalization | |
| 276 if normalize: | |
| 277 values = 1.0/math.sqrt(2 * numpy.pi * sigma2) * values | |
| 278 return values | |
| 279 | |
| 280 | |
| 281 def call_peaks(array, shift, data, keys, direction, down_width, up_width, exclusion): | |
| 282 peaks = [] | |
| 283 | |
| 284 def find_peaks(): | |
| 285 # Go through the array and call each peak | |
| 286 results = (array > numpy.roll(array, 1)) & (array > numpy.roll(array, -1)) | |
| 287 indexes = numpy.where(results) | |
| 288 for index in indexes[0]: | |
| 289 pos = down_width or exclusion // 2 | |
| 290 neg = up_width or exclusion // 2 | |
| 291 # Reverse strand | |
| 292 if direction == 2: | |
| 293 # Swap positive and negative widths | |
| 294 pos, neg = neg, pos | |
| 295 peaks.append(Peak(int(index)-shift, pos, neg)) | |
| 296 find_peaks() | |
| 297 | |
| 298 def calculate_reads(): | |
| 299 # Calculate the number of reads in each peak | |
| 300 for peak in peaks: | |
| 301 reads = get_window(data, peak.start, peak.end, keys) | |
| 302 peak.value = sum([read[direction] for read in reads]) | |
| 303 # Flat list of indexes with frequency | |
| 304 indexes = [r for read in reads for r in [read[0]] * read[direction]] | |
| 305 peak.stddev = numpy.std(indexes) | |
| 306 calculate_reads() | |
| 307 | |
| 308 def perform_exclusion(): | |
| 309 # Process the exclusion zone | |
| 310 peak_keys = make_peak_keys(peaks) | |
| 311 peaks_by_value = peaks[:] | |
| 312 peaks_by_value.sort(key=lambda peak: -peak.value) | |
| 313 for peak in peaks_by_value: | |
| 314 peak.safe = True | |
| 315 window = get_window(peaks, | |
| 316 peak.index-exclusion//2, | |
| 317 peak.index+exclusion//2, | |
| 318 peak_keys) | |
| 319 for excluded in window: | |
| 320 if excluded.safe: | |
| 321 continue | |
| 322 i = get_index(excluded.index, peak_keys) | |
| 323 del peak_keys[i] | |
| 324 del peaks[i] | |
| 325 perform_exclusion() | |
| 326 return peaks | |
| 327 | |
| 328 | |
| 329 def process_chromosome(cname, data, writer, process_bounds, width, sigma, down_width, up_width, exclusion, filter): | |
| 330 """ | |
| 331 Process a chromosome. Takes the chromosome name, list of reads, a CSV | |
| 332 writer to write processes results to, the bounds (2-tuple) to write | |
| 333 results in, and options. | |
| 334 """ | |
| 335 if not data: | |
| 336 return | |
| 337 keys = make_keys(data) | |
| 338 # Create the arrays that hold the sum of the normals | |
| 339 forward_array, forward_shift = allocate_array(data, width) | |
| 340 reverse_array, reverse_shift = allocate_array(data, width) | |
| 341 normal = normal_array(width, sigma) | |
| 342 | |
| 343 def populate_array(): | |
| 344 # Add each read's normal to the array | |
| 345 for read in data: | |
| 346 index, forward, reverse = read | |
| 347 # Add the normals to the appropriate regions | |
| 348 if forward: | |
| 349 forward_array[index+forward_shift-width:index+forward_shift+width] += normal * forward | |
| 350 if reverse: | |
| 351 reverse_array[index+reverse_shift-width:index+reverse_shift+width] += normal * reverse | |
| 352 populate_array() | |
| 353 forward_peaks = call_peaks(forward_array, forward_shift, data, keys, 1, down_width, up_width, exclusion) | |
| 354 reverse_peaks = call_peaks(reverse_array, reverse_shift, data, keys, 2, down_width, up_width, exclusion) | |
| 355 # Convert chromosome name in preparation for writing output | |
| 356 cname = convert_data(cname, 'zeropad', 'numeric') | |
| 357 | |
| 358 def write(cname, strand, peak): | |
| 359 start = max(peak.start, 1) | |
| 360 end = peak.end | |
| 361 value = peak.value | |
| 362 stddev = peak.stddev | |
| 363 if value > filter: | |
| 364 # This version of genetrack outputs only gff files. | |
| 365 writer.writerow(gff_row(cname=cname, | |
| 366 source='genetrack', | |
| 367 start=start, | |
| 368 end=end, | |
| 369 score=value, | |
| 370 strand=strand, | |
| 371 attrs={'stddev': stddev})) | |
| 372 | |
| 373 for peak in forward_peaks: | |
| 374 if process_bounds[0] < peak.index < process_bounds[1]: | |
| 375 write(cname, '+', peak) | |
| 376 for peak in reverse_peaks: | |
| 377 if process_bounds[0] < peak.index < process_bounds[1]: | |
| 378 write(cname, '-', peak) | |
| 379 | |
| 380 | |
| 381 def sort_chromosome_reads_by_index(input_path): | |
| 382 """ | |
| 383 Return a gff file with chromosome reads sorted by index. | |
| 384 """ | |
| 385 # Will this sort produce different results across platforms? | |
| 386 output_path = tempfile.NamedTemporaryFile(delete=False).name | |
| 387 command = 'sort -k 1,1 -k 4,4n "%s" > "%s"' % (input_path, output_path) | |
| 388 p = subprocess.Popen(command, shell=True) | |
| 389 p.wait() | |
| 390 return output_path |
