Mercurial > repos > greg > repmatch_gff3
comparison repmatch_gff3_util.py @ 0:d33030c8e2cc draft
Uploaded
| author | greg |
|---|---|
| date | Tue, 17 Nov 2015 14:26:08 -0500 |
| parents | |
| children | 8159aaa7da4b |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d33030c8e2cc |
|---|---|
| 1 import bisect | |
| 2 import csv | |
| 3 import os | |
| 4 import shutil | |
| 5 import sys | |
| 6 import tempfile | |
| 7 | |
| 8 from matplotlib import pyplot | |
| 9 | |
| 10 # Graph settings | |
| 11 Y_LABEL = 'Counts' | |
| 12 X_LABEL = 'Number of matched replicates' | |
| 13 TICK_WIDTH = 3 | |
| 14 # Amount to shift the graph to make labels fit, [left, right, top, bottom] | |
| 15 ADJUST = [0.180, 0.9, 0.9, 0.1] | |
| 16 # Length of tick marks, use TICK_WIDTH for width | |
| 17 pyplot.rc('xtick.major', size=10.00) | |
| 18 pyplot.rc('ytick.major', size=10.00) | |
| 19 pyplot.rc('lines', linewidth=4.00) | |
| 20 pyplot.rc('axes', linewidth=3.00) | |
| 21 pyplot.rc('font', family='Arial', size=32.0) | |
| 22 | |
| 23 PLOT_FORMATS = ['png', 'pdf', 'svg'] | |
| 24 COLORS = 'krb' | |
| 25 | |
| 26 | |
| 27 class Replicate(object): | |
| 28 | |
| 29 def __init__(self, id, dataset_path): | |
| 30 self.id = id | |
| 31 self.dataset_path = dataset_path | |
| 32 self.parse(csv.reader(open(dataset_path, 'rt'), delimiter='\t')) | |
| 33 | |
| 34 def parse(self, reader): | |
| 35 self.chromosomes = {} | |
| 36 for line in reader: | |
| 37 if line[0].startswith("#") or line[0].startswith('"'): | |
| 38 continue | |
| 39 cname, junk, junk, mid, midplus, value, strand, junk, attrs = line | |
| 40 attrs = parse_gff_attrs(attrs) | |
| 41 distance = attrs['cw_distance'] | |
| 42 mid = int(mid) | |
| 43 midplus = int(midplus) | |
| 44 value = float(value) | |
| 45 distance = int(distance) | |
| 46 if cname not in self.chromosomes: | |
| 47 self.chromosomes[cname] = Chromosome(cname) | |
| 48 chrom = self.chromosomes[cname] | |
| 49 chrom.add_peak(Peak(cname, mid, value, distance, self)) | |
| 50 for chrom in self.chromosomes.values(): | |
| 51 chrom.sort_by_index() | |
| 52 | |
| 53 def filter(self, up_limit, low_limit): | |
| 54 for chrom in self.chromosomes.values(): | |
| 55 chrom.filter(up_limit, low_limit) | |
| 56 | |
| 57 def size(self): | |
| 58 return sum([len(c.peaks) for c in self.chromosomes.values()]) | |
| 59 | |
| 60 | |
| 61 class Chromosome(object): | |
| 62 | |
| 63 def __init__(self, name): | |
| 64 self.name = name | |
| 65 self.peaks = [] | |
| 66 | |
| 67 def add_peak(self, peak): | |
| 68 self.peaks.append(peak) | |
| 69 | |
| 70 def sort_by_index(self): | |
| 71 self.peaks.sort(key=lambda peak: peak.midpoint) | |
| 72 self.keys = make_keys(self.peaks) | |
| 73 | |
| 74 def remove_peak(self, peak): | |
| 75 i = bisect.bisect_left(self.keys, peak.midpoint) | |
| 76 # If the peak was actually found | |
| 77 if i < len(self.peaks) and self.peaks[i].midpoint == peak.midpoint: | |
| 78 del self.keys[i] | |
| 79 del self.peaks[i] | |
| 80 | |
| 81 def filter(self, up_limit, low_limit): | |
| 82 self.peaks = [p for p in self.peaks if low_limit <= p.distance <= up_limit] | |
| 83 self.keys = make_keys(self.peaks) | |
| 84 | |
| 85 | |
| 86 class Peak(object): | |
| 87 | |
| 88 def __init__(self, chrom, midpoint, value, distance, replicate): | |
| 89 self.chrom = chrom | |
| 90 self.value = value | |
| 91 self.midpoint = midpoint | |
| 92 self.distance = distance | |
| 93 self.replicate = replicate | |
| 94 | |
| 95 def normalized_value(self, med): | |
| 96 return self.value * med / self.replicate.median | |
| 97 | |
| 98 | |
| 99 class PeakGroup(object): | |
| 100 | |
| 101 def __init__(self): | |
| 102 self.peaks = {} | |
| 103 | |
| 104 def add_peak(self, repid, peak): | |
| 105 self.peaks[repid] = peak | |
| 106 | |
| 107 @property | |
| 108 def chrom(self): | |
| 109 return self.peaks.values()[0].chrom | |
| 110 | |
| 111 @property | |
| 112 def midpoint(self): | |
| 113 return median([peak.midpoint for peak in self.peaks.values()]) | |
| 114 | |
| 115 @property | |
| 116 def num_replicates(self): | |
| 117 return len(self.peaks) | |
| 118 | |
| 119 @property | |
| 120 def median_distance(self): | |
| 121 return median([peak.distance for peak in self.peaks.values()]) | |
| 122 | |
| 123 @property | |
| 124 def value_sum(self): | |
| 125 return sum([peak.value for peak in self.peaks.values()]) | |
| 126 | |
| 127 def normalized_value(self, med): | |
| 128 values = [] | |
| 129 for peak in self.peaks.values(): | |
| 130 values.append(peak.normalized_value(med)) | |
| 131 return median(values) | |
| 132 | |
| 133 @property | |
| 134 def peakpeak_distance(self): | |
| 135 keys = self.peaks.keys() | |
| 136 return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint) | |
| 137 | |
| 138 | |
| 139 class FrequencyDistribution(object): | |
| 140 | |
| 141 def __init__(self, d=None): | |
| 142 self.dist = d or {} | |
| 143 | |
| 144 def add(self, x): | |
| 145 self.dist[x] = self.dist.get(x, 0) + 1 | |
| 146 | |
| 147 def graph_series(self): | |
| 148 x = [] | |
| 149 y = [] | |
| 150 for key, val in self.dist.items(): | |
| 151 x.append(key) | |
| 152 y.append(val) | |
| 153 return x, y | |
| 154 | |
| 155 def mode(self): | |
| 156 return max(self.dist.items(), key=lambda data: data[1])[0] | |
| 157 | |
| 158 def size(self): | |
| 159 return sum(self.dist.values()) | |
| 160 | |
| 161 | |
| 162 def stop_err(msg): | |
| 163 sys.stderr.write(msg) | |
| 164 sys.exit(1) | |
| 165 | |
| 166 | |
| 167 def median(data): | |
| 168 """ | |
| 169 Find the integer median of the data set. | |
| 170 """ | |
| 171 if not data: | |
| 172 return 0 | |
| 173 sdata = sorted(data) | |
| 174 if len(data) % 2 == 0: | |
| 175 return (sdata[len(data)//2] + sdata[len(data)//2-1]) / 2 | |
| 176 else: | |
| 177 return sdata[len(data)//2] | |
| 178 | |
| 179 | |
| 180 def make_keys(peaks): | |
| 181 return [data.midpoint for data in peaks] | |
| 182 | |
| 183 | |
| 184 def get_window(chromosome, target_peaks, distance): | |
| 185 """ | |
| 186 Returns a window of all peaks from a replicate within a certain distance of | |
| 187 a peak from another replicate. | |
| 188 """ | |
| 189 lower = target_peaks[0].midpoint | |
| 190 upper = target_peaks[0].midpoint | |
| 191 for peak in target_peaks: | |
| 192 lower = min(lower, peak.midpoint - distance) | |
| 193 upper = max(upper, peak.midpoint + distance) | |
| 194 start_index = bisect.bisect_left(chromosome.keys, lower) | |
| 195 end_index = bisect.bisect_right(chromosome.keys, upper) | |
| 196 return (chromosome.peaks[start_index: end_index], chromosome.name) | |
| 197 | |
| 198 | |
| 199 def match_largest(window, peak, chrum): | |
| 200 if not window: | |
| 201 return None | |
| 202 if peak.chrom != chrum: | |
| 203 return None | |
| 204 return max(window, key=lambda cpeak: cpeak.value) | |
| 205 | |
| 206 | |
| 207 def match_closest(window, peak, chrum): | |
| 208 if not window: | |
| 209 return None | |
| 210 if peak.chrom != chrum: | |
| 211 return None | |
| 212 return min(window, key=lambda match: abs(match.midpoint - peak.midpoint)) | |
| 213 | |
| 214 | |
| 215 def frequency_histogram(freqs, dataset_path, labels=[], title=''): | |
| 216 pyplot.clf() | |
| 217 pyplot.figure(figsize=(10, 10)) | |
| 218 for i, freq in enumerate(freqs): | |
| 219 xvals, yvals = freq.graph_series() | |
| 220 # Go from high to low | |
| 221 xvals.reverse() | |
| 222 pyplot.bar([x-0.4 + 0.8/len(freqs)*i for x in xvals], yvals, width=0.8/len(freqs), color=COLORS[i]) | |
| 223 pyplot.xticks(range(min(xvals), max(xvals)+1), map(str, reversed(range(min(xvals), max(xvals)+1)))) | |
| 224 pyplot.xlabel(X_LABEL) | |
| 225 pyplot.ylabel(Y_LABEL) | |
| 226 pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3]) | |
| 227 ax = pyplot.gca() | |
| 228 for l in ax.get_xticklines() + ax.get_yticklines(): | |
| 229 l.set_markeredgewidth(TICK_WIDTH) | |
| 230 pyplot.savefig(dataset_path) | |
| 231 | |
| 232 | |
| 233 METHODS = {'closest': match_closest, 'largest': match_largest} | |
| 234 | |
| 235 | |
| 236 def gff_attrs(d): | |
| 237 if not d: | |
| 238 return '.' | |
| 239 return ';'.join('%s=%s' % item for item in d.items()) | |
| 240 | |
| 241 | |
| 242 def parse_gff_attrs(s): | |
| 243 d = {} | |
| 244 if s == '.': | |
| 245 return d | |
| 246 for item in s.split(';'): | |
| 247 key, val = item.split('=') | |
| 248 d[key] = val | |
| 249 return d | |
| 250 | |
| 251 | |
| 252 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): | |
| 253 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) | |
| 254 | |
| 255 | |
| 256 def get_temporary_plot_path(plot_format): | |
| 257 """ | |
| 258 Return the path to a temporary file with a valid image format | |
| 259 file extension that can be used with bioformats. | |
| 260 """ | |
| 261 tmp_dir = tempfile.mkdtemp(prefix='tmp-repmatch-') | |
| 262 fd, name = tempfile.mkstemp(suffix='.%s' % plot_format, dir=tmp_dir) | |
| 263 os.close(fd) | |
| 264 return name | |
| 265 | |
| 266 | |
| 267 def process_files(dataset_paths, galaxy_hids, method, distance, step, replicates, up_limit, low_limit, output_files, | |
| 268 plot_format, output_summary, output_orphan, output_detail, output_key, output_histogram): | |
| 269 output_histogram_file = output_files in ["all"] and method in ["all"] | |
| 270 if len(dataset_paths) < 2: | |
| 271 return | |
| 272 if method == 'all': | |
| 273 match_methods = METHODS.keys() | |
| 274 else: | |
| 275 match_methods = [method] | |
| 276 for match_method in match_methods: | |
| 277 statistics = perform_process(dataset_paths, | |
| 278 galaxy_hids, | |
| 279 match_method, | |
| 280 distance, | |
| 281 step, | |
| 282 replicates, | |
| 283 up_limit, | |
| 284 low_limit, | |
| 285 output_files, | |
| 286 plot_format, | |
| 287 output_summary, | |
| 288 output_orphan, | |
| 289 output_detail, | |
| 290 output_key, | |
| 291 output_histogram) | |
| 292 if output_histogram_file: | |
| 293 tmp_histogram_path = get_temporary_plot_path(plot_format) | |
| 294 frequency_histogram([stat['distribution'] for stat in [statistics]], | |
| 295 tmp_histogram_path, | |
| 296 METHODS.keys()) | |
| 297 shutil.move(tmp_histogram_path, output_histogram) | |
| 298 | |
| 299 | |
| 300 def perform_process(dataset_paths, galaxy_hids, method, distance, step, num_required, up_limit, low_limit, output_files, | |
| 301 plot_format, output_summary, output_orphan, output_detail, output_key, output_histogram): | |
| 302 output_detail_file = output_files in ["all"] and output_detail is not None | |
| 303 output_key_file = output_files in ["all"] and output_key is not None | |
| 304 output_orphan_file = output_files in ["all", "simple_orphan"] and output_orphan is not None | |
| 305 output_histogram_file = output_files in ["all"] and output_histogram is not None | |
| 306 replicates = [] | |
| 307 for i, dataset_path in enumerate(dataset_paths): | |
| 308 try: | |
| 309 galaxy_hid = galaxy_hids[i] | |
| 310 r = Replicate(galaxy_hid, dataset_path) | |
| 311 replicates.append(r) | |
| 312 except Exception, e: | |
| 313 stop_err('Unable to parse file "%s", exception: %s' % (dataset_path, str(e))) | |
| 314 attrs = 'd%sr%s' % (distance, num_required) | |
| 315 if up_limit != 1000: | |
| 316 attrs += 'u%d' % up_limit | |
| 317 if low_limit != -1000: | |
| 318 attrs += 'l%d' % low_limit | |
| 319 if step != 0: | |
| 320 attrs += 's%d' % step | |
| 321 | |
| 322 def td_writer(file_path): | |
| 323 # Returns a tab-delimited writer for a certain output | |
| 324 return csv.writer(open(file_path, 'wt'), delimiter='\t') | |
| 325 | |
| 326 labels = ('chrom', | |
| 327 'median midpoint', | |
| 328 'median midpoint+1', | |
| 329 'median normalized reads', | |
| 330 'replicates', | |
| 331 'median c-w distance', | |
| 332 'reads sum') | |
| 333 for replicate in replicates: | |
| 334 labels += ('chrom', | |
| 335 'median midpoint', | |
| 336 'median midpoint+1', | |
| 337 'c-w sum', | |
| 338 'c-w distance', | |
| 339 'replicate id') | |
| 340 summary_output = td_writer(output_summary) | |
| 341 if output_key_file: | |
| 342 key_output = td_writer(output_key) | |
| 343 key_output.writerow(('data', 'median read count')) | |
| 344 if output_detail_file: | |
| 345 detail_output = td_writer(output_detail) | |
| 346 detail_output.writerow(labels) | |
| 347 if output_orphan_file: | |
| 348 orphan_output = td_writer(output_orphan) | |
| 349 orphan_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id')) | |
| 350 # Perform filtering | |
| 351 if up_limit < 1000 or low_limit > -1000: | |
| 352 for replicate in replicates: | |
| 353 replicate.filter(up_limit, low_limit) | |
| 354 # Actually merge the peaks | |
| 355 peak_groups = [] | |
| 356 orphans = [] | |
| 357 freq = FrequencyDistribution() | |
| 358 | |
| 359 def do_match(reps, distance): | |
| 360 # Copy list because we will mutate it, but keep replicate references. | |
| 361 reps = reps[:] | |
| 362 while len(reps) > 1: | |
| 363 # Iterate over each replicate as "main" | |
| 364 main = reps[0] | |
| 365 reps.remove(main) | |
| 366 for chromosome in main.chromosomes.values(): | |
| 367 peaks_by_value = chromosome.peaks[:] | |
| 368 # Sort main replicate by value | |
| 369 peaks_by_value.sort(key=lambda peak: -peak.value) | |
| 370 | |
| 371 def search_for_matches(group): | |
| 372 # Here we use multiple passes, expanding the window to be | |
| 373 # +- distance from any previously matched peak. | |
| 374 while True: | |
| 375 new_match = False | |
| 376 for replicate in reps: | |
| 377 if replicate.id in group.peaks: | |
| 378 # Stop if match already found for this replicate | |
| 379 continue | |
| 380 try: | |
| 381 # Lines changed to remove a major bug by Rohit Reja. | |
| 382 window, chrum = get_window(replicate.chromosomes[chromosome.name], | |
| 383 group.peaks.values(), | |
| 384 distance) | |
| 385 match = METHODS[method](window, peak, chrum) | |
| 386 except KeyError: | |
| 387 continue | |
| 388 if match: | |
| 389 group.add_peak(replicate.id, match) | |
| 390 new_match = True | |
| 391 if not new_match: | |
| 392 break | |
| 393 # Attempt to enlarge existing peak groups | |
| 394 for group in peak_groups: | |
| 395 old_peaks = group.peaks.values()[:] | |
| 396 search_for_matches(group) | |
| 397 for peak in group.peaks.values(): | |
| 398 if peak not in old_peaks: | |
| 399 peak.replicate.chromosomes[chromosome.name].remove_peak(peak) | |
| 400 # Attempt to find new peaks groups. For each peak in the | |
| 401 # main replicate, search for matches in the other replicates | |
| 402 for peak in peaks_by_value: | |
| 403 matches = PeakGroup() | |
| 404 matches.add_peak(main.id, peak) | |
| 405 search_for_matches(matches) | |
| 406 # Were enough replicates matched? | |
| 407 if matches.num_replicates >= num_required: | |
| 408 for peak in matches.peaks.values(): | |
| 409 peak.replicate.chromosomes[chromosome.name].remove_peak(peak) | |
| 410 peak_groups.append(matches) | |
| 411 # Zero or less = no stepping | |
| 412 if step <= 0: | |
| 413 do_match(replicates, distance) | |
| 414 else: | |
| 415 for d in range(0, distance, step): | |
| 416 do_match(replicates, d) | |
| 417 for group in peak_groups: | |
| 418 freq.add(group.num_replicates) | |
| 419 # Collect together the remaining orphans | |
| 420 for replicate in replicates: | |
| 421 for chromosome in replicate.chromosomes.values(): | |
| 422 for peak in chromosome.peaks: | |
| 423 freq.add(1) | |
| 424 orphans.append(peak) | |
| 425 # Average the orphan count in the graph by # replicates | |
| 426 med = median([peak.value for group in peak_groups for peak in group.peaks.values()]) | |
| 427 for replicate in replicates: | |
| 428 replicate.median = median([peak.value for group in peak_groups for peak in group.peaks.values() if peak.replicate == replicate]) | |
| 429 key_output.writerow((replicate.id, replicate.median)) | |
| 430 for group in peak_groups: | |
| 431 # Output summary (matched pairs). | |
| 432 summary_output.writerow(gff_row(cname=group.chrom, | |
| 433 start=group.midpoint, | |
| 434 end=group.midpoint+1, | |
| 435 source='repmatch', | |
| 436 score=group.normalized_value(med), | |
| 437 attrs={'median_distance': group.median_distance, | |
| 438 'replicates': group.num_replicates, | |
| 439 'value_sum': group.value_sum})) | |
| 440 if output_detail_file: | |
| 441 summary = (group.chrom, | |
| 442 group.midpoint, | |
| 443 group.midpoint+1, | |
| 444 group.normalized_value(med), | |
| 445 group.num_replicates, | |
| 446 group.median_distance, | |
| 447 group.value_sum) | |
| 448 for peak in group.peaks.values(): | |
| 449 summary += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id) | |
| 450 detail_output.writerow(summary) | |
| 451 if output_orphan_file: | |
| 452 for orphan in orphans: | |
| 453 orphan_output.writerow((orphan.chrom, | |
| 454 orphan.midpoint, | |
| 455 orphan.midpoint+1, | |
| 456 orphan.value, | |
| 457 orphan.distance, | |
| 458 orphan.replicate.id)) | |
| 459 if output_histogram_file: | |
| 460 tmp_histogram_path = get_temporary_plot_path(plot_format) | |
| 461 frequency_histogram([freq], tmp_histogram_path) | |
| 462 shutil.move(tmp_histogram_path, output_histogram) | |
| 463 return {'distribution': freq} |
