comparison bedutil.py @ 1:c3d600729b6f draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 88cf1e923a8c9e5bc6953ad412d15a7c70f054d1
author galaxyp
date Mon, 22 Jan 2018 13:13:26 -0500
parents 887e111c0919
children
comparison
equal deleted inserted replaced
0:887e111c0919 1:c3d600729b6f
9 # 9 #
10 # James E Johnson 10 # James E Johnson
11 # 11 #
12 #------------------------------------------------------------------------------ 12 #------------------------------------------------------------------------------
13 """ 13 """
14
15 from __future__ import print_function
14 16
15 import sys 17 import sys
16 18
17 from Bio.Seq import reverse_complement, translate 19 from Bio.Seq import reverse_complement, translate
18 20
43 bed_entry.second_gene_name = fields[18] 45 bed_entry.second_gene_name = fields[18]
44 bed_entry.gene_type = fields[19] 46 bed_entry.gene_type = fields[19]
45 return bed_entry 47 return bed_entry
46 48
47 49
50 def as_int_list(obj):
51 if obj is None:
52 return None
53 if isinstance(obj, list):
54 return [int(x) for x in obj]
55 elif isinstance(obj, str):
56 return [int(x) for x in obj.split(',')]
57 else: # python2 unicode?
58 return [int(x) for x in str(obj).split(',')]
59
60
48 class BedEntry(object): 61 class BedEntry(object):
49 def __init__(self, chrom=None, chromStart=None, chromEnd=None, 62 def __init__(self, chrom=None, chromStart=None, chromEnd=None,
50 name=None, score=None, strand=None, 63 name=None, score=None, strand=None,
51 thickStart=None, thickEnd=None, itemRgb=None, 64 thickStart=None, thickEnd=None, itemRgb=None,
52 blockCount=None, blockSizes=None, blockStarts=None): 65 blockCount=None, blockSizes=None, blockStarts=None):
58 self.strand = '-' if str(strand).startswith('-') else '+' 71 self.strand = '-' if str(strand).startswith('-') else '+'
59 self.thickStart = int(thickStart) if thickStart else self.chromStart 72 self.thickStart = int(thickStart) if thickStart else self.chromStart
60 self.thickEnd = int(thickEnd) if thickEnd else self.chromEnd 73 self.thickEnd = int(thickEnd) if thickEnd else self.chromEnd
61 self.itemRgb = str(itemRgb) if itemRgb is not None else r'100,100,100' 74 self.itemRgb = str(itemRgb) if itemRgb is not None else r'100,100,100'
62 self.blockCount = int(blockCount) 75 self.blockCount = int(blockCount)
63 if isinstance(blockSizes, str) or isinstance(blockSizes, unicode): 76 self.blockSizes = as_int_list(blockSizes)
64 self.blockSizes = [int(x) for x in blockSizes.split(',')] 77 self.blockStarts = as_int_list(blockStarts)
65 elif isinstance(blockSizes, list):
66 self.blockSizes = [int(x) for x in blockSizes]
67 else:
68 self.blockSizes = blockSizes
69 if isinstance(blockStarts, str) or isinstance(blockSizes, unicode):
70 self.blockStarts = [int(x) for x in blockStarts.split(',')]
71 elif isinstance(blockStarts, list):
72 self.blockStarts = [int(x) for x in blockStarts]
73 else:
74 self.blockStarts = blockStarts
75 self.second_name = None 78 self.second_name = None
76 self.cds_start_status = None 79 self.cds_start_status = None
77 self.cds_end_status = None 80 self.cds_end_status = None
78 self.exon_frames = None 81 self.exon_frames = None
79 self.biotype = None 82 self.biotype = None
163 return self.set_cds(min(cds_pos) + basepairs, max(cds_pos)) 166 return self.set_cds(min(cds_pos) + basepairs, max(cds_pos))
164 else: 167 else:
165 return self.set_cds(min(cds_pos), max(cds_pos) + basepairs) 168 return self.set_cds(min(cds_pos), max(cds_pos) + basepairs)
166 return None 169 return None
167 170
171 def get_cds_bed(self):
172 cds_pos = [self.cdna_offset_of_pos(self.thickStart),
173 self.cdna_offset_of_pos(self.thickEnd)]
174 return self.trim(min(cds_pos), max(cds_pos))
175
168 def get_cigar(self): 176 def get_cigar(self):
169 cigar = '' 177 cigar = ''
170 r = range(self.blockCount) 178 r = range(self.blockCount)
171 xl = None 179 xl = None
172 for x in r: 180 for x in r:
212 translations.append(translation) 220 translations.append(translation)
213 return translations 221 return translations
214 222
215 def pos_of_cdna_offet(self, offset): 223 def pos_of_cdna_offet(self, offset):
216 if offset is not None and 0 <= offset < sum(self.blockSizes): 224 if offset is not None and 0 <= offset < sum(self.blockSizes):
217 r = range(self.blockCount) 225 r = list(range(self.blockCount))
218 rev = self.strand == '-' 226 rev = self.strand == '-'
219 if rev: 227 if rev:
220 r.reverse() 228 r.reverse()
221 nlen = 0 229 nlen = 0
222 for x in r: 230 for x in r:
231 return None 239 return None
232 240
233 def cdna_offset_of_pos(self, pos): 241 def cdna_offset_of_pos(self, pos):
234 if not self.chromStart <= pos < self.chromEnd: 242 if not self.chromStart <= pos < self.chromEnd:
235 return -1 243 return -1
236 r = range(self.blockCount) 244 r = list(range(self.blockCount))
237 rev = self.strand == '-' 245 rev = self.strand == '-'
238 if rev: 246 if rev:
239 r.reverse() 247 r.reverse()
240 nlen = 0 248 nlen = 0
241 for x in r: 249 for x in r:
246 nlen += self.blockSizes[x] 254 nlen += self.blockSizes[x]
247 255
248 def apply_variant(self, pos, ref, alt): 256 def apply_variant(self, pos, ref, alt):
249 pos = int(pos) 257 pos = int(pos)
250 if not ref or not alt: 258 if not ref or not alt:
251 print >> sys.stderr, "variant requires ref and alt sequences" 259 print("variant requires ref and alt sequences", file=sys.stderr)
252 return 260 return
253 if not self.chromStart <= pos <= self.chromEnd: 261 if not self.chromStart <= pos <= self.chromEnd:
254 print >> sys.stderr, "variant not in entry %s: %s %d < %d < %d"\ 262 print("variant not in entry %s: %s %d < %d < %d" %
255 % (self.name, self.strand, self.chromStart, pos, self.chromEnd) 263 (self.name, self.strand,
256 print >> sys.stderr, "%s" % str(self) 264 self.chromStart, pos, self.chromEnd),
265 file=sys.stderr)
266 print("%s" % str(self), file=sys.stderr)
257 return 267 return
258 if len(ref) != len(alt): 268 if len(ref) != len(alt):
259 print >> sys.stderr, "variant only works for snp: %s %s"\ 269 print("variant only works for snp: %s %s" % (ref, alt),
260 % (ref, alt) 270 file=sys.stderr)
261 return 271 return
262 if not self.seq: 272 if not self.seq:
263 print >> sys.stderr, "variant entry %s has no seq" % self.name 273 print("variant entry %s has no seq" % self.name, file=sys.stderr)
264 return 274 return
265 """ 275 """
266 if self.strand == '-': 276 if self.strand == '-':
267 ref = reverse_complement(ref) 277 ref = reverse_complement(ref)
268 alt = reverse_complement(alt) 278 alt = reverse_complement(alt)
272 for i in range(len(ref)): 282 for i in range(len(ref)):
273 # offset = self.cdna_offset_of_pos(pos+i) 283 # offset = self.cdna_offset_of_pos(pos+i)
274 if offset is not None: 284 if offset is not None:
275 bases[offset+i] = alt[i] 285 bases[offset+i] = alt[i]
276 else: 286 else:
277 print >> sys.stderr,\ 287 print("variant offset %s: %s %d < %d < %d" %
278 "variant offset %s: %s %d < %d < %d"\ 288 (self.name, self.strand, self.chromStart,
279 % (self.name, self.strand, self.chromStart, 289 pos+1, self.chromEnd), file=sys.stderr)
280 pos+1, self.chromEnd) 290 print("%s" % str(self), file=sys.stderr)
281 print >> sys.stderr, "%s" % str(self)
282 self.seq = ''.join(bases) 291 self.seq = ''.join(bases)
283 self.variants.append("g.%d%s>%s" % (pos+1, ref, alt)) 292 self.variants.append("g.%d%s>%s" % (pos+1, ref, alt))
284 293
285 def get_variant_bed(self, pos, ref, alt): 294 def get_variant_bed(self, pos, ref, alt):
286 pos = int(pos) 295 pos = int(pos)
287 if not ref or not alt: 296 if not ref or not alt:
288 print >> sys.stderr, "variant requires ref and alt sequences" 297 print("variant requires ref and alt sequences", file=sys.stderr)
289 return None 298 return None
290 if not self.chromStart <= pos <= self.chromEnd: 299 if not self.chromStart <= pos <= self.chromEnd:
291 print >> sys.stderr,\ 300 print("variant not in entry %s: %s %d < %d < %d" %
292 "variant not in entry %s: %s %d < %d < %d"\ 301 (self.name, self.strand,
293 % (self.name, self.strand, self.chromStart, pos, self.chromEnd) 302 self.chromStart, pos, self.chromEnd),
294 print >> sys.stderr, "%s" % str(self) 303 file=sys.stderr)
304 print("%s" % str(self), file=sys.stderr)
295 return None 305 return None
296 if not self.seq: 306 if not self.seq:
297 print >> sys.stderr, "variant entry %s has no seq" % self.name 307 print("variant entry %s has no seq" % self.name, file=sys.stderr)
298 return None 308 return None
299 tbed = BedEntry(chrom=self.chrom, 309 tbed = BedEntry(chrom=self.chrom,
300 chromStart=self.chromStart, chromEnd=self.chromEnd, 310 chromStart=self.chromStart, chromEnd=self.chromEnd,
301 name=self.name, score=self.score, strand=self.strand, 311 name=self.name, score=self.score, strand=self.strand,
302 thickStart=self.chromStart, thickEnd=self.chromEnd, 312 thickStart=self.chromStart, thickEnd=self.chromEnd,
328 # (start, end) 338 # (start, end)
329 def get_subrange(self, tstart, tstop, debug=False): 339 def get_subrange(self, tstart, tstop, debug=False):
330 chromStart = self.chromStart 340 chromStart = self.chromStart
331 chromEnd = self.chromEnd 341 chromEnd = self.chromEnd
332 if debug: 342 if debug:
333 print >> sys.stderr, "%s" % (str(self)) 343 print("%s" % (str(self)), file=sys.stderr)
334 r = range(self.blockCount) 344 r = list(range(self.blockCount))
335 if self.strand == '-': 345 if self.strand == '-':
336 r.reverse() 346 r.reverse()
337 bStart = 0 347 bStart = 0
338 bEnd = 0 348 bEnd = 0
339 for x in r: 349 for x in r:
351 (tstop - bStart) 361 (tstop - bStart)
352 else: 362 else:
353 chromStart = self.chromStart + self.blockStarts[x] +\ 363 chromStart = self.chromStart + self.blockStarts[x] +\
354 self.blockSizes[x] - (tstop - bStart) 364 self.blockSizes[x] - (tstop - bStart)
355 if debug: 365 if debug:
356 print >> sys.stderr,\ 366 print("%3d %s\t%d\t%d\t%d\t%d\t%d\t%d" %
357 "%3d %s\t%d\t%d\t%d\t%d\t%d\t%d"\ 367 (x, self.strand, bStart, bEnd,
358 % (x, self.strand, bStart, bEnd, 368 tstart, tstop, chromStart, chromEnd), file=sys.stderr)
359 tstart, tstop, chromStart, chromEnd)
360 bStart += self.blockSizes[x] 369 bStart += self.blockSizes[x]
361 return(chromStart, chromEnd) 370 return(chromStart, chromEnd)
362 371
363 # get the blocks for sub range 372 # get the blocks for sub range
364 def get_blocks(self, chromStart, chromEnd): 373 def get_blocks(self, chromStart, chromEnd):
407 if self.strand == '-': 416 if self.strand == '-':
408 exon_sizes.reverse() 417 exon_sizes.reverse()
409 splice_sites = [sum(exon_sizes[:x]) / 3 418 splice_sites = [sum(exon_sizes[:x]) / 3
410 for x in range(1, len(exon_sizes))] 419 for x in range(1, len(exon_sizes))]
411 if debug: 420 if debug:
412 print >> sys.stderr, "splice_sites: %s" % splice_sites 421 print("splice_sites: %s" % splice_sites, file=sys.stderr)
413 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] 422 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0]
414 if seq: 423 if seq:
415 for i in range(3): 424 for i in range(3):
416 translation = self.get_translation(sequence=seq[i:]) 425 translation = self.get_translation(sequence=seq[i:])
417 if translation: 426 if translation:
418 tstart = 0 427 tstart = 0
419 tstop = len(translation) 428 tstop = len(translation)
420 offset = (block_sum - i) % 3 429 offset = (block_sum - i) % 3
421 if debug: 430 if debug:
422 print >> sys.stderr,\ 431 print("frame: %d\ttstart: %d tstop: %d " +
423 "frame: %d\ttstart: %d tstop: %d offset: %d\t%s"\ 432 "offset: %d\t%s" %
424 % (i, tstart, tstop, offset, translation) 433 (i, tstart, tstop, offset, translation),
434 file=sys.stderr)
425 if not untrimmed: 435 if not untrimmed:
426 tstart = translation.rfind('*', 0, junc) + 1 436 tstart = translation.rfind('*', 0, junc) + 1
427 stop = translation.find('*', junc) 437 stop = translation.find('*', junc)
428 tstop = stop if stop >= 0 else len(translation) 438 tstop = stop if stop >= 0 else len(translation)
429 offset = (block_sum - i) % 3 439 offset = (block_sum - i) % 3
430 trimmed = translation[tstart:tstop] 440 trimmed = translation[tstart:tstop]
431 if debug: 441 if debug:
432 print >> sys.stderr,\ 442 print("frame: %d\ttstart: %d tstop: %d " +
433 "frame: %d\ttstart: %d tstop: %d offset: %d\t%s"\ 443 "offset: %d\t%s" %
434 % (i, tstart, tstop, offset, trimmed) 444 (i, tstart, tstop, offset, trimmed),
445 file=sys.stderr)
435 if filtering and tstart > ignore: 446 if filtering and tstart > ignore:
436 continue 447 continue
437 # get genomic locations for start and end 448 # get genomic locations for start and end
438 if self.strand == '+': 449 if self.strand == '+':
439 chromStart = self.chromStart + i + (tstart * 3) 450 chromStart = self.chromStart + i + (tstart * 3)
447 (tblockCount, tblockSizes, tblockStarts) =\ 458 (tblockCount, tblockSizes, tblockStarts) =\
448 self.get_blocks(chromStart, chromEnd) 459 self.get_blocks(chromStart, chromEnd)
449 translations[i] = (chromStart, chromEnd, trimmed, 460 translations[i] = (chromStart, chromEnd, trimmed,
450 tblockCount, tblockSizes, tblockStarts) 461 tblockCount, tblockSizes, tblockStarts)
451 if debug: 462 if debug:
452 print >> sys.stderr,\ 463 print("tblockCount: %d tblockStarts: %s " +
453 "tblockCount: %d tblockStarts: %s tblockSizes: %s"\ 464 "tblockSizes: %s" %
454 % (tblockCount, tblockStarts, tblockSizes) 465 (tblockCount, tblockStarts, tblockSizes),
466 file=sys.stderr)
455 return translations 467 return translations
456 468
457 def get_seq_id(self, seqtype='unk:unk', reference='', frame=None): 469 def get_seq_id(self, seqtype='unk:unk', reference='', frame=None):
458 # Ensembl fasta ID format 470 # Ensembl fasta ID format
459 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT 471 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT