Mercurial > repos > galaxyp > retrieve_ensembl_bed
comparison bedutil.py @ 1:c3d600729b6f draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 88cf1e923a8c9e5bc6953ad412d15a7c70f054d1
author | galaxyp |
---|---|
date | Mon, 22 Jan 2018 13:13:26 -0500 |
parents | 887e111c0919 |
children |
comparison
equal
deleted
inserted
replaced
0:887e111c0919 | 1:c3d600729b6f |
---|---|
9 # | 9 # |
10 # James E Johnson | 10 # James E Johnson |
11 # | 11 # |
12 #------------------------------------------------------------------------------ | 12 #------------------------------------------------------------------------------ |
13 """ | 13 """ |
14 | |
15 from __future__ import print_function | |
14 | 16 |
15 import sys | 17 import sys |
16 | 18 |
17 from Bio.Seq import reverse_complement, translate | 19 from Bio.Seq import reverse_complement, translate |
18 | 20 |
43 bed_entry.second_gene_name = fields[18] | 45 bed_entry.second_gene_name = fields[18] |
44 bed_entry.gene_type = fields[19] | 46 bed_entry.gene_type = fields[19] |
45 return bed_entry | 47 return bed_entry |
46 | 48 |
47 | 49 |
50 def as_int_list(obj): | |
51 if obj is None: | |
52 return None | |
53 if isinstance(obj, list): | |
54 return [int(x) for x in obj] | |
55 elif isinstance(obj, str): | |
56 return [int(x) for x in obj.split(',')] | |
57 else: # python2 unicode? | |
58 return [int(x) for x in str(obj).split(',')] | |
59 | |
60 | |
48 class BedEntry(object): | 61 class BedEntry(object): |
49 def __init__(self, chrom=None, chromStart=None, chromEnd=None, | 62 def __init__(self, chrom=None, chromStart=None, chromEnd=None, |
50 name=None, score=None, strand=None, | 63 name=None, score=None, strand=None, |
51 thickStart=None, thickEnd=None, itemRgb=None, | 64 thickStart=None, thickEnd=None, itemRgb=None, |
52 blockCount=None, blockSizes=None, blockStarts=None): | 65 blockCount=None, blockSizes=None, blockStarts=None): |
58 self.strand = '-' if str(strand).startswith('-') else '+' | 71 self.strand = '-' if str(strand).startswith('-') else '+' |
59 self.thickStart = int(thickStart) if thickStart else self.chromStart | 72 self.thickStart = int(thickStart) if thickStart else self.chromStart |
60 self.thickEnd = int(thickEnd) if thickEnd else self.chromEnd | 73 self.thickEnd = int(thickEnd) if thickEnd else self.chromEnd |
61 self.itemRgb = str(itemRgb) if itemRgb is not None else r'100,100,100' | 74 self.itemRgb = str(itemRgb) if itemRgb is not None else r'100,100,100' |
62 self.blockCount = int(blockCount) | 75 self.blockCount = int(blockCount) |
63 if isinstance(blockSizes, str) or isinstance(blockSizes, unicode): | 76 self.blockSizes = as_int_list(blockSizes) |
64 self.blockSizes = [int(x) for x in blockSizes.split(',')] | 77 self.blockStarts = as_int_list(blockStarts) |
65 elif isinstance(blockSizes, list): | |
66 self.blockSizes = [int(x) for x in blockSizes] | |
67 else: | |
68 self.blockSizes = blockSizes | |
69 if isinstance(blockStarts, str) or isinstance(blockSizes, unicode): | |
70 self.blockStarts = [int(x) for x in blockStarts.split(',')] | |
71 elif isinstance(blockStarts, list): | |
72 self.blockStarts = [int(x) for x in blockStarts] | |
73 else: | |
74 self.blockStarts = blockStarts | |
75 self.second_name = None | 78 self.second_name = None |
76 self.cds_start_status = None | 79 self.cds_start_status = None |
77 self.cds_end_status = None | 80 self.cds_end_status = None |
78 self.exon_frames = None | 81 self.exon_frames = None |
79 self.biotype = None | 82 self.biotype = None |
163 return self.set_cds(min(cds_pos) + basepairs, max(cds_pos)) | 166 return self.set_cds(min(cds_pos) + basepairs, max(cds_pos)) |
164 else: | 167 else: |
165 return self.set_cds(min(cds_pos), max(cds_pos) + basepairs) | 168 return self.set_cds(min(cds_pos), max(cds_pos) + basepairs) |
166 return None | 169 return None |
167 | 170 |
171 def get_cds_bed(self): | |
172 cds_pos = [self.cdna_offset_of_pos(self.thickStart), | |
173 self.cdna_offset_of_pos(self.thickEnd)] | |
174 return self.trim(min(cds_pos), max(cds_pos)) | |
175 | |
168 def get_cigar(self): | 176 def get_cigar(self): |
169 cigar = '' | 177 cigar = '' |
170 r = range(self.blockCount) | 178 r = range(self.blockCount) |
171 xl = None | 179 xl = None |
172 for x in r: | 180 for x in r: |
212 translations.append(translation) | 220 translations.append(translation) |
213 return translations | 221 return translations |
214 | 222 |
215 def pos_of_cdna_offet(self, offset): | 223 def pos_of_cdna_offet(self, offset): |
216 if offset is not None and 0 <= offset < sum(self.blockSizes): | 224 if offset is not None and 0 <= offset < sum(self.blockSizes): |
217 r = range(self.blockCount) | 225 r = list(range(self.blockCount)) |
218 rev = self.strand == '-' | 226 rev = self.strand == '-' |
219 if rev: | 227 if rev: |
220 r.reverse() | 228 r.reverse() |
221 nlen = 0 | 229 nlen = 0 |
222 for x in r: | 230 for x in r: |
231 return None | 239 return None |
232 | 240 |
233 def cdna_offset_of_pos(self, pos): | 241 def cdna_offset_of_pos(self, pos): |
234 if not self.chromStart <= pos < self.chromEnd: | 242 if not self.chromStart <= pos < self.chromEnd: |
235 return -1 | 243 return -1 |
236 r = range(self.blockCount) | 244 r = list(range(self.blockCount)) |
237 rev = self.strand == '-' | 245 rev = self.strand == '-' |
238 if rev: | 246 if rev: |
239 r.reverse() | 247 r.reverse() |
240 nlen = 0 | 248 nlen = 0 |
241 for x in r: | 249 for x in r: |
246 nlen += self.blockSizes[x] | 254 nlen += self.blockSizes[x] |
247 | 255 |
248 def apply_variant(self, pos, ref, alt): | 256 def apply_variant(self, pos, ref, alt): |
249 pos = int(pos) | 257 pos = int(pos) |
250 if not ref or not alt: | 258 if not ref or not alt: |
251 print >> sys.stderr, "variant requires ref and alt sequences" | 259 print("variant requires ref and alt sequences", file=sys.stderr) |
252 return | 260 return |
253 if not self.chromStart <= pos <= self.chromEnd: | 261 if not self.chromStart <= pos <= self.chromEnd: |
254 print >> sys.stderr, "variant not in entry %s: %s %d < %d < %d"\ | 262 print("variant not in entry %s: %s %d < %d < %d" % |
255 % (self.name, self.strand, self.chromStart, pos, self.chromEnd) | 263 (self.name, self.strand, |
256 print >> sys.stderr, "%s" % str(self) | 264 self.chromStart, pos, self.chromEnd), |
265 file=sys.stderr) | |
266 print("%s" % str(self), file=sys.stderr) | |
257 return | 267 return |
258 if len(ref) != len(alt): | 268 if len(ref) != len(alt): |
259 print >> sys.stderr, "variant only works for snp: %s %s"\ | 269 print("variant only works for snp: %s %s" % (ref, alt), |
260 % (ref, alt) | 270 file=sys.stderr) |
261 return | 271 return |
262 if not self.seq: | 272 if not self.seq: |
263 print >> sys.stderr, "variant entry %s has no seq" % self.name | 273 print("variant entry %s has no seq" % self.name, file=sys.stderr) |
264 return | 274 return |
265 """ | 275 """ |
266 if self.strand == '-': | 276 if self.strand == '-': |
267 ref = reverse_complement(ref) | 277 ref = reverse_complement(ref) |
268 alt = reverse_complement(alt) | 278 alt = reverse_complement(alt) |
272 for i in range(len(ref)): | 282 for i in range(len(ref)): |
273 # offset = self.cdna_offset_of_pos(pos+i) | 283 # offset = self.cdna_offset_of_pos(pos+i) |
274 if offset is not None: | 284 if offset is not None: |
275 bases[offset+i] = alt[i] | 285 bases[offset+i] = alt[i] |
276 else: | 286 else: |
277 print >> sys.stderr,\ | 287 print("variant offset %s: %s %d < %d < %d" % |
278 "variant offset %s: %s %d < %d < %d"\ | 288 (self.name, self.strand, self.chromStart, |
279 % (self.name, self.strand, self.chromStart, | 289 pos+1, self.chromEnd), file=sys.stderr) |
280 pos+1, self.chromEnd) | 290 print("%s" % str(self), file=sys.stderr) |
281 print >> sys.stderr, "%s" % str(self) | |
282 self.seq = ''.join(bases) | 291 self.seq = ''.join(bases) |
283 self.variants.append("g.%d%s>%s" % (pos+1, ref, alt)) | 292 self.variants.append("g.%d%s>%s" % (pos+1, ref, alt)) |
284 | 293 |
285 def get_variant_bed(self, pos, ref, alt): | 294 def get_variant_bed(self, pos, ref, alt): |
286 pos = int(pos) | 295 pos = int(pos) |
287 if not ref or not alt: | 296 if not ref or not alt: |
288 print >> sys.stderr, "variant requires ref and alt sequences" | 297 print("variant requires ref and alt sequences", file=sys.stderr) |
289 return None | 298 return None |
290 if not self.chromStart <= pos <= self.chromEnd: | 299 if not self.chromStart <= pos <= self.chromEnd: |
291 print >> sys.stderr,\ | 300 print("variant not in entry %s: %s %d < %d < %d" % |
292 "variant not in entry %s: %s %d < %d < %d"\ | 301 (self.name, self.strand, |
293 % (self.name, self.strand, self.chromStart, pos, self.chromEnd) | 302 self.chromStart, pos, self.chromEnd), |
294 print >> sys.stderr, "%s" % str(self) | 303 file=sys.stderr) |
304 print("%s" % str(self), file=sys.stderr) | |
295 return None | 305 return None |
296 if not self.seq: | 306 if not self.seq: |
297 print >> sys.stderr, "variant entry %s has no seq" % self.name | 307 print("variant entry %s has no seq" % self.name, file=sys.stderr) |
298 return None | 308 return None |
299 tbed = BedEntry(chrom=self.chrom, | 309 tbed = BedEntry(chrom=self.chrom, |
300 chromStart=self.chromStart, chromEnd=self.chromEnd, | 310 chromStart=self.chromStart, chromEnd=self.chromEnd, |
301 name=self.name, score=self.score, strand=self.strand, | 311 name=self.name, score=self.score, strand=self.strand, |
302 thickStart=self.chromStart, thickEnd=self.chromEnd, | 312 thickStart=self.chromStart, thickEnd=self.chromEnd, |
328 # (start, end) | 338 # (start, end) |
329 def get_subrange(self, tstart, tstop, debug=False): | 339 def get_subrange(self, tstart, tstop, debug=False): |
330 chromStart = self.chromStart | 340 chromStart = self.chromStart |
331 chromEnd = self.chromEnd | 341 chromEnd = self.chromEnd |
332 if debug: | 342 if debug: |
333 print >> sys.stderr, "%s" % (str(self)) | 343 print("%s" % (str(self)), file=sys.stderr) |
334 r = range(self.blockCount) | 344 r = list(range(self.blockCount)) |
335 if self.strand == '-': | 345 if self.strand == '-': |
336 r.reverse() | 346 r.reverse() |
337 bStart = 0 | 347 bStart = 0 |
338 bEnd = 0 | 348 bEnd = 0 |
339 for x in r: | 349 for x in r: |
351 (tstop - bStart) | 361 (tstop - bStart) |
352 else: | 362 else: |
353 chromStart = self.chromStart + self.blockStarts[x] +\ | 363 chromStart = self.chromStart + self.blockStarts[x] +\ |
354 self.blockSizes[x] - (tstop - bStart) | 364 self.blockSizes[x] - (tstop - bStart) |
355 if debug: | 365 if debug: |
356 print >> sys.stderr,\ | 366 print("%3d %s\t%d\t%d\t%d\t%d\t%d\t%d" % |
357 "%3d %s\t%d\t%d\t%d\t%d\t%d\t%d"\ | 367 (x, self.strand, bStart, bEnd, |
358 % (x, self.strand, bStart, bEnd, | 368 tstart, tstop, chromStart, chromEnd), file=sys.stderr) |
359 tstart, tstop, chromStart, chromEnd) | |
360 bStart += self.blockSizes[x] | 369 bStart += self.blockSizes[x] |
361 return(chromStart, chromEnd) | 370 return(chromStart, chromEnd) |
362 | 371 |
363 # get the blocks for sub range | 372 # get the blocks for sub range |
364 def get_blocks(self, chromStart, chromEnd): | 373 def get_blocks(self, chromStart, chromEnd): |
407 if self.strand == '-': | 416 if self.strand == '-': |
408 exon_sizes.reverse() | 417 exon_sizes.reverse() |
409 splice_sites = [sum(exon_sizes[:x]) / 3 | 418 splice_sites = [sum(exon_sizes[:x]) / 3 |
410 for x in range(1, len(exon_sizes))] | 419 for x in range(1, len(exon_sizes))] |
411 if debug: | 420 if debug: |
412 print >> sys.stderr, "splice_sites: %s" % splice_sites | 421 print("splice_sites: %s" % splice_sites, file=sys.stderr) |
413 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] | 422 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] |
414 if seq: | 423 if seq: |
415 for i in range(3): | 424 for i in range(3): |
416 translation = self.get_translation(sequence=seq[i:]) | 425 translation = self.get_translation(sequence=seq[i:]) |
417 if translation: | 426 if translation: |
418 tstart = 0 | 427 tstart = 0 |
419 tstop = len(translation) | 428 tstop = len(translation) |
420 offset = (block_sum - i) % 3 | 429 offset = (block_sum - i) % 3 |
421 if debug: | 430 if debug: |
422 print >> sys.stderr,\ | 431 print("frame: %d\ttstart: %d tstop: %d " + |
423 "frame: %d\ttstart: %d tstop: %d offset: %d\t%s"\ | 432 "offset: %d\t%s" % |
424 % (i, tstart, tstop, offset, translation) | 433 (i, tstart, tstop, offset, translation), |
434 file=sys.stderr) | |
425 if not untrimmed: | 435 if not untrimmed: |
426 tstart = translation.rfind('*', 0, junc) + 1 | 436 tstart = translation.rfind('*', 0, junc) + 1 |
427 stop = translation.find('*', junc) | 437 stop = translation.find('*', junc) |
428 tstop = stop if stop >= 0 else len(translation) | 438 tstop = stop if stop >= 0 else len(translation) |
429 offset = (block_sum - i) % 3 | 439 offset = (block_sum - i) % 3 |
430 trimmed = translation[tstart:tstop] | 440 trimmed = translation[tstart:tstop] |
431 if debug: | 441 if debug: |
432 print >> sys.stderr,\ | 442 print("frame: %d\ttstart: %d tstop: %d " + |
433 "frame: %d\ttstart: %d tstop: %d offset: %d\t%s"\ | 443 "offset: %d\t%s" % |
434 % (i, tstart, tstop, offset, trimmed) | 444 (i, tstart, tstop, offset, trimmed), |
445 file=sys.stderr) | |
435 if filtering and tstart > ignore: | 446 if filtering and tstart > ignore: |
436 continue | 447 continue |
437 # get genomic locations for start and end | 448 # get genomic locations for start and end |
438 if self.strand == '+': | 449 if self.strand == '+': |
439 chromStart = self.chromStart + i + (tstart * 3) | 450 chromStart = self.chromStart + i + (tstart * 3) |
447 (tblockCount, tblockSizes, tblockStarts) =\ | 458 (tblockCount, tblockSizes, tblockStarts) =\ |
448 self.get_blocks(chromStart, chromEnd) | 459 self.get_blocks(chromStart, chromEnd) |
449 translations[i] = (chromStart, chromEnd, trimmed, | 460 translations[i] = (chromStart, chromEnd, trimmed, |
450 tblockCount, tblockSizes, tblockStarts) | 461 tblockCount, tblockSizes, tblockStarts) |
451 if debug: | 462 if debug: |
452 print >> sys.stderr,\ | 463 print("tblockCount: %d tblockStarts: %s " + |
453 "tblockCount: %d tblockStarts: %s tblockSizes: %s"\ | 464 "tblockSizes: %s" % |
454 % (tblockCount, tblockStarts, tblockSizes) | 465 (tblockCount, tblockStarts, tblockSizes), |
466 file=sys.stderr) | |
455 return translations | 467 return translations |
456 | 468 |
457 def get_seq_id(self, seqtype='unk:unk', reference='', frame=None): | 469 def get_seq_id(self, seqtype='unk:unk', reference='', frame=None): |
458 # Ensembl fasta ID format | 470 # Ensembl fasta ID format |
459 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT | 471 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT |