Mercurial > repos > galaxyp > retrieve_ensembl_bed
comparison bedutil.py @ 1:c3d600729b6f draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 88cf1e923a8c9e5bc6953ad412d15a7c70f054d1
| author | galaxyp |
|---|---|
| date | Mon, 22 Jan 2018 13:13:26 -0500 |
| parents | 887e111c0919 |
| children |
comparison
equal
deleted
inserted
replaced
| 0:887e111c0919 | 1:c3d600729b6f |
|---|---|
| 9 # | 9 # |
| 10 # James E Johnson | 10 # James E Johnson |
| 11 # | 11 # |
| 12 #------------------------------------------------------------------------------ | 12 #------------------------------------------------------------------------------ |
| 13 """ | 13 """ |
| 14 | |
| 15 from __future__ import print_function | |
| 14 | 16 |
| 15 import sys | 17 import sys |
| 16 | 18 |
| 17 from Bio.Seq import reverse_complement, translate | 19 from Bio.Seq import reverse_complement, translate |
| 18 | 20 |
| 43 bed_entry.second_gene_name = fields[18] | 45 bed_entry.second_gene_name = fields[18] |
| 44 bed_entry.gene_type = fields[19] | 46 bed_entry.gene_type = fields[19] |
| 45 return bed_entry | 47 return bed_entry |
| 46 | 48 |
| 47 | 49 |
| 50 def as_int_list(obj): | |
| 51 if obj is None: | |
| 52 return None | |
| 53 if isinstance(obj, list): | |
| 54 return [int(x) for x in obj] | |
| 55 elif isinstance(obj, str): | |
| 56 return [int(x) for x in obj.split(',')] | |
| 57 else: # python2 unicode? | |
| 58 return [int(x) for x in str(obj).split(',')] | |
| 59 | |
| 60 | |
| 48 class BedEntry(object): | 61 class BedEntry(object): |
| 49 def __init__(self, chrom=None, chromStart=None, chromEnd=None, | 62 def __init__(self, chrom=None, chromStart=None, chromEnd=None, |
| 50 name=None, score=None, strand=None, | 63 name=None, score=None, strand=None, |
| 51 thickStart=None, thickEnd=None, itemRgb=None, | 64 thickStart=None, thickEnd=None, itemRgb=None, |
| 52 blockCount=None, blockSizes=None, blockStarts=None): | 65 blockCount=None, blockSizes=None, blockStarts=None): |
| 58 self.strand = '-' if str(strand).startswith('-') else '+' | 71 self.strand = '-' if str(strand).startswith('-') else '+' |
| 59 self.thickStart = int(thickStart) if thickStart else self.chromStart | 72 self.thickStart = int(thickStart) if thickStart else self.chromStart |
| 60 self.thickEnd = int(thickEnd) if thickEnd else self.chromEnd | 73 self.thickEnd = int(thickEnd) if thickEnd else self.chromEnd |
| 61 self.itemRgb = str(itemRgb) if itemRgb is not None else r'100,100,100' | 74 self.itemRgb = str(itemRgb) if itemRgb is not None else r'100,100,100' |
| 62 self.blockCount = int(blockCount) | 75 self.blockCount = int(blockCount) |
| 63 if isinstance(blockSizes, str) or isinstance(blockSizes, unicode): | 76 self.blockSizes = as_int_list(blockSizes) |
| 64 self.blockSizes = [int(x) for x in blockSizes.split(',')] | 77 self.blockStarts = as_int_list(blockStarts) |
| 65 elif isinstance(blockSizes, list): | |
| 66 self.blockSizes = [int(x) for x in blockSizes] | |
| 67 else: | |
| 68 self.blockSizes = blockSizes | |
| 69 if isinstance(blockStarts, str) or isinstance(blockSizes, unicode): | |
| 70 self.blockStarts = [int(x) for x in blockStarts.split(',')] | |
| 71 elif isinstance(blockStarts, list): | |
| 72 self.blockStarts = [int(x) for x in blockStarts] | |
| 73 else: | |
| 74 self.blockStarts = blockStarts | |
| 75 self.second_name = None | 78 self.second_name = None |
| 76 self.cds_start_status = None | 79 self.cds_start_status = None |
| 77 self.cds_end_status = None | 80 self.cds_end_status = None |
| 78 self.exon_frames = None | 81 self.exon_frames = None |
| 79 self.biotype = None | 82 self.biotype = None |
| 163 return self.set_cds(min(cds_pos) + basepairs, max(cds_pos)) | 166 return self.set_cds(min(cds_pos) + basepairs, max(cds_pos)) |
| 164 else: | 167 else: |
| 165 return self.set_cds(min(cds_pos), max(cds_pos) + basepairs) | 168 return self.set_cds(min(cds_pos), max(cds_pos) + basepairs) |
| 166 return None | 169 return None |
| 167 | 170 |
| 171 def get_cds_bed(self): | |
| 172 cds_pos = [self.cdna_offset_of_pos(self.thickStart), | |
| 173 self.cdna_offset_of_pos(self.thickEnd)] | |
| 174 return self.trim(min(cds_pos), max(cds_pos)) | |
| 175 | |
| 168 def get_cigar(self): | 176 def get_cigar(self): |
| 169 cigar = '' | 177 cigar = '' |
| 170 r = range(self.blockCount) | 178 r = range(self.blockCount) |
| 171 xl = None | 179 xl = None |
| 172 for x in r: | 180 for x in r: |
| 212 translations.append(translation) | 220 translations.append(translation) |
| 213 return translations | 221 return translations |
| 214 | 222 |
| 215 def pos_of_cdna_offet(self, offset): | 223 def pos_of_cdna_offet(self, offset): |
| 216 if offset is not None and 0 <= offset < sum(self.blockSizes): | 224 if offset is not None and 0 <= offset < sum(self.blockSizes): |
| 217 r = range(self.blockCount) | 225 r = list(range(self.blockCount)) |
| 218 rev = self.strand == '-' | 226 rev = self.strand == '-' |
| 219 if rev: | 227 if rev: |
| 220 r.reverse() | 228 r.reverse() |
| 221 nlen = 0 | 229 nlen = 0 |
| 222 for x in r: | 230 for x in r: |
| 231 return None | 239 return None |
| 232 | 240 |
| 233 def cdna_offset_of_pos(self, pos): | 241 def cdna_offset_of_pos(self, pos): |
| 234 if not self.chromStart <= pos < self.chromEnd: | 242 if not self.chromStart <= pos < self.chromEnd: |
| 235 return -1 | 243 return -1 |
| 236 r = range(self.blockCount) | 244 r = list(range(self.blockCount)) |
| 237 rev = self.strand == '-' | 245 rev = self.strand == '-' |
| 238 if rev: | 246 if rev: |
| 239 r.reverse() | 247 r.reverse() |
| 240 nlen = 0 | 248 nlen = 0 |
| 241 for x in r: | 249 for x in r: |
| 246 nlen += self.blockSizes[x] | 254 nlen += self.blockSizes[x] |
| 247 | 255 |
| 248 def apply_variant(self, pos, ref, alt): | 256 def apply_variant(self, pos, ref, alt): |
| 249 pos = int(pos) | 257 pos = int(pos) |
| 250 if not ref or not alt: | 258 if not ref or not alt: |
| 251 print >> sys.stderr, "variant requires ref and alt sequences" | 259 print("variant requires ref and alt sequences", file=sys.stderr) |
| 252 return | 260 return |
| 253 if not self.chromStart <= pos <= self.chromEnd: | 261 if not self.chromStart <= pos <= self.chromEnd: |
| 254 print >> sys.stderr, "variant not in entry %s: %s %d < %d < %d"\ | 262 print("variant not in entry %s: %s %d < %d < %d" % |
| 255 % (self.name, self.strand, self.chromStart, pos, self.chromEnd) | 263 (self.name, self.strand, |
| 256 print >> sys.stderr, "%s" % str(self) | 264 self.chromStart, pos, self.chromEnd), |
| 265 file=sys.stderr) | |
| 266 print("%s" % str(self), file=sys.stderr) | |
| 257 return | 267 return |
| 258 if len(ref) != len(alt): | 268 if len(ref) != len(alt): |
| 259 print >> sys.stderr, "variant only works for snp: %s %s"\ | 269 print("variant only works for snp: %s %s" % (ref, alt), |
| 260 % (ref, alt) | 270 file=sys.stderr) |
| 261 return | 271 return |
| 262 if not self.seq: | 272 if not self.seq: |
| 263 print >> sys.stderr, "variant entry %s has no seq" % self.name | 273 print("variant entry %s has no seq" % self.name, file=sys.stderr) |
| 264 return | 274 return |
| 265 """ | 275 """ |
| 266 if self.strand == '-': | 276 if self.strand == '-': |
| 267 ref = reverse_complement(ref) | 277 ref = reverse_complement(ref) |
| 268 alt = reverse_complement(alt) | 278 alt = reverse_complement(alt) |
| 272 for i in range(len(ref)): | 282 for i in range(len(ref)): |
| 273 # offset = self.cdna_offset_of_pos(pos+i) | 283 # offset = self.cdna_offset_of_pos(pos+i) |
| 274 if offset is not None: | 284 if offset is not None: |
| 275 bases[offset+i] = alt[i] | 285 bases[offset+i] = alt[i] |
| 276 else: | 286 else: |
| 277 print >> sys.stderr,\ | 287 print("variant offset %s: %s %d < %d < %d" % |
| 278 "variant offset %s: %s %d < %d < %d"\ | 288 (self.name, self.strand, self.chromStart, |
| 279 % (self.name, self.strand, self.chromStart, | 289 pos+1, self.chromEnd), file=sys.stderr) |
| 280 pos+1, self.chromEnd) | 290 print("%s" % str(self), file=sys.stderr) |
| 281 print >> sys.stderr, "%s" % str(self) | |
| 282 self.seq = ''.join(bases) | 291 self.seq = ''.join(bases) |
| 283 self.variants.append("g.%d%s>%s" % (pos+1, ref, alt)) | 292 self.variants.append("g.%d%s>%s" % (pos+1, ref, alt)) |
| 284 | 293 |
| 285 def get_variant_bed(self, pos, ref, alt): | 294 def get_variant_bed(self, pos, ref, alt): |
| 286 pos = int(pos) | 295 pos = int(pos) |
| 287 if not ref or not alt: | 296 if not ref or not alt: |
| 288 print >> sys.stderr, "variant requires ref and alt sequences" | 297 print("variant requires ref and alt sequences", file=sys.stderr) |
| 289 return None | 298 return None |
| 290 if not self.chromStart <= pos <= self.chromEnd: | 299 if not self.chromStart <= pos <= self.chromEnd: |
| 291 print >> sys.stderr,\ | 300 print("variant not in entry %s: %s %d < %d < %d" % |
| 292 "variant not in entry %s: %s %d < %d < %d"\ | 301 (self.name, self.strand, |
| 293 % (self.name, self.strand, self.chromStart, pos, self.chromEnd) | 302 self.chromStart, pos, self.chromEnd), |
| 294 print >> sys.stderr, "%s" % str(self) | 303 file=sys.stderr) |
| 304 print("%s" % str(self), file=sys.stderr) | |
| 295 return None | 305 return None |
| 296 if not self.seq: | 306 if not self.seq: |
| 297 print >> sys.stderr, "variant entry %s has no seq" % self.name | 307 print("variant entry %s has no seq" % self.name, file=sys.stderr) |
| 298 return None | 308 return None |
| 299 tbed = BedEntry(chrom=self.chrom, | 309 tbed = BedEntry(chrom=self.chrom, |
| 300 chromStart=self.chromStart, chromEnd=self.chromEnd, | 310 chromStart=self.chromStart, chromEnd=self.chromEnd, |
| 301 name=self.name, score=self.score, strand=self.strand, | 311 name=self.name, score=self.score, strand=self.strand, |
| 302 thickStart=self.chromStart, thickEnd=self.chromEnd, | 312 thickStart=self.chromStart, thickEnd=self.chromEnd, |
| 328 # (start, end) | 338 # (start, end) |
| 329 def get_subrange(self, tstart, tstop, debug=False): | 339 def get_subrange(self, tstart, tstop, debug=False): |
| 330 chromStart = self.chromStart | 340 chromStart = self.chromStart |
| 331 chromEnd = self.chromEnd | 341 chromEnd = self.chromEnd |
| 332 if debug: | 342 if debug: |
| 333 print >> sys.stderr, "%s" % (str(self)) | 343 print("%s" % (str(self)), file=sys.stderr) |
| 334 r = range(self.blockCount) | 344 r = list(range(self.blockCount)) |
| 335 if self.strand == '-': | 345 if self.strand == '-': |
| 336 r.reverse() | 346 r.reverse() |
| 337 bStart = 0 | 347 bStart = 0 |
| 338 bEnd = 0 | 348 bEnd = 0 |
| 339 for x in r: | 349 for x in r: |
| 351 (tstop - bStart) | 361 (tstop - bStart) |
| 352 else: | 362 else: |
| 353 chromStart = self.chromStart + self.blockStarts[x] +\ | 363 chromStart = self.chromStart + self.blockStarts[x] +\ |
| 354 self.blockSizes[x] - (tstop - bStart) | 364 self.blockSizes[x] - (tstop - bStart) |
| 355 if debug: | 365 if debug: |
| 356 print >> sys.stderr,\ | 366 print("%3d %s\t%d\t%d\t%d\t%d\t%d\t%d" % |
| 357 "%3d %s\t%d\t%d\t%d\t%d\t%d\t%d"\ | 367 (x, self.strand, bStart, bEnd, |
| 358 % (x, self.strand, bStart, bEnd, | 368 tstart, tstop, chromStart, chromEnd), file=sys.stderr) |
| 359 tstart, tstop, chromStart, chromEnd) | |
| 360 bStart += self.blockSizes[x] | 369 bStart += self.blockSizes[x] |
| 361 return(chromStart, chromEnd) | 370 return(chromStart, chromEnd) |
| 362 | 371 |
| 363 # get the blocks for sub range | 372 # get the blocks for sub range |
| 364 def get_blocks(self, chromStart, chromEnd): | 373 def get_blocks(self, chromStart, chromEnd): |
| 407 if self.strand == '-': | 416 if self.strand == '-': |
| 408 exon_sizes.reverse() | 417 exon_sizes.reverse() |
| 409 splice_sites = [sum(exon_sizes[:x]) / 3 | 418 splice_sites = [sum(exon_sizes[:x]) / 3 |
| 410 for x in range(1, len(exon_sizes))] | 419 for x in range(1, len(exon_sizes))] |
| 411 if debug: | 420 if debug: |
| 412 print >> sys.stderr, "splice_sites: %s" % splice_sites | 421 print("splice_sites: %s" % splice_sites, file=sys.stderr) |
| 413 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] | 422 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] |
| 414 if seq: | 423 if seq: |
| 415 for i in range(3): | 424 for i in range(3): |
| 416 translation = self.get_translation(sequence=seq[i:]) | 425 translation = self.get_translation(sequence=seq[i:]) |
| 417 if translation: | 426 if translation: |
| 418 tstart = 0 | 427 tstart = 0 |
| 419 tstop = len(translation) | 428 tstop = len(translation) |
| 420 offset = (block_sum - i) % 3 | 429 offset = (block_sum - i) % 3 |
| 421 if debug: | 430 if debug: |
| 422 print >> sys.stderr,\ | 431 print("frame: %d\ttstart: %d tstop: %d " + |
| 423 "frame: %d\ttstart: %d tstop: %d offset: %d\t%s"\ | 432 "offset: %d\t%s" % |
| 424 % (i, tstart, tstop, offset, translation) | 433 (i, tstart, tstop, offset, translation), |
| 434 file=sys.stderr) | |
| 425 if not untrimmed: | 435 if not untrimmed: |
| 426 tstart = translation.rfind('*', 0, junc) + 1 | 436 tstart = translation.rfind('*', 0, junc) + 1 |
| 427 stop = translation.find('*', junc) | 437 stop = translation.find('*', junc) |
| 428 tstop = stop if stop >= 0 else len(translation) | 438 tstop = stop if stop >= 0 else len(translation) |
| 429 offset = (block_sum - i) % 3 | 439 offset = (block_sum - i) % 3 |
| 430 trimmed = translation[tstart:tstop] | 440 trimmed = translation[tstart:tstop] |
| 431 if debug: | 441 if debug: |
| 432 print >> sys.stderr,\ | 442 print("frame: %d\ttstart: %d tstop: %d " + |
| 433 "frame: %d\ttstart: %d tstop: %d offset: %d\t%s"\ | 443 "offset: %d\t%s" % |
| 434 % (i, tstart, tstop, offset, trimmed) | 444 (i, tstart, tstop, offset, trimmed), |
| 445 file=sys.stderr) | |
| 435 if filtering and tstart > ignore: | 446 if filtering and tstart > ignore: |
| 436 continue | 447 continue |
| 437 # get genomic locations for start and end | 448 # get genomic locations for start and end |
| 438 if self.strand == '+': | 449 if self.strand == '+': |
| 439 chromStart = self.chromStart + i + (tstart * 3) | 450 chromStart = self.chromStart + i + (tstart * 3) |
| 447 (tblockCount, tblockSizes, tblockStarts) =\ | 458 (tblockCount, tblockSizes, tblockStarts) =\ |
| 448 self.get_blocks(chromStart, chromEnd) | 459 self.get_blocks(chromStart, chromEnd) |
| 449 translations[i] = (chromStart, chromEnd, trimmed, | 460 translations[i] = (chromStart, chromEnd, trimmed, |
| 450 tblockCount, tblockSizes, tblockStarts) | 461 tblockCount, tblockSizes, tblockStarts) |
| 451 if debug: | 462 if debug: |
| 452 print >> sys.stderr,\ | 463 print("tblockCount: %d tblockStarts: %s " + |
| 453 "tblockCount: %d tblockStarts: %s tblockSizes: %s"\ | 464 "tblockSizes: %s" % |
| 454 % (tblockCount, tblockStarts, tblockSizes) | 465 (tblockCount, tblockStarts, tblockSizes), |
| 466 file=sys.stderr) | |
| 455 return translations | 467 return translations |
| 456 | 468 |
| 457 def get_seq_id(self, seqtype='unk:unk', reference='', frame=None): | 469 def get_seq_id(self, seqtype='unk:unk', reference='', frame=None): |
| 458 # Ensembl fasta ID format | 470 # Ensembl fasta ID format |
| 459 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT | 471 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT |
