Mercurial > repos > jjohnson > ensembl_cdna_translate
comparison ensembl_cdna_translate.py @ 7:d59e3ce10e74 draft
Uploaded
| author | jjohnson |
|---|---|
| date | Wed, 13 Dec 2017 11:15:34 -0500 |
| parents | b7f2f5e3390c |
| children | 5c92d0be6514 |
comparison
equal
deleted
inserted
replaced
| 6:f8e0e5e237a5 | 7:d59e3ce10e74 |
|---|---|
| 11 # | 11 # |
| 12 #------------------------------------------------------------------------------ | 12 #------------------------------------------------------------------------------ |
| 13 """ | 13 """ |
| 14 | 14 |
| 15 import argparse | 15 import argparse |
| 16 import re | |
| 16 import sys | 17 import sys |
| 17 from time import sleep | 18 from time import sleep |
| 18 | 19 |
| 19 from Bio.Seq import translate | 20 from Bio.Seq import translate |
| 20 | 21 |
| 23 import digest | 24 import digest |
| 24 | 25 |
| 25 | 26 |
| 26 server = "https://rest.ensembl.org" | 27 server = "https://rest.ensembl.org" |
| 27 ext = "/info/assembly/homo_sapiens?" | 28 ext = "/info/assembly/homo_sapiens?" |
| 28 max_region = 5000000 | 29 max_region = 4000000 |
| 29 | 30 |
| 30 | 31 |
| 31 def ensembl_rest(ext, headers): | 32 def ensembl_rest(ext, headers): |
| 32 # if True: print >> sys.stderr, "%s" % ext | 33 if True: print >> sys.stderr, "%s" % ext |
| 33 r = requests.get(server+ext, headers=headers) | 34 r = requests.get(server+ext, headers=headers) |
| 34 if r.status_code == 429: | 35 if r.status_code == 429: |
| 35 print >> sys.stderr, "response headers: %s\n" % r.headers | 36 print >> sys.stderr, "response headers: %s\n" % r.headers |
| 36 if 'Retry-After' in r.headers: | 37 if 'Retry-After' in r.headers: |
| 37 sleep(r.headers['Retry-After']) | 38 sleep(r.headers['Retry-After']) |
| 80 coord_system = coord_systems[seq['coord_system']] | 81 coord_system = coord_systems[seq['coord_system']] |
| 81 coord_system[seq['name']] = int(seq['length']) | 82 coord_system[seq['name']] = int(seq['length']) |
| 82 return coord_systems | 83 return coord_systems |
| 83 | 84 |
| 84 | 85 |
| 85 def get_transcripts_bed(species, refseq, start, length,params=None): | 86 def get_transcripts_bed(species, refseq, start, length, strand='', params=None): |
| 86 bed = [] | 87 bed = [] |
| 87 param = params if params else '' | 88 param = params if params else '' |
| 88 req_header = {"Content-Type": "text/x-bed"} | 89 req_header = {"Content-Type": "text/x-bed"} |
| 89 regions = range(start, length, max_region) | 90 regions = range(start, length, max_region) |
| 90 if not regions or regions[-1] < length: | 91 if not regions or regions[-1] < length: |
| 91 regions.append(length) | 92 regions.append(length) |
| 92 for end in regions[1:]: | 93 for end in regions[1:]: |
| 93 ext = "/overlap/region/%s/%s:%d-%d?feature=transcript;%s"\ | 94 ext = "/overlap/region/%s/%s:%d-%d%s?feature=transcript;%s"\ |
| 94 % (species, refseq, start, end, param) | 95 % (species, refseq, start, end, strand, param) |
| 95 start = end + 1 | 96 start = end + 1 |
| 96 r = ensembl_rest(ext, req_header) | 97 r = ensembl_rest(ext, req_header) |
| 97 if r.text: | 98 if r.text: |
| 98 bed += r.text.splitlines() | 99 bed += r.text.splitlines() |
| 99 return bed | 100 return bed |
| 113 | 114 |
| 114 def get_cds(id,params=None): | 115 def get_cds(id,params=None): |
| 115 return get_seq(id, 'cds',params=params) | 116 return get_seq(id, 'cds',params=params) |
| 116 | 117 |
| 117 | 118 |
| 119 def get_transcript_haplotypes(species,transcript): | |
| 120 ext = "/transcript_haplotypes/%s/%s?aligned_sequences=1" % (species,transcript) | |
| 121 req_header = {"Content-Type" : "application/json"} | |
| 122 r = ensembl_rest(ext, req_header) | |
| 123 decoded = r.json() | |
| 124 | |
| 125 | |
| 118 def bed_from_line(line): | 126 def bed_from_line(line): |
| 119 fields = line.rstrip('\r\n').split('\t') | 127 fields = line.rstrip('\r\n').split('\t') |
| 128 if len(fields) < 12: | |
| 129 return None | |
| 120 (chrom, chromStart, chromEnd, name, score, strand, | 130 (chrom, chromStart, chromEnd, name, score, strand, |
| 121 thickStart, thickEnd, itemRgb, | 131 thickStart, thickEnd, itemRgb, |
| 122 blockCount, blockSizes, blockStarts) = fields[0:12] | 132 blockCount, blockSizes, blockStarts) = fields[0:12] |
| 123 bed_entry = BedEntry(chrom=chrom, chromStart=chromStart, chromEnd=chromEnd, | 133 bed_entry = BedEntry(chrom=chrom, chromStart=chromStart, chromEnd=chromEnd, |
| 124 name=name, score=score, strand=strand, | 134 name=name, score=score, strand=strand, |
| 125 thickStart=thickStart, thickEnd=thickEnd, | 135 thickStart=thickStart, thickEnd=thickEnd, |
| 126 itemRgb=itemRgb, | 136 itemRgb=itemRgb, |
| 127 blockCount=blockCount, | 137 blockCount=blockCount, |
| 128 blockSizes=blockSizes.rstrip(','), | 138 blockSizes=blockSizes.rstrip(','), |
| 129 blockStarts=blockStarts.rstrip(',')) | 139 blockStarts=blockStarts.rstrip(',')) |
| 140 if len(fields) == 20: | |
| 141 bed_entry.second_name = fields[12] | |
| 142 bed_entry.cds_start_status = fields[13] | |
| 143 bed_entry.cds_end_status = fields[14] | |
| 144 bed_entry.exon_frames = fields[15] | |
| 145 bed_entry.biotype = fields[16] | |
| 146 bed_entry.gene_name = fields[17] | |
| 147 bed_entry.second_gene_name = fields[18] | |
| 148 bed_entry.gene_type = fields[19] | |
| 130 return bed_entry | 149 return bed_entry |
| 131 | 150 |
| 132 | 151 |
| 133 class BedEntry(object): | 152 class BedEntry(object): |
| 134 def __init__(self, chrom=None, chromStart=None, chromEnd=None, | 153 def __init__(self, chrom=None, chromStart=None, chromEnd=None, |
| 155 self.blockStarts = [int(x) for x in blockStarts.split(',')] | 174 self.blockStarts = [int(x) for x in blockStarts.split(',')] |
| 156 elif isinstance(blockStarts, list): | 175 elif isinstance(blockStarts, list): |
| 157 self.blockStarts = [int(x) for x in blockStarts] | 176 self.blockStarts = [int(x) for x in blockStarts] |
| 158 else: | 177 else: |
| 159 self.blockStarts = blockStarts | 178 self.blockStarts = blockStarts |
| 179 self.second_name = None | |
| 180 self.cds_start_status = None | |
| 181 self.cds_end_status = None | |
| 182 self.exon_frames = None | |
| 183 self.biotype = None | |
| 184 self.gene_name = None | |
| 185 self.second_gene_name = None | |
| 186 self.gene_type = None | |
| 160 self.seq = None | 187 self.seq = None |
| 161 self.pep = None | 188 self.pep = None |
| 162 | 189 |
| 163 def __str__(self): | 190 def __str__(self): |
| 164 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s' % ( | 191 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s' % ( |
| 244 description='Retrieve Ensembl cDNAs and three frame translate') | 271 description='Retrieve Ensembl cDNAs and three frame translate') |
| 245 parser.add_argument( | 272 parser.add_argument( |
| 246 '-s', '--species', default='human', | 273 '-s', '--species', default='human', |
| 247 help='Ensembl Species to retrieve') | 274 help='Ensembl Species to retrieve') |
| 248 parser.add_argument( | 275 parser.add_argument( |
| 276 '-R', '--regions', action='append', default=[], | |
| 277 help='Restrict Ensembl retrieval to regions e.g.: X,2:20000-25000,3:100-500+') | |
| 278 parser.add_argument( | |
| 249 '-B', '--biotypes', action='append', default=[], | 279 '-B', '--biotypes', action='append', default=[], |
| 250 help='Restrict Ensembl biotypes to retrieve') | 280 help='Restrict Ensembl biotypes to retrieve') |
| 251 parser.add_argument( | 281 parser.add_argument( |
| 252 '-i', '--input', default=None, | 282 '-i', '--input', default=None, |
| 253 help='Use BED instead of retrieving cDNA from ensembl (-) for stdin') | 283 help='Use BED instead of retrieving cDNA from ensembl (-) for stdin') |
| 292 # print >> sys.stderr, "args biotypes: %s" % args.biotypes | 322 # print >> sys.stderr, "args biotypes: %s" % args.biotypes |
| 293 biotypea = ['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',')] | 323 biotypea = ['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',')] |
| 294 # print >> sys.stderr, "args biotypes: %s" % biotypea | 324 # print >> sys.stderr, "args biotypes: %s" % biotypea |
| 295 biotypes = ';'.join(['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip()]) | 325 biotypes = ';'.join(['biotype=%s' % bt.strip() for biotype in args.biotypes for bt in biotype.split(',') if bt.strip()]) |
| 296 # print >> sys.stderr, "biotypes: %s" % biotypes | 326 # print >> sys.stderr, "biotypes: %s" % biotypes |
| 327 | |
| 328 selected_regions = dict() # chrom:(start,end) | |
| 329 region_pat = '^([^:]+)(?::(\d*)(?:-(\d+)([+-])?)?)?' | |
| 330 if args.regions: | |
| 331 for entry in args.regions: | |
| 332 if not entry: | |
| 333 continue | |
| 334 regs = [x.strip() for x in entry.split(',') if x.strip()] | |
| 335 for reg in regs: | |
| 336 m = re.match(region_pat,reg) | |
| 337 if m: | |
| 338 (chrom,start,end,strand) = m.groups() | |
| 339 if chrom: | |
| 340 if chrom not in selected_regions: | |
| 341 selected_regions[chrom] = [] | |
| 342 selected_regions[chrom].append([start,end,strand]) | |
| 297 | 343 |
| 298 translations = dict() # start : end : seq | 344 translations = dict() # start : end : seq |
| 299 | 345 |
| 300 def unique_prot(tbed, seq): | 346 def unique_prot(tbed, seq): |
| 301 if tbed.chromStart not in translations: | 347 if tbed.chromStart not in translations: |
| 362 fa_wtr.write("\n") | 408 fa_wtr.write("\n") |
| 363 fa_wtr.flush() | 409 fa_wtr.flush() |
| 364 aa_start = aa_end + 1 | 410 aa_start = aa_end + 1 |
| 365 return translate_count | 411 return translate_count |
| 366 | 412 |
| 413 def translate_region(species,ref,start,stop,strand): | |
| 414 translation_count = 0 | |
| 415 regions = range(start, stop, max_region) | |
| 416 if not regions or regions[-1] < stop: | |
| 417 regions.append(stop) | |
| 418 for end in regions[1:]: | |
| 419 bedlines = get_transcripts_bed(species, ref, start, end, strand=strand, params=biotypes) | |
| 420 if args.verbose or args.debug: | |
| 421 print >> sys.stderr,\ | |
| 422 "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\ | |
| 423 % (species, ref, start, end, len(bedlines)) | |
| 424 # start, end, seq | |
| 425 for i, bedline in enumerate(bedlines): | |
| 426 try: | |
| 427 bed = bed_from_line(bedline)\ | |
| 428 if any([not args.raw, fa_wtr, bed_wtr])\ | |
| 429 else None | |
| 430 if tx_wtr: | |
| 431 tx_wtr.write(bedline if args.raw else str(bed)) | |
| 432 tx_wtr.write("\n") | |
| 433 tx_wtr.flush() | |
| 434 if bed: | |
| 435 translation_count += translate_bed(bed) | |
| 436 except Exception as e: | |
| 437 print >> sys.stderr,\ | |
| 438 "BED error (%s) : %s\n" % (e, bedline) | |
| 439 start = end + 1 | |
| 440 return translation_count | |
| 441 | |
| 367 if input_rdr: | 442 if input_rdr: |
| 368 translation_count = 0 | 443 translation_count = 0 |
| 369 for i, bedline in enumerate(input_rdr): | 444 for i, bedline in enumerate(input_rdr): |
| 370 try: | 445 try: |
| 371 bed = bed_from_line(bedline) | 446 bed = bed_from_line(bedline) |
| 447 if bed is None: | |
| 448 continue | |
| 449 if bed.biotype and biotypea and bed.biotype not in biotypea: | |
| 450 continue | |
| 372 translation_count += translate_bed(bed) | 451 translation_count += translate_bed(bed) |
| 373 except: | 452 except: |
| 374 print >> sys.stderr, "BED format error: %s\n" % bedline | 453 print >> sys.stderr, "BED format error: %s\n" % bedline |
| 375 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])): | 454 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])): |
| 376 print >> sys.stderr,\ | 455 print >> sys.stderr,\ |
| 377 "%s\tcDNA translations:%d" % (species, translation_count) | 456 "%s\tcDNA translations:%d" % (species, translation_count) |
| 378 else: | 457 else: |
| 379 coord_systems = get_toplevel(species) | 458 coord_systems = get_toplevel(species) |
| 380 if 'chromosome' in coord_systems: | 459 if 'chromosome' in coord_systems: |
| 460 ref_lengths = dict() | |
| 381 for ref in sorted(coord_systems['chromosome'].keys()): | 461 for ref in sorted(coord_systems['chromosome'].keys()): |
| 382 length = coord_systems['chromosome'][ref] | 462 length = coord_systems['chromosome'][ref] |
| 463 ref_lengths[ref] = length | |
| 383 if not any([tx_wtr, fa_wtr, bed_wtr]): | 464 if not any([tx_wtr, fa_wtr, bed_wtr]): |
| 384 print >> sys.stderr,\ | 465 print >> sys.stderr,\ |
| 385 "%s\t%s\tlength: %d" % (species, ref, length) | 466 "%s\t%s\tlength: %d" % (species, ref, length) |
| 386 continue | 467 if selected_regions: |
| 387 if args.debug: | |
| 388 print >> sys.stderr,\ | |
| 389 "Retrieving transcripts: %s\t%s\tlength: %d"\ | |
| 390 % (species, ref, length) | |
| 391 translation_count = 0 | 468 translation_count = 0 |
| 469 for ref in sorted(selected_regions.keys()): | |
| 470 if ref in ref_lengths: | |
| 471 for reg in selected_regions[ref]: | |
| 472 (_start,_stop,_strand) = reg | |
| 473 start = int(_start) if _start else 0 | |
| 474 stop = int(_stop) if _stop else ref_lengths[ref] | |
| 475 strand = '' if not _strand else ':1' if _strand == '+' else ':-1' | |
| 476 translation_count += translate_region(species,ref,start,stop,strand) | |
| 477 else: | |
| 478 strand = '' | |
| 392 start = 0 | 479 start = 0 |
| 393 regions = range(start, length, max_region) | 480 for ref in sorted(ref_lengths.keys()): |
| 394 if not regions or regions[-1] < length: | 481 length = ref_lengths[ref] |
| 395 regions.append(length) | 482 translation_count = 0 |
| 396 for end in regions[1:]: | 483 if args.debug: |
| 397 bedlines = get_transcripts_bed(species, ref, start, end, params=biotypes) | |
| 398 if args.verbose or args.debug: | |
| 399 print >> sys.stderr,\ | 484 print >> sys.stderr,\ |
| 400 "%s\t%s\tstart: %d\tend: %d\tcDNA transcripts:%d"\ | 485 "Retrieving transcripts: %s\t%s\tlength: %d"\ |
| 401 % (species, ref, start, end, len(bedlines)) | 486 % (species, ref, length) |
| 402 # start, end, seq | 487 translation_count += translate_region(species,ref,start,length,strand) |
| 403 for i, bedline in enumerate(bedlines): | 488 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])): |
| 404 try: | 489 print >> sys.stderr,\ |
| 405 bed = bed_from_line(bedline)\ | 490 "%s\t%s\tlength: %d\tcDNA translations:%d"\ |
| 406 if any([not args.raw, fa_wtr, bed_wtr])\ | 491 % (species, ref, length, translation_count) |
| 407 else None | |
| 408 if tx_wtr: | |
| 409 tx_wtr.write(bedline if args.raw else str(bed)) | |
| 410 tx_wtr.write("\n") | |
| 411 tx_wtr.flush() | |
| 412 if bed: | |
| 413 translation_count += translate_bed(bed) | |
| 414 except Exception as e: | |
| 415 print >> sys.stderr,\ | |
| 416 "BED error (%s) : %s\n" % (e, bedline) | |
| 417 start = end + 1 | |
| 418 | |
| 419 if args.debug or (args.verbose and any([fa_wtr, bed_wtr])): | |
| 420 print >> sys.stderr,\ | |
| 421 "%s\t%s\tlength: %d\tcDNA translations:%d"\ | |
| 422 % (species, ref, length, translation_count) | |
| 423 | 492 |
| 424 | 493 |
| 425 if __name__ == "__main__": | 494 if __name__ == "__main__": |
| 426 __main__() | 495 __main__() |
