Mercurial > repos > jjohnson > translate_bed_sequences
comparison translate_bed_sequences.py @ 5:c626a939eef7 draft default tip
Uploaded
| author | jjohnson |
|---|---|
| date | Tue, 12 Jan 2016 14:38:03 -0500 |
| parents | 3b526a780849 |
| children |
comparison
equal
deleted
inserted
replaced
| 4:aa93f7910259 | 5:c626a939eef7 |
|---|---|
| 17 Output: Fasta of 3-frame translations of the spliced sequence | 17 Output: Fasta of 3-frame translations of the spliced sequence |
| 18 | 18 |
| 19 """ | 19 """ |
| 20 | 20 |
| 21 import sys,re,os.path | 21 import sys,re,os.path |
| 22 import tempfile | |
| 22 import optparse | 23 import optparse |
| 23 from optparse import OptionParser | 24 from optparse import OptionParser |
| 24 from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate | 25 from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate |
| 25 | 26 |
| 26 class BedEntry( object ): | 27 class BedEntry( object ): |
| 27 def __init__(self, line): | 28 def __init__(self, line): |
| 28 self.line = line | 29 self.line = line |
| 29 try: | 30 try: |
| 30 (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts,seq) = line.split('\t')[0:13] | 31 fields = line.rstrip('\r\n').split('\t') |
| 32 (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts) = fields[0:12] | |
| 33 seq = fields[12] if len(fields) > 12 else None | |
| 31 self.chrom = chrom | 34 self.chrom = chrom |
| 32 self.chromStart = int(chromStart) | 35 self.chromStart = int(chromStart) |
| 33 self.chromEnd = int(chromEnd) | 36 self.chromEnd = int(chromEnd) |
| 34 self.name = name | 37 self.name = name |
| 35 self.score = int(score) | 38 self.score = int(score) |
| 42 self.blockStarts = [int(x) for x in blockStarts.split(',')] | 45 self.blockStarts = [int(x) for x in blockStarts.split(',')] |
| 43 self.seq = seq | 46 self.seq = seq |
| 44 except Exception, e: | 47 except Exception, e: |
| 45 print >> sys.stderr, "Unable to read Bed entry" % e | 48 print >> sys.stderr, "Unable to read Bed entry" % e |
| 46 exit(1) | 49 exit(1) |
| 50 def __str__(self): | |
| 51 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s%s' % ( | |
| 52 self.chrom, self.chromStart, self.chromEnd, self.name, self.score, self.strand, self.thickStart, self.thickEnd, self.itemRgb, self.blockCount, | |
| 53 ','.join([str(x) for x in self.blockSizes]), | |
| 54 ','.join([str(x) for x in self.blockStarts]), | |
| 55 '\t%s' % self.seq if self.seq else '') | |
| 47 def get_splice_junctions(self): | 56 def get_splice_junctions(self): |
| 48 splice_juncs = [] | 57 splice_juncs = [] |
| 49 for i in range(self.blockCount - 1): | 58 for i in range(self.blockCount - 1): |
| 50 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) | 59 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) |
| 51 splice_juncs.append(splice_junc) | 60 splice_juncs.append(splice_junc) |
| 78 for i in range(3): | 87 for i in range(3): |
| 79 translation = self.get_translation(sequence=seq[i:]) | 88 translation = self.get_translation(sequence=seq[i:]) |
| 80 if translation: | 89 if translation: |
| 81 translations.append(translation) | 90 translations.append(translation) |
| 82 return translations | 91 return translations |
| 83 ## [[start,end,seq],[start,end,seq],[start,end,seq]] | 92 ## (start,end) |
| 93 def get_subrange(self,tstart,tstop): | |
| 94 chromStart = self.chromStart | |
| 95 chromEnd = self.chromEnd | |
| 96 r = range(self.blockCount) | |
| 97 if self.strand == '-': | |
| 98 r.reverse() | |
| 99 bStart = 0 | |
| 100 for x in r: | |
| 101 bEnd = bStart + self.blockSizes[x] | |
| 102 if bStart <= tstart < bEnd: | |
| 103 if self.strand == '+': | |
| 104 chromStart = self.chromStart + self.blockStarts[x] + (tstart - bStart) | |
| 105 else: | |
| 106 chromEnd = self.chromStart + self.blockStarts[x] + (tstart - bStart) | |
| 107 if bStart <= tstop < bEnd: | |
| 108 if self.strand == '+': | |
| 109 chromEnd = self.chromStart + self.blockStarts[x] + (tstop - bStart) | |
| 110 else: | |
| 111 chromStart = self.chromStart + self.blockStarts[x] + self.blockSizes[x] - (tstop - bStart) | |
| 112 bStart += self.blockSizes[x] | |
| 113 return(chromStart,chromEnd) | |
| 114 #get the blocks for sub range | |
| 115 def get_blocks(self,chromStart,chromEnd): | |
| 116 tblockCount = 0 | |
| 117 tblockSizes = [] | |
| 118 tblockStarts = [] | |
| 119 for x in range(self.blockCount): | |
| 120 bStart = self.chromStart + self.blockStarts[x] | |
| 121 bEnd = bStart + self.blockSizes[x] | |
| 122 if bStart > chromEnd: | |
| 123 break | |
| 124 if bEnd < chromStart: | |
| 125 continue | |
| 126 cStart = max(chromStart,bStart) | |
| 127 tblockStarts.append(cStart - chromStart) | |
| 128 tblockSizes.append(min(chromEnd,bEnd) - cStart) | |
| 129 tblockCount += 1 | |
| 130 ## print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount,tblockStarts,tblockSizes) | |
| 131 return (tblockCount,tblockSizes,tblockStarts) | |
| 132 ## [(start,end,seq,blockCount,blockSizes,blockStarts),(start,end,seq,blockCount,blockSizes,blockStarts),(start,end,seq,blockCount,blockSizes,blockStarts)] | |
| 84 ## filter: ignore translation if stop codon in first exon after ignore_left_bp | 133 ## filter: ignore translation if stop codon in first exon after ignore_left_bp |
| 85 def get_filterd_translations(self,untrimmed=False,filtering=True,ignore_left_bp=0,ignore_right_bp=0): | 134 def get_filterd_translations(self,untrimmed=False,filtering=True,ignore_left_bp=0,ignore_right_bp=0,debug=False): |
| 86 translations = [None,None,None] | 135 translations = [None,None,None,None,None,None] |
| 87 seq = self.get_spliced_seq() | 136 seq = self.get_spliced_seq() |
| 88 ignore = (ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3 | 137 ignore = (ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3 |
| 89 block_sum = sum(self.blockSizes) | 138 block_sum = sum(self.blockSizes) |
| 90 exon_sizes = self.blockSizes | 139 exon_sizes = [x for x in self.blockSizes] |
| 91 if self.strand == '-': | 140 if self.strand == '-': |
| 92 exon_sizes.reverse() | 141 exon_sizes.reverse() |
| 93 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))] | 142 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))] |
| 143 if debug: | |
| 144 print >> sys.stderr, "splice_sites: %s" % splice_sites | |
| 94 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] | 145 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] |
| 95 if seq: | 146 if seq: |
| 96 for i in range(3): | 147 for i in range(3): |
| 97 translation = self.get_translation(sequence=seq[i:]) | 148 translation = self.get_translation(sequence=seq[i:]) |
| 98 if translation: | 149 if translation: |
| 99 tstart = 0 | 150 tstart = 0 |
| 100 tstop = len(translation) | 151 tstop = len(translation) |
| 152 offset = (block_sum - i) % 3 | |
| 153 if debug: | |
| 154 print >> sys.stderr, "frame: %d\ttstart: %d tstop: %d offset: %d\t%s" % (i,tstart,tstop,offset,translation) | |
| 101 if not untrimmed: | 155 if not untrimmed: |
| 102 tstart = translation.rfind('*',0,junc) + 1 | 156 tstart = translation.rfind('*',0,junc) + 1 |
| 103 stop = translation.find('*',junc) | 157 stop = translation.find('*',junc) |
| 104 tstop = stop if stop >= 0 else len(translation) | 158 tstop = stop if stop >= 0 else len(translation) |
| 159 offset = (block_sum - i) % 3 | |
| 160 trimmed = translation[tstart:tstop] | |
| 161 if debug: | |
| 162 print >> sys.stderr, "frame: %d\ttstart: %d tstop: %d offset: %d\t%s" % (i,tstart,tstop,offset,trimmed) | |
| 105 if filtering and tstart > ignore: | 163 if filtering and tstart > ignore: |
| 106 continue | 164 continue |
| 107 trimmed = translation[tstart:tstop] | |
| 108 #get genomic locations for start and end | 165 #get genomic locations for start and end |
| 109 offset = (block_sum - i) % 3 | |
| 110 if self.strand == '+': | 166 if self.strand == '+': |
| 111 chromStart = self.chromStart + i + (tstart * 3) | 167 chromStart = self.chromStart + i + (tstart * 3) |
| 112 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3 | 168 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3 |
| 113 else: | 169 else: |
| 114 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3 | 170 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3 |
| 115 chromEnd = self.chromEnd - i - (tstart * 3) | 171 chromEnd = self.chromEnd - i - (tstart * 3) |
| 116 translations[i] = [chromStart,chromEnd,trimmed] | 172 #get the blocks for this translation |
| 173 (tblockCount,tblockSizes,tblockStarts) = self.get_blocks(chromStart,chromEnd) | |
| 174 translations[i] = (chromStart,chromEnd,trimmed,tblockCount,tblockSizes,tblockStarts) | |
| 175 if debug: | |
| 176 print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount,tblockStarts,tblockSizes) | |
| 177 # translations[i] = (chromStart,chromEnd,trimmed,tblockCount,tblockSizes,tblockStarts) | |
| 117 return translations | 178 return translations |
| 118 def get_seq_id(self,seqtype='unk:unk',reference='',frame=None): | 179 def get_seq_id(self,seqtype='unk:unk',reference='',frame=None): |
| 119 ## Ensembl fasta ID format | 180 ## Ensembl fasta ID format |
| 120 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT | 181 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT |
| 121 # >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding | 182 # >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding |
| 158 #Parse Command Line | 219 #Parse Command Line |
| 159 parser = optparse.OptionParser() | 220 parser = optparse.OptionParser() |
| 160 parser.add_option( '-i', '--input', dest='input', help='BED file (tophat junctions.bed) with sequence column added' ) | 221 parser.add_option( '-i', '--input', dest='input', help='BED file (tophat junctions.bed) with sequence column added' ) |
| 161 parser.add_option( '-o', '--output', dest='output', help='Translations of spliced sequence') | 222 parser.add_option( '-o', '--output', dest='output', help='Translations of spliced sequence') |
| 162 parser.add_option( '-b', '--bed_format', dest='bed_format', action='store_true', default=False, help='Append translations to bed file instead of fasta' ) | 223 parser.add_option( '-b', '--bed_format', dest='bed_format', action='store_true', default=False, help='Append translations to bed file instead of fasta' ) |
| 224 parser.add_option( '-D', '--fa_db', dest='fa_db', default=None, help='Prefix DB identifier for fasta ID line, e.g. generic' ) | |
| 225 parser.add_option( '-s', '--fa_sep', dest='fa_sep', default='|', help='fasta ID separator defaults to pipe char, e.g. generic|ProtID|description' ) | |
| 226 parser.add_option( '-B', '--bed', dest='bed', default=None, help='Output a bed file with added 13th column having translation' ) | |
| 227 parser.add_option( '-G', '--gff3', dest='gff', default=None, help='Output translations to a GFF3 file' ) | |
| 163 parser.add_option( '-S', '--seqtype', dest='seqtype', default='pep:splice', help='SEQTYPE:STATUS for fasta ID line' ) | 228 parser.add_option( '-S', '--seqtype', dest='seqtype', default='pep:splice', help='SEQTYPE:STATUS for fasta ID line' ) |
| 229 parser.add_option( '-P', '--id_prefix', dest='id_prefix', default='', help='prefix for the sequence ID' ) | |
| 164 parser.add_option( '-R', '--reference', dest='reference', default=None, help='Genome Reference Name for fasta ID location ' ) | 230 parser.add_option( '-R', '--reference', dest='reference', default=None, help='Genome Reference Name for fasta ID location ' ) |
| 231 parser.add_option( '-r', '--refsource', dest='refsource', default=None, help='Source for Genome Reference, e.g. Ensembl, UCSC, or NCBI' ) | |
| 165 parser.add_option( '-Q', '--score_name', dest='score_name', default=None, help='include in the fasta ID line score_name:score ' ) | 232 parser.add_option( '-Q', '--score_name', dest='score_name', default=None, help='include in the fasta ID line score_name:score ' ) |
| 166 parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to ignore when filtering' ) | 233 parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to ignore when filtering' ) |
| 167 parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to ignore when filtering' ) | 234 parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to ignore when filtering' ) |
| 168 parser.add_option( '-U', '--unfiltered', dest='filtering', action='store_false', default=True, help='Do NOT filterout translation with stop codon in the first exon' ) | 235 parser.add_option( '-U', '--unfiltered', dest='filtering', action='store_false', default=True, help='Do NOT filterout translation with stop codon in the first exon' ) |
| 169 parser.add_option( '-u', '--untrimmed', dest='untrimmed', action='store_true', default=False, help='Do NOT trim from splice site to stop codon' ) | 236 parser.add_option( '-u', '--untrimmed', dest='untrimmed', action='store_true', default=False, help='Do NOT trim from splice site to stop codon' ) |
| 180 print >> sys.stderr, "failed: %s" % e | 247 print >> sys.stderr, "failed: %s" % e |
| 181 exit(2) | 248 exit(2) |
| 182 else: | 249 else: |
| 183 inputFile = sys.stdin | 250 inputFile = sys.stdin |
| 184 # Output files | 251 # Output files |
| 252 bed_fh = None | |
| 253 gff_fh = None | |
| 254 gff_fa_file = None | |
| 255 gff_fa = None | |
| 185 outFile = None | 256 outFile = None |
| 186 if options.output == None: | 257 if options.output == None: |
| 187 #write to stdout | 258 #write to stdout |
| 188 outFile = sys.stdout | 259 outFile = sys.stdout |
| 260 if options.gff: | |
| 261 gff_fa_file = tempfile.NamedTemporaryFile(prefix='gff_fasta_',suffix=".fa",dir=os.getcwd()).name | |
| 262 gff_fa = open(gff_fa_file,'w') | |
| 189 else: | 263 else: |
| 190 try: | 264 try: |
| 191 outPath = os.path.abspath(options.output) | 265 outPath = os.path.abspath(options.output) |
| 192 outFile = open(outPath, 'w') | 266 outFile = open(outPath, 'w') |
| 193 except Exception, e: | 267 except Exception, e: |
| 194 print >> sys.stderr, "failed: %s" % e | 268 print >> sys.stderr, "failed: %s" % e |
| 195 exit(3) | 269 exit(3) |
| 270 if options.gff: | |
| 271 gff_fa_file = outPath | |
| 272 if options.bed: | |
| 273 bed_fh = open(options.bed,'w') | |
| 274 bed_fh.write('track name="%s" description="%s" \n' % ('novel_junctioni_translations','test')) | |
| 275 if options.gff: | |
| 276 gff_fh = open(options.gff,'w') | |
| 277 gff_fh.write("##gff-version 3.2.1\n") | |
| 278 if options.reference: | |
| 279 gff_fh.write("##genome-build %s %s\n" % (options.refsource if options.refsource else 'unknown', options.reference)) | |
| 196 leading_bp = 0 | 280 leading_bp = 0 |
| 197 trailing_bp = 0 | 281 trailing_bp = 0 |
| 198 if options.leading_bp: | 282 if options.leading_bp: |
| 199 if options.leading_bp >= 0: | 283 if options.leading_bp >= 0: |
| 200 leading_bp = options.leading_bp | 284 leading_bp = options.leading_bp |
| 231 print >> sys.stderr, "" | 315 print >> sys.stderr, "" |
| 232 if options.bed_format: | 316 if options.bed_format: |
| 233 tx_entry = "%s\t%s\n" % (line.rstrip('\r\n'),'\t'.join(translations)) | 317 tx_entry = "%s\t%s\n" % (line.rstrip('\r\n'),'\t'.join(translations)) |
| 234 outFile.write(tx_entry) | 318 outFile.write(tx_entry) |
| 235 else: | 319 else: |
| 236 translations = entry.get_filterd_translations(untrimmed=options.untrimmed,filtering=options.filtering,ignore_left_bp=leading_bp,ignore_right_bp=trailing_bp) | 320 translations = entry.get_filterd_translations(untrimmed=options.untrimmed,filtering=options.filtering,ignore_left_bp=leading_bp,ignore_right_bp=trailing_bp,debug=options.debug) |
| 237 for i,tx in enumerate(translations): | 321 for i,tx in enumerate(translations): |
| 238 if tx: | 322 if tx: |
| 239 (chromStart,chromEnd,translation) = tx | 323 (chromStart,chromEnd,translation,blockCount,blockSizes,blockStarts) = tx |
| 240 if options.min_length != None and len(translation) < options.min_length: | 324 if options.min_length != None and len(translation) < options.min_length: |
| 241 continue | 325 continue |
| 242 if options.max_stop_codons != None and translation.count('*') > options.max_stop_codons: | 326 if options.max_stop_codons != None and translation.count('*') > options.max_stop_codons: |
| 243 continue | 327 continue |
| 244 frame_name = '_%s' % (i + 1) | 328 frame_name = '_%s' % (i + 1) |
| 329 pep_id = "%s%s%s" % (options.id_prefix,entry.name,frame_name) | |
| 330 if bed_fh: | |
| 331 bed_fh.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\n' % (str(entry.chrom),chromStart,chromEnd,pep_id,entry.score,entry.strand,chromStart,chromEnd,entry.itemRgb,blockCount,','.join([str(x) for x in blockSizes]),','.join([str(x) for x in blockStarts]),translation)) | |
| 245 location = "chromosome:%s:%s:%s:%s:%s" % (options.reference,entry.chrom,chromStart,chromEnd,strand) | 332 location = "chromosome:%s:%s:%s:%s:%s" % (options.reference,entry.chrom,chromStart,chromEnd,strand) |
| 333 if blockCount: | |
| 334 location += " blockCount:%d blockSizes:%s blockStarts:%s" % (blockCount,','.join([str(x) for x in blockSizes]),','.join([str(x) for x in blockStarts])) | |
| 246 score = " %s:%s" % (options.score_name,entry.score) if options.score_name else '' | 335 score = " %s:%s" % (options.score_name,entry.score) if options.score_name else '' |
| 247 seq_id = "%s%s %s %s%s" % (entry.name,frame_name,options.seqtype,location, score) | 336 seq_description = "%s %s%s" % (options.seqtype, location, score) |
| 248 outFile.write(">%s\n" % seq_id) | 337 seq_id = "%s " % pep_id |
| 249 outFile.write(translation) | 338 if options.fa_db: |
| 250 outFile.write('\n') | 339 seq_id = "%s%s%s%s" % (options.fa_db,options.fa_sep,pep_id,options.fa_sep) |
| 340 fa_id = "%s%s" % (seq_id,seq_description) | |
| 341 fa_entry = ">%s\n%s\n" % (fa_id,translation) | |
| 342 outFile.write(fa_entry) | |
| 343 if gff_fh: | |
| 344 if gff_fa: | |
| 345 gff_fa.write(fa_entry) | |
| 346 gff_fh.write("##sequence-region %s %d %d\n" % (entry.chrom,chromStart + 1,chromEnd - 1)) | |
| 347 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tID=%s\n" % (entry.chrom,'splice_junc','gene',chromStart + 1,chromEnd - 1,entry.score,entry.strand,0,pep_id)) | |
| 348 for x in range(blockCount): | |
| 349 start = chromStart+blockStarts[x] + 1 | |
| 350 end = start + blockSizes[x] - 1 | |
| 351 phase = (3 - sum(blockSizes[:x]) % 3) % 3 | |
| 352 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tParent=%s;ID=%s_%d\n" % (entry.chrom,'splice_junc','CDS',start,end,entry.score,entry.strand,phase,pep_id,pep_id,x)) | |
| 353 """ | |
| 354 ##gff-version 3 | |
| 355 ##sequence-region 19 1 287484 | |
| 356 19 MassSpec peptide 282299 287484 10.0 - 0 ID=TEARLSFYSGHSSFGMYCMVFLALYVQ | |
| 357 19 MassSpec CDS 287474 287484 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 | |
| 358 19 MassSpec CDS 282752 282809 . - 1 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 | |
| 359 19 MassSpec CDS 282299 282310 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 | |
| 360 """ | |
| 361 if bed_fh: | |
| 362 bed_fh.close() | |
| 363 if gff_fh: | |
| 364 if gff_fa: | |
| 365 gff_fa.close() | |
| 366 else: | |
| 367 outFile.close() | |
| 368 gff_fa = open(gff_fa_file,'r') | |
| 369 gff_fh.write("##FASTA\n") | |
| 370 for i, line in enumerate(gff_fa): | |
| 371 gff_fh.write(line) | |
| 372 gff_fh.close() | |
| 251 except Exception, e: | 373 except Exception, e: |
| 252 print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e) | 374 print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e) |
| 253 | 375 |
| 254 if __name__ == "__main__" : __main__() | 376 if __name__ == "__main__" : __main__() |
| 255 | 377 |
