Mercurial > repos > jjohnson > ensembl_variant_report
comparison ensembl_variant_report.py @ 7:d5cb252c68da draft
planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/ensembl_variant_report commit b62e927469f96a857e880c77530c50c885ad92fc-dirty
| author | jjohnson |
|---|---|
| date | Thu, 14 Jun 2018 18:07:10 -0400 |
| parents | d9fad18ffdb1 |
| children | fd612f8119a2 |
comparison
equal
deleted
inserted
replaced
| 6:d9fad18ffdb1 | 7:d5cb252c68da |
|---|---|
| 150 print >> sys.stderr, "Parsing gene model failed: %s" % e | 150 print >> sys.stderr, "Parsing gene model failed: %s" % e |
| 151 exit(2) | 151 exit(2) |
| 152 try: | 152 try: |
| 153 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf | 153 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf |
| 154 for tid,pos1,ref,alt,dp,freq in parse_input(): | 154 for tid,pos1,ref,alt,dp,freq in parse_input(): |
| 155 if options.debug: print >> sys.stderr, "%s\t%d\t%s\t%s\t%d\t%f" % (tid,pos1,ref,alt,dp,freq) | |
| 155 if not tid: | 156 if not tid: |
| 156 continue | 157 continue |
| 157 if options.min_depth and dp is not None and dp < options.min_depth: | 158 if options.min_depth and dp is not None and dp < options.min_depth: |
| 158 continue | 159 continue |
| 159 if options.min_freq and freq is not None and freq < options.min_freq: | 160 if options.min_freq and freq is not None and freq < options.min_freq: |
| 160 continue | 161 continue |
| 161 ## transcript_id, pos, ref, alt, dp, dpr | 162 try: |
| 162 tx = ens_ref.get_gtf_transcript(tid) | 163 ## transcript_id, pos, ref, alt, dp, dpr |
| 163 if not tx: | 164 tx = ens_ref.get_gtf_transcript(tid) |
| 164 continue | 165 if not tx and tid.find('.') > 0: |
| 165 coding = ens_ref.transcript_is_coding(tid) | 166 tid = tid.split('.')[0] |
| 166 if not coding: | 167 tx = ens_ref.get_gtf_transcript(tid) |
| 167 continue | 168 if not tx: |
| 168 frame_shift = len(ref) != len(alt) | 169 continue |
| 169 cds = ens_ref.get_cds(tid) | 170 if options.debug: print >> sys.stderr, "%s" % (tx) |
| 170 pos0 = pos1 - 1 # zero based position | 171 coding = ens_ref.transcript_is_coding(tid) |
| 171 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1 | 172 if not coding: |
| 172 alt_seq = alt if tx.gene.strand else reverse_complement(alt) | 173 continue |
| 173 ref_seq = ref if tx.gene.strand else reverse_complement(ref) | 174 frame_shift = len(ref) != len(alt) |
| 174 cds_pos = ens_ref.genome_to_cds_pos(tid, spos) | 175 cds = ens_ref.get_cds(tid) |
| 175 alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else '' | 176 if options.debug or not cds: print >> sys.stderr, "cds:\n%s" % (cds) |
| 176 offset = 0 | 177 pos0 = pos1 - 1 # zero based position |
| 177 if tx.gene.strand: | 178 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1 |
| 178 for i in range(min(len(ref),len(alt))): | 179 alt_seq = alt if tx.gene.strand else reverse_complement(alt) |
| 179 if ref[i] == alt[i]: | 180 ref_seq = ref if tx.gene.strand else reverse_complement(ref) |
| 180 offset = i | 181 cds_pos = ens_ref.genome_to_cds_pos(tid, spos) |
| 181 else: | 182 if options.debug: print >> sys.stderr, "cds_pos: %s" % (str(cds_pos)) |
| 182 break | 183 alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else '' |
| 183 else: | 184 offset = 0 |
| 184 for i in range(-1,-min(len(ref),len(alt)) -1,-1): | 185 if tx.gene.strand: |
| 185 if ref[i] == alt[i]: | 186 for i in range(min(len(ref),len(alt))): |
| 186 offset = i | 187 if ref[i] == alt[i]: |
| 187 else: | 188 offset = i |
| 188 break | 189 else: |
| 189 refpep = translate(cds[:len(cds)/3*3]) | |
| 190 pep = translate(alt_cds[:len(alt_cds)/3*3]) | |
| 191 peplen = len(pep) | |
| 192 aa_pos = (cds_pos + offset) / 3 | |
| 193 if aa_pos >= len(pep): | |
| 194 print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s\n" % (aa_pos,len(pep),tid,pos1,ref,alt) | |
| 195 continue | |
| 196 if frame_shift: | |
| 197 #find stop_codons | |
| 198 nstops = 0 | |
| 199 stop_codons = [] | |
| 200 for i in range(aa_pos,peplen): | |
| 201 if refpep[i] != pep[i]: | |
| 202 aa_pos = i | |
| 203 break | |
| 204 for i in range(aa_pos,peplen): | |
| 205 if pep[i] == '*': | |
| 206 nstops += 1 | |
| 207 stop_codons.append("%s-%s%s" % (alt_cds[i*3-1],alt_cds[i*3:i*3+3],"-%s" % alt_cds[i*3+4] if len(alt_cds) > i*3 else '')) | |
| 208 if nstops > options.readthrough: | |
| 209 reported_peptide = pep[aa_pos:i+1] | |
| 210 reported_stop_codon = ','.join(stop_codons) | |
| 211 break | 190 break |
| 212 else: | 191 else: |
| 213 reported_stop_codon = "%s-%s" % (alt_cds[peplen*3-4],alt_cds[peplen*3-3:peplen*3]) | 192 for i in range(-1,-min(len(ref),len(alt)) -1,-1): |
| 214 reported_peptide = "%s_%s_%s" % (pep[max(aa_pos-options.leading_aa,0):aa_pos], | 193 if ref[i] == alt[i]: |
| 215 pep[aa_pos], | 194 offset = i |
| 216 pep[aa_pos+1:min(aa_pos+1+options.trailing_aa,len(pep))]) | 195 else: |
| 217 cs_pos = aa_pos * 3 | 196 break |
| 218 aa_pos = (cds_pos + offset) / 3 | 197 refpep = translate(cds[:len(cds)/3*3]) |
| 219 ref_codon = cds[cs_pos:cs_pos+3] | 198 pep = translate(alt_cds[:len(alt_cds)/3*3]) |
| 220 ref_aa = translate(ref_codon) | 199 peplen = len(pep) |
| 221 alt_codon = alt_cds[cs_pos:cs_pos+3] | 200 aa_pos = (cds_pos + offset) / 3 |
| 222 alt_aa = translate(alt_codon) | 201 reported_stop_codon = '' |
| 223 gene = tx.gene.names[0] | 202 if aa_pos >= len(pep): |
| 224 report_fields = [tx.gene.names[0], | 203 print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s" % (aa_pos,len(pep),tid,pos1,ref,alt) |
| 225 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'), | 204 continue |
| 226 ref_seq, | 205 if frame_shift: |
| 227 alt_seq, | 206 #find stop_codons |
| 228 "%1.2f" % freq if freq is not None else '', | 207 nstops = 0 |
| 229 str(dp), | 208 stop_codons = [] |
| 230 "%s|%s" % (tx.gene.gene_id,tx.cdna_id), | 209 for i in range(aa_pos,peplen): |
| 231 "%d" % (aa_pos + 1), | 210 if refpep[i] != pep[i]: |
| 232 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa), | 211 aa_pos = i |
| 233 "%d" % len(pep), | 212 break |
| 234 reported_stop_codon, | 213 reported_peptide = pep[aa_pos:] |
| 235 reported_peptide | 214 for i in range(aa_pos,peplen): |
| 236 ] | 215 if pep[i] == '*': |
| 237 if options.debug: | 216 nstops += 1 |
| 238 report_fields.append("%d %d %d %d %s %s" % (cds_pos, offset, cs_pos,aa_pos,ref_codon,alt_codon)) | 217 stop_codons.append("%s-%s%s" % (alt_cds[i*3-1],alt_cds[i*3:i*3+3],"-%s" % alt_cds[i*3+4] if len(alt_cds) > i*3 else '')) |
| 239 outputFile.write('\t'.join(report_fields)) | 218 if nstops > options.readthrough: |
| 240 if options.debug: | 219 reported_peptide = pep[aa_pos:i+1] |
| 241 print >> sys.stderr, "%s %s\n%s\n%s\n%s %s" % ( | 220 reported_stop_codon = ','.join(stop_codons) |
| 242 cds[cs_pos-6:cs_pos], cds[cs_pos:cs_pos+15], | 221 break |
| 243 translate(cds[cs_pos-6:cs_pos+15]), | 222 else: |
| 244 translate(alt_cds[cs_pos-6:cs_pos+15]), | 223 reported_stop_codon = "%s-%s" % (alt_cds[peplen*3-4],alt_cds[peplen*3-3:peplen*3]) |
| 245 alt_cds[cs_pos-6:cs_pos], alt_cds[cs_pos:cs_pos+15]) | 224 reported_peptide = "%s_%s_%s" % (pep[max(aa_pos-options.leading_aa,0):aa_pos], |
| 246 outputFile.write('\n') | 225 pep[aa_pos], |
| 226 pep[aa_pos+1:min(aa_pos+1+options.trailing_aa,len(pep))]) | |
| 227 cs_pos = aa_pos * 3 | |
| 228 aa_pos = (cds_pos + offset) / 3 | |
| 229 ref_codon = cds[cs_pos:cs_pos+3] | |
| 230 ref_aa = translate(ref_codon) | |
| 231 alt_codon = alt_cds[cs_pos:cs_pos+3] | |
| 232 alt_aa = translate(alt_codon) | |
| 233 gene = tx.gene.names[0] | |
| 234 report_fields = [tx.gene.names[0], | |
| 235 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'), | |
| 236 ref_seq, | |
| 237 alt_seq, | |
| 238 "%1.2f" % freq if freq is not None else '', | |
| 239 str(dp), | |
| 240 "%s|%s" % (tx.gene.gene_id,tx.cdna_id), | |
| 241 "%d" % (aa_pos + 1), | |
| 242 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa), | |
| 243 "%d" % len(pep), | |
| 244 reported_stop_codon, | |
| 245 reported_peptide, | |
| 246 tx.get_transcript_type_name() | |
| 247 ] | |
| 248 if options.debug: | |
| 249 report_fields.append("%d %d %d %d %s %s" % (cds_pos, offset, cs_pos,aa_pos,ref_codon,alt_codon)) | |
| 250 outputFile.write('\t'.join(report_fields)) | |
| 251 if options.debug: | |
| 252 print >> sys.stderr, "%s %s\n%s\n%s\n%s %s" % ( | |
| 253 cds[cs_pos-6:cs_pos], cds[cs_pos:cs_pos+15], | |
| 254 translate(cds[cs_pos-6:cs_pos+15]), | |
| 255 translate(alt_cds[cs_pos-6:cs_pos+15]), | |
| 256 alt_cds[cs_pos-6:cs_pos], alt_cds[cs_pos:cs_pos+15]) | |
| 257 outputFile.write('\n') | |
| 258 except Exception, e: | |
| 259 print >> sys.stderr, "failed: %s" % e | |
| 260 print >> sys.stderr, "%s\t%d\t%s\t%s\t%d\t%f\t%s" % (tid,pos1,ref,alt,dp,freq,e) | |
| 247 except Exception, e: | 261 except Exception, e: |
| 248 print >> sys.stderr, "failed: %s" % e | 262 print >> sys.stderr, "failed: %s" % e |
| 249 exit(1) | 263 exit(1) |
| 250 | 264 |
| 251 | 265 |
