Mercurial > repos > jjohnson > ensembl_variant_report
comparison ensembl_variant_report.py @ 5:9e83cc05d384 draft
Uploaded
| author | jjohnson |
|---|---|
| date | Mon, 06 Feb 2017 09:25:43 -0500 |
| parents | c3a9e63e8c51 |
| children | d9fad18ffdb1 |
comparison
equal
deleted
inserted
replaced
| 4:908c68e3c73f | 5:9e83cc05d384 |
|---|---|
| 90 print >> sys.stderr, "%d: %s\n" % (linenum,line) | 90 print >> sys.stderr, "%d: %s\n" % (linenum,line) |
| 91 continue | 91 continue |
| 92 chrom = fields[ci] | 92 chrom = fields[ci] |
| 93 pos = int(fields[pi]) | 93 pos = int(fields[pi]) |
| 94 ref = fields[ri] | 94 ref = fields[ri] |
| 95 alt = fields[ai] | 95 alts = fields[ai] |
| 96 dp = int(fields[di]) | 96 dp = int(fields[di]) |
| 97 dpr = [int(x) for x in fields[fi].split(',')] | 97 dpr = [int(x) for x in fields[fi].split(',')] |
| 98 yield (transcript,pos,ref,alt,dp,dpr) | 98 for i,alt in enumerate(alts.split(',')): |
| 99 freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None | |
| 100 yield (transcript,pos,ref,alt,dp,freq) | |
| 99 | 101 |
| 100 def parse_snpeff_vcf(): | 102 def parse_snpeff_vcf(): |
| 101 for linenum,line in enumerate(inputFile): | 103 for linenum,line in enumerate(inputFile): |
| 102 if line.startswith('##'): | 104 if line.startswith('##'): |
| 103 if line.find('SnpEffVersion=') > 0: | 105 if line.find('SnpEffVersion=') > 0: |
| 106 pass | 108 pass |
| 107 else: | 109 else: |
| 108 fields = line.strip('\r\n').split('\t') | 110 fields = line.strip('\r\n').split('\t') |
| 109 if options.debug: print >> sys.stderr, "\n%s" % (fields) | 111 if options.debug: print >> sys.stderr, "\n%s" % (fields) |
| 110 (chrom, pos, id, ref, alts, qual, filter, info) = fields[0:8] | 112 (chrom, pos, id, ref, alts, qual, filter, info) = fields[0:8] |
| 113 alt_list = alts.split(',') | |
| 111 pos = int(pos) | 114 pos = int(pos) |
| 112 qual = float(qual) | 115 qual = float(qual) |
| 113 for info_item in info.split(';'): | 116 for info_item in info.split(';'): |
| 114 if info_item.find('=') < 0: continue | 117 if info_item.find('=') < 0: continue |
| 115 (key, val) = info_item.split('=', 1) | 118 (key, val) = info_item.split('=', 1) |
| 123 if key == 'ANN': | 126 if key == 'ANN': |
| 124 (alt,eff,impact,gene_name,gene_id,feature_type,transcript,biotype,exon,c_hgvs,p_hgvs,cdna,cds,aa,distance,info) = effect.split('|') | 127 (alt,eff,impact,gene_name,gene_id,feature_type,transcript,biotype,exon,c_hgvs,p_hgvs,cdna,cds,aa,distance,info) = effect.split('|') |
| 125 elif key == 'EFF': | 128 elif key == 'EFF': |
| 126 (eff, effs) = effect.rstrip(')').split('(') | 129 (eff, effs) = effect.rstrip(')').split('(') |
| 127 (impact, functional_class, codon_change, aa_change, aa_len, gene_name, biotype, coding, transcript, exon, alt) = effs.split('|')[0:11] | 130 (impact, functional_class, codon_change, aa_change, aa_len, gene_name, biotype, coding, transcript, exon, alt) = effs.split('|')[0:11] |
| 128 yield (transcript,pos,ref,alt,dp,dpr) | 131 i = alt_list.index(alt) if alt in alt_list else 0 |
| 132 freq = float(dpr[i+1])/float(sum(dpr)) if dpr else None | |
| 133 yield (transcript,pos,ref,alt,dp,freq) | |
| 129 | 134 |
| 130 | 135 |
| 131 #Process gene model | 136 #Process gene model |
| 132 ens_ref = None | 137 ens_ref = None |
| 133 if options.gene_model != None: | 138 if options.gene_model != None: |
| 142 except Exception, e: | 147 except Exception, e: |
| 143 print >> sys.stderr, "Parsing gene model failed: %s" % e | 148 print >> sys.stderr, "Parsing gene model failed: %s" % e |
| 144 exit(2) | 149 exit(2) |
| 145 try: | 150 try: |
| 146 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf | 151 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf |
| 147 for tid,pos1,ref,alt,dp,dpr in parse_input(): | 152 for tid,pos1,ref,alt,dp,freq in parse_input(): |
| 148 freq = float(dpr[1])/float(sum(dpr)) if dpr else None | 153 if not tid: |
| 154 continue | |
| 149 if options.min_depth and dp is not None and dp < options.min_depth: | 155 if options.min_depth and dp is not None and dp < options.min_depth: |
| 150 continue | 156 continue |
| 151 if options.min_freq and freq is not None and freq < options.min_freq: | 157 if options.min_freq and freq is not None and freq < options.min_freq: |
| 152 continue | 158 continue |
| 153 ## transcript_id, pos, ref, alt, dp, dpr | 159 ## transcript_id, pos, ref, alt, dp, dpr |
| 154 tx = ens_ref.get_gtf_transcript(tid) | 160 tx = ens_ref.get_gtf_transcript(tid) |
| 161 if not tx: | |
| 162 continue | |
| 155 coding = ens_ref.transcript_is_coding(tid) | 163 coding = ens_ref.transcript_is_coding(tid) |
| 156 if not coding: | 164 if not coding: |
| 157 continue | 165 continue |
| 158 frame_shift = len(ref) != len(alt) | 166 frame_shift = len(ref) != len(alt) |
| 159 cds = ens_ref.get_cds(tid) | 167 cds = ens_ref.get_cds(tid) |
| 160 pos0 = pos1 - 1 # zero based position | 168 pos0 = pos1 - 1 # zero based position |
| 161 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1 | 169 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1 |
| 162 alt_seq = alt if tx.gene.strand else reverse_complement(alt) | 170 alt_seq = alt if tx.gene.strand else reverse_complement(alt) |
| 171 ref_seq = ref if tx.gene.strand else reverse_complement(ref) | |
| 163 cds_pos = ens_ref.genome_to_cds_pos(tid, spos) | 172 cds_pos = ens_ref.genome_to_cds_pos(tid, spos) |
| 164 alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] | 173 alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] if cds_pos+len(ref) < len(cds) else '' |
| 165 offset = 0 | 174 offset = 0 |
| 166 if tx.gene.strand: | 175 if tx.gene.strand: |
| 167 for i in range(min(len(ref),len(alt))): | 176 for i in range(min(len(ref),len(alt))): |
| 168 if ref[i] == alt[i]: | 177 if ref[i] == alt[i]: |
| 169 offset = i | 178 offset = i |
| 178 refpep = translate(cds[:len(cds)/3*3]) | 187 refpep = translate(cds[:len(cds)/3*3]) |
| 179 pep = translate(alt_cds[:len(alt_cds)/3*3]) | 188 pep = translate(alt_cds[:len(alt_cds)/3*3]) |
| 180 peplen = len(pep) | 189 peplen = len(pep) |
| 181 aa_pos = (cds_pos + offset) / 3 | 190 aa_pos = (cds_pos + offset) / 3 |
| 182 if aa_pos >= len(pep): | 191 if aa_pos >= len(pep): |
| 183 print >> sys.stderr, "%d: %s\n" % (linenum,line) | 192 print >> sys.stderr, "aa_pos %d >= peptide length %d : %s %d %s %s\n" % (aa_pos,len(pep),tid,pos1,ref,alt) |
| 184 continue | 193 continue |
| 185 if frame_shift: | 194 if frame_shift: |
| 186 #find stop_codons | 195 #find stop_codons |
| 187 nstops = 0 | 196 nstops = 0 |
| 188 stop_codons = [] | 197 stop_codons = [] |
| 210 alt_codon = alt_cds[cs_pos:cs_pos+3] | 219 alt_codon = alt_cds[cs_pos:cs_pos+3] |
| 211 alt_aa = translate(alt_codon) | 220 alt_aa = translate(alt_codon) |
| 212 gene = tx.gene.names[0] | 221 gene = tx.gene.names[0] |
| 213 report_fields = [tx.gene.names[0], | 222 report_fields = [tx.gene.names[0], |
| 214 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'), | 223 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'), |
| 215 ref, | 224 ref_seq, |
| 216 alt, | 225 alt_seq, |
| 217 "%1.2f" % freq if freq is not None else '', | 226 "%1.2f" % freq if freq is not None else '', |
| 218 str(dp), | 227 str(dp), |
| 219 "%s|%s" % (tx.gene.gene_id,tx.cdna_id), | 228 "%s|%s" % (tx.gene.gene_id,tx.cdna_id), |
| 220 "%d" % (aa_pos + 1), | 229 "%d" % (aa_pos + 1), |
| 221 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa), | 230 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa), |
