Mercurial > repos > jjohnson > ensembl_variant_report
comparison ensembl_variant_report.py @ 0:c3a9e63e8c51 draft
planemo upload for repository https://github.com/jj-umn/galaxytools/tree/master/ensembl_variant_report commit 239c1ee096e5fc3e2e929f7bf2d4afba5c677d4b-dirty
| author | jjohnson |
|---|---|
| date | Fri, 06 Jan 2017 16:19:40 -0500 |
| parents | |
| children | 9e83cc05d384 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c3a9e63e8c51 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Report variants to Ensembl Transcripts | |
| 4 | |
| 5 FrameShift report line | |
| 6 # Gene Variant Position Reference Variation Prevalence Sequencing Depth Transcript AA Position AA change AA Length Stop Codon Stop Region AA Variation | |
| 7 TIGD6 chr5:149374879 AAG AG 1.00 13 ENSG00000164296|ENST00000296736 345 Q344 522 A-TGA-T KRWTSSRPST* | |
| 8 | |
| 9 MissSense report line | |
| 10 # Gene Variant Position Reference Variation Prevalence Sequencing Depth Transcript AA Position AA change AA Length Stop Codon Stop Region AA Variation | |
| 11 FN1 chr2:216235089 G A 1.00 7394 ENSG00000115414|ENST00000354785 2261 V2261I 2478 G-TAA TGLTRGATYN_I_IVEALKDQQR | |
| 12 """ | |
| 13 import sys | |
| 14 import os.path | |
| 15 import re | |
| 16 import time | |
| 17 import optparse | |
| 18 from ensemblref import EnsemblRef | |
| 19 from Bio.Seq import reverse_complement, translate | |
| 20 | |
| 21 | |
| 22 def __main__(): | |
| 23 #Parse Command Line | |
| 24 parser = optparse.OptionParser() | |
| 25 #I/O | |
| 26 parser.add_option( '-i', '--input', dest='input', default=None, help='Tabular file with peptide_sequence column' ) | |
| 27 parser.add_option( '-s', '--format', dest='format', default='tabular', choices=['tabular','snpeff'], help='Tabular file with peptide_sequence column' ) | |
| 28 #Columns for tabular input | |
| 29 parser.add_option( '-C', '--chrom_column', type='int', dest='chrom_column', default=1, help='column ordinal with Ensembl transctip ID' ) | |
| 30 parser.add_option( '-P', '--pos_column', type='int', dest='pos_column', default=2, help='column ordinal with Ensembl transctip ID' ) | |
| 31 parser.add_option( '-R', '--ref_column', type='int', dest='ref_column', default=3, help='column ordinal with Ensembl transctip ID' ) | |
| 32 parser.add_option( '-A', '--alt_column', type='int', dest='alt_column', default=4, help='column ordinal with Ensembl transctip ID' ) | |
| 33 parser.add_option( '-T', '--transcript_column', type='int', dest='transcript_column', default=1, help='column ordinal with Ensembl transctip ID' ) | |
| 34 parser.add_option( '-F', '--dpr_column', type='int', dest='dpr_column', default=1, help='column with VCF: DPR or AD' ) | |
| 35 parser.add_option( '-D', '--dp_column', type='int', dest='dp_column', default=1, help='column with VCF: DP' ) | |
| 36 parser.add_option( '-g', '--gene_model', dest='gene_model', default=None, help='GTF gene model file. Used to annotate NSJ peptide entries.') | |
| 37 parser.add_option( '-2', '--twobit', dest='twobit', default=None, help='Reference genome in UCSC twobit format') | |
| 38 #Output file | |
| 39 parser.add_option( '-o', '--output', dest='output', default=None, help='The output report (else write to stdout)' ) | |
| 40 #filters | |
| 41 parser.add_option( '-d', '--min_depth', type='int', dest='min_depth', default=None, help='Minimum read depth to report' ) | |
| 42 parser.add_option( '-f', '--min_freq', type='float', dest='min_freq', default=None, help='Minimum variant frequency to report' ) | |
| 43 #peptide options | |
| 44 parser.add_option( '-l', '--leading_aa', type='int', dest='leading_aa', default=10, help='Number AAs before missense variant' ) | |
| 45 parser.add_option( '-t', '--trailing_aa', type='int', dest='trailing_aa', default=10, help='Number AAs after missense variant' ) | |
| 46 parser.add_option( '-r', '--readthrough', type='int', dest='readthrough', default=0, help='' ) | |
| 47 # | |
| 48 parser.add_option('--debug', dest='debug', action='store_true', default=False, help='Print debugging messages') | |
| 49 (options, args) = parser.parse_args() | |
| 50 | |
| 51 ##INPUTS## | |
| 52 if options.input != None: | |
| 53 try: | |
| 54 inputPath = os.path.abspath(options.input) | |
| 55 inputFile = open(inputPath, 'r') | |
| 56 except Exception, e: | |
| 57 print >> sys.stderr, "failed: %s" % e | |
| 58 exit(2) | |
| 59 else: | |
| 60 inputFile = sys.stdin | |
| 61 | |
| 62 if options.output != None: | |
| 63 try: | |
| 64 outputPath = os.path.abspath(options.output) | |
| 65 outputFile = open(outputPath, 'w') | |
| 66 except Exception, e: | |
| 67 print >> sys.stderr, "failed: %s" % e | |
| 68 exit(3) | |
| 69 else: | |
| 70 outputFile = sys.stdout | |
| 71 | |
| 72 def parse_tabular(): | |
| 73 ci = options.chrom_column - 1 | |
| 74 pi = options.pos_column - 1 | |
| 75 ri = options.ref_column - 1 | |
| 76 ai = options.alt_column - 1 | |
| 77 ti = options.transcript_column - 1 | |
| 78 di = options.dp_column - 1 | |
| 79 fi = options.dpr_column - 1 | |
| 80 for linenum,line in enumerate(inputFile): | |
| 81 if options.debug: | |
| 82 print >> sys.stderr, "%d: %s\n" % (linenum,line) | |
| 83 if line.startswith('#'): | |
| 84 continue | |
| 85 if line.strip() == '': | |
| 86 continue | |
| 87 fields = line.rstrip('\r\n').split('\t') | |
| 88 transcript = fields[ti] | |
| 89 if not transcript: | |
| 90 print >> sys.stderr, "%d: %s\n" % (linenum,line) | |
| 91 continue | |
| 92 chrom = fields[ci] | |
| 93 pos = int(fields[pi]) | |
| 94 ref = fields[ri] | |
| 95 alt = fields[ai] | |
| 96 dp = int(fields[di]) | |
| 97 dpr = [int(x) for x in fields[fi].split(',')] | |
| 98 yield (transcript,pos,ref,alt,dp,dpr) | |
| 99 | |
| 100 def parse_snpeff_vcf(): | |
| 101 for linenum,line in enumerate(inputFile): | |
| 102 if line.startswith('##'): | |
| 103 if line.find('SnpEffVersion=') > 0: | |
| 104 SnpEffVersion = re.search('SnpEffVersion="?(\d+\.\d+)',line).groups()[0] | |
| 105 elif line.startswith('#CHROM'): | |
| 106 pass | |
| 107 else: | |
| 108 fields = line.strip('\r\n').split('\t') | |
| 109 if options.debug: print >> sys.stderr, "\n%s" % (fields) | |
| 110 (chrom, pos, id, ref, alts, qual, filter, info) = fields[0:8] | |
| 111 pos = int(pos) | |
| 112 qual = float(qual) | |
| 113 for info_item in info.split(';'): | |
| 114 if info_item.find('=') < 0: continue | |
| 115 (key, val) = info_item.split('=', 1) | |
| 116 if key == 'DP': | |
| 117 dp = int(val) | |
| 118 if key == 'DPR': | |
| 119 dpr = dpr = [int(x) for x in val.split(',')] | |
| 120 if key in ['EFF','ANN']: | |
| 121 for effect in val.split(','): | |
| 122 if options.debug: print >> sys.stderr, "\n%s" % (effect.split('|')) | |
| 123 if key == 'ANN': | |
| 124 (alt,eff,impact,gene_name,gene_id,feature_type,transcript,biotype,exon,c_hgvs,p_hgvs,cdna,cds,aa,distance,info) = effect.split('|') | |
| 125 elif key == 'EFF': | |
| 126 (eff, effs) = effect.rstrip(')').split('(') | |
| 127 (impact, functional_class, codon_change, aa_change, aa_len, gene_name, biotype, coding, transcript, exon, alt) = effs.split('|')[0:11] | |
| 128 yield (transcript,pos,ref,alt,dp,dpr) | |
| 129 | |
| 130 | |
| 131 #Process gene model | |
| 132 ens_ref = None | |
| 133 if options.gene_model != None: | |
| 134 try: | |
| 135 geneModelFile = os.path.abspath(options.gene_model) | |
| 136 twoBitFile = os.path.abspath(options.twobit) | |
| 137 print >> sys.stderr, "Parsing ensembl ref: %s %s" % (options.gene_model,options.twobit) | |
| 138 time1 = time.time() | |
| 139 ens_ref = EnsemblRef(geneModelFile,twoBitFile) | |
| 140 time2 = time.time() | |
| 141 print >> sys.stderr, "Parsing ensembl ref: %d seconds" % (int(time2-time1)) | |
| 142 except Exception, e: | |
| 143 print >> sys.stderr, "Parsing gene model failed: %s" % e | |
| 144 exit(2) | |
| 145 try: | |
| 146 parse_input = parse_tabular if options.format == 'tabular' else parse_snpeff_vcf | |
| 147 for tid,pos1,ref,alt,dp,dpr in parse_input(): | |
| 148 freq = float(dpr[1])/float(sum(dpr)) if dpr else None | |
| 149 if options.min_depth and dp is not None and dp < options.min_depth: | |
| 150 continue | |
| 151 if options.min_freq and freq is not None and freq < options.min_freq: | |
| 152 continue | |
| 153 ## transcript_id, pos, ref, alt, dp, dpr | |
| 154 tx = ens_ref.get_gtf_transcript(tid) | |
| 155 coding = ens_ref.transcript_is_coding(tid) | |
| 156 if not coding: | |
| 157 continue | |
| 158 frame_shift = len(ref) != len(alt) | |
| 159 cds = ens_ref.get_cds(tid) | |
| 160 pos0 = pos1 - 1 # zero based position | |
| 161 spos = pos0 if tx.gene.strand else pos0 + len(ref) - 1 | |
| 162 alt_seq = alt if tx.gene.strand else reverse_complement(alt) | |
| 163 cds_pos = ens_ref.genome_to_cds_pos(tid, spos) | |
| 164 alt_cds = cds[:cds_pos] + alt_seq + cds[cds_pos+len(ref):] | |
| 165 offset = 0 | |
| 166 if tx.gene.strand: | |
| 167 for i in range(min(len(ref),len(alt))): | |
| 168 if ref[i] == alt[i]: | |
| 169 offset = i | |
| 170 else: | |
| 171 break | |
| 172 else: | |
| 173 for i in range(-1,-min(len(ref),len(alt)) -1,-1): | |
| 174 if ref[i] == alt[i]: | |
| 175 offset = i | |
| 176 else: | |
| 177 break | |
| 178 refpep = translate(cds[:len(cds)/3*3]) | |
| 179 pep = translate(alt_cds[:len(alt_cds)/3*3]) | |
| 180 peplen = len(pep) | |
| 181 aa_pos = (cds_pos + offset) / 3 | |
| 182 if aa_pos >= len(pep): | |
| 183 print >> sys.stderr, "%d: %s\n" % (linenum,line) | |
| 184 continue | |
| 185 if frame_shift: | |
| 186 #find stop_codons | |
| 187 nstops = 0 | |
| 188 stop_codons = [] | |
| 189 for i in range(aa_pos,peplen): | |
| 190 if refpep[i] != pep[i]: | |
| 191 aa_pos = i | |
| 192 break | |
| 193 for i in range(aa_pos,peplen): | |
| 194 if pep[i] == '*': | |
| 195 nstops += 1 | |
| 196 stop_codons.append("%s-%s%s" % (alt_cds[i*3-1],alt_cds[i*3:i*3+3],"-%s" % alt_cds[i*3+4] if len(alt_cds) > i*3 else '')) | |
| 197 if nstops > options.readthrough: | |
| 198 reported_peptide = pep[aa_pos:i+1] | |
| 199 reported_stop_codon = ','.join(stop_codons) | |
| 200 break | |
| 201 else: | |
| 202 reported_stop_codon = "%s-%s" % (alt_cds[peplen*3-4],alt_cds[peplen*3-3:peplen*3]) | |
| 203 reported_peptide = "%s_%s_%s" % (pep[max(aa_pos-options.leading_aa,0):aa_pos], | |
| 204 pep[aa_pos], | |
| 205 pep[aa_pos+1:min(aa_pos+1+options.trailing_aa,len(pep))]) | |
| 206 cs_pos = aa_pos * 3 | |
| 207 aa_pos = (cds_pos + offset) / 3 | |
| 208 ref_codon = cds[cs_pos:cs_pos+3] | |
| 209 ref_aa = translate(ref_codon) | |
| 210 alt_codon = alt_cds[cs_pos:cs_pos+3] | |
| 211 alt_aa = translate(alt_codon) | |
| 212 gene = tx.gene.names[0] | |
| 213 report_fields = [tx.gene.names[0], | |
| 214 '%s:%d %s' % (tx.gene.contig,pos1,'+' if tx.gene.strand else '-'), | |
| 215 ref, | |
| 216 alt, | |
| 217 "%1.2f" % freq if freq is not None else '', | |
| 218 str(dp), | |
| 219 "%s|%s" % (tx.gene.gene_id,tx.cdna_id), | |
| 220 "%d" % (aa_pos + 1), | |
| 221 "%s%d%s" % (ref_aa,aa_pos + 1,alt_aa), | |
| 222 "%d" % len(pep), | |
| 223 reported_stop_codon, | |
| 224 reported_peptide | |
| 225 ] | |
| 226 if options.debug: | |
| 227 report_fields.append("%d %d %d %d %s %s" % (cds_pos, offset, cs_pos,aa_pos,ref_codon,alt_codon)) | |
| 228 outputFile.write('\t'.join(report_fields)) | |
| 229 if options.debug: | |
| 230 print >> sys.stderr, "%s %s\n%s\n%s\n%s %s" % ( | |
| 231 cds[cs_pos-6:cs_pos], cds[cs_pos:cs_pos+15], | |
| 232 translate(cds[cs_pos-6:cs_pos+15]), | |
| 233 translate(alt_cds[cs_pos-6:cs_pos+15]), | |
| 234 alt_cds[cs_pos-6:cs_pos], alt_cds[cs_pos:cs_pos+15]) | |
| 235 outputFile.write('\n') | |
| 236 except Exception, e: | |
| 237 print >> sys.stderr, "failed: %s" % e | |
| 238 exit(1) | |
| 239 | |
| 240 | |
| 241 if __name__ == "__main__" : __main__() | |
| 242 |
