annotate convert_VCF_info_fields.py @ 13:b7da0d412e48 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
author iuc
date Fri, 17 Sep 2021 20:20:30 +0000
parents 8ee6b81b7de9
children 8ba58c8139aa
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
1 #!/usr/bin/env python3
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
2
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
3 # Takes in VCF file annotated with medaka tools annotate and converts
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
4 #
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
5 # Usage statement:
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
6 # python convert_VCF_info_fields.py in_vcf.vcf out_vcf.vcf
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
7
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
8 # 10/21/2020 - Nathan P. Roach, natproach@gmail.com
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
9
13
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
10 import re
6
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
11 import sys
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
12 from collections import OrderedDict
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
13 from math import log10
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
14
10
8ee6b81b7de9 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 0faf0ade3f13d7c78d93869823ea9fdf25c21b13"
iuc
parents: 8
diff changeset
15 import scipy.stats
6
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
16
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
17
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
18 def pval_to_phredqual(pval):
7
103c8b06c95f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents: 6
diff changeset
19 try:
103c8b06c95f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents: 6
diff changeset
20 ret = round(-10 * log10(pval))
103c8b06c95f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents: 6
diff changeset
21 except ValueError:
103c8b06c95f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents: 6
diff changeset
22 ret = 2147483647 # transform pval of 0.0 to max signed 32 bit int
103c8b06c95f "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit ed5a3aadbecc0decf9a797447f3ac7700683ea9a"
iuc
parents: 6
diff changeset
23 return ret
6
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
24
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
25
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
26 def parseInfoField(info):
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
27 info_fields = info.split(';')
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
28 info_dict = OrderedDict()
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
29 for info_field in info_fields:
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
30 code, val = info_field.split('=')
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
31 info_dict[code] = val
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
32 return info_dict
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
33
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
34
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
35 def annotateVCF(in_vcf_filepath, out_vcf_filepath):
13
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
36 """Postprocess output of medaka tools annotate.
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
37
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
38 Splits multiallelic sites into separate records.
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
39 Replaces medaka INFO fields that might represent information of the ref
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
40 and multiple alternate alleles with simple ref, alt allele counterparts.
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
41 """
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
42
6
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
43 in_vcf = open(in_vcf_filepath, 'r')
13
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
44 # medaka INFO fields that do not make sense after splitting of
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
45 # multi-allelic records
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
46 # DP will be overwritten with the value of DPSP because medaka tools
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
47 # annotate currently only calculates the latter correctly
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
48 # (https://github.com/nanoporetech/medaka/issues/192).
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
49 # DPS, which is as unreliable as DP, gets skipped and the code
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
50 # calculates the spanning reads equivalent DPSPS instead.
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
51 to_skip = {'SC', 'SR', 'AR', 'DP', 'DPSP', 'DPS'}
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
52 struct_meta_pat = re.compile('##(.+)=<ID=([^,]+)(,.+)?>')
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
53 header_lines = []
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
54 contig_ids = set()
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
55 contig_ids_simple = set()
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
56 # parse the metadata lines of the input VCF and drop:
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
57 # - duplicate lines
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
58 # - INFO lines declaring keys we are not going to write
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
59 # - redundant contig information
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
60 while True:
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
61 line = in_vcf.readline()
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
62 if line[:2] != '##':
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
63 assert line.startswith('#CHROM')
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
64 break
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
65 if line in header_lines:
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
66 # the annotate tool may generate lines already written by
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
67 # medaka variant again (example: medaka version line)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
68 continue
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
69 match = struct_meta_pat.match(line)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
70 if match:
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
71 match_type, match_id, match_misc = match.groups()
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
72 if match_type == 'INFO':
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
73 if match_id == 'DPSP':
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
74 line = line.replace('DPSP', 'DP')
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
75 elif match_id in to_skip:
6
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
76 continue
13
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
77 elif match_type == 'contig':
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
78 contig_ids.add(match_id)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
79 if not match_misc:
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
80 # the annotate tools writes its own contig info,
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
81 # which is redundant with contig info generated by
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
82 # medaka variant, but lacks a length value.
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
83 # We don't need the incomplete line.
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
84 contig_ids_simple.add(match_id)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
85 continue
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
86 header_lines.append(line)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
87 # Lets check the above assumption about each ID-only contig line
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
88 # having a more complete counterpart.
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
89 assert not (contig_ids_simple - contig_ids)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
90 header_lines.insert(1, '##convert_VCF_info_fields=0.2\n')
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
91 header_lines += [
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
92 '##INFO=<ID=DPSPS,Number=2,Type=Integer,Description="Depth of spanning reads by strand">\n',
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
93 '##INFO=<ID=AF,Number=1,Type=Float,Description="Spanning Reads Allele Frequency">\n',
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
94 '##INFO=<ID=FAF,Number=1,Type=Float,Description="Forward Spanning Reads Allele Frequency">\n',
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
95 '##INFO=<ID=RAF,Number=1,Type=Float,Description="Reverse Spanning Reads Allele Frequency">\n',
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
96 '##INFO=<ID=SB,Number=1,Type=Integer,Description="Phred-scaled strand bias of spanning reads at this position">\n',
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
97 '##INFO=<ID=DP4,Number=4,Type=Integer,Description="Counts for ref-forward bases, ref-reverse, alt-forward and alt-reverse bases in spanning reads">\n',
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
98 '##INFO=<ID=AS,Number=4,Type=Integer,Description="Total alignment score to ref and alt allele of spanning reads by strand (ref fwd, ref rev, alt fwd, alt rev) aligned with parasail match 5, mismatch -4, open 5, extend 3">\n',
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
99 line
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
100 ]
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
101
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
102 with open(out_vcf_filepath, 'w') as out_vcf:
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
103 out_vcf.writelines(header_lines)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
104 for line in in_vcf:
6
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
105 fields = line.split('\t')
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
106 info_dict = parseInfoField(fields[7])
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
107 sr_list = [int(x) for x in info_dict["SR"].split(',')]
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
108 sc_list = [int(x) for x in info_dict["SC"].split(',')]
13
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
109 if len(sr_list) != len(sc_list):
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
110 print(
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
111 'WARNING - SR and SC are different lengths, '
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
112 'skipping variant'
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
113 )
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
114 print(line.strip()) # Print the line for debugging purposes
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
115 continue
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
116 variant_list = fields[4].split(',')
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
117 dpsp = int(info_dict['DPSP'])
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
118 ref_fwd, ref_rev = 0, 1
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
119 dpspf, dpspr = (int(x) for x in info_dict['AR'].split(','))
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
120 for i in range(0, len(sr_list), 2):
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
121 dpspf += sr_list[i]
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
122 dpspr += sr_list[i + 1]
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
123 for j, i in enumerate(range(2, len(sr_list), 2)):
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
124 dp4 = (
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
125 sr_list[ref_fwd],
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
126 sr_list[ref_rev],
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
127 sr_list[i],
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
128 sr_list[i + 1]
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
129 )
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
130 dp2x2 = [[dp4[0], dp4[1]], [dp4[2], dp4[3]]]
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
131 _, p_val = scipy.stats.fisher_exact(dp2x2)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
132 sb = pval_to_phredqual(p_val)
6
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
133
13
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
134 as_ = (
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
135 sc_list[ref_fwd],
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
136 sc_list[ref_rev],
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
137 sc_list[i],
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
138 sc_list[i + 1]
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
139 )
6
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
140
13
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
141 info = []
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
142 for code in info_dict:
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
143 if code in to_skip:
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
144 continue
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
145 val = info_dict[code]
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
146 info.append("%s=%s" % (code, val))
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
147
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
148 info.append('DP=%d' % dpsp)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
149 info.append('DPSPS=%d,%d' % (dpspf, dpspr))
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
150
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
151 if dpsp == 0:
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
152 info.append('AF=NaN')
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
153 else:
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
154 af = (dp4[2] + dp4[3]) / dpsp
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
155 info.append('AF=%.6f' % af)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
156 if dpspf == 0:
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
157 info.append('FAF=NaN')
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
158 else:
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
159 faf = dp4[2] / dpspf
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
160 info.append('FAF=%.6f' % faf)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
161 if dpspr == 0:
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
162 info.append('RAF=NaN')
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
163 else:
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
164 raf = dp4[3] / dpspr
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
165 info.append('RAF=%.6f' % raf)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
166 info.append('SB=%d' % sb)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
167 info.append('DP4=%d,%d,%d,%d' % dp4)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
168 info.append('AS=%d,%d,%d,%d' % as_)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
169 new_info = ';'.join(info)
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
170 fields[4] = variant_list[j]
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
171 fields[7] = new_info
b7da0d412e48 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 86211daa63a6f39524df8759364795b782324303"
iuc
parents: 10
diff changeset
172 out_vcf.write('\t'.join(fields))
6
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
173 in_vcf.close()
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
174
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
175
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
176 if __name__ == "__main__":
103a681e82a1 "planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/medaka commit 9b7d28ac59ad082874670ee989836631ba8d7fb4"
iuc
parents:
diff changeset
177 annotateVCF(sys.argv[1], sys.argv[2])