Mercurial > repos > dfornika > blast_xml_to_tabular
comparison blast_xml_to_tabular.py @ 0:efe0c7b8fb78 draft
planemo upload for repository https://github.com/dfornika/galaxy/tree/master/tools/blast_xml_to_tabular commit 006cbba6513492f5a06b573c676400a2d464520b-dirty
author | dfornika |
---|---|
date | Mon, 09 Sep 2019 17:17:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:efe0c7b8fb78 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """Convert a BLAST XML file to 12 column tabular output | |
4 | |
5 Takes three command line options, input BLAST XML filename, output tabular | |
6 BLAST filename, output format (std for standard 12 columns, or ext for the | |
7 extended 24 columns offered in the BLAST+ wrappers). | |
8 | |
9 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart | |
10 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which | |
11 mean: | |
12 | |
13 ====== ========= ============================================ | |
14 Column NCBI name Description | |
15 ------ --------- -------------------------------------------- | |
16 1 qseqid Query Seq-id (ID of your sequence) | |
17 2 sseqid Subject Seq-id (ID of the database hit) | |
18 3 pident Percentage of identical matches | |
19 4 length Alignment length | |
20 5 mismatch Number of mismatches | |
21 6 gapopen Number of gap openings | |
22 7 qstart Start of alignment in query | |
23 8 qend End of alignment in query | |
24 9 sstart Start of alignment in subject (database hit) | |
25 10 send End of alignment in subject (database hit) | |
26 11 evalue Expectation value (E-value) | |
27 12 bitscore Bit score | |
28 ====== ========= ============================================ | |
29 | |
30 The additional columns offered in the Galaxy BLAST+ wrappers are: | |
31 | |
32 ====== ============= =========================================== | |
33 Column NCBI name Description | |
34 ------ ------------- ------------------------------------------- | |
35 13 sallseqid All subject Seq-id(s), separated by a ';' | |
36 14 score Raw score | |
37 15 nident Number of identical matches | |
38 16 positive Number of positive-scoring matches | |
39 17 gaps Total number of gaps | |
40 18 ppos Percentage of positive-scoring matches | |
41 19 qframe Query frame | |
42 20 sframe Subject frame | |
43 21 qseq Aligned part of query sequence | |
44 22 sseq Aligned part of subject sequence | |
45 23 qlen Query sequence length | |
46 24 slen Subject sequence length | |
47 ====== ============= =========================================== | |
48 | |
49 Most of these fields are given explicitly in the XML file, others some like | |
50 the percentage identity and the number of gap openings must be calculated. | |
51 | |
52 Be aware that the sequence in the extended tabular output or XML direct from | |
53 BLAST+ may or may not use XXXX masking on regions of low complexity. This | |
54 can throw the off the calculation of percentage identity and gap openings. | |
55 [In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard, | |
56 with these numbers changing depending on whether or not the low complexity | |
57 filter is used.] | |
58 | |
59 This script attempts to produce identical output to what BLAST+ would have done. | |
60 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra | |
61 space character (probably a bug). | |
62 | |
63 python blast_xml_parser.py in_file out_file out_format | |
64 """ | |
65 | |
66 import sys | |
67 import re | |
68 | |
69 import xml.etree.cElementTree as ElementTree | |
70 | |
71 def stop_err( msg ): | |
72 sys.stderr.write("%s\n" % msg) | |
73 sys.exit(1) | |
74 | |
75 #Parse Command Line | |
76 try: | |
77 in_file, out_file, out_fmt = sys.argv[1:] | |
78 except: | |
79 stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)") | |
80 | |
81 if out_fmt == "std": | |
82 extended = False | |
83 elif out_fmt == "x22": | |
84 stop_err("Format argument x22 has been replaced with ext (extended 24 columns)") | |
85 elif out_fmt == "ext" or out_fmt == "ext+": | |
86 extended = True | |
87 else: | |
88 stop_err("Format argument should be std (12 column) or ext (extended 24 columns) or ext+ (extended 26 columns)") | |
89 | |
90 extended_plus = False | |
91 if '+' in out_fmt: | |
92 extended_plus = True | |
93 | |
94 # get an iterable | |
95 try: | |
96 context = ElementTree.iterparse(in_file, events=("start", "end")) | |
97 except: | |
98 stop_err("Invalid data format.") | |
99 # turn it into an iterator | |
100 context = iter(context) | |
101 # get the root element | |
102 try: | |
103 event, root = context.next() | |
104 except: | |
105 stop_err( "Invalid data format." ) | |
106 | |
107 re_default_query_id = re.compile("^Query_\d+$") | |
108 assert re_default_query_id.match("Query_101") | |
109 assert not re_default_query_id.match("Query_101a") | |
110 assert not re_default_query_id.match("MyQuery_101") | |
111 re_default_subject_id = re.compile("^Subject_\d+$") | |
112 assert re_default_subject_id.match("Subject_1") | |
113 assert not re_default_subject_id.match("Subject_") | |
114 assert not re_default_subject_id.match("Subject_12a") | |
115 assert not re_default_subject_id.match("TheSubject_1") | |
116 | |
117 outfile = open(out_file, 'w') | |
118 blast_program = None | |
119 for event, elem in context: | |
120 if event == "end" and elem.tag == "BlastOutput_program": | |
121 blast_program = elem.text | |
122 # for every <Iteration> tag | |
123 if event == "end" and elem.tag == "Iteration": | |
124 #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA | |
125 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> | |
126 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> | |
127 # <Iteration_query-len>406</Iteration_query-len> | |
128 # <Iteration_hits></Iteration_hits> | |
129 # | |
130 #Or, from BLAST 2.2.24+ run online | |
131 # <Iteration_query-ID>Query_1</Iteration_query-ID> | |
132 # <Iteration_query-def>Sample</Iteration_query-def> | |
133 # <Iteration_query-len>516</Iteration_query-len> | |
134 # <Iteration_hits>... | |
135 qseqid = elem.findtext("Iteration_query-ID") | |
136 if re_default_query_id.match(qseqid): | |
137 #Place holder ID, take the first word of the query definition | |
138 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] | |
139 qlen = int(elem.findtext("Iteration_query-len")) | |
140 | |
141 # for every <Hit> within <Iteration> | |
142 for hit in elem.findall("Iteration_hits/Hit"): | |
143 #Expecting either this, | |
144 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> | |
145 # <Hit_def>RecName: Full=Rhodopsin</Hit_def> | |
146 # <Hit_accession>P56514</Hit_accession> | |
147 #or, | |
148 # <Hit_id>Subject_1</Hit_id> | |
149 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> | |
150 # <Hit_accession>Subject_1</Hit_accession> | |
151 # | |
152 #apparently depending on the parse_deflines switch | |
153 sseqid = hit.findtext("Hit_id").split(None,1)[0] | |
154 hit_def = sseqid + " " + hit.findtext("Hit_def") | |
155 if re_default_subject_id.match(sseqid) \ | |
156 and sseqid == hit.findtext("Hit_accession"): | |
157 #Place holder ID, take the first word of the subject definition | |
158 hit_def = hit.findtext("Hit_def") | |
159 sseqid = hit_def.split(None,1)[0] | |
160 # for every <Hsp> within <Hit> | |
161 for hsp in hit.findall("Hit_hsps/Hsp"): | |
162 nident = hsp.findtext("Hsp_identity") | |
163 length = hsp.findtext("Hsp_align-len") | |
164 pident = "%0.2f" % (100*float(nident)/float(length)) | |
165 | |
166 q_seq = hsp.findtext("Hsp_qseq") | |
167 h_seq = hsp.findtext("Hsp_hseq") | |
168 m_seq = hsp.findtext("Hsp_midline") | |
169 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length) | |
170 gapopen = str(len(q_seq.replace('-', ' ').split())-1 + \ | |
171 len(h_seq.replace('-', ' ').split())-1) | |
172 | |
173 mismatch = m_seq.count(' ') + m_seq.count('+') \ | |
174 - q_seq.count('-') - h_seq.count('-') | |
175 #TODO - Remove this alternative mismatch calculation and test | |
176 #once satisifed there are no problems | |
177 expected_mismatch = len(q_seq) \ | |
178 - sum(1 for q,h in zip(q_seq, h_seq) \ | |
179 if q == h or q == "-" or h == "-") | |
180 xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X") | |
181 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx): | |
182 stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \ | |
183 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"), | |
184 int(mismatch), expected_mismatch)) | |
185 | |
186 #TODO - Remove this alternative identity calculation and test | |
187 #once satisifed there are no problems | |
188 expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h) | |
189 if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")): | |
190 stop_err("%s vs %s identities, expected %i <= %i <= %i" \ | |
191 % (qseqid, sseqid, expected_identity, int(nident), | |
192 expected_identity + q_seq.count("X"))) | |
193 | |
194 evalue = hsp.findtext("Hsp_evalue") | |
195 if evalue == "0": | |
196 evalue = "0.0" | |
197 else: | |
198 evalue = "%0.0e" % float(evalue) | |
199 | |
200 bitscore = float(hsp.findtext("Hsp_bit-score")) | |
201 if bitscore < 100: | |
202 #Seems to show one decimal place for lower scores | |
203 bitscore = "%0.1f" % bitscore | |
204 else: | |
205 #Note BLAST does not round to nearest int, it truncates | |
206 bitscore = "%i" % bitscore | |
207 | |
208 qstart = hsp.findtext("Hsp_query-from") | |
209 qend = hsp.findtext("Hsp_query-to") | |
210 values = [qseqid, | |
211 sseqid, | |
212 pident, | |
213 length, #hsp.findtext("Hsp_align-len") | |
214 str(mismatch), | |
215 gapopen, | |
216 #hsp.findtext("Hsp_query-from"), #qstart, | |
217 #hsp.findtext("Hsp_query-to"), #qend, | |
218 qstart, | |
219 qend, | |
220 hsp.findtext("Hsp_hit-from"), #sstart, | |
221 hsp.findtext("Hsp_hit-to"), #send, | |
222 evalue, #hsp.findtext("Hsp_evalue") in scientific notation | |
223 bitscore, #hsp.findtext("Hsp_bit-score") rounded | |
224 ] | |
225 | |
226 if extended: | |
227 sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">")) | |
228 #print hit_def, "-->", sallseqid | |
229 positive = hsp.findtext("Hsp_positive") | |
230 ppos = "%0.2f" % (100*float(positive)/float(length)) | |
231 qframe = hsp.findtext("Hsp_query-frame") | |
232 sframe = hsp.findtext("Hsp_hit-frame") | |
233 if blast_program == "blastp": | |
234 #Probably a bug in BLASTP that they use 0 or 1 depending on format | |
235 if qframe == "0": qframe = "1" | |
236 if sframe == "0": sframe = "1" | |
237 slen = int(hit.findtext("Hit_len")) | |
238 values.extend([sallseqid, | |
239 hsp.findtext("Hsp_score"), #score, | |
240 nident, | |
241 positive, | |
242 hsp.findtext("Hsp_gaps"), #gaps, | |
243 ppos, | |
244 qframe, | |
245 sframe, | |
246 #NOTE - for blastp, XML shows original seq, tabular uses XXX masking | |
247 q_seq, | |
248 h_seq, | |
249 str(qlen), | |
250 str(slen) | |
251 ]) | |
252 | |
253 if extended_plus: | |
254 pcov = "%0.2f" % (float(int(qend) - int(qstart) + 1)/qlen * 100) | |
255 sallseqdescr = ";".join(name.split(None,1)[1] for name in hit_def.split(">")) | |
256 values.extend([pcov, sallseqdescr]) | |
257 | |
258 #print "\t".join(values) | |
259 outfile.write("\t".join(values) + "\n") | |
260 # prevents ElementTree from growing large datastructure | |
261 root.clear() | |
262 elem.clear() | |
263 outfile.close() |