comparison blast_xml_to_tabular.py @ 0:efe0c7b8fb78 draft

planemo upload for repository https://github.com/dfornika/galaxy/tree/master/tools/blast_xml_to_tabular commit 006cbba6513492f5a06b573c676400a2d464520b-dirty
author dfornika
date Mon, 09 Sep 2019 17:17:09 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:efe0c7b8fb78
1 #!/usr/bin/env python
2
3 """Convert a BLAST XML file to 12 column tabular output
4
5 Takes three command line options, input BLAST XML filename, output tabular
6 BLAST filename, output format (std for standard 12 columns, or ext for the
7 extended 24 columns offered in the BLAST+ wrappers).
8
9 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart
10 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which
11 mean:
12
13 ====== ========= ============================================
14 Column NCBI name Description
15 ------ --------- --------------------------------------------
16 1 qseqid Query Seq-id (ID of your sequence)
17 2 sseqid Subject Seq-id (ID of the database hit)
18 3 pident Percentage of identical matches
19 4 length Alignment length
20 5 mismatch Number of mismatches
21 6 gapopen Number of gap openings
22 7 qstart Start of alignment in query
23 8 qend End of alignment in query
24 9 sstart Start of alignment in subject (database hit)
25 10 send End of alignment in subject (database hit)
26 11 evalue Expectation value (E-value)
27 12 bitscore Bit score
28 ====== ========= ============================================
29
30 The additional columns offered in the Galaxy BLAST+ wrappers are:
31
32 ====== ============= ===========================================
33 Column NCBI name Description
34 ------ ------------- -------------------------------------------
35 13 sallseqid All subject Seq-id(s), separated by a ';'
36 14 score Raw score
37 15 nident Number of identical matches
38 16 positive Number of positive-scoring matches
39 17 gaps Total number of gaps
40 18 ppos Percentage of positive-scoring matches
41 19 qframe Query frame
42 20 sframe Subject frame
43 21 qseq Aligned part of query sequence
44 22 sseq Aligned part of subject sequence
45 23 qlen Query sequence length
46 24 slen Subject sequence length
47 ====== ============= ===========================================
48
49 Most of these fields are given explicitly in the XML file, others some like
50 the percentage identity and the number of gap openings must be calculated.
51
52 Be aware that the sequence in the extended tabular output or XML direct from
53 BLAST+ may or may not use XXXX masking on regions of low complexity. This
54 can throw the off the calculation of percentage identity and gap openings.
55 [In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard,
56 with these numbers changing depending on whether or not the low complexity
57 filter is used.]
58
59 This script attempts to produce identical output to what BLAST+ would have done.
60 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
61 space character (probably a bug).
62
63 python blast_xml_parser.py in_file out_file out_format
64 """
65
66 import sys
67 import re
68
69 import xml.etree.cElementTree as ElementTree
70
71 def stop_err( msg ):
72 sys.stderr.write("%s\n" % msg)
73 sys.exit(1)
74
75 #Parse Command Line
76 try:
77 in_file, out_file, out_fmt = sys.argv[1:]
78 except:
79 stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)")
80
81 if out_fmt == "std":
82 extended = False
83 elif out_fmt == "x22":
84 stop_err("Format argument x22 has been replaced with ext (extended 24 columns)")
85 elif out_fmt == "ext" or out_fmt == "ext+":
86 extended = True
87 else:
88 stop_err("Format argument should be std (12 column) or ext (extended 24 columns) or ext+ (extended 26 columns)")
89
90 extended_plus = False
91 if '+' in out_fmt:
92 extended_plus = True
93
94 # get an iterable
95 try:
96 context = ElementTree.iterparse(in_file, events=("start", "end"))
97 except:
98 stop_err("Invalid data format.")
99 # turn it into an iterator
100 context = iter(context)
101 # get the root element
102 try:
103 event, root = context.next()
104 except:
105 stop_err( "Invalid data format." )
106
107 re_default_query_id = re.compile("^Query_\d+$")
108 assert re_default_query_id.match("Query_101")
109 assert not re_default_query_id.match("Query_101a")
110 assert not re_default_query_id.match("MyQuery_101")
111 re_default_subject_id = re.compile("^Subject_\d+$")
112 assert re_default_subject_id.match("Subject_1")
113 assert not re_default_subject_id.match("Subject_")
114 assert not re_default_subject_id.match("Subject_12a")
115 assert not re_default_subject_id.match("TheSubject_1")
116
117 outfile = open(out_file, 'w')
118 blast_program = None
119 for event, elem in context:
120 if event == "end" and elem.tag == "BlastOutput_program":
121 blast_program = elem.text
122 # for every <Iteration> tag
123 if event == "end" and elem.tag == "Iteration":
124 #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
125 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
126 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
127 # <Iteration_query-len>406</Iteration_query-len>
128 # <Iteration_hits></Iteration_hits>
129 #
130 #Or, from BLAST 2.2.24+ run online
131 # <Iteration_query-ID>Query_1</Iteration_query-ID>
132 # <Iteration_query-def>Sample</Iteration_query-def>
133 # <Iteration_query-len>516</Iteration_query-len>
134 # <Iteration_hits>...
135 qseqid = elem.findtext("Iteration_query-ID")
136 if re_default_query_id.match(qseqid):
137 #Place holder ID, take the first word of the query definition
138 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
139 qlen = int(elem.findtext("Iteration_query-len"))
140
141 # for every <Hit> within <Iteration>
142 for hit in elem.findall("Iteration_hits/Hit"):
143 #Expecting either this,
144 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
145 # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
146 # <Hit_accession>P56514</Hit_accession>
147 #or,
148 # <Hit_id>Subject_1</Hit_id>
149 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
150 # <Hit_accession>Subject_1</Hit_accession>
151 #
152 #apparently depending on the parse_deflines switch
153 sseqid = hit.findtext("Hit_id").split(None,1)[0]
154 hit_def = sseqid + " " + hit.findtext("Hit_def")
155 if re_default_subject_id.match(sseqid) \
156 and sseqid == hit.findtext("Hit_accession"):
157 #Place holder ID, take the first word of the subject definition
158 hit_def = hit.findtext("Hit_def")
159 sseqid = hit_def.split(None,1)[0]
160 # for every <Hsp> within <Hit>
161 for hsp in hit.findall("Hit_hsps/Hsp"):
162 nident = hsp.findtext("Hsp_identity")
163 length = hsp.findtext("Hsp_align-len")
164 pident = "%0.2f" % (100*float(nident)/float(length))
165
166 q_seq = hsp.findtext("Hsp_qseq")
167 h_seq = hsp.findtext("Hsp_hseq")
168 m_seq = hsp.findtext("Hsp_midline")
169 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
170 gapopen = str(len(q_seq.replace('-', ' ').split())-1 + \
171 len(h_seq.replace('-', ' ').split())-1)
172
173 mismatch = m_seq.count(' ') + m_seq.count('+') \
174 - q_seq.count('-') - h_seq.count('-')
175 #TODO - Remove this alternative mismatch calculation and test
176 #once satisifed there are no problems
177 expected_mismatch = len(q_seq) \
178 - sum(1 for q,h in zip(q_seq, h_seq) \
179 if q == h or q == "-" or h == "-")
180 xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X")
181 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
182 stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \
183 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
184 int(mismatch), expected_mismatch))
185
186 #TODO - Remove this alternative identity calculation and test
187 #once satisifed there are no problems
188 expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h)
189 if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
190 stop_err("%s vs %s identities, expected %i <= %i <= %i" \
191 % (qseqid, sseqid, expected_identity, int(nident),
192 expected_identity + q_seq.count("X")))
193
194 evalue = hsp.findtext("Hsp_evalue")
195 if evalue == "0":
196 evalue = "0.0"
197 else:
198 evalue = "%0.0e" % float(evalue)
199
200 bitscore = float(hsp.findtext("Hsp_bit-score"))
201 if bitscore < 100:
202 #Seems to show one decimal place for lower scores
203 bitscore = "%0.1f" % bitscore
204 else:
205 #Note BLAST does not round to nearest int, it truncates
206 bitscore = "%i" % bitscore
207
208 qstart = hsp.findtext("Hsp_query-from")
209 qend = hsp.findtext("Hsp_query-to")
210 values = [qseqid,
211 sseqid,
212 pident,
213 length, #hsp.findtext("Hsp_align-len")
214 str(mismatch),
215 gapopen,
216 #hsp.findtext("Hsp_query-from"), #qstart,
217 #hsp.findtext("Hsp_query-to"), #qend,
218 qstart,
219 qend,
220 hsp.findtext("Hsp_hit-from"), #sstart,
221 hsp.findtext("Hsp_hit-to"), #send,
222 evalue, #hsp.findtext("Hsp_evalue") in scientific notation
223 bitscore, #hsp.findtext("Hsp_bit-score") rounded
224 ]
225
226 if extended:
227 sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">"))
228 #print hit_def, "-->", sallseqid
229 positive = hsp.findtext("Hsp_positive")
230 ppos = "%0.2f" % (100*float(positive)/float(length))
231 qframe = hsp.findtext("Hsp_query-frame")
232 sframe = hsp.findtext("Hsp_hit-frame")
233 if blast_program == "blastp":
234 #Probably a bug in BLASTP that they use 0 or 1 depending on format
235 if qframe == "0": qframe = "1"
236 if sframe == "0": sframe = "1"
237 slen = int(hit.findtext("Hit_len"))
238 values.extend([sallseqid,
239 hsp.findtext("Hsp_score"), #score,
240 nident,
241 positive,
242 hsp.findtext("Hsp_gaps"), #gaps,
243 ppos,
244 qframe,
245 sframe,
246 #NOTE - for blastp, XML shows original seq, tabular uses XXX masking
247 q_seq,
248 h_seq,
249 str(qlen),
250 str(slen)
251 ])
252
253 if extended_plus:
254 pcov = "%0.2f" % (float(int(qend) - int(qstart) + 1)/qlen * 100)
255 sallseqdescr = ";".join(name.split(None,1)[1] for name in hit_def.split(">"))
256 values.extend([pcov, sallseqdescr])
257
258 #print "\t".join(values)
259 outfile.write("\t".join(values) + "\n")
260 # prevents ElementTree from growing large datastructure
261 root.clear()
262 elem.clear()
263 outfile.close()