comparison blast.py @ 18:1250aab8b97a draft

planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/datatypes/blast_datatypes/ commit 960f4708be7cdd486e4569e7b44eb856b2cad79d-dirty
author peterjc
date Fri, 22 Feb 2019 09:54:46 -0500
parents 3eada762af11
children
comparison
equal deleted inserted replaced
17:3eada762af11 18:1250aab8b97a
11 from galaxy.datatypes.xml import GenericXml 11 from galaxy.datatypes.xml import GenericXml
12 12
13 log = logging.getLogger(__name__) 13 log = logging.getLogger(__name__)
14 14
15 # Note implicit string concatenation here to avoid excessively long lines: 15 # Note implicit string concatenation here to avoid excessively long lines:
16 _DOCTYPES = ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" ' 16 _DOCTYPES = [
17 '"http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">', 17 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" '
18 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" ' 18 '"http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
19 '"NCBI_BlastOutput.dtd">'] 19 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" '
20 '"NCBI_BlastOutput.dtd">',
21 ]
20 22
21 23
22 class BlastXml(GenericXml): 24 class BlastXml(GenericXml):
23 """NCBI Blast XML Output data.""" 25 """NCBI Blast XML Output data."""
24 26
26 28
27 def set_peek(self, dataset, is_multi_byte=False): 29 def set_peek(self, dataset, is_multi_byte=False):
28 """Set the peek and blurb text.""" 30 """Set the peek and blurb text."""
29 if not dataset.dataset.purged: 31 if not dataset.dataset.purged:
30 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte) 32 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte)
31 dataset.blurb = 'NCBI Blast XML data' 33 dataset.blurb = "NCBI Blast XML data"
32 else: 34 else:
33 dataset.peek = 'file does not exist' 35 dataset.peek = "file does not exist"
34 dataset.blurb = 'file purged from disk' 36 dataset.blurb = "file purged from disk"
35 37
36 def sniff(self, filename): 38 def sniff(self, filename):
37 """Determine from the contents if the file is blastxml. 39 """Determine from the contents if the file is blastxml.
38 40
39 >>> from galaxy.datatypes.sniff import get_test_fname 41 >>> from galaxy.datatypes.sniff import get_test_fname
56 line = handle.readline() 58 line = handle.readline()
57 if line.strip() not in _DOCTYPES: 59 if line.strip() not in _DOCTYPES:
58 handle.close() 60 handle.close()
59 return False 61 return False
60 line = handle.readline() 62 line = handle.readline()
61 if line.strip() != '<BlastOutput>': 63 if line.strip() != "<BlastOutput>":
62 handle.close() 64 handle.close()
63 return False 65 return False
64 handle.close() 66 handle.close()
65 return True 67 return True
66 68
72 """ 74 """
73 if len(split_files) == 1: 75 if len(split_files) == 1:
74 # For one file only, use base class method (move/copy) 76 # For one file only, use base class method (move/copy)
75 return Text.merge(split_files, output_file) 77 return Text.merge(split_files, output_file)
76 if not split_files: 78 if not split_files:
77 raise ValueError("Given no BLAST XML files, %r, to merge into %s" 79 raise ValueError(
78 % (split_files, output_file)) 80 "Given no BLAST XML files, %r, to merge into %s"
81 % (split_files, output_file)
82 )
79 out = open(output_file, "w") 83 out = open(output_file, "w")
80 h = None 84 h = None
81 for f in split_files: 85 for f in split_files:
82 if not os.path.isfile(f): 86 if not os.path.isfile(f):
83 log.warning("BLAST XML file %s missing, retry in 1s..." % f) 87 log.warning("BLAST XML file %s missing, retry in 1s..." % f)
136 old_header = header 140 old_header = header
137 elif old_header[:300] != header[:300]: 141 elif old_header[:300] != header[:300]:
138 # Enough to check <BlastOutput_program> and <BlastOutput_version> match 142 # Enough to check <BlastOutput_program> and <BlastOutput_version> match
139 out.close() 143 out.close()
140 h.close() 144 h.close()
141 raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" 145 raise ValueError(
142 % (split_files[0], f, old_header[:300], header[:300])) 146 "BLAST XML headers don't match for %s and %s - have:\n"
147 "%s\n...\n\nAnd:\n%s\n...\n"
148 % (split_files[0], f, old_header[:300], header[:300])
149 )
143 else: 150 else:
144 out.write(" <Iteration>\n") 151 out.write(" <Iteration>\n")
145 for line in h: 152 for line in h:
146 if "</BlastOutput_iterations>" in line: 153 if "</BlastOutput_iterations>" in line:
147 break 154 break
148 # TODO - Increment <Iteration_iter-num> and if required automatic query names 155 # TODO - Increment <Iteration_iter-num> and if required automatic query
149 # like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing? 156 # names like <Iteration_query-ID>Query_3</Iteration_query-ID> to be
157 # increasing?
150 out.write(line) 158 out.write(line)
151 h.close() 159 h.close()
152 out.write(" </BlastOutput_iterations>\n") 160 out.write(" </BlastOutput_iterations>\n")
153 out.write("</BlastOutput>\n") 161 out.write("</BlastOutput>\n")
154 out.close() 162 out.close()
163
155 merge = staticmethod(merge) 164 merge = staticmethod(merge)
156 165
157 166
158 class _BlastDb(object): 167 class _BlastDb(object):
159 """Base class for BLAST database datatype.""" 168 """Base class for BLAST database datatype."""
162 """Set the peek and blurb text.""" 171 """Set the peek and blurb text."""
163 if not dataset.dataset.purged: 172 if not dataset.dataset.purged:
164 dataset.peek = "BLAST database (multiple files)" 173 dataset.peek = "BLAST database (multiple files)"
165 dataset.blurb = "BLAST database (multiple files)" 174 dataset.blurb = "BLAST database (multiple files)"
166 else: 175 else:
167 dataset.peek = 'file does not exist' 176 dataset.peek = "file does not exist"
168 dataset.blurb = 'file purged from disk' 177 dataset.blurb = "file purged from disk"
169 178
170 def display_peek(self, dataset): 179 def display_peek(self, dataset):
171 """Create HTML content, used for displaying peek.""" 180 """Create HTML content, used for displaying peek."""
172 try: 181 try:
173 return dataset.peek 182 return dataset.peek
174 except Exception: 183 except Exception:
175 return "BLAST database (multiple files)" 184 return "BLAST database (multiple files)"
176 185
177 def display_data(self, trans, data, preview=False, filename=None, 186 def display_data(
178 to_ext=None, size=None, offset=None, **kwd): 187 self,
188 trans,
189 data,
190 preview=False,
191 filename=None,
192 to_ext=None,
193 size=None,
194 offset=None,
195 **kwd
196 ):
179 """Documented as an old display method, but still gets called via tests etc. 197 """Documented as an old display method, but still gets called via tests etc.
180 198
181 This allows us to format the data shown in the central pane via the "eye" icon. 199 This allows us to format the data shown in the central pane via the "eye" icon.
182 """ 200 """
183 if filename is not None and filename != "index": 201 if filename is not None and filename != "index":
184 # Change nothing - important for the unit tests to access child files: 202 # Change nothing - important for the unit tests to access child files:
185 return Data.display_data(self, trans, data, preview, filename, 203 return Data.display_data(
186 to_ext, size, offset, **kwd) 204 self, trans, data, preview, filename, to_ext, size, offset, **kwd
205 )
187 if self.file_ext == "blastdbn": 206 if self.file_ext == "blastdbn":
188 title = "This is a nucleotide BLAST database" 207 title = "This is a nucleotide BLAST database"
189 elif self.file_ext == "blastdbp": 208 elif self.file_ext == "blastdbp":
190 title = "This is a protein BLAST database" 209 title = "This is a protein BLAST database"
191 elif self.file_ext == "blastdbd": 210 elif self.file_ext == "blastdbd":
202 except Exception: 221 except Exception:
203 pass 222 pass
204 if not msg: 223 if not msg:
205 msg = title 224 msg = title
206 # Galaxy assumes HTML for the display of composite datatypes, 225 # Galaxy assumes HTML for the display of composite datatypes,
207 return "<html><head><title>%s</title></head><body><pre>%s</pre></body></html>" % (title, msg) 226 return (
227 "<html><head><title>%s</title></head><body><pre>%s</pre></body></html>"
228 % (title, msg)
229 )
208 230
209 def merge(split_files, output_file): 231 def merge(split_files, output_file):
210 """Merge BLAST databases (not implemented for now).""" 232 """Merge BLAST databases (not implemented for now)."""
211 raise NotImplementedError("Merging BLAST databases is non-trivial (do this via makeblastdb?)") 233 raise NotImplementedError(
234 "Merging BLAST databases is non-trivial (do this via makeblastdb?)"
235 )
212 236
213 def split(cls, input_datasets, subdir_generator_function, split_params): 237 def split(cls, input_datasets, subdir_generator_function, split_params):
214 """Split a BLAST database (not implemented for now).""" 238 """Split a BLAST database (not implemented for now)."""
215 if split_params is None: 239 if split_params is None:
216 return None 240 return None
218 242
219 243
220 class BlastNucDb(_BlastDb, Data): 244 class BlastNucDb(_BlastDb, Data):
221 """Class for nucleotide BLAST database files.""" 245 """Class for nucleotide BLAST database files."""
222 246
223 file_ext = 'blastdbn' 247 file_ext = "blastdbn"
224 allow_datatype_change = False 248 allow_datatype_change = False
225 composite_type = 'basic' 249 composite_type = "basic"
226 250
227 def __init__(self, **kwd): 251 def __init__(self, **kwd):
228 """Initialize the class.""" 252 """Initialize the class."""
229 Data.__init__(self, **kwd) 253 Data.__init__(self, **kwd)
230 self.add_composite_file('blastdb.nhr', is_binary=True) # sequence headers 254 self.add_composite_file("blastdb.nhr", is_binary=True) # sequence headers
231 self.add_composite_file('blastdb.nin', is_binary=True) # index file 255 self.add_composite_file("blastdb.nin", is_binary=True) # index file
232 self.add_composite_file('blastdb.nsq', is_binary=True) # nucleotide sequences 256 self.add_composite_file("blastdb.nsq", is_binary=True) # nucleotide sequences
233 257
234 # alias ( -gi_mask option of makeblastdb) 258 # alias ( -gi_mask option of makeblastdb)
235 self.add_composite_file('blastdb.nal', is_binary=False, optional=True) 259 self.add_composite_file("blastdb.nal", is_binary=False, optional=True)
236 260
237 # sorted sequence hash values ( -hash_index option of makeblastdb) 261 # sorted sequence hash values ( -hash_index option of makeblastdb)
238 self.add_composite_file('blastdb.nhd', is_binary=True, optional=True) 262 self.add_composite_file("blastdb.nhd", is_binary=True, optional=True)
239 263
240 # index of sequence hash values ( -hash_index option of makeblastdb) 264 # index of sequence hash values ( -hash_index option of makeblastdb)
241 self.add_composite_file('blastdb.nhi', is_binary=True, optional=True) 265 self.add_composite_file("blastdb.nhi", is_binary=True, optional=True)
242 266
243 # sorted GI values ( -parse_seqids option of makeblastdb and gi present in the description lines) 267 # sorted GI values
244 self.add_composite_file('blastdb.nnd', is_binary=True, optional=True) 268 # ( -parse_seqids option of makeblastdb and gi present in the description lines)
245 269 self.add_composite_file("blastdb.nnd", is_binary=True, optional=True)
246 # index of GI values ( -parse_seqids option of makeblastdb and gi present in the description lines) 270
247 self.add_composite_file('blastdb.nni', is_binary=True, optional=True) 271 # index of GI values
272 # ( -parse_seqids option of makeblastdb and gi present in the description lines)
273 self.add_composite_file("blastdb.nni", is_binary=True, optional=True)
248 274
249 # OID->GI lookup file ( -hash_index or -parse_seqids option of makeblastdb) 275 # OID->GI lookup file ( -hash_index or -parse_seqids option of makeblastdb)
250 self.add_composite_file('blastdb.nog', is_binary=True, optional=True) 276 self.add_composite_file("blastdb.nog", is_binary=True, optional=True)
251 277
252 # sorted sequence accession values ( -hash_index or -parse_seqids option of makeblastdb) 278 # sorted sequence accession values
253 self.add_composite_file('blastdb.nsd', is_binary=True, optional=True) 279 # ( -hash_index or -parse_seqids option of makeblastdb)
254 280 self.add_composite_file("blastdb.nsd", is_binary=True, optional=True)
255 # index of sequence accession values ( -hash_index or -parse_seqids option of makeblastdb) 281
256 self.add_composite_file('blastdb.nsi', is_binary=True, optional=True) 282 # index of sequence accession values
283 # ( -hash_index or -parse_seqids option of makeblastdb)
284 self.add_composite_file("blastdb.nsi", is_binary=True, optional=True)
257 285
258 # first volume of the MegaBLAST index generated by makembindex 286 # first volume of the MegaBLAST index generated by makembindex
259 # self.add_composite_file('blastdb.00.idx', is_binary=True, optional=True) 287 # self.add_composite_file('blastdb.00.idx', is_binary=True, optional=True)
260 # The previous line should be repeated for each index volume, with filename 288 # The previous line should be repeated for each index volume, with filename
261 # extensions like '.01.idx', '.02.idx', etc. 289 # extensions like '.01.idx', '.02.idx', etc.
277 305
278 306
279 class BlastProtDb(_BlastDb, Data): 307 class BlastProtDb(_BlastDb, Data):
280 """Class for protein BLAST database files.""" 308 """Class for protein BLAST database files."""
281 309
282 file_ext = 'blastdbp' 310 file_ext = "blastdbp"
283 allow_datatype_change = False 311 allow_datatype_change = False
284 composite_type = 'basic' 312 composite_type = "basic"
285 313
286 def __init__(self, **kwd): 314 def __init__(self, **kwd):
287 """Initialize the class.""" 315 """Initialize the class."""
288 Data.__init__(self, **kwd) 316 Data.__init__(self, **kwd)
289 # Component file comments are as in BlastNucDb except where noted 317 # Component file comments are as in BlastNucDb except where noted
290 self.add_composite_file('blastdb.phr', is_binary=True) 318 self.add_composite_file("blastdb.phr", is_binary=True)
291 self.add_composite_file('blastdb.pin', is_binary=True) 319 self.add_composite_file("blastdb.pin", is_binary=True)
292 self.add_composite_file('blastdb.psq', is_binary=True) # protein sequences 320 self.add_composite_file("blastdb.psq", is_binary=True) # protein sequences
293 self.add_composite_file('blastdb.phd', is_binary=True, optional=True) 321 self.add_composite_file("blastdb.phd", is_binary=True, optional=True)
294 self.add_composite_file('blastdb.phi', is_binary=True, optional=True) 322 self.add_composite_file("blastdb.phi", is_binary=True, optional=True)
295 self.add_composite_file('blastdb.pnd', is_binary=True, optional=True) 323 self.add_composite_file("blastdb.pnd", is_binary=True, optional=True)
296 self.add_composite_file('blastdb.pni', is_binary=True, optional=True) 324 self.add_composite_file("blastdb.pni", is_binary=True, optional=True)
297 self.add_composite_file('blastdb.pog', is_binary=True, optional=True) 325 self.add_composite_file("blastdb.pog", is_binary=True, optional=True)
298 self.add_composite_file('blastdb.psd', is_binary=True, optional=True) 326 self.add_composite_file("blastdb.psd", is_binary=True, optional=True)
299 self.add_composite_file('blastdb.psi', is_binary=True, optional=True) 327 self.add_composite_file("blastdb.psi", is_binary=True, optional=True)
300 # self.add_composite_file('blastdb.paa', is_binary=True, optional=True) 328 # self.add_composite_file('blastdb.paa', is_binary=True, optional=True)
301 # self.add_composite_file('blastdb.pab', is_binary=True, optional=True) 329 # self.add_composite_file('blastdb.pab', is_binary=True, optional=True)
302 # self.add_composite_file('blastdb.pac', is_binary=True, optional=True) 330 # self.add_composite_file('blastdb.pac', is_binary=True, optional=True)
303 # The last 3 lines should be repeated for each WriteDB column, with filename 331 # The last 3 lines should be repeated for each WriteDB column, with filename
304 # extensions like ('.pba', '.pbb', '.pbc'), ('.pca', '.pcb', '.pcc'), etc. 332 # extensions like ('.pba', '.pbb', '.pbc'), ('.pca', '.pcb', '.pcc'), etc.
305 333
306 334
307 class BlastDomainDb(_BlastDb, Data): 335 class BlastDomainDb(_BlastDb, Data):
308 """Class for domain BLAST database files.""" 336 """Class for domain BLAST database files."""
309 337
310 file_ext = 'blastdbd' 338 file_ext = "blastdbd"
311 allow_datatype_change = False 339 allow_datatype_change = False
312 composite_type = 'basic' 340 composite_type = "basic"
313 341
314 def __init__(self, **kwd): 342 def __init__(self, **kwd):
315 """Initialize the class.""" 343 """Initialize the class."""
316 Data.__init__(self, **kwd) 344 Data.__init__(self, **kwd)
317 self.add_composite_file('blastdb.phr', is_binary=True) 345 self.add_composite_file("blastdb.phr", is_binary=True)
318 self.add_composite_file('blastdb.pin', is_binary=True) 346 self.add_composite_file("blastdb.pin", is_binary=True)
319 self.add_composite_file('blastdb.psq', is_binary=True) 347 self.add_composite_file("blastdb.psq", is_binary=True)
320 self.add_composite_file('blastdb.freq', is_binary=True, optional=True) 348 self.add_composite_file("blastdb.freq", is_binary=True, optional=True)
321 self.add_composite_file('blastdb.loo', is_binary=True, optional=True) 349 self.add_composite_file("blastdb.loo", is_binary=True, optional=True)
322 self.add_composite_file('blastdb.psd', is_binary=True, optional=True) 350 self.add_composite_file("blastdb.psd", is_binary=True, optional=True)
323 self.add_composite_file('blastdb.psi', is_binary=True, optional=True) 351 self.add_composite_file("blastdb.psi", is_binary=True, optional=True)
324 self.add_composite_file('blastdb.rps', is_binary=True, optional=True) 352 self.add_composite_file("blastdb.rps", is_binary=True, optional=True)
325 self.add_composite_file('blastdb.aux', is_binary=True, optional=True) 353 self.add_composite_file("blastdb.aux", is_binary=True, optional=True)