Mercurial > repos > devteam > blast_datatypes
comparison blast.py @ 18:1250aab8b97a draft
planemo upload for repository https://github.com/peterjc/galaxy_blast/tree/master/datatypes/blast_datatypes/ commit 960f4708be7cdd486e4569e7b44eb856b2cad79d-dirty
| author | peterjc |
|---|---|
| date | Fri, 22 Feb 2019 09:54:46 -0500 |
| parents | 3eada762af11 |
| children |
comparison
equal
deleted
inserted
replaced
| 17:3eada762af11 | 18:1250aab8b97a |
|---|---|
| 11 from galaxy.datatypes.xml import GenericXml | 11 from galaxy.datatypes.xml import GenericXml |
| 12 | 12 |
| 13 log = logging.getLogger(__name__) | 13 log = logging.getLogger(__name__) |
| 14 | 14 |
| 15 # Note implicit string concatenation here to avoid excessively long lines: | 15 # Note implicit string concatenation here to avoid excessively long lines: |
| 16 _DOCTYPES = ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" ' | 16 _DOCTYPES = [ |
| 17 '"http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">', | 17 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" ' |
| 18 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" ' | 18 '"http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">', |
| 19 '"NCBI_BlastOutput.dtd">'] | 19 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" ' |
| 20 '"NCBI_BlastOutput.dtd">', | |
| 21 ] | |
| 20 | 22 |
| 21 | 23 |
| 22 class BlastXml(GenericXml): | 24 class BlastXml(GenericXml): |
| 23 """NCBI Blast XML Output data.""" | 25 """NCBI Blast XML Output data.""" |
| 24 | 26 |
| 26 | 28 |
| 27 def set_peek(self, dataset, is_multi_byte=False): | 29 def set_peek(self, dataset, is_multi_byte=False): |
| 28 """Set the peek and blurb text.""" | 30 """Set the peek and blurb text.""" |
| 29 if not dataset.dataset.purged: | 31 if not dataset.dataset.purged: |
| 30 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte) | 32 dataset.peek = get_file_peek(dataset.file_name, is_multi_byte=is_multi_byte) |
| 31 dataset.blurb = 'NCBI Blast XML data' | 33 dataset.blurb = "NCBI Blast XML data" |
| 32 else: | 34 else: |
| 33 dataset.peek = 'file does not exist' | 35 dataset.peek = "file does not exist" |
| 34 dataset.blurb = 'file purged from disk' | 36 dataset.blurb = "file purged from disk" |
| 35 | 37 |
| 36 def sniff(self, filename): | 38 def sniff(self, filename): |
| 37 """Determine from the contents if the file is blastxml. | 39 """Determine from the contents if the file is blastxml. |
| 38 | 40 |
| 39 >>> from galaxy.datatypes.sniff import get_test_fname | 41 >>> from galaxy.datatypes.sniff import get_test_fname |
| 56 line = handle.readline() | 58 line = handle.readline() |
| 57 if line.strip() not in _DOCTYPES: | 59 if line.strip() not in _DOCTYPES: |
| 58 handle.close() | 60 handle.close() |
| 59 return False | 61 return False |
| 60 line = handle.readline() | 62 line = handle.readline() |
| 61 if line.strip() != '<BlastOutput>': | 63 if line.strip() != "<BlastOutput>": |
| 62 handle.close() | 64 handle.close() |
| 63 return False | 65 return False |
| 64 handle.close() | 66 handle.close() |
| 65 return True | 67 return True |
| 66 | 68 |
| 72 """ | 74 """ |
| 73 if len(split_files) == 1: | 75 if len(split_files) == 1: |
| 74 # For one file only, use base class method (move/copy) | 76 # For one file only, use base class method (move/copy) |
| 75 return Text.merge(split_files, output_file) | 77 return Text.merge(split_files, output_file) |
| 76 if not split_files: | 78 if not split_files: |
| 77 raise ValueError("Given no BLAST XML files, %r, to merge into %s" | 79 raise ValueError( |
| 78 % (split_files, output_file)) | 80 "Given no BLAST XML files, %r, to merge into %s" |
| 81 % (split_files, output_file) | |
| 82 ) | |
| 79 out = open(output_file, "w") | 83 out = open(output_file, "w") |
| 80 h = None | 84 h = None |
| 81 for f in split_files: | 85 for f in split_files: |
| 82 if not os.path.isfile(f): | 86 if not os.path.isfile(f): |
| 83 log.warning("BLAST XML file %s missing, retry in 1s..." % f) | 87 log.warning("BLAST XML file %s missing, retry in 1s..." % f) |
| 136 old_header = header | 140 old_header = header |
| 137 elif old_header[:300] != header[:300]: | 141 elif old_header[:300] != header[:300]: |
| 138 # Enough to check <BlastOutput_program> and <BlastOutput_version> match | 142 # Enough to check <BlastOutput_program> and <BlastOutput_version> match |
| 139 out.close() | 143 out.close() |
| 140 h.close() | 144 h.close() |
| 141 raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" | 145 raise ValueError( |
| 142 % (split_files[0], f, old_header[:300], header[:300])) | 146 "BLAST XML headers don't match for %s and %s - have:\n" |
| 147 "%s\n...\n\nAnd:\n%s\n...\n" | |
| 148 % (split_files[0], f, old_header[:300], header[:300]) | |
| 149 ) | |
| 143 else: | 150 else: |
| 144 out.write(" <Iteration>\n") | 151 out.write(" <Iteration>\n") |
| 145 for line in h: | 152 for line in h: |
| 146 if "</BlastOutput_iterations>" in line: | 153 if "</BlastOutput_iterations>" in line: |
| 147 break | 154 break |
| 148 # TODO - Increment <Iteration_iter-num> and if required automatic query names | 155 # TODO - Increment <Iteration_iter-num> and if required automatic query |
| 149 # like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing? | 156 # names like <Iteration_query-ID>Query_3</Iteration_query-ID> to be |
| 157 # increasing? | |
| 150 out.write(line) | 158 out.write(line) |
| 151 h.close() | 159 h.close() |
| 152 out.write(" </BlastOutput_iterations>\n") | 160 out.write(" </BlastOutput_iterations>\n") |
| 153 out.write("</BlastOutput>\n") | 161 out.write("</BlastOutput>\n") |
| 154 out.close() | 162 out.close() |
| 163 | |
| 155 merge = staticmethod(merge) | 164 merge = staticmethod(merge) |
| 156 | 165 |
| 157 | 166 |
| 158 class _BlastDb(object): | 167 class _BlastDb(object): |
| 159 """Base class for BLAST database datatype.""" | 168 """Base class for BLAST database datatype.""" |
| 162 """Set the peek and blurb text.""" | 171 """Set the peek and blurb text.""" |
| 163 if not dataset.dataset.purged: | 172 if not dataset.dataset.purged: |
| 164 dataset.peek = "BLAST database (multiple files)" | 173 dataset.peek = "BLAST database (multiple files)" |
| 165 dataset.blurb = "BLAST database (multiple files)" | 174 dataset.blurb = "BLAST database (multiple files)" |
| 166 else: | 175 else: |
| 167 dataset.peek = 'file does not exist' | 176 dataset.peek = "file does not exist" |
| 168 dataset.blurb = 'file purged from disk' | 177 dataset.blurb = "file purged from disk" |
| 169 | 178 |
| 170 def display_peek(self, dataset): | 179 def display_peek(self, dataset): |
| 171 """Create HTML content, used for displaying peek.""" | 180 """Create HTML content, used for displaying peek.""" |
| 172 try: | 181 try: |
| 173 return dataset.peek | 182 return dataset.peek |
| 174 except Exception: | 183 except Exception: |
| 175 return "BLAST database (multiple files)" | 184 return "BLAST database (multiple files)" |
| 176 | 185 |
| 177 def display_data(self, trans, data, preview=False, filename=None, | 186 def display_data( |
| 178 to_ext=None, size=None, offset=None, **kwd): | 187 self, |
| 188 trans, | |
| 189 data, | |
| 190 preview=False, | |
| 191 filename=None, | |
| 192 to_ext=None, | |
| 193 size=None, | |
| 194 offset=None, | |
| 195 **kwd | |
| 196 ): | |
| 179 """Documented as an old display method, but still gets called via tests etc. | 197 """Documented as an old display method, but still gets called via tests etc. |
| 180 | 198 |
| 181 This allows us to format the data shown in the central pane via the "eye" icon. | 199 This allows us to format the data shown in the central pane via the "eye" icon. |
| 182 """ | 200 """ |
| 183 if filename is not None and filename != "index": | 201 if filename is not None and filename != "index": |
| 184 # Change nothing - important for the unit tests to access child files: | 202 # Change nothing - important for the unit tests to access child files: |
| 185 return Data.display_data(self, trans, data, preview, filename, | 203 return Data.display_data( |
| 186 to_ext, size, offset, **kwd) | 204 self, trans, data, preview, filename, to_ext, size, offset, **kwd |
| 205 ) | |
| 187 if self.file_ext == "blastdbn": | 206 if self.file_ext == "blastdbn": |
| 188 title = "This is a nucleotide BLAST database" | 207 title = "This is a nucleotide BLAST database" |
| 189 elif self.file_ext == "blastdbp": | 208 elif self.file_ext == "blastdbp": |
| 190 title = "This is a protein BLAST database" | 209 title = "This is a protein BLAST database" |
| 191 elif self.file_ext == "blastdbd": | 210 elif self.file_ext == "blastdbd": |
| 202 except Exception: | 221 except Exception: |
| 203 pass | 222 pass |
| 204 if not msg: | 223 if not msg: |
| 205 msg = title | 224 msg = title |
| 206 # Galaxy assumes HTML for the display of composite datatypes, | 225 # Galaxy assumes HTML for the display of composite datatypes, |
| 207 return "<html><head><title>%s</title></head><body><pre>%s</pre></body></html>" % (title, msg) | 226 return ( |
| 227 "<html><head><title>%s</title></head><body><pre>%s</pre></body></html>" | |
| 228 % (title, msg) | |
| 229 ) | |
| 208 | 230 |
| 209 def merge(split_files, output_file): | 231 def merge(split_files, output_file): |
| 210 """Merge BLAST databases (not implemented for now).""" | 232 """Merge BLAST databases (not implemented for now).""" |
| 211 raise NotImplementedError("Merging BLAST databases is non-trivial (do this via makeblastdb?)") | 233 raise NotImplementedError( |
| 234 "Merging BLAST databases is non-trivial (do this via makeblastdb?)" | |
| 235 ) | |
| 212 | 236 |
| 213 def split(cls, input_datasets, subdir_generator_function, split_params): | 237 def split(cls, input_datasets, subdir_generator_function, split_params): |
| 214 """Split a BLAST database (not implemented for now).""" | 238 """Split a BLAST database (not implemented for now).""" |
| 215 if split_params is None: | 239 if split_params is None: |
| 216 return None | 240 return None |
| 218 | 242 |
| 219 | 243 |
| 220 class BlastNucDb(_BlastDb, Data): | 244 class BlastNucDb(_BlastDb, Data): |
| 221 """Class for nucleotide BLAST database files.""" | 245 """Class for nucleotide BLAST database files.""" |
| 222 | 246 |
| 223 file_ext = 'blastdbn' | 247 file_ext = "blastdbn" |
| 224 allow_datatype_change = False | 248 allow_datatype_change = False |
| 225 composite_type = 'basic' | 249 composite_type = "basic" |
| 226 | 250 |
| 227 def __init__(self, **kwd): | 251 def __init__(self, **kwd): |
| 228 """Initialize the class.""" | 252 """Initialize the class.""" |
| 229 Data.__init__(self, **kwd) | 253 Data.__init__(self, **kwd) |
| 230 self.add_composite_file('blastdb.nhr', is_binary=True) # sequence headers | 254 self.add_composite_file("blastdb.nhr", is_binary=True) # sequence headers |
| 231 self.add_composite_file('blastdb.nin', is_binary=True) # index file | 255 self.add_composite_file("blastdb.nin", is_binary=True) # index file |
| 232 self.add_composite_file('blastdb.nsq', is_binary=True) # nucleotide sequences | 256 self.add_composite_file("blastdb.nsq", is_binary=True) # nucleotide sequences |
| 233 | 257 |
| 234 # alias ( -gi_mask option of makeblastdb) | 258 # alias ( -gi_mask option of makeblastdb) |
| 235 self.add_composite_file('blastdb.nal', is_binary=False, optional=True) | 259 self.add_composite_file("blastdb.nal", is_binary=False, optional=True) |
| 236 | 260 |
| 237 # sorted sequence hash values ( -hash_index option of makeblastdb) | 261 # sorted sequence hash values ( -hash_index option of makeblastdb) |
| 238 self.add_composite_file('blastdb.nhd', is_binary=True, optional=True) | 262 self.add_composite_file("blastdb.nhd", is_binary=True, optional=True) |
| 239 | 263 |
| 240 # index of sequence hash values ( -hash_index option of makeblastdb) | 264 # index of sequence hash values ( -hash_index option of makeblastdb) |
| 241 self.add_composite_file('blastdb.nhi', is_binary=True, optional=True) | 265 self.add_composite_file("blastdb.nhi", is_binary=True, optional=True) |
| 242 | 266 |
| 243 # sorted GI values ( -parse_seqids option of makeblastdb and gi present in the description lines) | 267 # sorted GI values |
| 244 self.add_composite_file('blastdb.nnd', is_binary=True, optional=True) | 268 # ( -parse_seqids option of makeblastdb and gi present in the description lines) |
| 245 | 269 self.add_composite_file("blastdb.nnd", is_binary=True, optional=True) |
| 246 # index of GI values ( -parse_seqids option of makeblastdb and gi present in the description lines) | 270 |
| 247 self.add_composite_file('blastdb.nni', is_binary=True, optional=True) | 271 # index of GI values |
| 272 # ( -parse_seqids option of makeblastdb and gi present in the description lines) | |
| 273 self.add_composite_file("blastdb.nni", is_binary=True, optional=True) | |
| 248 | 274 |
| 249 # OID->GI lookup file ( -hash_index or -parse_seqids option of makeblastdb) | 275 # OID->GI lookup file ( -hash_index or -parse_seqids option of makeblastdb) |
| 250 self.add_composite_file('blastdb.nog', is_binary=True, optional=True) | 276 self.add_composite_file("blastdb.nog", is_binary=True, optional=True) |
| 251 | 277 |
| 252 # sorted sequence accession values ( -hash_index or -parse_seqids option of makeblastdb) | 278 # sorted sequence accession values |
| 253 self.add_composite_file('blastdb.nsd', is_binary=True, optional=True) | 279 # ( -hash_index or -parse_seqids option of makeblastdb) |
| 254 | 280 self.add_composite_file("blastdb.nsd", is_binary=True, optional=True) |
| 255 # index of sequence accession values ( -hash_index or -parse_seqids option of makeblastdb) | 281 |
| 256 self.add_composite_file('blastdb.nsi', is_binary=True, optional=True) | 282 # index of sequence accession values |
| 283 # ( -hash_index or -parse_seqids option of makeblastdb) | |
| 284 self.add_composite_file("blastdb.nsi", is_binary=True, optional=True) | |
| 257 | 285 |
| 258 # first volume of the MegaBLAST index generated by makembindex | 286 # first volume of the MegaBLAST index generated by makembindex |
| 259 # self.add_composite_file('blastdb.00.idx', is_binary=True, optional=True) | 287 # self.add_composite_file('blastdb.00.idx', is_binary=True, optional=True) |
| 260 # The previous line should be repeated for each index volume, with filename | 288 # The previous line should be repeated for each index volume, with filename |
| 261 # extensions like '.01.idx', '.02.idx', etc. | 289 # extensions like '.01.idx', '.02.idx', etc. |
| 277 | 305 |
| 278 | 306 |
| 279 class BlastProtDb(_BlastDb, Data): | 307 class BlastProtDb(_BlastDb, Data): |
| 280 """Class for protein BLAST database files.""" | 308 """Class for protein BLAST database files.""" |
| 281 | 309 |
| 282 file_ext = 'blastdbp' | 310 file_ext = "blastdbp" |
| 283 allow_datatype_change = False | 311 allow_datatype_change = False |
| 284 composite_type = 'basic' | 312 composite_type = "basic" |
| 285 | 313 |
| 286 def __init__(self, **kwd): | 314 def __init__(self, **kwd): |
| 287 """Initialize the class.""" | 315 """Initialize the class.""" |
| 288 Data.__init__(self, **kwd) | 316 Data.__init__(self, **kwd) |
| 289 # Component file comments are as in BlastNucDb except where noted | 317 # Component file comments are as in BlastNucDb except where noted |
| 290 self.add_composite_file('blastdb.phr', is_binary=True) | 318 self.add_composite_file("blastdb.phr", is_binary=True) |
| 291 self.add_composite_file('blastdb.pin', is_binary=True) | 319 self.add_composite_file("blastdb.pin", is_binary=True) |
| 292 self.add_composite_file('blastdb.psq', is_binary=True) # protein sequences | 320 self.add_composite_file("blastdb.psq", is_binary=True) # protein sequences |
| 293 self.add_composite_file('blastdb.phd', is_binary=True, optional=True) | 321 self.add_composite_file("blastdb.phd", is_binary=True, optional=True) |
| 294 self.add_composite_file('blastdb.phi', is_binary=True, optional=True) | 322 self.add_composite_file("blastdb.phi", is_binary=True, optional=True) |
| 295 self.add_composite_file('blastdb.pnd', is_binary=True, optional=True) | 323 self.add_composite_file("blastdb.pnd", is_binary=True, optional=True) |
| 296 self.add_composite_file('blastdb.pni', is_binary=True, optional=True) | 324 self.add_composite_file("blastdb.pni", is_binary=True, optional=True) |
| 297 self.add_composite_file('blastdb.pog', is_binary=True, optional=True) | 325 self.add_composite_file("blastdb.pog", is_binary=True, optional=True) |
| 298 self.add_composite_file('blastdb.psd', is_binary=True, optional=True) | 326 self.add_composite_file("blastdb.psd", is_binary=True, optional=True) |
| 299 self.add_composite_file('blastdb.psi', is_binary=True, optional=True) | 327 self.add_composite_file("blastdb.psi", is_binary=True, optional=True) |
| 300 # self.add_composite_file('blastdb.paa', is_binary=True, optional=True) | 328 # self.add_composite_file('blastdb.paa', is_binary=True, optional=True) |
| 301 # self.add_composite_file('blastdb.pab', is_binary=True, optional=True) | 329 # self.add_composite_file('blastdb.pab', is_binary=True, optional=True) |
| 302 # self.add_composite_file('blastdb.pac', is_binary=True, optional=True) | 330 # self.add_composite_file('blastdb.pac', is_binary=True, optional=True) |
| 303 # The last 3 lines should be repeated for each WriteDB column, with filename | 331 # The last 3 lines should be repeated for each WriteDB column, with filename |
| 304 # extensions like ('.pba', '.pbb', '.pbc'), ('.pca', '.pcb', '.pcc'), etc. | 332 # extensions like ('.pba', '.pbb', '.pbc'), ('.pca', '.pcb', '.pcc'), etc. |
| 305 | 333 |
| 306 | 334 |
| 307 class BlastDomainDb(_BlastDb, Data): | 335 class BlastDomainDb(_BlastDb, Data): |
| 308 """Class for domain BLAST database files.""" | 336 """Class for domain BLAST database files.""" |
| 309 | 337 |
| 310 file_ext = 'blastdbd' | 338 file_ext = "blastdbd" |
| 311 allow_datatype_change = False | 339 allow_datatype_change = False |
| 312 composite_type = 'basic' | 340 composite_type = "basic" |
| 313 | 341 |
| 314 def __init__(self, **kwd): | 342 def __init__(self, **kwd): |
| 315 """Initialize the class.""" | 343 """Initialize the class.""" |
| 316 Data.__init__(self, **kwd) | 344 Data.__init__(self, **kwd) |
| 317 self.add_composite_file('blastdb.phr', is_binary=True) | 345 self.add_composite_file("blastdb.phr", is_binary=True) |
| 318 self.add_composite_file('blastdb.pin', is_binary=True) | 346 self.add_composite_file("blastdb.pin", is_binary=True) |
| 319 self.add_composite_file('blastdb.psq', is_binary=True) | 347 self.add_composite_file("blastdb.psq", is_binary=True) |
| 320 self.add_composite_file('blastdb.freq', is_binary=True, optional=True) | 348 self.add_composite_file("blastdb.freq", is_binary=True, optional=True) |
| 321 self.add_composite_file('blastdb.loo', is_binary=True, optional=True) | 349 self.add_composite_file("blastdb.loo", is_binary=True, optional=True) |
| 322 self.add_composite_file('blastdb.psd', is_binary=True, optional=True) | 350 self.add_composite_file("blastdb.psd", is_binary=True, optional=True) |
| 323 self.add_composite_file('blastdb.psi', is_binary=True, optional=True) | 351 self.add_composite_file("blastdb.psi", is_binary=True, optional=True) |
| 324 self.add_composite_file('blastdb.rps', is_binary=True, optional=True) | 352 self.add_composite_file("blastdb.rps", is_binary=True, optional=True) |
| 325 self.add_composite_file('blastdb.aux', is_binary=True, optional=True) | 353 self.add_composite_file("blastdb.aux", is_binary=True, optional=True) |
