Mercurial > repos > devteam > blast_datatypes
annotate blast.py @ 5:b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
Fixes a MetadataElement bug and includes more of the optional
BLAST database files (contribution from Nicola Soranzo).
author | peterjc |
---|---|
date | Wed, 20 Mar 2013 10:39:27 -0400 |
parents | f9a7783ed7b6 |
children | a44a7a5456e1 |
rev | line source |
---|---|
3 | 1 """ |
2 BlastXml class | |
3 """ | |
4 | |
5 from galaxy.datatypes.data import get_file_peek | |
4 | 6 from galaxy.datatypes.data import Text, Data |
3 | 7 from galaxy.datatypes.xml import GenericXml |
4 | 8 from galaxy.datatypes.metadata import MetadataElement |
3 | 9 |
5
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
10 |
3 | 11 class BlastXml( GenericXml ): |
12 """NCBI Blast XML Output data""" | |
13 file_ext = "blastxml" | |
14 | |
15 def set_peek( self, dataset, is_multi_byte=False ): | |
16 """Set the peek and blurb text""" | |
17 if not dataset.dataset.purged: | |
18 dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
19 dataset.blurb = 'NCBI Blast XML data' | |
20 else: | |
21 dataset.peek = 'file does not exist' | |
22 dataset.blurb = 'file purged from disk' | |
4 | 23 |
3 | 24 def sniff( self, filename ): |
25 """ | |
26 Determines whether the file is blastxml | |
27 | |
28 >>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' ) | |
29 >>> BlastXml().sniff( fname ) | |
30 True | |
31 >>> fname = get_test_fname( 'tblastn_four_human_vs_rhodopsin.xml' ) | |
32 >>> BlastXml().sniff( fname ) | |
33 True | |
34 >>> fname = get_test_fname( 'interval.interval' ) | |
35 >>> BlastXml().sniff( fname ) | |
36 False | |
37 """ | |
38 #TODO - Use a context manager on Python 2.5+ to close handle | |
39 handle = open(filename) | |
40 line = handle.readline() | |
41 if line.strip() != '<?xml version="1.0"?>': | |
42 handle.close() | |
43 return False | |
44 line = handle.readline() | |
45 if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">', | |
46 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']: | |
47 handle.close() | |
48 return False | |
49 line = handle.readline() | |
50 if line.strip() != '<BlastOutput>': | |
51 handle.close() | |
52 return False | |
53 handle.close() | |
54 return True | |
55 | |
56 def merge(split_files, output_file): | |
57 """Merging multiple XML files is non-trivial and must be done in subclasses.""" | |
58 if len(split_files) == 1: | |
59 #For one file only, use base class method (move/copy) | |
60 return Text.merge(split_files, output_file) | |
4 | 61 if not split_files: |
62 raise ValueError("Given no BLAST XML files, %r, to merge into %s" \ | |
63 % (split_files, output_file)) | |
3 | 64 out = open(output_file, "w") |
65 h = None | |
66 for f in split_files: | |
67 h = open(f) | |
68 body = False | |
69 header = h.readline() | |
70 if not header: | |
71 out.close() | |
72 h.close() | |
73 raise ValueError("BLAST XML file %s was empty" % f) | |
74 if header.strip() != '<?xml version="1.0"?>': | |
75 out.write(header) #for diagnosis | |
76 out.close() | |
77 h.close() | |
78 raise ValueError("%s is not an XML file!" % f) | |
79 line = h.readline() | |
80 header += line | |
81 if line.strip() not in ['<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">', | |
82 '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">']: | |
83 out.write(header) #for diagnosis | |
84 out.close() | |
85 h.close() | |
86 raise ValueError("%s is not a BLAST XML file!" % f) | |
87 while True: | |
88 line = h.readline() | |
89 if not line: | |
90 out.write(header) #for diagnosis | |
91 out.close() | |
92 h.close() | |
93 raise ValueError("BLAST XML file %s ended prematurely" % f) | |
94 header += line | |
95 if "<Iteration>" in line: | |
96 break | |
97 if len(header) > 10000: | |
98 #Something has gone wrong, don't load too much into memory! | |
99 #Write what we have to the merged file for diagnostics | |
100 out.write(header) | |
101 out.close() | |
102 h.close() | |
103 raise ValueError("BLAST XML file %s has too long a header!" % f) | |
104 if "<BlastOutput>" not in header: | |
105 out.close() | |
106 h.close() | |
107 raise ValueError("%s is not a BLAST XML file:\n%s\n..." % (f, header)) | |
108 if f == split_files[0]: | |
109 out.write(header) | |
110 old_header = header | |
111 elif old_header[:300] != header[:300]: | |
112 #Enough to check <BlastOutput_program> and <BlastOutput_version> match | |
113 out.close() | |
114 h.close() | |
115 raise ValueError("BLAST XML headers don't match for %s and %s - have:\n%s\n...\n\nAnd:\n%s\n...\n" \ | |
116 % (split_files[0], f, old_header[:300], header[:300])) | |
117 else: | |
118 out.write(" <Iteration>\n") | |
119 for line in h: | |
120 if "</BlastOutput_iterations>" in line: | |
121 break | |
122 #TODO - Increment <Iteration_iter-num> and if required automatic query names | |
123 #like <Iteration_query-ID>Query_3</Iteration_query-ID> to be increasing? | |
124 out.write(line) | |
125 h.close() | |
126 out.write(" </BlastOutput_iterations>\n") | |
127 out.write("</BlastOutput>\n") | |
128 out.close() | |
129 merge = staticmethod(merge) | |
130 | |
4 | 131 |
132 class _BlastDb(object): | |
133 """Base class for BLAST database datatype.""" | |
134 | |
135 def set_peek( self, dataset, is_multi_byte=False ): | |
136 """Set the peek and blurb text.""" | |
137 if not dataset.dataset.purged: | |
138 dataset.peek = "BLAST database (multiple files)" | |
139 dataset.blurb = "BLAST database (multiple files)" | |
140 else: | |
141 dataset.peek = 'file does not exist' | |
142 dataset.blurb = 'file purged from disk' | |
143 | |
144 def display_peek( self, dataset ): | |
145 """Create HTML content, used for displaying peek.""" | |
146 try: | |
147 return dataset.peek | |
148 except: | |
149 return "BLAST database (multiple files)" | |
150 | |
151 def display_data(self, trans, data, preview=False, filename=None, | |
152 to_ext=None, size=None, offset=None, **kwd): | |
153 """Apparently an old display method, but still gets called. | |
154 | |
155 This allows us to format the data shown in the central pane via the "eye" icon. | |
156 """ | |
157 return "This is a BLAST database." | |
158 | |
159 def get_mime(self): | |
160 """Returns the mime type of the datatype (pretend it is text for peek)""" | |
161 return 'text/plain' | |
162 | |
163 def merge(split_files, output_file): | |
164 """Merge BLAST databases (not implemented for now).""" | |
165 raise NotImplementedError("Merging BLAST databases is non-trivial (do this via makeblastdb?)") | |
166 | |
167 def split( cls, input_datasets, subdir_generator_function, split_params): | |
168 """Split a BLAST database (not implemented for now).""" | |
169 if split_params is None: | |
170 return None | |
171 raise NotImplementedError("Can't split BLAST databases") | |
172 | |
173 | |
174 class BlastNucDb( _BlastDb, Data ): | |
175 """Class for nucleotide BLAST database files.""" | |
176 file_ext = 'blastdbn' | |
177 composite_type ='basic' | |
178 | |
5
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
179 def __init__(self, **kwd): |
4 | 180 Data.__init__(self, **kwd) |
5
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
181 self.add_composite_file('blastdb.nhr', is_binary=True) # sequence headers |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
182 self.add_composite_file('blastdb.nin', is_binary=True) # index file |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
183 self.add_composite_file('blastdb.nsq', is_binary=True) # nucleotide sequences |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
184 self.add_composite_file('blastdb.nal', is_binary=False, optional=True) # alias ( -gi_mask option of makeblastdb) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
185 self.add_composite_file('blastdb.nhd', is_binary=True, optional=True) # sorted sequence hash values ( -hash_index option of makeblastdb) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
186 self.add_composite_file('blastdb.nhi', is_binary=True, optional=True) # index of sequence hash values ( -hash_index option of makeblastdb) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
187 self.add_composite_file('blastdb.nnd', is_binary=True, optional=True) # sorted GI values ( -parse_seqids option of makeblastdb and gi present in the description lines) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
188 self.add_composite_file('blastdb.nni', is_binary=True, optional=True) # index of GI values ( -parse_seqids option of makeblastdb and gi present in the description lines) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
189 self.add_composite_file('blastdb.nog', is_binary=True, optional=True) # OID->GI lookup file ( -hash_index or -parse_seqids option of makeblastdb) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
190 self.add_composite_file('blastdb.nsd', is_binary=True, optional=True) # sorted sequence accession values ( -hash_index or -parse_seqids option of makeblastdb) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
191 self.add_composite_file('blastdb.nsi', is_binary=True, optional=True) # index of sequence accession values ( -hash_index or -parse_seqids option of makeblastdb) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
192 # self.add_composite_file('blastdb.naa', is_binary=True, optional=True) # index of a WriteDB column for e.g. mask data |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
193 # self.add_composite_file('blastdb.nab', is_binary=True, optional=True) # data of a WriteDB column |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
194 # self.add_composite_file('blastdb.nac', is_binary=True, optional=True) # multiple byte order for a WriteDB column |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
195 # The last 3 lines should be repeated for each WriteDB column, with filename extensions like ('.nba', '.nbb', '.nbc'), ('.nca', '.ncb', '.ncc'), etc. |
4 | 196 |
197 def display_data(self, trans, data, preview=False, filename=None, | |
198 to_ext=None, size=None, offset=None, **kwd): | |
199 """Apparently an old display method, but still gets called. | |
200 | |
201 This allows us to format the data shown in the central pane via the "eye" icon. | |
202 """ | |
203 return "This is a BLAST nucleotide database." | |
204 | |
5
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
205 |
4 | 206 class BlastProtDb( _BlastDb, Data ): |
207 """Class for protein BLAST database files.""" | |
208 file_ext = 'blastdbp' | |
209 composite_type ='basic' | |
210 | |
5
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
211 def __init__(self, **kwd): |
4 | 212 Data.__init__(self, **kwd) |
5
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
213 # Component file comments are as in BlastNucDb except where noted |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
214 self.add_composite_file('blastdb.phr', is_binary=True) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
215 self.add_composite_file('blastdb.pin', is_binary=True) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
216 self.add_composite_file('blastdb.psq', is_binary=True) # protein sequences |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
217 self.add_composite_file('blastdb.phd', is_binary=True, optional=True) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
218 self.add_composite_file('blastdb.phi', is_binary=True, optional=True) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
219 self.add_composite_file('blastdb.pnd', is_binary=True, optional=True) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
220 self.add_composite_file('blastdb.pni', is_binary=True, optional=True) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
221 self.add_composite_file('blastdb.pog', is_binary=True, optional=True) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
222 self.add_composite_file('blastdb.psd', is_binary=True, optional=True) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
223 self.add_composite_file('blastdb.psi', is_binary=True, optional=True) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
224 # self.add_composite_file('blastdb.paa', is_binary=True, optional=True) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
225 # self.add_composite_file('blastdb.pab', is_binary=True, optional=True) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
226 # self.add_composite_file('blastdb.pac', is_binary=True, optional=True) |
b3a3ba0c1d47
Uploaded v0.0.15 which updates the BLAST database definitions.
peterjc
parents:
4
diff
changeset
|
227 # The last 3 lines should be repeated for each WriteDB column, with filename extensions like ('.pba', '.pbb', '.pbc'), ('.pca', '.pcb', '.pcc'), etc. |
4 | 228 |
229 def display_data(self, trans, data, preview=False, filename=None, | |
230 to_ext=None, size=None, offset=None, **kwd): | |
231 """Apparently an old display method, but still gets called. | |
232 | |
233 This allows us to format the data shown in the central pane via the "eye" icon. | |
234 """ | |
235 return "This is a BLAST protein database." |