Mercurial > repos > iracooke > proteomics_datatypes
comparison proteomics.py @ 0:7101f7e4b00b
Uploaded
author | iracooke |
---|---|
date | Wed, 08 May 2013 03:25:50 -0400 |
parents | |
children | b22ebbb05260 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:7101f7e4b00b |
---|---|
1 """ | |
2 Proteomics format classes | |
3 """ | |
4 import logging | |
5 import re | |
6 from galaxy.datatypes.data import * | |
7 from galaxy.datatypes.xml import * | |
8 from galaxy.datatypes.sniff import * | |
9 from galaxy.datatypes.binary import * | |
10 from galaxy.datatypes.interval import * | |
11 | |
12 log = logging.getLogger(__name__) | |
13 | |
14 class ProtGff( Gff ): | |
15 """Tab delimited data in Gff format""" | |
16 file_ext = "prot_gff" | |
17 def set_peek( self, dataset, is_multi_byte=False ): | |
18 """Set the peek and blurb text""" | |
19 if not dataset.dataset.purged: | |
20 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
21 dataset.blurb = 'Proteogenomics GFF' | |
22 else: | |
23 dataset.peek = 'file does not exist' | |
24 dataset.blurb = 'file purged from disk' | |
25 | |
26 def sniff( self, filename ): | |
27 handle = open(filename) | |
28 xmlns_re = re.compile("^##gff-version") | |
29 for i in range(3): | |
30 line = handle.readline() | |
31 if xmlns_re.match(line.strip()): | |
32 handle.close() | |
33 return True | |
34 | |
35 handle.close() | |
36 return False | |
37 | |
38 | |
39 class Xls( Binary ): | |
40 """Class describing a binary excel spreadsheet file""" | |
41 file_ext = "xls" | |
42 | |
43 def set_peek( self, dataset, is_multi_byte=False ): | |
44 if not dataset.dataset.purged: | |
45 dataset.peek = "Excel Spreadsheet file" | |
46 dataset.blurb = data.nice_size( dataset.get_size() ) | |
47 else: | |
48 dataset.peek = 'file does not exist' | |
49 dataset.blurb = 'file purged from disk' | |
50 def display_peek( self, dataset ): | |
51 try: | |
52 return dataset.peek | |
53 except: | |
54 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) ) | |
55 | |
56 class ProteomicsXml(GenericXml): | |
57 """ An enhanced XML datatype used to reuse code across several | |
58 proteomic/mass-spec datatypes. """ | |
59 | |
60 def sniff(self, filename): | |
61 """ Determines whether the file is the correct XML type. """ | |
62 with open(filename, 'r') as contents: | |
63 while True: | |
64 line = contents.readline() | |
65 if line == None or not line.startswith('<?'): | |
66 break | |
67 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string | |
68 return line != None and re.match(pattern, line) != None | |
69 | |
70 def set_peek( self, dataset, is_multi_byte=False ): | |
71 """Set the peek and blurb text""" | |
72 if not dataset.dataset.purged: | |
73 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
74 dataset.blurb = self.blurb | |
75 else: | |
76 dataset.peek = 'file does not exist' | |
77 dataset.blurb = 'file purged from disk' | |
78 | |
79 class PepXml(ProteomicsXml): | |
80 """pepXML data""" | |
81 file_ext = "pepxml" | |
82 blurb = 'pepXML data' | |
83 root = "msms_pipeline_analysis" | |
84 | |
85 | |
86 class MzML(ProteomicsXml): | |
87 """mzML data""" | |
88 file_ext = "mzml" | |
89 blurb = 'mzML Mass Spectrometry data' | |
90 root = "(mzML|indexedmzML)" | |
91 | |
92 | |
93 class ProtXML(ProteomicsXml): | |
94 """protXML data""" | |
95 file_ext = "protxml" | |
96 blurb = 'prot XML Search Results' | |
97 root = "protein_summary" | |
98 | |
99 | |
100 class MzXML(ProteomicsXml): | |
101 """mzXML data""" | |
102 file_ext = "mzXML" | |
103 blurb = "mzXML Mass Spectrometry data" | |
104 root = "mzXML" | |
105 | |
106 ## PSI datatypes | |
107 class MzIdentML(ProteomicsXml): | |
108 file_ext = "mzid" | |
109 blurb = "XML identified peptides and proteins." | |
110 root = "MzIdentML" | |
111 | |
112 | |
113 class TraML(ProteomicsXml): | |
114 file_ext = "traML" | |
115 blurb = "TraML transition list" | |
116 root = "TraML" | |
117 | |
118 | |
119 class MzQuantML(ProteomicsXml): | |
120 file_ext = "mzq" | |
121 blurb = "XML quantification data" | |
122 root = "MzQuantML" | |
123 | |
124 | |
125 class Mgf( Text ): | |
126 """Mascot Generic Format data""" | |
127 file_ext = "mgf" | |
128 | |
129 def set_peek( self, dataset, is_multi_byte=False ): | |
130 """Set the peek and blurb text""" | |
131 if not dataset.dataset.purged: | |
132 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
133 dataset.blurb = 'mgf Mascot Generic Format' | |
134 else: | |
135 dataset.peek = 'file does not exist' | |
136 dataset.blurb = 'file purged from disk' | |
137 | |
138 | |
139 def sniff( self, filename ): | |
140 mgf_begin_ions = "BEGIN IONS" | |
141 max_lines=100 | |
142 | |
143 for i, line in enumerate( file( filename ) ): | |
144 line = line.rstrip( '\n\r' ) | |
145 if line==mgf_begin_ions: | |
146 return True | |
147 if i>max_lines: | |
148 return False | |
149 | |
150 | |
151 class MascotDat( Text ): | |
152 """Mascot search results """ | |
153 file_ext = "mascotdat" | |
154 | |
155 def set_peek( self, dataset, is_multi_byte=False ): | |
156 """Set the peek and blurb text""" | |
157 if not dataset.dataset.purged: | |
158 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
159 dataset.blurb = 'mascotdat Mascot Search Results' | |
160 else: | |
161 dataset.peek = 'file does not exist' | |
162 dataset.blurb = 'file purged from disk' | |
163 | |
164 | |
165 def sniff( self, filename ): | |
166 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)" | |
167 max_lines=10 | |
168 | |
169 for i, line in enumerate( file( filename ) ): | |
170 line = line.rstrip( '\n\r' ) | |
171 if line==mime_version: | |
172 return True | |
173 if i>max_lines: | |
174 return False | |
175 | |
176 | |
177 class RAW( Binary ): | |
178 """Class describing a Thermo Finnigan binary RAW file""" | |
179 file_ext = "raw" | |
180 def sniff( self, filename ): | |
181 # Thermo Finnigan RAW format is proprietary and hence not well documented. | |
182 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n | |
183 # This combination represents 17 bytes, but to play safe we read 20 bytes from | |
184 # the start of the file. | |
185 try: | |
186 header = open( filename ).read(20) | |
187 hexheader = binascii.b2a_hex( header ) | |
188 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' ) | |
189 if hexheader.find(finnigan) != -1: | |
190 return True | |
191 return False | |
192 except: | |
193 return False | |
194 def set_peek( self, dataset, is_multi_byte=False ): | |
195 if not dataset.dataset.purged: | |
196 dataset.peek = "Thermo Finnigan RAW file" | |
197 dataset.blurb = data.nice_size( dataset.get_size() ) | |
198 else: | |
199 dataset.peek = 'file does not exist' | |
200 dataset.blurb = 'file purged from disk' | |
201 def display_peek( self, dataset ): | |
202 try: | |
203 return dataset.peek | |
204 except: | |
205 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) ) | |
206 | |
207 | |
208 if hasattr(Binary, 'register_sniffable_binary_format'): | |
209 Binary.register_sniffable_binary_format('RAW', 'RAW', RAW) | |
210 | |
211 | |
212 class Msp(Text): | |
213 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """ | |
214 file_ext = "msp" | |
215 | |
216 @staticmethod | |
217 def next_line_starts_with(contents, prefix): | |
218 next_line = contents.readline() | |
219 return next_line != None and next_line.startswith(prefix) | |
220 | |
221 def sniff(self, filename): | |
222 """ Determines whether the file is a NIST MSP output file. | |
223 | |
224 >>> fname = get_test_fname('test.msp') | |
225 >>> Msp().sniff(fname) | |
226 True | |
227 >>> fname = get_test_fname('test.mzXML') | |
228 >>> Msp().sniff(fname) | |
229 False | |
230 """ | |
231 with open(filename, 'r') as contents: | |
232 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:") | |
233 | |
234 class Ms2(Text): | |
235 file_ext = "ms2" | |
236 | |
237 def sniff(self, filename): | |
238 """ Determines whether the file is a valid ms2 file. | |
239 | |
240 >>> fname = get_test_fname('test.msp') | |
241 >>> Ms2().sniff(fname) | |
242 False | |
243 >>> fname = get_test_fname('test.ms2') | |
244 >>> Ms2().sniff(fname) | |
245 True | |
246 """ | |
247 | |
248 with open(filename, 'r') as contents: | |
249 header_lines = [] | |
250 while True: | |
251 line = contents.readline() | |
252 if line == None or len(line) == 0: | |
253 pass | |
254 elif line.startswith('H\t'): | |
255 header_lines.append(line) | |
256 else: | |
257 break | |
258 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']: | |
259 found_header = False | |
260 for header_line in header_lines: | |
261 if header_line.startswith('H\t%s' % (header_field)): | |
262 found_header = True | |
263 break | |
264 if not found_header: | |
265 return False | |
266 | |
267 return True | |
268 | |
269 # unsniffable binary format, should do something about this | |
270 class XHunterAslFormat(Binary): | |
271 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """ | |
272 file_ext = "hlf" | |
273 | |
274 | |
275 if hasattr(Binary, 'register_unsniffable_binary_ext'): | |
276 Binary.register_unsniffable_binary_ext('hlf') |