comparison proteomics.py @ 0:7101f7e4b00b

Uploaded
author iracooke
date Wed, 08 May 2013 03:25:50 -0400
parents
children b22ebbb05260
comparison
equal deleted inserted replaced
-1:000000000000 0:7101f7e4b00b
1 """
2 Proteomics format classes
3 """
4 import logging
5 import re
6 from galaxy.datatypes.data import *
7 from galaxy.datatypes.xml import *
8 from galaxy.datatypes.sniff import *
9 from galaxy.datatypes.binary import *
10 from galaxy.datatypes.interval import *
11
12 log = logging.getLogger(__name__)
13
14 class ProtGff( Gff ):
15 """Tab delimited data in Gff format"""
16 file_ext = "prot_gff"
17 def set_peek( self, dataset, is_multi_byte=False ):
18 """Set the peek and blurb text"""
19 if not dataset.dataset.purged:
20 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
21 dataset.blurb = 'Proteogenomics GFF'
22 else:
23 dataset.peek = 'file does not exist'
24 dataset.blurb = 'file purged from disk'
25
26 def sniff( self, filename ):
27 handle = open(filename)
28 xmlns_re = re.compile("^##gff-version")
29 for i in range(3):
30 line = handle.readline()
31 if xmlns_re.match(line.strip()):
32 handle.close()
33 return True
34
35 handle.close()
36 return False
37
38
39 class Xls( Binary ):
40 """Class describing a binary excel spreadsheet file"""
41 file_ext = "xls"
42
43 def set_peek( self, dataset, is_multi_byte=False ):
44 if not dataset.dataset.purged:
45 dataset.peek = "Excel Spreadsheet file"
46 dataset.blurb = data.nice_size( dataset.get_size() )
47 else:
48 dataset.peek = 'file does not exist'
49 dataset.blurb = 'file purged from disk'
50 def display_peek( self, dataset ):
51 try:
52 return dataset.peek
53 except:
54 return "Binary xls file (%s)" % ( data.nice_size( dataset.get_size() ) )
55
56 class ProteomicsXml(GenericXml):
57 """ An enhanced XML datatype used to reuse code across several
58 proteomic/mass-spec datatypes. """
59
60 def sniff(self, filename):
61 """ Determines whether the file is the correct XML type. """
62 with open(filename, 'r') as contents:
63 while True:
64 line = contents.readline()
65 if line == None or not line.startswith('<?'):
66 break
67 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
68 return line != None and re.match(pattern, line) != None
69
70 def set_peek( self, dataset, is_multi_byte=False ):
71 """Set the peek and blurb text"""
72 if not dataset.dataset.purged:
73 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
74 dataset.blurb = self.blurb
75 else:
76 dataset.peek = 'file does not exist'
77 dataset.blurb = 'file purged from disk'
78
79 class PepXml(ProteomicsXml):
80 """pepXML data"""
81 file_ext = "pepxml"
82 blurb = 'pepXML data'
83 root = "msms_pipeline_analysis"
84
85
86 class MzML(ProteomicsXml):
87 """mzML data"""
88 file_ext = "mzml"
89 blurb = 'mzML Mass Spectrometry data'
90 root = "(mzML|indexedmzML)"
91
92
93 class ProtXML(ProteomicsXml):
94 """protXML data"""
95 file_ext = "protxml"
96 blurb = 'prot XML Search Results'
97 root = "protein_summary"
98
99
100 class MzXML(ProteomicsXml):
101 """mzXML data"""
102 file_ext = "mzXML"
103 blurb = "mzXML Mass Spectrometry data"
104 root = "mzXML"
105
106 ## PSI datatypes
107 class MzIdentML(ProteomicsXml):
108 file_ext = "mzid"
109 blurb = "XML identified peptides and proteins."
110 root = "MzIdentML"
111
112
113 class TraML(ProteomicsXml):
114 file_ext = "traML"
115 blurb = "TraML transition list"
116 root = "TraML"
117
118
119 class MzQuantML(ProteomicsXml):
120 file_ext = "mzq"
121 blurb = "XML quantification data"
122 root = "MzQuantML"
123
124
125 class Mgf( Text ):
126 """Mascot Generic Format data"""
127 file_ext = "mgf"
128
129 def set_peek( self, dataset, is_multi_byte=False ):
130 """Set the peek and blurb text"""
131 if not dataset.dataset.purged:
132 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
133 dataset.blurb = 'mgf Mascot Generic Format'
134 else:
135 dataset.peek = 'file does not exist'
136 dataset.blurb = 'file purged from disk'
137
138
139 def sniff( self, filename ):
140 mgf_begin_ions = "BEGIN IONS"
141 max_lines=100
142
143 for i, line in enumerate( file( filename ) ):
144 line = line.rstrip( '\n\r' )
145 if line==mgf_begin_ions:
146 return True
147 if i>max_lines:
148 return False
149
150
151 class MascotDat( Text ):
152 """Mascot search results """
153 file_ext = "mascotdat"
154
155 def set_peek( self, dataset, is_multi_byte=False ):
156 """Set the peek and blurb text"""
157 if not dataset.dataset.purged:
158 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
159 dataset.blurb = 'mascotdat Mascot Search Results'
160 else:
161 dataset.peek = 'file does not exist'
162 dataset.blurb = 'file purged from disk'
163
164
165 def sniff( self, filename ):
166 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
167 max_lines=10
168
169 for i, line in enumerate( file( filename ) ):
170 line = line.rstrip( '\n\r' )
171 if line==mime_version:
172 return True
173 if i>max_lines:
174 return False
175
176
177 class RAW( Binary ):
178 """Class describing a Thermo Finnigan binary RAW file"""
179 file_ext = "raw"
180 def sniff( self, filename ):
181 # Thermo Finnigan RAW format is proprietary and hence not well documented.
182 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
183 # This combination represents 17 bytes, but to play safe we read 20 bytes from
184 # the start of the file.
185 try:
186 header = open( filename ).read(20)
187 hexheader = binascii.b2a_hex( header )
188 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
189 if hexheader.find(finnigan) != -1:
190 return True
191 return False
192 except:
193 return False
194 def set_peek( self, dataset, is_multi_byte=False ):
195 if not dataset.dataset.purged:
196 dataset.peek = "Thermo Finnigan RAW file"
197 dataset.blurb = data.nice_size( dataset.get_size() )
198 else:
199 dataset.peek = 'file does not exist'
200 dataset.blurb = 'file purged from disk'
201 def display_peek( self, dataset ):
202 try:
203 return dataset.peek
204 except:
205 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
206
207
208 if hasattr(Binary, 'register_sniffable_binary_format'):
209 Binary.register_sniffable_binary_format('RAW', 'RAW', RAW)
210
211
212 class Msp(Text):
213 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
214 file_ext = "msp"
215
216 @staticmethod
217 def next_line_starts_with(contents, prefix):
218 next_line = contents.readline()
219 return next_line != None and next_line.startswith(prefix)
220
221 def sniff(self, filename):
222 """ Determines whether the file is a NIST MSP output file.
223
224 >>> fname = get_test_fname('test.msp')
225 >>> Msp().sniff(fname)
226 True
227 >>> fname = get_test_fname('test.mzXML')
228 >>> Msp().sniff(fname)
229 False
230 """
231 with open(filename, 'r') as contents:
232 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
233
234 class Ms2(Text):
235 file_ext = "ms2"
236
237 def sniff(self, filename):
238 """ Determines whether the file is a valid ms2 file.
239
240 >>> fname = get_test_fname('test.msp')
241 >>> Ms2().sniff(fname)
242 False
243 >>> fname = get_test_fname('test.ms2')
244 >>> Ms2().sniff(fname)
245 True
246 """
247
248 with open(filename, 'r') as contents:
249 header_lines = []
250 while True:
251 line = contents.readline()
252 if line == None or len(line) == 0:
253 pass
254 elif line.startswith('H\t'):
255 header_lines.append(line)
256 else:
257 break
258 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
259 found_header = False
260 for header_line in header_lines:
261 if header_line.startswith('H\t%s' % (header_field)):
262 found_header = True
263 break
264 if not found_header:
265 return False
266
267 return True
268
269 # unsniffable binary format, should do something about this
270 class XHunterAslFormat(Binary):
271 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
272 file_ext = "hlf"
273
274
275 if hasattr(Binary, 'register_unsniffable_binary_ext'):
276 Binary.register_unsniffable_binary_ext('hlf')