comparison proteomics.py @ 7:9cfabf0b942d draft

Uploaded
author iracooke
date Sun, 14 Dec 2014 22:42:08 -0500
parents
children 6ab4a0bf67df
comparison
equal deleted inserted replaced
6:54adfe557ef4 7:9cfabf0b942d
1 """
2 Proteomics format classes
3 """
4 import logging
5 import re
6 import binascii
7
8 from galaxy.datatypes.sniff import *
9 from galaxy.datatypes import data
10 from galaxy.datatypes.data import Text
11 from galaxy.datatypes.xml import GenericXml
12 from galaxy.datatypes.binary import Binary
13 from galaxy.datatypes.tabular import Tabular
14 from galaxy.datatypes.interval import Gff
15
16 log = logging.getLogger(__name__)
17
18
19 class Wiff( Binary ):
20 """Class for wiff files."""
21 file_ext = 'wiff'
22 allow_datatype_change = False
23 composite_type = 'auto_primary_file'
24
25 def __init__(self, **kwd):
26 Binary.__init__(self, **kwd)
27 self.add_composite_file( 'wiff',
28 description = 'AB SCIEX files in .wiff format. This can contain all needed information or only metadata.',
29 is_binary = True )
30 self.add_composite_file( 'wiff_scan',
31 description = 'AB SCIEX spectra file (wiff.scan), if the corresponding .wiff file only contains metadata.',
32 optional = 'True', is_binary = True )
33
34 def generate_primary_file( self, dataset = None ):
35 rval = ['<html><head><title>Wiff Composite Dataset </title></head><p/>']
36 rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
37 for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
38 fn = composite_name
39 opt_text = ''
40 if composite_file.optional:
41 opt_text = ' (optional)'
42 if composite_file.get('description'):
43 rval.append( '<li><a href="%s" type="text/plain">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
44 else:
45 rval.append( '<li><a href="%s" type="text/plain">%s</a>%s</li>' % ( fn, fn, opt_text ) )
46 rval.append( '</ul></div></html>' )
47 return "\n".join( rval )
48
49
50
51 if hasattr(Binary, 'register_unsniffable_binary_ext'):
52 Binary.register_unsniffable_binary_ext('wiff')
53
54
55 class IdpDB( Binary ):
56 file_ext = "idpDB"
57
58 if hasattr(Binary, 'register_unsniffable_binary_ext'):
59 Binary.register_unsniffable_binary_ext('idpDB')
60
61
62 class PepXmlReport( Tabular ):
63 """pepxml converted to tabular report"""
64 file_ext = "tsv"
65
66 def __init__(self, **kwd):
67 Tabular.__init__( self, **kwd )
68 self.column_names = ['Protein', 'Peptide', 'Assumed Charge', 'Neutral Pep Mass (calculated)', 'Neutral Mass', 'Retention Time', 'Start Scan', 'End Scan', 'Search Engine', 'PeptideProphet Probability', 'Interprophet Probabaility']
69
70 def display_peek( self, dataset ):
71 """Returns formated html of peek"""
72 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
73
74
75 class ProtXmlReport( Tabular ):
76 """protxml converted to tabular report"""
77 file_ext = "tsv"
78 comment_lines = 1
79
80 def __init__(self, **kwd):
81 Tabular.__init__( self, **kwd )
82 self.column_names = ["Entry Number", "Group Probability", "Protein", "Protein Link", "Protein Probability", "Percent Coverage", "Number of Unique Peptides", "Total Independent Spectra", "Percent Share of Spectrum ID's", "Description", "Protein Molecular Weight", "Protein Length", "Is Nondegenerate Evidence", "Weight", "Precursor Ion Charge", "Peptide sequence", "Peptide Link", "NSP Adjusted Probability", "Initial Probability", "Number of Total Termini", "Number of Sibling Peptides Bin", "Number of Instances", "Peptide Group Designator", "Is Evidence?"]
83
84 def display_peek( self, dataset ):
85 """Returns formated html of peek"""
86 return Tabular.make_html_table( self, dataset, column_names=self.column_names )
87
88 class ProteomicsXml( GenericXml ):
89 """ An enhanced XML datatype used to reuse code across several
90 proteomic/mass-spec datatypes. """
91
92 def sniff(self, filename):
93 """ Determines whether the file is the correct XML type. """
94 with open(filename, 'r') as contents:
95 while True:
96 line = contents.readline()
97 if line == None or not line.startswith('<?'):
98 break
99 pattern = '^<(\w*:)?%s' % self.root # pattern match <root or <ns:root for any ns string
100 return line != None and re.match(pattern, line) != None
101
102 def set_peek( self, dataset, is_multi_byte=False ):
103 """Set the peek and blurb text"""
104 if not dataset.dataset.purged:
105 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
106 dataset.blurb = self.blurb
107 else:
108 dataset.peek = 'file does not exist'
109 dataset.blurb = 'file purged from disk'
110
111
112 class PepXml(ProteomicsXml):
113 """pepXML data"""
114 file_ext = "pepxml"
115 blurb = 'pepXML data'
116 root = "msms_pipeline_analysis"
117
118
119 class MzML(ProteomicsXml):
120 """mzML data"""
121 file_ext = "mzml"
122 blurb = 'mzML Mass Spectrometry data'
123 root = "(mzML|indexedmzML)"
124
125
126 class ProtXML(ProteomicsXml):
127 """protXML data"""
128 file_ext = "protxml"
129 blurb = 'prot XML Search Results'
130 root = "protein_summary"
131
132
133 class MzXML(ProteomicsXml):
134 """mzXML data"""
135 file_ext = "mzxml"
136 blurb = "mzXML Mass Spectrometry data"
137 root = "mzXML"
138
139 ## PSI datatypes
140 class MzIdentML(ProteomicsXml):
141 file_ext = "mzid"
142 blurb = "XML identified peptides and proteins."
143 root = "MzIdentML"
144
145
146 class TraML(ProteomicsXml):
147 file_ext = "traml"
148 blurb = "TraML transition list"
149 root = "TraML"
150
151
152 class MzQuantML(ProteomicsXml):
153 file_ext = "mzq"
154 blurb = "XML quantification data"
155 root = "MzQuantML"
156
157
158 class ConsensusXML(ProteomicsXml):
159 file_ext = "consensusxml"
160 blurb = "OpenMS multiple LC-MS map alignment file"
161 root = "consensusXML"
162
163
164 class FeatureXML(ProteomicsXml):
165 file_ext = "featurexml"
166 blurb = "OpenMS feature file"
167 root = "featureMap"
168
169
170 class IdXML(ProteomicsXml):
171 file_ext = "idxml"
172 blurb = "OpenMS identification file"
173 root = "IdXML"
174
175 class TandemXML(ProteomicsXml):
176 file_ext = "tandem"
177 blurb = "X!Tandem search results file"
178 root = "bioml"
179
180 class Mgf( Text ):
181 """Mascot Generic Format data"""
182 file_ext = "mgf"
183
184 def set_peek( self, dataset, is_multi_byte=False ):
185 """Set the peek and blurb text"""
186 if not dataset.dataset.purged:
187 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
188 dataset.blurb = 'mgf Mascot Generic Format'
189 else:
190 dataset.peek = 'file does not exist'
191 dataset.blurb = 'file purged from disk'
192
193 def sniff( self, filename ):
194 mgf_begin_ions = "BEGIN IONS"
195 max_lines=100
196
197 for i, line in enumerate( file( filename ) ):
198 line = line.rstrip( '\n\r' )
199 if line==mgf_begin_ions:
200 return True
201 if i>max_lines:
202 return False
203
204
205 class MascotDat( Text ):
206 """Mascot search results """
207 file_ext = "mascotdat"
208
209 def set_peek( self, dataset, is_multi_byte=False ):
210 """Set the peek and blurb text"""
211 if not dataset.dataset.purged:
212 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
213 dataset.blurb = 'mascotdat Mascot Search Results'
214 else:
215 dataset.peek = 'file does not exist'
216 dataset.blurb = 'file purged from disk'
217
218
219 def sniff( self, filename ):
220 mime_version = "MIME-Version: 1.0 (Generated by Mascot version 1.0)"
221 max_lines=10
222
223 for i, line in enumerate( file( filename ) ):
224 line = line.rstrip( '\n\r' )
225 if line==mime_version:
226 return True
227 if i>max_lines:
228 return False
229
230
231 class RAW( Binary ):
232 """Class describing a Thermo Finnigan binary RAW file"""
233 file_ext = "raw"
234 def sniff( self, filename ):
235 # Thermo Finnigan RAW format is proprietary and hence not well documented.
236 # Files start with 2 bytes that seem to differ followed by F\0i\0n\0n\0i\0g\0a\0n
237 # This combination represents 17 bytes, but to play safe we read 20 bytes from
238 # the start of the file.
239 try:
240 header = open( filename ).read(20)
241 hexheader = binascii.b2a_hex( header )
242 finnigan = binascii.hexlify( 'F\0i\0n\0n\0i\0g\0a\0n' )
243 if hexheader.find(finnigan) != -1:
244 return True
245 return False
246 except:
247 return False
248 def set_peek( self, dataset, is_multi_byte=False ):
249 if not dataset.dataset.purged:
250 dataset.peek = "Thermo Finnigan RAW file"
251 dataset.blurb = data.nice_size( dataset.get_size() )
252 else:
253 dataset.peek = 'file does not exist'
254 dataset.blurb = 'file purged from disk'
255 def display_peek( self, dataset ):
256 try:
257 return dataset.peek
258 except:
259 return "Thermo Finnigan RAW file (%s)" % ( data.nice_size( dataset.get_size() ) )
260
261
262 if hasattr(Binary, 'register_sniffable_binary_format'):
263 Binary.register_sniffable_binary_format('raw', 'raw', RAW)
264
265
266 class Msp( Text ):
267 """ Output of NIST MS Search Program chemdata.nist.gov/mass-spc/ftp/mass-spc/PepLib.pdf """
268 file_ext = "msp"
269
270 @staticmethod
271 def next_line_starts_with(contents, prefix):
272 next_line = contents.readline()
273 return next_line != None and next_line.startswith(prefix)
274
275 def sniff(self, filename):
276 """ Determines whether the file is a NIST MSP output file.
277
278 >>> fname = get_test_fname('test.msp')
279 >>> Msp().sniff(fname)
280 True
281 >>> fname = get_test_fname('test.mzXML')
282 >>> Msp().sniff(fname)
283 False
284 """
285 with open(filename, 'r') as contents:
286 return Msp.next_line_starts_with(contents, "Name:") and Msp.next_line_starts_with(contents, "MW:")
287
288 class Ms2(Text):
289 file_ext = "ms2"
290
291 def sniff(self, filename):
292 """ Determines whether the file is a valid ms2 file.
293
294 >>> fname = get_test_fname('test.msp')
295 >>> Ms2().sniff(fname)
296 False
297 >>> fname = get_test_fname('test.ms2')
298 >>> Ms2().sniff(fname)
299 True
300 """
301
302 with open(filename, 'r') as contents:
303 header_lines = []
304 while True:
305 line = contents.readline()
306 if line == None or len(line) == 0:
307 pass
308 elif line.startswith('H\t'):
309 header_lines.append(line)
310 else:
311 break
312 for header_field in ['CreationDate', 'Extractor', 'ExtractorVersion', 'ExtractorOptions']:
313 found_header = False
314 for header_line in header_lines:
315 if header_line.startswith('H\t%s' % (header_field)):
316 found_header = True
317 break
318 if not found_header:
319 return False
320
321 return True
322
323 # unsniffable binary format, should do something about this
324 class XHunterAslFormat( Binary ):
325 """ Annotated Spectra in the HLF format http://www.thegpm.org/HUNTER/format_2006_09_15.html """
326 file_ext = "hlf"
327
328 if hasattr(Binary, 'register_unsniffable_binary_ext'):
329 Binary.register_unsniffable_binary_ext('hlf')