comparison datatypes/molFiles.py @ 0:af7b6c6ee439 draft

initial commit
author bgruening
date Tue, 25 Dec 2012 05:16:25 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:af7b6c6ee439
1 # -*- coding: utf-8 -*-
2
3 from galaxy.datatypes import data
4 import logging
5 from galaxy.datatypes.sniff import *
6 import commands
7 import pybel
8 import openbabel
9 openbabel.obErrorLog.StopLogging()
10
11 from galaxy.datatypes.metadata import MetadataElement
12 from galaxy.datatypes import metadata
13
14 log = logging.getLogger(__name__)
15
16 class GenericMolFile( data.Text ):
17
18 MetadataElement( name="molecules", default=0, desc="Number of molecules", readonly=True, visible=False, optional=True, no_value=0 )
19
20 file_ext = "mol2/sdf/drf"
21 def check_filetype( self,filename ):
22 self.no_mols = commands.getstatusoutput("grep -c \\$\\$\\$\\$ "+filename)
23 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
24 self.file_ext="sdf"
25 return True
26 self.no_mols = commands.getstatusoutput("grep -c @\<TRIPOS\>MOLECULE "+filename)
27 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
28 self.file_ext="mol2"
29 return True
30 self.no_mols = commands.getstatusoutput("grep -c \"ligand id\" "+filename)
31 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
32 self.file_ext="drf"
33 return True
34 self.no_mols = commands.getstatusoutput("grep -c HEADER "+filename)
35 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
36 self.file_ext="pdb"
37 return True
38 return False
39
40 def set_peek( self, dataset, is_multi_byte=False ):
41 if not dataset.dataset.purged:
42 if(self.check_filetype(dataset.file_name)) :
43 if (self.no_mols[1] == '1'):
44 dataset.blurb = "1 molecule"
45 else:
46 dataset.blurb = "%s molecules" % self.no_mols[1]
47 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
48 else:
49 dataset.peek = 'file does not exist'
50 dataset.blurb = 'file purged from disk'
51
52 def get_mime(self):
53 return 'text/plain'
54
55
56 class GenericMultiMolFile( GenericMolFile ):
57 def set_peek( self, dataset, is_multi_byte=False ):
58 if not dataset.dataset.purged:
59 self.sniff(dataset.file_name)
60 if (self.no_mols[1] == '1'):
61 dataset.blurb = "1 molecule"
62 else:
63 dataset.blurb = "%s molecules" % self.no_mols[1]
64 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
65 else:
66 dataset.peek = 'file does not exist'
67 dataset.blurb = 'file purged from disk'
68
69 class SDF( GenericMultiMolFile ):
70 file_ext = "sdf"
71 def sniff( self, filename ):
72 self.no_mols = commands.getstatusoutput("grep -c \\$\\$\\$\\$ "+filename)
73 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
74 return True
75 else:
76 return False
77
78 class MOL2( GenericMultiMolFile ):
79 file_ext = "mol2"
80 def sniff( self, filename ):
81 self.no_mols = commands.getstatusoutput("grep -c @\<TRIPOS\>MOLECULE "+filename)
82 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
83 return True
84 else:
85 return False
86
87 class FPS( GenericMultiMolFile ):
88 file_ext = "fps"
89 def sniff( self, filename ):
90 self.no_mols = commands.getstatusoutput("grep -c -v '^#' "+filename)
91 with open(filename) as in_handle:
92 for line_counter, line in enumerate(in_handle):
93 line = line.strip()
94 if line.startswith('#FPS1'):
95 return True
96 if line_counter > 10:
97 return False
98
99 class DRF( GenericMultiMolFile ):
100 file_ext = "drf"
101 def sniff( self, filename ):
102 self.no_mols = commands.getstatusoutput("grep -c \"ligand id\" "+filename)
103 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
104 return True
105 else:
106 return False
107
108
109 class PHAR( GenericMultiMolFile ):
110 MetadataElement( name="base_name", desc="base name", default='Phar',
111 readonly=True, set_in_upload=True)
112 file_ext = "phar"
113 def sniff( self, filename ):
114 self.no_mols = commands.getstatusoutput("grep -c -v '^#' "+filename)
115 return False
116
117 class PDB( GenericMolFile ):
118 file_ext = "pdb"
119 def sniff( self, filename ):
120 self.no_mols = commands.getstatusoutput("grep -c HEADER "+filename)
121 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
122 return True
123 else:
124 return False
125 def set_peek( self, dataset, is_multi_byte=False ):
126 #def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
127 if not dataset.dataset.purged:
128 res = commands.getstatusoutput("lib/galaxy/datatypes/countResidues.sh "+dataset.file_name)
129 dataset.peek = res[1]
130 self.sniff(dataset.file_name)
131 if (self.no_mols[1] == '1'):
132 dataset.blurb = "1 protein structure"
133 else:
134 dataset.blurb = "%s protein structures"%self.no_mols[1]
135 else:
136 dataset.peek = 'file does not exist'
137 dataset.blurb = 'file purged from disk'
138
139 class grd ( data.Text ) :
140 file_ext = "grd"
141 def set_peek( self, dataset, is_multi_byte=False ):
142 if not dataset.dataset.purged:
143 #dataset.peek = ""
144 dataset.blurb = "score-grids for docking"
145 else:
146 dataset.peek = 'file does not exist'
147 dataset.blurb = 'file purged from disk'
148
149 class grdtgz ( data.Text ) :
150 file_ext = "grd.tgz"
151 def set_peek( self, dataset, is_multi_byte=False ):
152 if not dataset.dataset.purged:
153 #dataset.peek = ""
154 dataset.blurb = "compressed score-grids for docking"
155 else:
156 dataset.peek = 'file does not exist'
157 dataset.blurb = 'file purged from disk'
158
159
160 class InChI( GenericMultiMolFile ):
161 file_ext = "inchi"
162 def sniff( self, filename ):
163 self.no_mols = commands.getstatusoutput("grep -c '^InChI=' "+filename)
164 word_count = commands.getoutput("wc -w "+filename).split()[0]
165
166 if self.no_mols[1] != word_count:
167 return False
168
169 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
170 return True
171 else:
172 return False
173
174 def set_meta( self, dataset, **kwd ):
175 """
176 Set the number of sequences and the number of data lines
177 in dataset.
178 """
179 if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
180 dataset.metadata.data_lines = None
181 dataset.metadata.sequences = None
182 return
183 #word_count = commands.getoutput("wc -w "+filename).split()[0]
184 # word_count are the lines of the file, if word_count and molecule count
185 # are the same, that must hold to be an InChI File, then that should be
186 # the same number as all non-empty lines
187 #dataset.metadata.data_lines = word_count
188 #int(commands.getoutput("grep -cve '^\s*$' "+filename))
189 #dataset.metadata.molecules = word_count
190
191
192 class SMILES( GenericMultiMolFile ):
193 file_ext = "smi"
194 def sniff( self, filename ):
195 """
196 Its hard or impossible to sniff a SMILES File. All what i know is the
197 word_count must be the same as the non-empty line count. And that i can
198 try to import the first SMILES and check if it is a molecule.
199 """
200
201 # that corresponds to non-empty line count
202 self.no_mols = commands.getstatusoutput("grep -cve '^\s*$' "+filename)
203 word_count = int(commands.getoutput("wc -w "+filename).split()[0])
204
205 if int(self.no_mols[1]) != word_count:
206 return False
207
208 if (self.no_mols[0] == 0) & (int(self.no_mols[1]) > 0):
209 for line in open(filename):
210 line = line.strip()
211 if line:
212 # if we have atoms, we have a molecule
213 try:
214 if len(pybel.readstring('smi', line).atoms) > 0:
215 return True
216 else:
217 return False
218 except:
219 # if convert fails its not a smiles string
220 return False
221 return True
222 else:
223 return False
224
225 def set_meta( self, dataset, **kwd ):
226 """
227 Set the number of sequences and the number of data lines
228 in dataset.
229 """
230 if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
231 dataset.metadata.data_lines = None
232 dataset.metadata.sequences = None
233 return
234
235 #word_count = int(commands.getoutput("wc -w "+filename).split()[0])
236 # word_count are the lines of the file, if word_count and molecule count
237 # are the same, that must hold to be an InChI File, then that should be
238 # the same number as all non-empty lines
239 #dataset.metadata.data_lines = word_count
240 #dataset.metadata.molecules = word_count
241
242