Mercurial > repos > bgruening > chemical_datatypes
comparison datatypes/molFiles.py @ 0:af7b6c6ee439 draft
initial commit
author | bgruening |
---|---|
date | Tue, 25 Dec 2012 05:16:25 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:af7b6c6ee439 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 | |
3 from galaxy.datatypes import data | |
4 import logging | |
5 from galaxy.datatypes.sniff import * | |
6 import commands | |
7 import pybel | |
8 import openbabel | |
9 openbabel.obErrorLog.StopLogging() | |
10 | |
11 from galaxy.datatypes.metadata import MetadataElement | |
12 from galaxy.datatypes import metadata | |
13 | |
14 log = logging.getLogger(__name__) | |
15 | |
16 class GenericMolFile( data.Text ): | |
17 | |
18 MetadataElement( name="molecules", default=0, desc="Number of molecules", readonly=True, visible=False, optional=True, no_value=0 ) | |
19 | |
20 file_ext = "mol2/sdf/drf" | |
21 def check_filetype( self,filename ): | |
22 self.no_mols = commands.getstatusoutput("grep -c \\$\\$\\$\\$ "+filename) | |
23 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): | |
24 self.file_ext="sdf" | |
25 return True | |
26 self.no_mols = commands.getstatusoutput("grep -c @\<TRIPOS\>MOLECULE "+filename) | |
27 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): | |
28 self.file_ext="mol2" | |
29 return True | |
30 self.no_mols = commands.getstatusoutput("grep -c \"ligand id\" "+filename) | |
31 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): | |
32 self.file_ext="drf" | |
33 return True | |
34 self.no_mols = commands.getstatusoutput("grep -c HEADER "+filename) | |
35 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): | |
36 self.file_ext="pdb" | |
37 return True | |
38 return False | |
39 | |
40 def set_peek( self, dataset, is_multi_byte=False ): | |
41 if not dataset.dataset.purged: | |
42 if(self.check_filetype(dataset.file_name)) : | |
43 if (self.no_mols[1] == '1'): | |
44 dataset.blurb = "1 molecule" | |
45 else: | |
46 dataset.blurb = "%s molecules" % self.no_mols[1] | |
47 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
48 else: | |
49 dataset.peek = 'file does not exist' | |
50 dataset.blurb = 'file purged from disk' | |
51 | |
52 def get_mime(self): | |
53 return 'text/plain' | |
54 | |
55 | |
56 class GenericMultiMolFile( GenericMolFile ): | |
57 def set_peek( self, dataset, is_multi_byte=False ): | |
58 if not dataset.dataset.purged: | |
59 self.sniff(dataset.file_name) | |
60 if (self.no_mols[1] == '1'): | |
61 dataset.blurb = "1 molecule" | |
62 else: | |
63 dataset.blurb = "%s molecules" % self.no_mols[1] | |
64 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) | |
65 else: | |
66 dataset.peek = 'file does not exist' | |
67 dataset.blurb = 'file purged from disk' | |
68 | |
69 class SDF( GenericMultiMolFile ): | |
70 file_ext = "sdf" | |
71 def sniff( self, filename ): | |
72 self.no_mols = commands.getstatusoutput("grep -c \\$\\$\\$\\$ "+filename) | |
73 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): | |
74 return True | |
75 else: | |
76 return False | |
77 | |
78 class MOL2( GenericMultiMolFile ): | |
79 file_ext = "mol2" | |
80 def sniff( self, filename ): | |
81 self.no_mols = commands.getstatusoutput("grep -c @\<TRIPOS\>MOLECULE "+filename) | |
82 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): | |
83 return True | |
84 else: | |
85 return False | |
86 | |
87 class FPS( GenericMultiMolFile ): | |
88 file_ext = "fps" | |
89 def sniff( self, filename ): | |
90 self.no_mols = commands.getstatusoutput("grep -c -v '^#' "+filename) | |
91 with open(filename) as in_handle: | |
92 for line_counter, line in enumerate(in_handle): | |
93 line = line.strip() | |
94 if line.startswith('#FPS1'): | |
95 return True | |
96 if line_counter > 10: | |
97 return False | |
98 | |
99 class DRF( GenericMultiMolFile ): | |
100 file_ext = "drf" | |
101 def sniff( self, filename ): | |
102 self.no_mols = commands.getstatusoutput("grep -c \"ligand id\" "+filename) | |
103 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): | |
104 return True | |
105 else: | |
106 return False | |
107 | |
108 | |
109 class PHAR( GenericMultiMolFile ): | |
110 MetadataElement( name="base_name", desc="base name", default='Phar', | |
111 readonly=True, set_in_upload=True) | |
112 file_ext = "phar" | |
113 def sniff( self, filename ): | |
114 self.no_mols = commands.getstatusoutput("grep -c -v '^#' "+filename) | |
115 return False | |
116 | |
117 class PDB( GenericMolFile ): | |
118 file_ext = "pdb" | |
119 def sniff( self, filename ): | |
120 self.no_mols = commands.getstatusoutput("grep -c HEADER "+filename) | |
121 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): | |
122 return True | |
123 else: | |
124 return False | |
125 def set_peek( self, dataset, is_multi_byte=False ): | |
126 #def set_peek( self, dataset, line_count=None, is_multi_byte=False ): | |
127 if not dataset.dataset.purged: | |
128 res = commands.getstatusoutput("lib/galaxy/datatypes/countResidues.sh "+dataset.file_name) | |
129 dataset.peek = res[1] | |
130 self.sniff(dataset.file_name) | |
131 if (self.no_mols[1] == '1'): | |
132 dataset.blurb = "1 protein structure" | |
133 else: | |
134 dataset.blurb = "%s protein structures"%self.no_mols[1] | |
135 else: | |
136 dataset.peek = 'file does not exist' | |
137 dataset.blurb = 'file purged from disk' | |
138 | |
139 class grd ( data.Text ) : | |
140 file_ext = "grd" | |
141 def set_peek( self, dataset, is_multi_byte=False ): | |
142 if not dataset.dataset.purged: | |
143 #dataset.peek = "" | |
144 dataset.blurb = "score-grids for docking" | |
145 else: | |
146 dataset.peek = 'file does not exist' | |
147 dataset.blurb = 'file purged from disk' | |
148 | |
149 class grdtgz ( data.Text ) : | |
150 file_ext = "grd.tgz" | |
151 def set_peek( self, dataset, is_multi_byte=False ): | |
152 if not dataset.dataset.purged: | |
153 #dataset.peek = "" | |
154 dataset.blurb = "compressed score-grids for docking" | |
155 else: | |
156 dataset.peek = 'file does not exist' | |
157 dataset.blurb = 'file purged from disk' | |
158 | |
159 | |
160 class InChI( GenericMultiMolFile ): | |
161 file_ext = "inchi" | |
162 def sniff( self, filename ): | |
163 self.no_mols = commands.getstatusoutput("grep -c '^InChI=' "+filename) | |
164 word_count = commands.getoutput("wc -w "+filename).split()[0] | |
165 | |
166 if self.no_mols[1] != word_count: | |
167 return False | |
168 | |
169 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0): | |
170 return True | |
171 else: | |
172 return False | |
173 | |
174 def set_meta( self, dataset, **kwd ): | |
175 """ | |
176 Set the number of sequences and the number of data lines | |
177 in dataset. | |
178 """ | |
179 if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize: | |
180 dataset.metadata.data_lines = None | |
181 dataset.metadata.sequences = None | |
182 return | |
183 #word_count = commands.getoutput("wc -w "+filename).split()[0] | |
184 # word_count are the lines of the file, if word_count and molecule count | |
185 # are the same, that must hold to be an InChI File, then that should be | |
186 # the same number as all non-empty lines | |
187 #dataset.metadata.data_lines = word_count | |
188 #int(commands.getoutput("grep -cve '^\s*$' "+filename)) | |
189 #dataset.metadata.molecules = word_count | |
190 | |
191 | |
192 class SMILES( GenericMultiMolFile ): | |
193 file_ext = "smi" | |
194 def sniff( self, filename ): | |
195 """ | |
196 Its hard or impossible to sniff a SMILES File. All what i know is the | |
197 word_count must be the same as the non-empty line count. And that i can | |
198 try to import the first SMILES and check if it is a molecule. | |
199 """ | |
200 | |
201 # that corresponds to non-empty line count | |
202 self.no_mols = commands.getstatusoutput("grep -cve '^\s*$' "+filename) | |
203 word_count = int(commands.getoutput("wc -w "+filename).split()[0]) | |
204 | |
205 if int(self.no_mols[1]) != word_count: | |
206 return False | |
207 | |
208 if (self.no_mols[0] == 0) & (int(self.no_mols[1]) > 0): | |
209 for line in open(filename): | |
210 line = line.strip() | |
211 if line: | |
212 # if we have atoms, we have a molecule | |
213 try: | |
214 if len(pybel.readstring('smi', line).atoms) > 0: | |
215 return True | |
216 else: | |
217 return False | |
218 except: | |
219 # if convert fails its not a smiles string | |
220 return False | |
221 return True | |
222 else: | |
223 return False | |
224 | |
225 def set_meta( self, dataset, **kwd ): | |
226 """ | |
227 Set the number of sequences and the number of data lines | |
228 in dataset. | |
229 """ | |
230 if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize: | |
231 dataset.metadata.data_lines = None | |
232 dataset.metadata.sequences = None | |
233 return | |
234 | |
235 #word_count = int(commands.getoutput("wc -w "+filename).split()[0]) | |
236 # word_count are the lines of the file, if word_count and molecule count | |
237 # are the same, that must hold to be an InChI File, then that should be | |
238 # the same number as all non-empty lines | |
239 #dataset.metadata.data_lines = word_count | |
240 #dataset.metadata.molecules = word_count | |
241 | |
242 |