0
|
1 # -*- coding: utf-8 -*-
|
|
2
|
|
3 from galaxy.datatypes import data
|
|
4 import logging
|
|
5 from galaxy.datatypes.sniff import *
|
|
6 import commands
|
|
7 import pybel
|
|
8 import openbabel
|
|
9 openbabel.obErrorLog.StopLogging()
|
|
10
|
|
11 from galaxy.datatypes.metadata import MetadataElement
|
|
12 from galaxy.datatypes import metadata
|
|
13
|
|
14 log = logging.getLogger(__name__)
|
|
15
|
|
16 class GenericMolFile( data.Text ):
|
|
17
|
|
18 MetadataElement( name="molecules", default=0, desc="Number of molecules", readonly=True, visible=False, optional=True, no_value=0 )
|
|
19
|
|
20 file_ext = "mol2/sdf/drf"
|
|
21 def check_filetype( self,filename ):
|
|
22 self.no_mols = commands.getstatusoutput("grep -c \\$\\$\\$\\$ "+filename)
|
|
23 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
|
|
24 self.file_ext="sdf"
|
|
25 return True
|
|
26 self.no_mols = commands.getstatusoutput("grep -c @\<TRIPOS\>MOLECULE "+filename)
|
|
27 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
|
|
28 self.file_ext="mol2"
|
|
29 return True
|
|
30 self.no_mols = commands.getstatusoutput("grep -c \"ligand id\" "+filename)
|
|
31 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
|
|
32 self.file_ext="drf"
|
|
33 return True
|
|
34 self.no_mols = commands.getstatusoutput("grep -c HEADER "+filename)
|
|
35 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
|
|
36 self.file_ext="pdb"
|
|
37 return True
|
|
38 return False
|
|
39
|
|
40 def set_peek( self, dataset, is_multi_byte=False ):
|
|
41 if not dataset.dataset.purged:
|
|
42 if(self.check_filetype(dataset.file_name)) :
|
|
43 if (self.no_mols[1] == '1'):
|
|
44 dataset.blurb = "1 molecule"
|
|
45 else:
|
|
46 dataset.blurb = "%s molecules" % self.no_mols[1]
|
|
47 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
48 else:
|
|
49 dataset.peek = 'file does not exist'
|
|
50 dataset.blurb = 'file purged from disk'
|
|
51
|
|
52 def get_mime(self):
|
|
53 return 'text/plain'
|
|
54
|
|
55
|
|
56 class GenericMultiMolFile( GenericMolFile ):
|
|
57 def set_peek( self, dataset, is_multi_byte=False ):
|
|
58 if not dataset.dataset.purged:
|
|
59 self.sniff(dataset.file_name)
|
|
60 if (self.no_mols[1] == '1'):
|
|
61 dataset.blurb = "1 molecule"
|
|
62 else:
|
|
63 dataset.blurb = "%s molecules" % self.no_mols[1]
|
|
64 dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
|
|
65 else:
|
|
66 dataset.peek = 'file does not exist'
|
|
67 dataset.blurb = 'file purged from disk'
|
|
68
|
|
69 class SDF( GenericMultiMolFile ):
|
|
70 file_ext = "sdf"
|
|
71 def sniff( self, filename ):
|
|
72 self.no_mols = commands.getstatusoutput("grep -c \\$\\$\\$\\$ "+filename)
|
|
73 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
|
|
74 return True
|
|
75 else:
|
|
76 return False
|
|
77
|
|
78 class MOL2( GenericMultiMolFile ):
|
|
79 file_ext = "mol2"
|
|
80 def sniff( self, filename ):
|
|
81 self.no_mols = commands.getstatusoutput("grep -c @\<TRIPOS\>MOLECULE "+filename)
|
|
82 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
|
|
83 return True
|
|
84 else:
|
|
85 return False
|
|
86
|
|
87 class FPS( GenericMultiMolFile ):
|
|
88 file_ext = "fps"
|
|
89 def sniff( self, filename ):
|
|
90 self.no_mols = commands.getstatusoutput("grep -c -v '^#' "+filename)
|
|
91 with open(filename) as in_handle:
|
|
92 for line_counter, line in enumerate(in_handle):
|
|
93 line = line.strip()
|
|
94 if line.startswith('#FPS1'):
|
|
95 return True
|
|
96 if line_counter > 10:
|
|
97 return False
|
|
98
|
|
99 class DRF( GenericMultiMolFile ):
|
|
100 file_ext = "drf"
|
|
101 def sniff( self, filename ):
|
|
102 self.no_mols = commands.getstatusoutput("grep -c \"ligand id\" "+filename)
|
|
103 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
|
|
104 return True
|
|
105 else:
|
|
106 return False
|
|
107
|
|
108
|
|
109 class PHAR( GenericMultiMolFile ):
|
|
110 MetadataElement( name="base_name", desc="base name", default='Phar',
|
|
111 readonly=True, set_in_upload=True)
|
|
112 file_ext = "phar"
|
|
113 def sniff( self, filename ):
|
|
114 self.no_mols = commands.getstatusoutput("grep -c -v '^#' "+filename)
|
|
115 return False
|
|
116
|
|
117 class PDB( GenericMolFile ):
|
|
118 file_ext = "pdb"
|
|
119 def sniff( self, filename ):
|
|
120 self.no_mols = commands.getstatusoutput("grep -c HEADER "+filename)
|
|
121 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
|
|
122 return True
|
|
123 else:
|
|
124 return False
|
|
125 def set_peek( self, dataset, is_multi_byte=False ):
|
|
126 #def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
|
|
127 if not dataset.dataset.purged:
|
|
128 res = commands.getstatusoutput("lib/galaxy/datatypes/countResidues.sh "+dataset.file_name)
|
|
129 dataset.peek = res[1]
|
|
130 self.sniff(dataset.file_name)
|
|
131 if (self.no_mols[1] == '1'):
|
|
132 dataset.blurb = "1 protein structure"
|
|
133 else:
|
|
134 dataset.blurb = "%s protein structures"%self.no_mols[1]
|
|
135 else:
|
|
136 dataset.peek = 'file does not exist'
|
|
137 dataset.blurb = 'file purged from disk'
|
|
138
|
|
139 class grd ( data.Text ) :
|
|
140 file_ext = "grd"
|
|
141 def set_peek( self, dataset, is_multi_byte=False ):
|
|
142 if not dataset.dataset.purged:
|
|
143 #dataset.peek = ""
|
|
144 dataset.blurb = "score-grids for docking"
|
|
145 else:
|
|
146 dataset.peek = 'file does not exist'
|
|
147 dataset.blurb = 'file purged from disk'
|
|
148
|
|
149 class grdtgz ( data.Text ) :
|
|
150 file_ext = "grd.tgz"
|
|
151 def set_peek( self, dataset, is_multi_byte=False ):
|
|
152 if not dataset.dataset.purged:
|
|
153 #dataset.peek = ""
|
|
154 dataset.blurb = "compressed score-grids for docking"
|
|
155 else:
|
|
156 dataset.peek = 'file does not exist'
|
|
157 dataset.blurb = 'file purged from disk'
|
|
158
|
|
159
|
|
160 class InChI( GenericMultiMolFile ):
|
|
161 file_ext = "inchi"
|
|
162 def sniff( self, filename ):
|
|
163 self.no_mols = commands.getstatusoutput("grep -c '^InChI=' "+filename)
|
|
164 word_count = commands.getoutput("wc -w "+filename).split()[0]
|
|
165
|
|
166 if self.no_mols[1] != word_count:
|
|
167 return False
|
|
168
|
|
169 if (self.no_mols[0] == 0) & (self.no_mols[1] > 0):
|
|
170 return True
|
|
171 else:
|
|
172 return False
|
|
173
|
|
174 def set_meta( self, dataset, **kwd ):
|
|
175 """
|
|
176 Set the number of sequences and the number of data lines
|
|
177 in dataset.
|
|
178 """
|
|
179 if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
|
|
180 dataset.metadata.data_lines = None
|
|
181 dataset.metadata.sequences = None
|
|
182 return
|
|
183 #word_count = commands.getoutput("wc -w "+filename).split()[0]
|
|
184 # word_count are the lines of the file, if word_count and molecule count
|
|
185 # are the same, that must hold to be an InChI File, then that should be
|
|
186 # the same number as all non-empty lines
|
|
187 #dataset.metadata.data_lines = word_count
|
|
188 #int(commands.getoutput("grep -cve '^\s*$' "+filename))
|
|
189 #dataset.metadata.molecules = word_count
|
|
190
|
|
191
|
|
192 class SMILES( GenericMultiMolFile ):
|
|
193 file_ext = "smi"
|
|
194 def sniff( self, filename ):
|
|
195 """
|
|
196 Its hard or impossible to sniff a SMILES File. All what i know is the
|
|
197 word_count must be the same as the non-empty line count. And that i can
|
|
198 try to import the first SMILES and check if it is a molecule.
|
|
199 """
|
|
200
|
|
201 # that corresponds to non-empty line count
|
|
202 self.no_mols = commands.getstatusoutput("grep -cve '^\s*$' "+filename)
|
|
203 word_count = int(commands.getoutput("wc -w "+filename).split()[0])
|
|
204
|
|
205 if int(self.no_mols[1]) != word_count:
|
|
206 return False
|
|
207
|
|
208 if (self.no_mols[0] == 0) & (int(self.no_mols[1]) > 0):
|
|
209 for line in open(filename):
|
|
210 line = line.strip()
|
|
211 if line:
|
|
212 # if we have atoms, we have a molecule
|
|
213 try:
|
|
214 if len(pybel.readstring('smi', line).atoms) > 0:
|
|
215 return True
|
|
216 else:
|
|
217 return False
|
|
218 except:
|
|
219 # if convert fails its not a smiles string
|
|
220 return False
|
|
221 return True
|
|
222 else:
|
|
223 return False
|
|
224
|
|
225 def set_meta( self, dataset, **kwd ):
|
|
226 """
|
|
227 Set the number of sequences and the number of data lines
|
|
228 in dataset.
|
|
229 """
|
|
230 if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
|
|
231 dataset.metadata.data_lines = None
|
|
232 dataset.metadata.sequences = None
|
|
233 return
|
|
234
|
|
235 #word_count = int(commands.getoutput("wc -w "+filename).split()[0])
|
|
236 # word_count are the lines of the file, if word_count and molecule count
|
|
237 # are the same, that must hold to be an InChI File, then that should be
|
|
238 # the same number as all non-empty lines
|
|
239 #dataset.metadata.data_lines = word_count
|
|
240 #dataset.metadata.molecules = word_count
|
|
241
|
|
242
|