0
|
1 #!/usr/bin/env python
|
|
2 #Dan Blankenberg
|
|
3
|
|
4 import sys
|
|
5 import os
|
|
6 import tempfile
|
|
7 import shutil
|
|
8 import optparse
|
|
9 import urllib2
|
|
10 import subprocess
|
|
11 import datetime
|
|
12 from os.path import basename
|
|
13 from json import loads, dumps
|
|
14 from xml.etree.ElementTree import tostring
|
|
15
|
|
16 import logging
|
|
17 _log_name = __name__
|
|
18 if _log_name == '__builtin__':
|
|
19 _log_name = 'toolshed.installed.g2.rsync.data.manager'
|
|
20 log = logging.getLogger( _log_name )
|
|
21
|
|
22 # Get the Data from the Galaxy Project rsync server
|
|
23 RSYNC_CMD = 'rsync'
|
|
24 RSYNC_SERVER = "rsync://datacache.g2.bx.psu.edu/"
|
|
25 LOCATION_DIR = "location"
|
|
26 INDEX_DIR = "indexes"
|
|
27
|
|
28 # Pull the Tool Data Table files from github
|
|
29 # FIXME: These files should be accessible from the rsync server directly.
|
|
30 TOOL_DATA_TABLE_CONF_XML_URLS = { 'main':"https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/usegalaxy.org/config/tool_data_table_conf.xml",
|
|
31 'test':"https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/test.galaxyproject.org/config/tool_data_table_conf.xml" }
|
|
32
|
|
33 # Replace data table source entries with local temporary location
|
|
34 GALAXY_DATA_CANONICAL_PATH = "/galaxy/data/"
|
|
35 TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE = '<file path="%slocation/' % ( GALAXY_DATA_CANONICAL_PATH )
|
|
36 TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET = '<file path="%s/'
|
|
37
|
|
38 # Some basic Caching, so we don't have to reload and download everything every time
|
|
39 CACHE_TIME = datetime.timedelta( minutes=10 )
|
|
40 TOOL_DATA_TABLES_LOADED_BY_URL = {}
|
|
41
|
|
42 # Entries will not be selected by default
|
|
43 DEFAULT_SELECTED = False
|
|
44
|
|
45 # Exclude data managers without 'path' column or that are in the manual exclude list
|
|
46 PATH_COLUMN_NAMES = ['path']
|
|
47 EXCLUDE_DATA_TABLES = []
|
|
48 # TODO: Make additional handler actions available for tables that can't fit into the the basic
|
|
49 # "take the value of path" as a dir and copy contents.
|
|
50 # e.g. mafs. Although this maf table is goofy and doesn't have path defined in <table> def,
|
|
51 # but it does exit in the .loc.
|
|
52
|
|
53 # --- These methods are called by/within the Galaxy Application
|
|
54
|
|
55 def exec_before_job( app, inp_data, out_data, param_dict, tool=None, **kwd ):
|
|
56 # Look for any data tables that haven't been defined for this data manager before and dynamically add them to Galaxy
|
|
57 param_dict = dict( **param_dict )
|
|
58 param_dict['data_table_entries'] = param_dict.get( 'data_table_entries', [] )
|
|
59 if not isinstance( param_dict['data_table_entries'], list ):
|
|
60 param_dict['data_table_entries'] = [param_dict['data_table_entries']]
|
|
61 param_dict['data_table_entries'] = ",".join( param_dict['data_table_entries'] )
|
|
62 if tool:
|
|
63 tool_shed_repository = tool.tool_shed_repository
|
|
64 else:
|
|
65 tool_shed_repository = None
|
|
66 tdtm = None
|
|
67 data_manager = app.data_managers.get_manager( tool.data_manager_id, None )
|
|
68 data_table_entries = get_data_table_entries( param_dict )
|
|
69 data_tables = load_data_tables_from_url( data_table_class=app.tool_data_tables.__class__ ).get( 'data_tables' )
|
|
70 for data_table_name, entries in data_table_entries.iteritems():
|
|
71 #get data table managed by this data Manager
|
|
72 has_data_table = app.tool_data_tables.get_tables().get( data_table_name )
|
|
73 if has_data_table:
|
|
74 has_data_table = bool( has_data_table.get_filename_for_source( data_manager, None ) )
|
|
75 if not has_data_table:
|
|
76 if tdtm is None:
|
|
77 from tool_shed.tools import data_table_manager
|
|
78 tdtm = data_table_manager.ToolDataTableManager( app )
|
|
79 target_dir, tool_path, relative_target_dir = tdtm.get_target_install_dir( tool_shed_repository )
|
|
80 #Dynamically add this data table
|
|
81 log.debug( "Attempting to dynamically create a missing Tool Data Table named %s." % data_table_name )
|
|
82 data_table = data_tables[data_table_name]
|
|
83 repo_info = tdtm.generate_repository_info_elem_from_repository( tool_shed_repository, parent_elem=None )
|
|
84 if repo_info is not None:
|
|
85 repo_info = tostring( repo_info )
|
|
86 tmp_file = tempfile.NamedTemporaryFile()
|
|
87 tmp_file.write( get_new_xml_definition( app, data_table, data_manager, repo_info, target_dir ) )
|
|
88 tmp_file.flush()
|
|
89 app.tool_data_tables.add_new_entries_from_config_file( tmp_file.name, None, app.config.shed_tool_data_table_config, persist=True )
|
|
90 tmp_file.close()
|
|
91
|
|
92 def galaxy_code_get_available_data_tables( trans ):
|
|
93 #list of data tables
|
|
94 found_tables = get_available_tables( trans )
|
|
95 rval = map( lambda x: ( ( x, x, DEFAULT_SELECTED ) ), found_tables )
|
|
96 return rval
|
|
97
|
|
98 def galaxy_code_get_available_data_tables_entries( trans, dbkey, data_table_names ):
|
|
99 #available entries, optionally filtered by dbkey and table names
|
|
100 if dbkey in [ None, '', '?' ]:
|
|
101 dbkey = None
|
|
102 if data_table_names in [ None, '', '?' ]:
|
|
103 data_table_names = None
|
|
104 found_tables = get_available_tables_for_dbkey( trans, dbkey, data_table_names )
|
|
105 dbkey_text = '(%s) ' % ( dbkey ) if dbkey else ''
|
|
106 rval = map( lambda x: ( "%s%s" % ( dbkey_text, x[0] ), dumps( dict( name=x[0].split( ': ' )[0], entry=x[1] ) ).encode( 'base64' ).rstrip(), DEFAULT_SELECTED ), found_tables.items() )
|
|
107 return rval
|
|
108
|
|
109 # --- End Galaxy called Methods ---
|
|
110
|
|
111
|
|
112 def rsync_urljoin( base, url ):
|
|
113 # urlparse.urljoin doesn't work correctly for our use-case
|
|
114 # probably because it doesn't recognize the rsync scheme
|
|
115 base = base.rstrip( '/' )
|
|
116 url = url.lstrip( '/' )
|
|
117 return "%s/%s" % ( base, url )
|
|
118
|
|
119 def rsync_list_dir( server, dir=None, skip_names=[] ):
|
|
120 #drwxr-xr-x 50 2014/05/16 20:58:11 .
|
|
121 if dir:
|
|
122 dir = rsync_urljoin( server, dir )
|
|
123 else:
|
|
124 dir = server
|
|
125 rsync_response = tempfile.NamedTemporaryFile()
|
|
126 rsync_stderr = tempfile.NamedTemporaryFile()
|
|
127 rsync_cmd = [ RSYNC_CMD, '--list-only', dir ]
|
|
128 return_code = subprocess.call( rsync_cmd, stdout=rsync_response, stderr=rsync_stderr )
|
|
129 rsync_response.flush()
|
|
130 rsync_response.seek(0)
|
|
131 rsync_stderr.flush()
|
|
132 rsync_stderr.seek(0)
|
|
133 if return_code:
|
|
134 msg = "stdout:\n%s\nstderr:\n%s" % ( rsync_response.read(), rsync_stderr.read() )
|
|
135 rsync_response.close()
|
|
136 rsync_stderr.close()
|
|
137 raise Exception( 'Failed to execute rsync command (%s), returncode=%s. Rsync_output:\n%s' % ( rsync_cmd, return_code, msg ) )
|
|
138 rsync_stderr.close()
|
|
139 rval = {}
|
|
140 for line in rsync_response:
|
|
141 perms, line = line.split( None, 1 )
|
|
142 line = line.strip()
|
|
143 size, line = line.split( None, 1 )
|
|
144 line = line.strip()
|
|
145 date, line = line.split( None, 1 )
|
|
146 line = line.strip()
|
|
147 time, line = line.split( None, 1 )
|
|
148 name = line.strip()
|
|
149 if name in skip_names:
|
|
150 continue
|
|
151 size = line.strip()
|
|
152 rval[ name ] = dict( name=name, permissions=perms, bytes=size, date=date, time=time )
|
|
153 rsync_response.close()
|
|
154 return rval
|
|
155
|
|
156 def rsync_sync_to_dir( source, target ):
|
|
157 rsync_response = tempfile.NamedTemporaryFile()
|
|
158 rsync_stderr = tempfile.NamedTemporaryFile()
|
|
159 rsync_cmd = [ RSYNC_CMD, '-avzP', source, target ]
|
|
160 return_code = subprocess.call( rsync_cmd, stdout=rsync_response, stderr=rsync_stderr )
|
|
161 rsync_response.flush()
|
|
162 rsync_response.seek(0)
|
|
163 rsync_stderr.flush()
|
|
164 rsync_stderr.seek(0)
|
|
165 if return_code:
|
|
166 msg = "stdout:\n%s\nstderr:\n%s" % ( rsync_response.read(), rsync_stderr.read() )
|
|
167 rsync_response.close()
|
|
168 rsync_stderr.close()
|
|
169 raise Exception( 'Failed to execute rsync command (%s), returncode=%s. Rsync_output:\n%s' % ( rsync_cmd, return_code, msg ) )
|
|
170 rsync_response.close()
|
|
171 rsync_stderr.close()
|
|
172 return return_code
|
|
173
|
|
174
|
|
175 def data_table_needs_refresh( cached_data_table, url ):
|
|
176 if cached_data_table is None:
|
|
177 return True, {}
|
|
178 if datetime.datetime.now() - cached_data_table.get( 'time_loaded' ) > CACHE_TIME:
|
|
179 data_table_text = urllib2.urlopen( url ).read()
|
|
180 if cached_data_table.get( 'data_table_text', None ) != data_table_text:
|
|
181 return True, {'data_table_text':data_table_text}
|
|
182 loc_file_attrs = rsync_list_dir( RSYNC_SERVER, LOCATION_DIR )
|
|
183 if cached_data_table.get( 'loc_file_attrs', None ) != loc_file_attrs:
|
|
184 return True, {'loc_file_attrs':loc_file_attrs}
|
|
185 return False, {}
|
|
186
|
|
187 def load_data_tables_from_url( url=None, site='main', data_table_class=None ):
|
|
188 if not url:
|
|
189 url = TOOL_DATA_TABLE_CONF_XML_URLS.get( site, None )
|
|
190 assert url, ValueError( 'You must provide either a URL or a site=name.' )
|
|
191
|
|
192 cached_data_table = TOOL_DATA_TABLES_LOADED_BY_URL.get( url, None )
|
|
193 refresh, attribs = data_table_needs_refresh( cached_data_table, url )
|
|
194 if refresh:
|
|
195 data_table_text = attribs.get( 'data_table_text' )or urllib2.urlopen( url ).read()
|
|
196 loc_file_attrs = attribs.get( 'loc_file_attrs' ) or rsync_list_dir( RSYNC_SERVER, LOCATION_DIR )
|
|
197
|
|
198 tmp_dir = tempfile.mkdtemp( prefix='rsync_g2_' )
|
|
199 tmp_loc_dir = os.path.join( tmp_dir, 'location' )
|
|
200 os.mkdir( tmp_loc_dir )
|
|
201 rsync_sync_to_dir( rsync_urljoin( RSYNC_SERVER, LOCATION_DIR ), os.path.abspath( tmp_loc_dir ) )
|
|
202
|
|
203
|
|
204 new_data_table_text = data_table_text.replace( TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE, TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET % ( tmp_loc_dir ) )
|
|
205 data_table_fh = tempfile.NamedTemporaryFile( dir=tmp_dir, prefix='rysnc_data_manager_data_table_conf_' )
|
|
206 data_table_fh.write( new_data_table_text )
|
|
207 data_table_fh.flush()
|
|
208 tmp_data_dir = os.path.join( tmp_dir, 'tool-data' )
|
|
209 os.mkdir( tmp_data_dir )
|
|
210 data_tables = data_table_class( tmp_data_dir, config_filename=data_table_fh.name )
|
|
211 for name, data_table in data_tables.data_tables.items():
|
|
212 if name in EXCLUDE_DATA_TABLES or not data_table_has_path_column( data_table ):
|
|
213 log.debug( 'Removing data table "%s" because it is excluded by name or does not have a defined "path" column.', name )
|
|
214 del data_tables.data_tables[name]
|
|
215 cached_data_table = { 'data_tables': data_tables, 'tmp_dir': tmp_dir, 'data_table_text': data_table_text, 'tmp_loc_dir': tmp_loc_dir, 'loc_file_attrs': loc_file_attrs, 'time_loaded': datetime.datetime.now() }
|
|
216 TOOL_DATA_TABLES_LOADED_BY_URL[ url ] = cached_data_table
|
|
217 #delete the files
|
|
218 data_table_fh.close()
|
|
219 cleanup_before_exit( tmp_dir )
|
|
220 return cached_data_table
|
|
221
|
|
222 def data_table_has_path_column( data_table ):
|
|
223 col_names = data_table.get_column_name_list()
|
|
224 for name in PATH_COLUMN_NAMES:
|
|
225 if name in col_names:
|
|
226 return True
|
|
227 return False
|
|
228
|
|
229 def get_available_tables( trans ):
|
|
230 #list of data tables
|
|
231 data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ )
|
|
232 return data_tables.get( 'data_tables' ).get_tables().keys()
|
|
233
|
|
234 def get_new_xml_definition( app, data_table, data_manager, repo_info=None, location_file_dir=None ):
|
|
235 sub_dict = { 'table_name': data_table.name, 'comment_char': '', 'columns': '', 'file_path': '' }
|
|
236 sub_dict.update( data_manager.get_tool_shed_repository_info_dict() )
|
|
237 if data_table.comment_char:
|
|
238 sub_dict['comment_char'] = 'comment_char="%s"' % ( data_table.comment_char )
|
|
239 for i, name in enumerate( data_table.get_column_name_list() ):
|
|
240 if name is not None:
|
|
241 sub_dict['columns'] = "%s\n%s" % ( sub_dict['columns'], '<column name="%s" index="%s" />' % ( name, i ) )
|
|
242 location_file_dir = location_file_dir or app.config.galaxy_data_manager_data_path
|
|
243 for filename in data_table.filenames.keys():
|
|
244 sub_dict['file_path'] = basename( filename )
|
|
245 sub_dict['file_path'] = os.path.join( location_file_dir, sub_dict['file_path'] ) #os.path.abspath?
|
|
246 if not os.path.exists( sub_dict['file_path'] ):
|
|
247 # Create empty file
|
|
248 open( sub_dict['file_path'], 'wb+' ).close()
|
|
249 break
|
|
250 sub_dict[ 'repo_info' ] = repo_info or ''
|
|
251 return """
|
|
252 <tables><table name="%(table_name)s" %(comment_char)s>
|
|
253 %(columns)s
|
|
254 <file path="%(file_path)s" />
|
|
255 %(repo_info)s
|
|
256 </table></tables>
|
|
257 """ % sub_dict
|
|
258
|
|
259 def get_available_tables_for_dbkey( trans, dbkey, data_table_names ):
|
|
260 my_data_tables = trans.app.tool_data_tables.get_tables()
|
|
261 data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ )
|
|
262 rval = {}
|
|
263 for name, data_table in data_tables.get( 'data_tables' ).get_tables().iteritems():
|
|
264 if ( not data_table_names or name in data_table_names ): #name in my_data_tables.keys() and
|
|
265 #TODO: check that columns are similiar
|
|
266 if not dbkey:
|
|
267 entry_getter = data_table.get_named_fields_list()
|
|
268 else:
|
|
269 entry_getter = data_table.get_entries( 'dbkey', dbkey, None, default=[] )
|
|
270 for entry in entry_getter:
|
|
271 name = "%s: %s" % ( data_table.name, dumps( entry ) )
|
|
272 rval[name] = entry
|
|
273 return rval
|
|
274
|
|
275 def split_path_all( path ):
|
|
276 rval = []
|
|
277 path = path.rstrip( '/' )
|
|
278 while True:
|
|
279 head, tail = os.path.split( path )
|
|
280 if tail:
|
|
281 rval.append( tail )
|
|
282 path = head
|
|
283 elif head:
|
|
284 rval.append( head )
|
|
285 break
|
|
286 else:
|
|
287 break
|
|
288 rval.reverse()
|
|
289 return rval
|
|
290
|
|
291 def get_data_for_path( path, data_root_dir ):
|
|
292 # We list dir with a /, but copy data without
|
|
293 # listing with / gives a . entry when its a dir
|
|
294 # cloning without the / will copy that whole directory into the target,
|
|
295 # instead of just that target's contents
|
|
296 if path.startswith( GALAXY_DATA_CANONICAL_PATH ):
|
|
297 path = path[ len( GALAXY_DATA_CANONICAL_PATH ):]
|
|
298 make_path = path
|
|
299 rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), path )
|
|
300 if rsync_source.endswith( '/' ):
|
|
301 rsync_source = rsync_source[:-1]
|
|
302 try:
|
|
303 dir_list = rsync_list_dir( rsync_source + "/" )
|
|
304 except Exception, e:
|
|
305 dir_list = None
|
|
306 while not dir_list or '.' not in dir_list:
|
|
307 head, tail = os.path.split( make_path )
|
|
308 if not head:
|
|
309 head = tail
|
|
310 make_path = head
|
|
311 rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), head ) #if we error here, likely due to a connection issue
|
|
312 if rsync_source.endswith( '/' ):
|
|
313 rsync_source = rsync_source[:-1]
|
|
314 dir_list = rsync_list_dir( rsync_source + "/" )
|
|
315 split_path = split_path_all( make_path )
|
|
316 target_path = data_root_dir
|
|
317 for p in split_path[:-1]:
|
|
318 target_path = os.path.join( target_path, p )
|
|
319 if not os.path.exists( target_path ):
|
|
320 os.mkdir( target_path )
|
|
321 rsync_sync_to_dir( rsync_source, target_path )
|
|
322 return path
|
|
323
|
|
324 def get_data_and_munge_path( data_table_name, data_table_entry, data_root_dir ):
|
|
325 path_cols = []
|
|
326 for key, value in data_table_entry.iteritems():
|
|
327 if key in PATH_COLUMN_NAMES:
|
|
328 path_cols.append( ( key, value ) )
|
|
329 found_data = False
|
|
330 if path_cols:
|
|
331 for col_name, value in path_cols:
|
|
332 #GALAXY_DATA_CANONICAL_PATH
|
|
333 if value.startswith( GALAXY_DATA_CANONICAL_PATH ):
|
|
334 data_table_entry[col_name] = get_data_for_path( value, data_root_dir )
|
|
335 found_data = True
|
|
336 else:
|
|
337 print 'unable to determine location of rsync data for', data_table_name, data_table_entry
|
|
338 return data_table_entry
|
|
339
|
|
340 def fulfill_data_table_entries( data_table_entries, data_manager_dict, data_root_dir ):
|
|
341 for data_table_name, entries in data_table_entries.iteritems():
|
|
342 for entry in entries:
|
|
343 entry = get_data_and_munge_path( data_table_name, entry, data_root_dir )
|
|
344 _add_data_table_entry( data_manager_dict, data_table_name, entry )
|
|
345 return data_manager_dict
|
|
346
|
|
347 def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ):
|
|
348 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
|
|
349 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get( data_table_name, [] )
|
|
350 data_manager_dict['data_tables'][data_table_name].append( data_table_entry )
|
|
351 return data_manager_dict
|
|
352
|
|
353 def cleanup_before_exit( tmp_dir ):
|
|
354 if tmp_dir and os.path.exists( tmp_dir ):
|
|
355 shutil.rmtree( tmp_dir )
|
|
356
|
|
357 def get_data_table_entries( params ):
|
|
358 rval = {}
|
|
359 data_table_entries = params.get( 'data_table_entries', None )
|
|
360 if data_table_entries :
|
|
361 for entry_text in data_table_entries.split( ',' ):
|
|
362 entry_text = entry_text.strip().decode( 'base64' )
|
|
363 entry_dict = loads( entry_text )
|
|
364 data_table_name = entry_dict['name']
|
|
365 data_table_entry = entry_dict['entry']
|
|
366 rval[ data_table_name ] = rval.get( data_table_name, [] )
|
|
367 rval[ data_table_name ].append( data_table_entry )
|
|
368 return rval
|
|
369
|
|
370 def main():
|
|
371 #Parse Command Line
|
|
372 parser = optparse.OptionParser()
|
|
373 (options, args) = parser.parse_args()
|
|
374
|
|
375 filename = args[0]
|
|
376
|
|
377 params = loads( open( filename ).read() )
|
|
378 target_directory = params[ 'output_data' ][0]['extra_files_path']
|
|
379 os.mkdir( target_directory )
|
|
380 data_manager_dict = {}
|
|
381
|
|
382 data_table_entries = get_data_table_entries( params['param_dict'] )
|
|
383
|
|
384 # Populate the data Tables
|
|
385 data_manager_dict = fulfill_data_table_entries( data_table_entries, data_manager_dict, target_directory )
|
|
386
|
|
387 #save info to json file
|
|
388 open( filename, 'wb' ).write( dumps( data_manager_dict ) )
|
|
389
|
|
390 if __name__ == "__main__": main()
|