Mercurial > repos > devteam > data_manager_rsync_g2
comparison data_manager/data_manager_rsync.py @ 0:70afa70bba41 draft
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/data_managers/data_manager_rsync_g2 commit 86cf90107482cab1cb47fc0d42d6705f8077daa7
author | devteam |
---|---|
date | Fri, 06 Nov 2015 14:18:13 -0500 |
parents | |
children | 861c071a6cd5 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:70afa70bba41 |
---|---|
1 #!/usr/bin/env python | |
2 #Dan Blankenberg | |
3 | |
4 import sys | |
5 import os | |
6 import tempfile | |
7 import shutil | |
8 import optparse | |
9 import urllib2 | |
10 import subprocess | |
11 import datetime | |
12 from os.path import basename | |
13 from json import loads, dumps | |
14 from xml.etree.ElementTree import tostring | |
15 | |
16 import logging | |
17 _log_name = __name__ | |
18 if _log_name == '__builtin__': | |
19 _log_name = 'toolshed.installed.g2.rsync.data.manager' | |
20 log = logging.getLogger( _log_name ) | |
21 | |
22 # Get the Data from the Galaxy Project rsync server | |
23 RSYNC_CMD = 'rsync' | |
24 RSYNC_SERVER = "rsync://datacache.g2.bx.psu.edu/" | |
25 LOCATION_DIR = "location" | |
26 INDEX_DIR = "indexes" | |
27 | |
28 # Pull the Tool Data Table files from github | |
29 # FIXME: These files should be accessible from the rsync server directly. | |
30 TOOL_DATA_TABLE_CONF_XML_URLS = { 'main':"https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/usegalaxy.org/config/tool_data_table_conf.xml", | |
31 'test':"https://raw.githubusercontent.com/galaxyproject/usegalaxy-playbook/master/files/galaxy/test.galaxyproject.org/config/tool_data_table_conf.xml" } | |
32 | |
33 # Replace data table source entries with local temporary location | |
34 GALAXY_DATA_CANONICAL_PATH = "/galaxy/data/" | |
35 TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE = '<file path="%slocation/' % ( GALAXY_DATA_CANONICAL_PATH ) | |
36 TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET = '<file path="%s/' | |
37 | |
38 # Some basic Caching, so we don't have to reload and download everything every time | |
39 CACHE_TIME = datetime.timedelta( minutes=10 ) | |
40 TOOL_DATA_TABLES_LOADED_BY_URL = {} | |
41 | |
42 # Entries will not be selected by default | |
43 DEFAULT_SELECTED = False | |
44 | |
45 # Exclude data managers without 'path' column or that are in the manual exclude list | |
46 PATH_COLUMN_NAMES = ['path'] | |
47 EXCLUDE_DATA_TABLES = [] | |
48 # TODO: Make additional handler actions available for tables that can't fit into the the basic | |
49 # "take the value of path" as a dir and copy contents. | |
50 # e.g. mafs. Although this maf table is goofy and doesn't have path defined in <table> def, | |
51 # but it does exit in the .loc. | |
52 | |
53 # --- These methods are called by/within the Galaxy Application | |
54 | |
55 def exec_before_job( app, inp_data, out_data, param_dict, tool=None, **kwd ): | |
56 # Look for any data tables that haven't been defined for this data manager before and dynamically add them to Galaxy | |
57 param_dict = dict( **param_dict ) | |
58 param_dict['data_table_entries'] = param_dict.get( 'data_table_entries', [] ) | |
59 if not isinstance( param_dict['data_table_entries'], list ): | |
60 param_dict['data_table_entries'] = [param_dict['data_table_entries']] | |
61 param_dict['data_table_entries'] = ",".join( param_dict['data_table_entries'] ) | |
62 if tool: | |
63 tool_shed_repository = tool.tool_shed_repository | |
64 else: | |
65 tool_shed_repository = None | |
66 tdtm = None | |
67 data_manager = app.data_managers.get_manager( tool.data_manager_id, None ) | |
68 data_table_entries = get_data_table_entries( param_dict ) | |
69 data_tables = load_data_tables_from_url( data_table_class=app.tool_data_tables.__class__ ).get( 'data_tables' ) | |
70 for data_table_name, entries in data_table_entries.iteritems(): | |
71 #get data table managed by this data Manager | |
72 has_data_table = app.tool_data_tables.get_tables().get( data_table_name ) | |
73 if has_data_table: | |
74 has_data_table = bool( has_data_table.get_filename_for_source( data_manager, None ) ) | |
75 if not has_data_table: | |
76 if tdtm is None: | |
77 from tool_shed.tools import data_table_manager | |
78 tdtm = data_table_manager.ToolDataTableManager( app ) | |
79 target_dir, tool_path, relative_target_dir = tdtm.get_target_install_dir( tool_shed_repository ) | |
80 #Dynamically add this data table | |
81 log.debug( "Attempting to dynamically create a missing Tool Data Table named %s." % data_table_name ) | |
82 data_table = data_tables[data_table_name] | |
83 repo_info = tdtm.generate_repository_info_elem_from_repository( tool_shed_repository, parent_elem=None ) | |
84 if repo_info is not None: | |
85 repo_info = tostring( repo_info ) | |
86 tmp_file = tempfile.NamedTemporaryFile() | |
87 tmp_file.write( get_new_xml_definition( app, data_table, data_manager, repo_info, target_dir ) ) | |
88 tmp_file.flush() | |
89 app.tool_data_tables.add_new_entries_from_config_file( tmp_file.name, None, app.config.shed_tool_data_table_config, persist=True ) | |
90 tmp_file.close() | |
91 | |
92 def galaxy_code_get_available_data_tables( trans ): | |
93 #list of data tables | |
94 found_tables = get_available_tables( trans ) | |
95 rval = map( lambda x: ( ( x, x, DEFAULT_SELECTED ) ), found_tables ) | |
96 return rval | |
97 | |
98 def galaxy_code_get_available_data_tables_entries( trans, dbkey, data_table_names ): | |
99 #available entries, optionally filtered by dbkey and table names | |
100 if dbkey in [ None, '', '?' ]: | |
101 dbkey = None | |
102 if data_table_names in [ None, '', '?' ]: | |
103 data_table_names = None | |
104 found_tables = get_available_tables_for_dbkey( trans, dbkey, data_table_names ) | |
105 dbkey_text = '(%s) ' % ( dbkey ) if dbkey else '' | |
106 rval = map( lambda x: ( "%s%s" % ( dbkey_text, x[0] ), dumps( dict( name=x[0].split( ': ' )[0], entry=x[1] ) ).encode( 'base64' ).rstrip(), DEFAULT_SELECTED ), found_tables.items() ) | |
107 return rval | |
108 | |
109 # --- End Galaxy called Methods --- | |
110 | |
111 | |
112 def rsync_urljoin( base, url ): | |
113 # urlparse.urljoin doesn't work correctly for our use-case | |
114 # probably because it doesn't recognize the rsync scheme | |
115 base = base.rstrip( '/' ) | |
116 url = url.lstrip( '/' ) | |
117 return "%s/%s" % ( base, url ) | |
118 | |
119 def rsync_list_dir( server, dir=None, skip_names=[] ): | |
120 #drwxr-xr-x 50 2014/05/16 20:58:11 . | |
121 if dir: | |
122 dir = rsync_urljoin( server, dir ) | |
123 else: | |
124 dir = server | |
125 rsync_response = tempfile.NamedTemporaryFile() | |
126 rsync_stderr = tempfile.NamedTemporaryFile() | |
127 rsync_cmd = [ RSYNC_CMD, '--list-only', dir ] | |
128 return_code = subprocess.call( rsync_cmd, stdout=rsync_response, stderr=rsync_stderr ) | |
129 rsync_response.flush() | |
130 rsync_response.seek(0) | |
131 rsync_stderr.flush() | |
132 rsync_stderr.seek(0) | |
133 if return_code: | |
134 msg = "stdout:\n%s\nstderr:\n%s" % ( rsync_response.read(), rsync_stderr.read() ) | |
135 rsync_response.close() | |
136 rsync_stderr.close() | |
137 raise Exception( 'Failed to execute rsync command (%s), returncode=%s. Rsync_output:\n%s' % ( rsync_cmd, return_code, msg ) ) | |
138 rsync_stderr.close() | |
139 rval = {} | |
140 for line in rsync_response: | |
141 perms, line = line.split( None, 1 ) | |
142 line = line.strip() | |
143 size, line = line.split( None, 1 ) | |
144 line = line.strip() | |
145 date, line = line.split( None, 1 ) | |
146 line = line.strip() | |
147 time, line = line.split( None, 1 ) | |
148 name = line.strip() | |
149 if name in skip_names: | |
150 continue | |
151 size = line.strip() | |
152 rval[ name ] = dict( name=name, permissions=perms, bytes=size, date=date, time=time ) | |
153 rsync_response.close() | |
154 return rval | |
155 | |
156 def rsync_sync_to_dir( source, target ): | |
157 rsync_response = tempfile.NamedTemporaryFile() | |
158 rsync_stderr = tempfile.NamedTemporaryFile() | |
159 rsync_cmd = [ RSYNC_CMD, '-avzP', source, target ] | |
160 return_code = subprocess.call( rsync_cmd, stdout=rsync_response, stderr=rsync_stderr ) | |
161 rsync_response.flush() | |
162 rsync_response.seek(0) | |
163 rsync_stderr.flush() | |
164 rsync_stderr.seek(0) | |
165 if return_code: | |
166 msg = "stdout:\n%s\nstderr:\n%s" % ( rsync_response.read(), rsync_stderr.read() ) | |
167 rsync_response.close() | |
168 rsync_stderr.close() | |
169 raise Exception( 'Failed to execute rsync command (%s), returncode=%s. Rsync_output:\n%s' % ( rsync_cmd, return_code, msg ) ) | |
170 rsync_response.close() | |
171 rsync_stderr.close() | |
172 return return_code | |
173 | |
174 | |
175 def data_table_needs_refresh( cached_data_table, url ): | |
176 if cached_data_table is None: | |
177 return True, {} | |
178 if datetime.datetime.now() - cached_data_table.get( 'time_loaded' ) > CACHE_TIME: | |
179 data_table_text = urllib2.urlopen( url ).read() | |
180 if cached_data_table.get( 'data_table_text', None ) != data_table_text: | |
181 return True, {'data_table_text':data_table_text} | |
182 loc_file_attrs = rsync_list_dir( RSYNC_SERVER, LOCATION_DIR ) | |
183 if cached_data_table.get( 'loc_file_attrs', None ) != loc_file_attrs: | |
184 return True, {'loc_file_attrs':loc_file_attrs} | |
185 return False, {} | |
186 | |
187 def load_data_tables_from_url( url=None, site='main', data_table_class=None ): | |
188 if not url: | |
189 url = TOOL_DATA_TABLE_CONF_XML_URLS.get( site, None ) | |
190 assert url, ValueError( 'You must provide either a URL or a site=name.' ) | |
191 | |
192 cached_data_table = TOOL_DATA_TABLES_LOADED_BY_URL.get( url, None ) | |
193 refresh, attribs = data_table_needs_refresh( cached_data_table, url ) | |
194 if refresh: | |
195 data_table_text = attribs.get( 'data_table_text' )or urllib2.urlopen( url ).read() | |
196 loc_file_attrs = attribs.get( 'loc_file_attrs' ) or rsync_list_dir( RSYNC_SERVER, LOCATION_DIR ) | |
197 | |
198 tmp_dir = tempfile.mkdtemp( prefix='rsync_g2_' ) | |
199 tmp_loc_dir = os.path.join( tmp_dir, 'location' ) | |
200 os.mkdir( tmp_loc_dir ) | |
201 rsync_sync_to_dir( rsync_urljoin( RSYNC_SERVER, LOCATION_DIR ), os.path.abspath( tmp_loc_dir ) ) | |
202 | |
203 | |
204 new_data_table_text = data_table_text.replace( TOOL_DATA_TABLE_CONF_XML_REPLACE_SOURCE, TOOL_DATA_TABLE_CONF_XML_REPLACE_TARGET % ( tmp_loc_dir ) ) | |
205 data_table_fh = tempfile.NamedTemporaryFile( dir=tmp_dir, prefix='rysnc_data_manager_data_table_conf_' ) | |
206 data_table_fh.write( new_data_table_text ) | |
207 data_table_fh.flush() | |
208 tmp_data_dir = os.path.join( tmp_dir, 'tool-data' ) | |
209 os.mkdir( tmp_data_dir ) | |
210 data_tables = data_table_class( tmp_data_dir, config_filename=data_table_fh.name ) | |
211 for name, data_table in data_tables.data_tables.items(): | |
212 if name in EXCLUDE_DATA_TABLES or not data_table_has_path_column( data_table ): | |
213 log.debug( 'Removing data table "%s" because it is excluded by name or does not have a defined "path" column.', name ) | |
214 del data_tables.data_tables[name] | |
215 cached_data_table = { 'data_tables': data_tables, 'tmp_dir': tmp_dir, 'data_table_text': data_table_text, 'tmp_loc_dir': tmp_loc_dir, 'loc_file_attrs': loc_file_attrs, 'time_loaded': datetime.datetime.now() } | |
216 TOOL_DATA_TABLES_LOADED_BY_URL[ url ] = cached_data_table | |
217 #delete the files | |
218 data_table_fh.close() | |
219 cleanup_before_exit( tmp_dir ) | |
220 return cached_data_table | |
221 | |
222 def data_table_has_path_column( data_table ): | |
223 col_names = data_table.get_column_name_list() | |
224 for name in PATH_COLUMN_NAMES: | |
225 if name in col_names: | |
226 return True | |
227 return False | |
228 | |
229 def get_available_tables( trans ): | |
230 #list of data tables | |
231 data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ ) | |
232 return data_tables.get( 'data_tables' ).get_tables().keys() | |
233 | |
234 def get_new_xml_definition( app, data_table, data_manager, repo_info=None, location_file_dir=None ): | |
235 sub_dict = { 'table_name': data_table.name, 'comment_char': '', 'columns': '', 'file_path': '' } | |
236 sub_dict.update( data_manager.get_tool_shed_repository_info_dict() ) | |
237 if data_table.comment_char: | |
238 sub_dict['comment_char'] = 'comment_char="%s"' % ( data_table.comment_char ) | |
239 for i, name in enumerate( data_table.get_column_name_list() ): | |
240 if name is not None: | |
241 sub_dict['columns'] = "%s\n%s" % ( sub_dict['columns'], '<column name="%s" index="%s" />' % ( name, i ) ) | |
242 location_file_dir = location_file_dir or app.config.galaxy_data_manager_data_path | |
243 for filename in data_table.filenames.keys(): | |
244 sub_dict['file_path'] = basename( filename ) | |
245 sub_dict['file_path'] = os.path.join( location_file_dir, sub_dict['file_path'] ) #os.path.abspath? | |
246 if not os.path.exists( sub_dict['file_path'] ): | |
247 # Create empty file | |
248 open( sub_dict['file_path'], 'wb+' ).close() | |
249 break | |
250 sub_dict[ 'repo_info' ] = repo_info or '' | |
251 return """ | |
252 <tables><table name="%(table_name)s" %(comment_char)s> | |
253 %(columns)s | |
254 <file path="%(file_path)s" /> | |
255 %(repo_info)s | |
256 </table></tables> | |
257 """ % sub_dict | |
258 | |
259 def get_available_tables_for_dbkey( trans, dbkey, data_table_names ): | |
260 my_data_tables = trans.app.tool_data_tables.get_tables() | |
261 data_tables = load_data_tables_from_url( data_table_class=trans.app.tool_data_tables.__class__ ) | |
262 rval = {} | |
263 for name, data_table in data_tables.get( 'data_tables' ).get_tables().iteritems(): | |
264 if ( not data_table_names or name in data_table_names ): #name in my_data_tables.keys() and | |
265 #TODO: check that columns are similiar | |
266 if not dbkey: | |
267 entry_getter = data_table.get_named_fields_list() | |
268 else: | |
269 entry_getter = data_table.get_entries( 'dbkey', dbkey, None, default=[] ) | |
270 for entry in entry_getter: | |
271 name = "%s: %s" % ( data_table.name, dumps( entry ) ) | |
272 rval[name] = entry | |
273 return rval | |
274 | |
275 def split_path_all( path ): | |
276 rval = [] | |
277 path = path.rstrip( '/' ) | |
278 while True: | |
279 head, tail = os.path.split( path ) | |
280 if tail: | |
281 rval.append( tail ) | |
282 path = head | |
283 elif head: | |
284 rval.append( head ) | |
285 break | |
286 else: | |
287 break | |
288 rval.reverse() | |
289 return rval | |
290 | |
291 def get_data_for_path( path, data_root_dir ): | |
292 # We list dir with a /, but copy data without | |
293 # listing with / gives a . entry when its a dir | |
294 # cloning without the / will copy that whole directory into the target, | |
295 # instead of just that target's contents | |
296 if path.startswith( GALAXY_DATA_CANONICAL_PATH ): | |
297 path = path[ len( GALAXY_DATA_CANONICAL_PATH ):] | |
298 make_path = path | |
299 rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), path ) | |
300 if rsync_source.endswith( '/' ): | |
301 rsync_source = rsync_source[:-1] | |
302 try: | |
303 dir_list = rsync_list_dir( rsync_source + "/" ) | |
304 except Exception, e: | |
305 dir_list = None | |
306 while not dir_list or '.' not in dir_list: | |
307 head, tail = os.path.split( make_path ) | |
308 if not head: | |
309 head = tail | |
310 make_path = head | |
311 rsync_source = rsync_urljoin( rsync_urljoin( RSYNC_SERVER, INDEX_DIR ), head ) #if we error here, likely due to a connection issue | |
312 if rsync_source.endswith( '/' ): | |
313 rsync_source = rsync_source[:-1] | |
314 dir_list = rsync_list_dir( rsync_source + "/" ) | |
315 split_path = split_path_all( make_path ) | |
316 target_path = data_root_dir | |
317 for p in split_path[:-1]: | |
318 target_path = os.path.join( target_path, p ) | |
319 if not os.path.exists( target_path ): | |
320 os.mkdir( target_path ) | |
321 rsync_sync_to_dir( rsync_source, target_path ) | |
322 return path | |
323 | |
324 def get_data_and_munge_path( data_table_name, data_table_entry, data_root_dir ): | |
325 path_cols = [] | |
326 for key, value in data_table_entry.iteritems(): | |
327 if key in PATH_COLUMN_NAMES: | |
328 path_cols.append( ( key, value ) ) | |
329 found_data = False | |
330 if path_cols: | |
331 for col_name, value in path_cols: | |
332 #GALAXY_DATA_CANONICAL_PATH | |
333 if value.startswith( GALAXY_DATA_CANONICAL_PATH ): | |
334 data_table_entry[col_name] = get_data_for_path( value, data_root_dir ) | |
335 found_data = True | |
336 else: | |
337 print 'unable to determine location of rsync data for', data_table_name, data_table_entry | |
338 return data_table_entry | |
339 | |
340 def fulfill_data_table_entries( data_table_entries, data_manager_dict, data_root_dir ): | |
341 for data_table_name, entries in data_table_entries.iteritems(): | |
342 for entry in entries: | |
343 entry = get_data_and_munge_path( data_table_name, entry, data_root_dir ) | |
344 _add_data_table_entry( data_manager_dict, data_table_name, entry ) | |
345 return data_manager_dict | |
346 | |
347 def _add_data_table_entry( data_manager_dict, data_table_name, data_table_entry ): | |
348 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) | |
349 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get( data_table_name, [] ) | |
350 data_manager_dict['data_tables'][data_table_name].append( data_table_entry ) | |
351 return data_manager_dict | |
352 | |
353 def cleanup_before_exit( tmp_dir ): | |
354 if tmp_dir and os.path.exists( tmp_dir ): | |
355 shutil.rmtree( tmp_dir ) | |
356 | |
357 def get_data_table_entries( params ): | |
358 rval = {} | |
359 data_table_entries = params.get( 'data_table_entries', None ) | |
360 if data_table_entries : | |
361 for entry_text in data_table_entries.split( ',' ): | |
362 entry_text = entry_text.strip().decode( 'base64' ) | |
363 entry_dict = loads( entry_text ) | |
364 data_table_name = entry_dict['name'] | |
365 data_table_entry = entry_dict['entry'] | |
366 rval[ data_table_name ] = rval.get( data_table_name, [] ) | |
367 rval[ data_table_name ].append( data_table_entry ) | |
368 return rval | |
369 | |
370 def main(): | |
371 #Parse Command Line | |
372 parser = optparse.OptionParser() | |
373 (options, args) = parser.parse_args() | |
374 | |
375 filename = args[0] | |
376 | |
377 params = loads( open( filename ).read() ) | |
378 target_directory = params[ 'output_data' ][0]['extra_files_path'] | |
379 os.mkdir( target_directory ) | |
380 data_manager_dict = {} | |
381 | |
382 data_table_entries = get_data_table_entries( params['param_dict'] ) | |
383 | |
384 # Populate the data Tables | |
385 data_manager_dict = fulfill_data_table_entries( data_table_entries, data_manager_dict, target_directory ) | |
386 | |
387 #save info to json file | |
388 open( filename, 'wb' ).write( dumps( data_manager_dict ) ) | |
389 | |
390 if __name__ == "__main__": main() |