# HG changeset patch # User greg # Date 1479841744 18000 # Node ID 11ea0659100f8e8d5cb41f62a060cc0bf1d53ecb Uploaded diff -r 000000000000 -r 11ea0659100f data_manager/data_manager_plant_tribes_scaffolds_download.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_plant_tribes_scaffolds_download.py Tue Nov 22 14:09:04 2016 -0500 @@ -0,0 +1,157 @@ +#!/usr/bin/env python +# +# Data manager for downloading Plant Tribes scaffolds data. +import argparse +import json +import os +import shutil +import sys +import tarfile +import urllib2 +import zipfile + + +DEFAULT_DATA_TABLE_NAMES = ["plant_tribes_scaffolds"] + + +def add_data_table_entry(data_manager_dict, data_table_name, data_table_entry): + data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) + data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get(data_table_name, []) + data_manager_dict['data_tables'][data_table_name].append(data_table_entry) + return data_manager_dict + + +def files_from_file_paths(file_paths): + """ + Given a list of file system paths, return a list of + absolute paths for all files and directories within + those paths. + """ + # Collect files. + files = [] + for file_path in file_paths: + file_path = os.path.abspath(file_path) + if os.path.isfile(file_path): + # Store full path for each file. + files.append(file_path) + elif os.path.isdir(file_path): + # Descend into directory and collect the files + for f in os.listdir(file_path): + files.extend(files_from_file_paths(os.path.join(file_path, f))) + return files + + +def import_from_server(data_manager_dict, target_directory, file_system_paths, description, link_to_data=False, data_table_names=DEFAULT_DATA_TABLE_NAMES): + """ + Creates references to the specified file(s) on the Galaxy + server in the data table. + """ + # Remove escapes for '\n' and '\r' that might have been inserted by Galaxy. + file_paths = file_system_paths.replace('__cn__', '\n').replace('__cr__', '\r').split() + files = files_from_file_paths(file_paths) + for f in files: + source_file = os.path.basename(f) + target_file = os.path.join(target_directory, source_file) + entry_name = source_file + if link_to_data: + os.symlink(f, target_file) + else: + shutil.copyfile(f, target_file) + for data_table_name in data_table_names: + data_manager_dict = add_data_table_entry(data_manager_dict, data_table_name, dict(value=source_file, name=entry_name, path=f, description=description)) + return data_manager_dict + + +def make_directory(dir): + if not os.path.exists(dir): + os.makedirs(dir) + + +def remove_directory(dir): + if os.path.exists(dir): + shutil.rmtree(dir) + + +def url_download(data_manager_dict, target_directory, url, description, data_table_names=DEFAULT_DATA_TABLE_NAMES): + work_directory = os.path.abspath(os.path.join(os.getcwd(), 'scaffolds')) + make_directory(work_directory) + file_path = os.path.join(work_directory, 'download.dat') + src = None + dst = None + try: + req = urllib2.Request(url) + src = urllib2.urlopen(req) + dst = open(file_path, 'wb') + while True: + chunk = src.read(2**10) + if chunk: + dst.write(chunk) + else: + break + except Exception, e: + print >>sys.stderr, str(e) + finally: + if src: + src.close() + if dst: + dst.close() + if tarfile.is_tarfile(file_path): + fh = tarfile.open(file_path, 'r:*') + elif zipfile.is_zipfile(file_path): + fh = zipfile.ZipFile(file_path, 'r') + else: + return + fh.extractall(work_directory) + os.remove(file_path) + # Move the scaffolds data files into defined output directory. + for filename in os.listdir(work_directory): + shutil.move(os.path.join(work_directory, filename), target_directory) + remove_directory(work_directory) + # Populate the data table, there should be a single entry in target_directory. + for file_path in os.listdir(target_directory): + entry_name = "%s" % os.path.basename(file_path) + for data_table_name in data_table_names: + data_manager_dict = add_data_table_entry(data_manager_dict, data_table_name, dict(value=entry_name, name=entry_name, path=file_path, description=description)) + return data_manager_dict + + +parser = argparse.ArgumentParser() +parser.add_argument('--data_source', dest='data_source', help='Data source') +parser.add_option('--description', dest='description', default=None, help='Description') +parser.add_option('--create_symlink', dest='create_symlink', default=None, help='Link files instead of copying') +parser.add_option('--file_system_paths', dest='file_system_paths', default=None, help='File system paths') +parser.add_option('--md5_checksum', dest='md5_checksum', default=None, help='MD5 checksum') +parser.add_option('--name', dest='name', help='Data table entry unique ID') +parser.add_option('--out_file', dest='out_file', help='JSON output file') +parser.add_option('--web_url', dest='web_url', default=None, help='Web URL') + +args = parser.parse_args() + +# Some magic happens with tools of type "manage_data" in that the output +# file magically contains some JSON data that wllos us to define the target +# directory. +params = json.loads(open(args.out_file).read()) +target_directory = params['output_data'][0]['extra_files_path'] +make_directory(target_directory) + +data_table_names = DEFAULT_DATA_TABLE_NAMES +if args.description is None: + description = '' +else: + description = args.description.strip() + +# Initialize the data table. +data_manager_dict = {} +data_manager_dict['value'] = args.name.lower() +data_manager_dict['name'] = args.name +data_manager_dict['path'] = '.' +data_manager_dict['description'] = description + +# Get the scaffolds data. +if args.data_source == 'web_url': + data_manager_dict = url_download(data_manager_dict, target_directory, args.web_url, description, data_table_names=DEFAULT_DATA_TABLE_NAMES) +else: + data_manager_dict = import_from_server(data_manager_dict, target_directory, args.file_system_paths, description, create_symlink=args.create_symlink, data_table_names=DEFAULT_DATA_TABLE_NAMES) + +# Write the JSON output dataset. +file(args.out_file, 'w').write(json.dumps(data_manager_dict)) diff -r 000000000000 -r 11ea0659100f data_manager/data_manager_plant_tribes_scaffolds_download.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_plant_tribes_scaffolds_download.xml Tue Nov 22 14:09:04 2016 -0500 @@ -0,0 +1,80 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +**What it does** + +This tool fetches scaffolds data used by the Plant Tribes Galaxy tools and populates the plant_tribes_scaffolds data table. + +The scaffolds data can be imported using a URL with an optional MD5 checksum or from files stored in a directory that is accessible to the Galaxy server. +An optional description can be provided that will appear next to the scaffolds file name in the data table entry. + +------ + +**Importing from a URL** + +Scaffolds data provided by the Floral Genome Project can be downloaded here: + + * 22 plant genomes (Angiosperms clusters, version 1.0): http://fgp.huck.psu.edu/planttribes_data/22Gv1.0.tar.bz2 + * 22 plant genomes (Angiosperms clusters, version 1.1): http://fgp.huck.psu.edu/planttribes_data/22Gv1.1.tar.bz2 + +**Importing from file system paths** + +Use this option to import Plant Tribes scaffolds data if it is available on the Galaxy server's file system. The value of +each **File system paths** must be the root directory of the uncompressed and extracted scaffolds data. + + + + + @unpublished{None, + author = {Greg Von Kuster}, + title = {None}, + year = {None}, + eprint = {None}, + url = {None} + } + + diff -r 000000000000 -r 11ea0659100f data_manager_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Tue Nov 22 14:09:04 2016 -0500 @@ -0,0 +1,20 @@ + + + + + + + + + + ${path} + plant_tribes/scaffolds/${value} + + ${GALAXY_DATA_MANAGER_DATA_PATH}/plant_tribes/scaffolds/${value} + abspath + + + + + + diff -r 000000000000 -r 11ea0659100f tool-data/plant_tribes_scaffolds.loc.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/plant_tribes_scaffolds.loc.sample Tue Nov 22 14:09:04 2016 -0500 @@ -0,0 +1,4 @@ +## Plant Tribes scaffolds +#Value Name Path Description +#22Gv1.0 22Gv1.0 /plant_tribes/scaffolds/22Gv1.0 22 plant genomes (Angiosperms clusters, version 1.0; 22Gv1.0) +#22Gv1.1 22Gv1.1 /plant_tribes/scaffolds/22Gv1.1 22 plant genomes (Angiosperms clusters, version 1.1; 22Gv1.1) diff -r 000000000000 -r 11ea0659100f tool_data_table_conf.xml.sample --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Nov 22 14:09:04 2016 -0500 @@ -0,0 +1,6 @@ + + + value, name, path, description + +
+