Mercurial > repos > greg > data_manager_plant_tribes_scaffolds_downloader

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_plant_tribes_scaffolds_download.py	Tue Nov 22 14:09:04 2016 -0500
@@ -0,0 +1,157 @@
+#!/usr/bin/env python
+#
+# Data manager for downloading Plant Tribes scaffolds data.
+import argparse
+import json
+import os
+import shutil
+import sys
+import tarfile
+import urllib2
+import zipfile
+
+
+DEFAULT_DATA_TABLE_NAMES = ["plant_tribes_scaffolds"]
+
+
+def add_data_table_entry(data_manager_dict, data_table_name, data_table_entry):
+    data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
+    data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get(data_table_name, [])
+    data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
+    return data_manager_dict
+
+
+def files_from_file_paths(file_paths):
+    """
+    Given a list of file system paths, return a list of
+    absolute paths for all files and directories within
+    those paths.
+    """
+    # Collect files.
+    files = []
+    for file_path in file_paths:
+        file_path = os.path.abspath(file_path)
+        if os.path.isfile(file_path):
+            # Store full path for each file.
+            files.append(file_path)
+        elif os.path.isdir(file_path):
+            # Descend into directory and collect the files
+            for f in os.listdir(file_path):
+                files.extend(files_from_file_paths(os.path.join(file_path, f)))
+    return files
+
+
+def import_from_server(data_manager_dict, target_directory, file_system_paths, description, link_to_data=False, data_table_names=DEFAULT_DATA_TABLE_NAMES):
+    """
+    Creates references to the specified file(s) on the Galaxy
+    server in the data table.
+    """
+    # Remove escapes for '\n' and '\r' that might have been inserted by Galaxy.
+    file_paths = file_system_paths.replace('__cn__', '\n').replace('__cr__', '\r').split()
+    files = files_from_file_paths(file_paths)
+    for f in files:
+        source_file = os.path.basename(f)
+        target_file = os.path.join(target_directory, source_file)
+        entry_name = source_file
+        if link_to_data:
+            os.symlink(f, target_file)
+        else:
+            shutil.copyfile(f, target_file)
+        for data_table_name in data_table_names:
+            data_manager_dict = add_data_table_entry(data_manager_dict, data_table_name, dict(value=source_file, name=entry_name, path=f, description=description))
+    return data_manager_dict
+
+
+def make_directory(dir):
+    if not os.path.exists(dir):
+        os.makedirs(dir)
+
+
+def remove_directory(dir):
+    if os.path.exists(dir):
+        shutil.rmtree(dir)
+
+
+def url_download(data_manager_dict, target_directory, url, description, data_table_names=DEFAULT_DATA_TABLE_NAMES):
+    work_directory = os.path.abspath(os.path.join(os.getcwd(), 'scaffolds'))
+    make_directory(work_directory)
+    file_path = os.path.join(work_directory, 'download.dat')
+    src = None
+    dst = None
+    try:
+        req = urllib2.Request(url)
+        src = urllib2.urlopen(req)
+        dst = open(file_path, 'wb')
+        while True:
+            chunk = src.read(2**10)
+            if chunk:
+                dst.write(chunk)
+            else:
+                break
+    except Exception, e:
+        print >>sys.stderr, str(e)
+    finally:
+        if src:
+            src.close()
+        if dst:
+            dst.close()
+    if tarfile.is_tarfile(file_path):
+        fh = tarfile.open(file_path, 'r:*')
+    elif zipfile.is_zipfile(file_path):
+        fh = zipfile.ZipFile(file_path, 'r')
+    else:
+        return
+    fh.extractall(work_directory)
+    os.remove(file_path)
+    # Move the scaffolds data files into defined output directory.
+    for filename in os.listdir(work_directory):
+        shutil.move(os.path.join(work_directory, filename), target_directory)
+    remove_directory(work_directory)
+    # Populate the data table, there should be a single entry in target_directory.
+    for file_path in os.listdir(target_directory):
+        entry_name = "%s" % os.path.basename(file_path)
+        for data_table_name in data_table_names:
+            data_manager_dict = add_data_table_entry(data_manager_dict, data_table_name, dict(value=entry_name, name=entry_name, path=file_path, description=description))
+    return data_manager_dict
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--data_source', dest='data_source', help='Data source')
+parser.add_option('--description', dest='description', default=None, help='Description')
+parser.add_option('--create_symlink', dest='create_symlink', default=None, help='Link files instead of copying')
+parser.add_option('--file_system_paths', dest='file_system_paths', default=None, help='File system paths')
+parser.add_option('--md5_checksum', dest='md5_checksum', default=None, help='MD5 checksum')
+parser.add_option('--name', dest='name', help='Data table entry unique ID')
+parser.add_option('--out_file', dest='out_file', help='JSON output file')
+parser.add_option('--web_url', dest='web_url', default=None, help='Web URL')
+
+args = parser.parse_args()
+
+# Some magic happens with tools of type "manage_data" in that the output
+# file magically contains some JSON data that wllos us to define the target
+# directory.
+params = json.loads(open(args.out_file).read())
+target_directory = params['output_data'][0]['extra_files_path']
+make_directory(target_directory)
+
+data_table_names = DEFAULT_DATA_TABLE_NAMES
+if args.description is None:
+    description = ''
+else:
+    description = args.description.strip()
+
+# Initialize the data table.
+data_manager_dict = {}
+data_manager_dict['value'] = args.name.lower()
+data_manager_dict['name'] = args.name
+data_manager_dict['path'] = '.'
+data_manager_dict['description'] = description
+
+# Get the scaffolds data.
+if args.data_source == 'web_url':
+    data_manager_dict = url_download(data_manager_dict, target_directory, args.web_url, description, data_table_names=DEFAULT_DATA_TABLE_NAMES)
+else:
+    data_manager_dict = import_from_server(data_manager_dict, target_directory, args.file_system_paths, description, create_symlink=args.create_symlink, data_table_names=DEFAULT_DATA_TABLE_NAMES)
+
+# Write the JSON output dataset.
+file(args.out_file, 'w').write(json.dumps(data_manager_dict))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_plant_tribes_scaffolds_download.xml	Tue Nov 22 14:09:04 2016 -0500
@@ -0,0 +1,80 @@
+<tool id="data_manager_plant_tribes_scaffolds_download" name="Plant Tribes Scaffolds Download" version="1.0.0" tool_type="manage_data">
+    <description></description>
+    <stdio>
+        <exit_code range=":-1" level="fatal" description="Error: Cannot open file" />
+        <exit_code range="1:" level="fatal" description="Error" />
+    </stdio>
+    <command>
+        <![CDATA[
+            python $__tool_directory__/data_manager_plant_tribes_scaffolds_download.py
+            --data_source=$data_source_cond.data_source
+            #if str($data_source_cond.data_source) == "web_url":
+                --web_url "$data_source_cond.web_url"
+                --md5_checksum "$data_source_cond.md5_checksum"
+            #else:
+                --file_system_paths "$data_source_cond.file_system_paths"
+                --create_symlink $data_source_cond.create_symlink
+            #end if
+            --description "$data_source_cond.description"
+            "$out_file"
+        ]]>
+    </command>
+    <inputs>
+        <param name="name" type="text" value="" label="Data table entry unique ID"/>
+        <param name="description" type="text" value="" label="Description of the data" help="Value is optional"/>
+        <conditional name="data_source_cond">
+            <param name="data_source" type="select" label="Choose the source for the Plant Tribes scaffolds files">
+                <option value="web_url" selected="true">Web URL</option>
+                <option value="file_system">File system paths</option>
+            </param>
+            <when value="web_url">
+                <param name="web_url" type="text" value="" label="Web URL" optional="False" />
+                <param name="md5_checksum" type="text" value="" label="MD5 checksum" help="Value is optional"/>
+            </when>
+            <when value="file_system">
+                <param name="file_system_paths" type="text" value="" area="True" label="File system paths" optional="False" help="All files within the given folders and their sub-folders will be installed." />
+                <param name="create_symlink" type="boolean" truevalue="create_symlink" falsevalue="copy_file" label="Link files instead of copying" checked="false" />
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json" label="${tool.name}"/>
+    </outputs>
+    <tests>
+    </tests>
+    <help>
+.. class:: infomark
+
+**What it does**
+
+This tool fetches scaffolds data used by the Plant Tribes Galaxy tools and populates the plant_tribes_scaffolds data table.
+
+The scaffolds data can be imported using a URL with an optional MD5 checksum or from files stored in a directory that is accessible to the Galaxy server.
+An optional description can be provided that will appear next to the scaffolds file name in the data table entry.
+
+------
+
+**Importing from a URL**
+
+Scaffolds data provided by the Floral Genome Project can be downloaded here:
+
+ * 22 plant genomes (Angiosperms clusters, version 1.0):        http://fgp.huck.psu.edu/planttribes_data/22Gv1.0.tar.bz2
+ * 22 plant genomes (Angiosperms clusters, version 1.1):        http://fgp.huck.psu.edu/planttribes_data/22Gv1.1.tar.bz2
+
+**Importing from file system paths**
+
+Use this option to import Plant Tribes scaffolds data if it is available on the Galaxy server's file system.  The value of
+each **File system paths** must be the root directory of the uncompressed and extracted scaffolds data.
+
+    </help>
+    <citations>
+        <citation type="bibtex">
+            @unpublished{None,
+            author = {Greg Von Kuster},
+            title = {None},
+            year = {None},
+            eprint = {None},
+            url = {None}
+        }</citation>
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Tue Nov 22 14:09:04 2016 -0500
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<data_managers>
+    <data_manager tool_file="data_manager/data_manager_plant_tribes_scaffolds_download.xml" id="data_manager_plant_tribes_scaffolds_download" >
+        <data_table name="plant_tribes_scaffolds">
+            <output>
+                <column name="value" />
+                <column name="name"/>
+                <column name="path" output_ref="out_file">
+                    <move type="file">
+                        <source>${path}</source>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">plant_tribes/scaffolds/${value}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/plant_tribes/scaffolds/${value}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+                <column name="description" />
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/plant_tribes_scaffolds.loc.sample	Tue Nov 22 14:09:04 2016 -0500
@@ -0,0 +1,4 @@
+## Plant Tribes scaffolds
+#Value	Name	Path	Description
+#22Gv1.0	22Gv1.0	/plant_tribes/scaffolds/22Gv1.0	22 plant genomes (Angiosperms clusters, version 1.0; 22Gv1.0)
+#22Gv1.1	22Gv1.1	/plant_tribes/scaffolds/22Gv1.1	22 plant genomes (Angiosperms clusters, version 1.1; 22Gv1.1)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Tue Nov 22 14:09:04 2016 -0500
@@ -0,0 +1,6 @@
+<tables>
+    <table name="plant_tribes_scaffolds" comment_char="#">
+        <columns>value, name, path, description</columns>
+        <file path="tool-data/plant_tribes_scaffolds.loc" />
+    </table>
+</tables>