Mercurial > repos > bebatut > data_manager_qiime_database_downloader
changeset 6:a720a88b29db draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_qiime_database_downloader commit 9a8e5333c047c6a5c18ca8f139a979704644ad87-dirty
author | bebatut |
---|---|
date | Wed, 03 May 2017 12:14:01 -0400 |
parents | 6c2db7877763 |
children | 10ac4f7824fa |
files | data_manager/data_manager_qiime_download.py data_manager/data_manager_qiime_download.xml tool-data/qiime_rep_set.loc.sample tool-data/qiime_rep_set_aligned.loc.sample tool-data/qiime_taxonomy.loc.sample tool-data/qiime_trees.loc.sample tool_data_table_conf.xml.sample |
diffstat | 7 files changed, 0 insertions(+), 506 deletions(-) [+] |
line wrap: on
line diff
--- a/data_manager/data_manager_qiime_download.py Wed May 03 12:12:47 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,368 +0,0 @@ -#!/usr/bin/env python -# -# Data manager for reference data for the QIIME Galaxy tools -import json -import argparse -import os -import subprocess -import sys -import tarfile -import zipfile -import requests -import ftplib - - -protocol = { - "unite": "http", - "greengenes": "ftp", - "silva": "http", - "img": "ftp" -} -baseUrl = { - "unite": "http://unite.ut.ee/sh_files/sh_qiime_release_", - "greengenes": "greengenes.microbio.me", - "silva": "http://www.arb-silva.de/fileadmin/silva_databases/qiime/Silva_", - "img": "ftp.microbio.me" -} -ftp_dir = { - "greengenes": "/greengenes_release/gg_", - "img": "" -} -ftp_file_prefix = { - "greengenes": "gg_", - "img": "" -} -ftp_file_suffix = { - "greengenes": "_otus", - "img": "" -} -extension = { - "unite": "zip", - "greengenes": "tar.gz", - "silva": { - "104_release": "tgz", - "108_release": "tgz", - "108_release_curated": "tgz", - "111_release": "tgz", - "119_consensus_majority_taxonomy": "zip", - "119_release": "zip", - "119_release_aligned_rep_files": "tar.gz", - "123_release": "zip", - "128_release": "tgz"}, - "img": "tgz" -} -filetypes = ["rep_set", "rep_set_aligned", "taxonomy", "trees"] - - -# Utility functions for interacting with Galaxy JSON -def read_input_json(jsonfile): - """Read the JSON supplied from the data manager tool - - Returns a tuple (param_dict,extra_files_path) - - 'param_dict' is an arbitrary dictionary of parameters - input into the tool; 'extra_files_path' is the path - to a directory where output files must be put for the - receiving data manager to pick them up. - - NB the directory pointed to by 'extra_files_path' - doesn't exist initially, it is the job of the script - to create it if necessary. - - """ - params = json.loads(open(jsonfile).read()) - return (params['param_dict'], - params['output_data'][0]['extra_files_path']) - - -# Utility functions for creating data table dictionaries -# -# Example usage: -# >>> d = create_data_tables_dict() -# >>> add_data_table(d,'my_data') -# >>> add_data_table_entry(dict(dbkey='hg19',value='human')) -# >>> add_data_table_entry(dict(dbkey='mm9',value='mouse')) -# >>> print str(json.dumps(d)) -def create_data_tables_dict(): - """Return a dictionary for storing data table information - - Returns a dictionary that can be used with 'add_data_table' - and 'add_data_table_entry' to store information about a - data table. It can be converted to JSON to be sent back to - the data manager. - - """ - d = {} - d['data_tables'] = {} - return d - - -def add_data_table(d, table): - """Add a data table to the data tables dictionary - - Creates a placeholder for a data table called 'table'. - - """ - d['data_tables'][table] = [] - - -def add_data_table_entry(d, table, entry): - """Add an entry to a data table - - Appends an entry to the data table 'table'. 'entry' - should be a dictionary where the keys are the names of - columns in the data table. - - Raises an exception if the named data table doesn't - exist. - - """ - try: - d['data_tables'][table].append(entry) - except KeyError: - raise Exception("add_data_table_entry: no table '%s'" % table) - - -def get_ftp_file(ftp, filename): - """ - """ - try: - ftp.retrbinary("RETR " + filename, open(filename, 'wb').write) - except: - print("Error") - - -def download_archive(db, version): - """ - - """ - filepath = "archive" - if protocol[db] == "http": - url = "%s%s.%s" % (baseUrl[db], version, extension[db]) - r = requests.get(url, stream=True) - r.raise_for_status() - with open(filepath, "wb") as fd: - for chunk in r.iter_content(chunk_size=128): - fd.write(chunk) - elif protocol[db] == "ftp": - ftp = ftplib.FTP(baseUrl[db]) - ftp.login("anonymous", "ftplib-example-1") - ftp.cwd("%s%s" % (ftp_dir[db], version)) - filepath = "%s%s%s.%s" % ( - ftp_file_prefix[db], - version, - ftp_file_suffix[db], - extension[db]) - get_ftp_file(ftp, filepath) - ftp.quit() - return filepath - - -def extract_archive(filepath, ext): - """ - """ - archive_content_path = "tmp" - if ext == "tar.gz" or ext == "tgz": - tar = tarfile.open(filepath) - tar.extractall(path=archive_content_path) - tar.close() - content = os.listdir(archive_content_path) - archive_content = [] - for x in content: - if not x.startswith("."): - archive_content.append(x) - if len(archive_content) == 1: - archive_content_path = os.path.join( - archive_content_path, - archive_content[0]) - elif ext == "zip": - zip_ref = zipfile.ZipFile(filepath, 'r') - zip_ref.extractall(archive_content_path) - zip_ref.close() - return archive_content_path - - -def move_unite_files(archive_content_path, filename_prefix, -name_prefix, data_tables, target_dir): - """ - - """ - archive_content = os.listdir(archive_content_path) - for content in archive_content: - content_filepath = os.path.join(archive_content_path, content) - content_name_prefix = "%s - %s" % (name_prefix, content.split(".")[0]) - content_filename_prefix = "%s_%s" % (filename_prefix, content) - if content.find("refs") != -1: - move_file( - content_filepath, - content_filename_prefix, - content_name_prefix, - data_tables, - os.path.join(target_dir, "rep_set"), - "rep_set") - elif content.find("taxonomy") != -1: - move_file( - content_filepath, - content_filename_prefix, - content_name_prefix, - data_tables, - os.path.join(target_dir, "taxonomy"), - "taxonomy") - - -def move_file(input_filepath, filename, name, data_tables, target_dir, -filetype): - """ - """ - output_filepath = os.path.join(target_dir, filename) - os.rename(input_filepath, output_filepath) - add_data_table_entry( - data_tables, - "qiime_%s" % (filetype), - dict( - dbkey=filename.split(".")[0], - value="1.0", - name=name, - path=output_filepath)) - - -def move_dir_content(input_path, filename_prefix, name_prefix, data_tables, -target_dir, filetype): - """ - """ - for content in os.listdir(input_path): - if content.startswith("."): - continue - content_path = os.path.join(input_path, content) - content_name_prefix = "%s - %s" % (name_prefix, content.split(".")[0]) - content_filename_prefix = "%s_%s" % (filename_prefix, content) - if os.path.isdir(content_path): - move_dir_content( - content_path, - content_filename_prefix, - content_name_prefix, - data_tables, - target_dir, - filetype) - else: - move_file( - content_path, - content_filename_prefix, - content_name_prefix, - data_tables, - target_dir, - filetype) - - -def move_files(archive_content_path, filename_prefix, -name_prefix, data_tables, target_dir, db): - """ - """ - for filetype in filetypes: - filetype_target_dir = os.path.join( - target_dir, - filetype) - filetype_path = os.path.join( - archive_content_path, - filetype) - move_dir_content( - filetype_path, - filename_prefix, - name_prefix, - data_tables, - filetype_target_dir, - filetype) - - -def download_db(data_tables, db, version, target_dir): - """Download QIIME database - - Creates references to the specified file(s) on the Galaxy - server in the appropriate data table (determined from the - file extension). - - The 'data_tables' dictionary should have been created using - the 'create_data_tables_dict' and 'add_data_table' functions. - - Arguments: - data_tables: a dictionary containing the data table info - db: name of the database - version: version of the database - table_name: name of the table - target_dir: directory to put copy or link to the data file - - """ - ext = extension[db] - if db == "silva": - ext = ext[version] - - print("Download archive") - filepath = download_archive(db, version) - - print("Extract archive %s" % filepath) - archive_content_path = extract_archive(filepath, ext) - - print("Moving file from %s" % archive_content_path) - filename_prefix = "%s_%s" % (db, version) - name_prefix = "%s (%s)" % (db, version) - if db == "greengenes" or db == "silva": - move_files( - archive_content_path, - filename_prefix, - name_prefix, - data_tables, - target_dir, - db) - elif db == "unite": - move_unite_files( - archive_content_path, - filename_prefix, - name_prefix, - data_tables, - target_dir) - - -if __name__ == "__main__": - print("Starting...") - - # Read command line - parser = argparse.ArgumentParser( - description='Download QIIME reference database') - parser.add_argument('--database', help="Database name") - parser.add_argument('--version', help="Database version") - parser.add_argument('--jsonfile', help="Output JSON file") - args = parser.parse_args() - - jsonfile = args.jsonfile - - # Read the input JSON - params, target_dir = read_input_json(jsonfile) - - # Make the target directory - print("Making %s" % target_dir) - os.mkdir(target_dir) - os.mkdir(os.path.join(target_dir, "rep_set")) - os.mkdir(os.path.join(target_dir, "rep_set_aligned")) - os.mkdir(os.path.join(target_dir, "taxonomy")) - os.mkdir(os.path.join(target_dir, "trees")) - - # Set up data tables dictionary - data_tables = create_data_tables_dict() - add_data_table(data_tables, "qiime_rep_set") - add_data_table(data_tables, "qiime_rep_set_aligned") - add_data_table(data_tables, "qiime_taxonomy") - add_data_table(data_tables, "qiime_trees") - - # Fetch data from specified data sources - download_db( - data_tables, - args.database, - args.version, - target_dir) - - # Write output JSON - print("Outputting JSON") - print(str(json.dumps(data_tables))) - with open(jsonfile, 'w') as out: - json.dump(data_tables, out) - print("Done.") \ No newline at end of file
--- a/data_manager/data_manager_qiime_download.xml Wed May 03 12:12:47 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,96 +0,0 @@ -<tool id="data_manager_qiime_download" name="Download QIIME" version="1.9.1" tool_type="manage_data"> - <description>reference databases</description> - <requirements> - <requirement type="package" version="2.13.0">requests</requirement> - </requirements> - <stdio> - <exit_code range=":-1" level="fatal" description="Error: Cannot open file" /> - <exit_code range="1:" level="fatal" description="Error" /> - </stdio> - <command><![CDATA[ - python '$__tool_directory__/data_manager_qiime_download.py' - --database '$db.database' - --version '$db.version' - --jsonfile '${out_file}' - ]]></command> - <inputs> - <conditional name="db"> - <param name="database" type="select" label="Database to download"> - <option value="greengenes" selected="true">Greengenes OTUs</option> - <option value="silva">SILVA OTUs (16S/18S)</option> - <option value="unite">UNITE OTUs (ITS)</option> - <!--<option value="img">IMG/QIIME reference protein sequences</option>--> - </param> - <when value="greengenes"> - <param name="version" type="select" label="Version of Greengenes"> - <option value="13_8" selected="true">13.8</option> - <option value="13_5">13.5</option> - <option value="12_10">12.10</option> - </param> - </when> - <when value="silva"> - <param name="version" type="select" label="Version of SILVA OTUs"> - <option value="128_release" selected="true">128</option> - <option value="123_release">123</option> - <option value="119_release_aligned_rep_files">119 (aligned rep)</option> - <option value="119_release">119</option> - <option value="119_consensus_majority_taxonomy">119 (consensus majority taxonomy)</option> - <option value="111_release">111</option> - <option value="108_release">108</option> - <option value="108_release_curated">108 (curated)</option> - <option value="108_release">108</option> - <option value="104_release">104</option> - </param> - </when> - <when value="unite"> - <param name="version" type="select" label="Version of UNITE OTUs"> - <option value="20.11.2016">7.1 (2016-11-20, with singletons set as RefS)</option> - <option value="s_20.11.2016">7.1 (2016-11-20, with global and 97% singletons)</option> - <option value="22.08.2016">7.1 (2016-08-22, with singletons set as RefS)</option> - <option value="s_22.08.2016">7.1 (2016-08-22, with global and 97% singletons)</option> - <option value="31.01.2016">7.0 (2016-01-31, with singletons set as RefS)</option> - <option value="s_31.01.2016">7.0 (2016-01-31, with global and 97% singletons)</option> - <option value="01.08.2015">7.0 (2015-08-01, with singletons set as RefS)</option> - <option value="s_01.08.2015">7.0 (2015-08-01, with global and 97% singletons)</option> - <option value="02.03.2015">7.0 (2015-03-02, with singletons set as RefS)</option> - <option value="s_02.03.2015">7.0 (2015-03-02, with global and 97% singletons)</option> - <option value="30.12.2014">6.0 (2014-12-30, with singletons set as RefS)</option> - <option value="s_30.12.2014">6.0 (2014-12-30, with global and 97% singletons)</option> - <option value="10.09.2014">6.0 (2014-09-10, with singletons set as RefS)</option> - <option value="s_10.09.2014">6.0 (2014-09-10, with global and 97% singletons)</option> - <option value="04.07.2014">6.0 (2014-07-04, with singletons set as RefS)</option> - <option value="s_04.07.2014">6.0 (2014-07-04, with global and 97% singletons)</option> - <option value="13.05.2014">6.0 (2014-05-13, with singletons set as RefS)</option> - <option value="s_13.05.2014">6.0 (2014-05-13, with global and 97% singletons)</option> - <option value="09.02.2014">6.0 (2014-02-09, with singletons set as RefS)</option> - <option value="s_09.02.2014">6.0 (2014-02-09, with global and 97% singletons)</option> - <option value="15.01.2014">6.0 (2014-01-15, with singletons set as RefS)</option> - <option value="s_15.01.2014">6.0 (2014-01-15, with global and 97% singletons)</option> - <option value="19.12.2013">6.0 (2013-12-19, with singletons set as RefS)</option> - <option value="s_19.12.2013">6.0 (2013-12-19, with global and 97% singletons)</option> - <option value="08.12.2013">6.0 (2013-12-08, with singletons set as RefS)</option> - <option value="s_08.12.2013">6.0 (2013-12-08, with global and 97% singletons)</option> - <option value="15.10.2013">5.0 (2013-10-15, with singletons set as RefS)</option> - <option value="s_15.10.2013">5.0 (2013-10-15, with global and 97% singletons)</option> - </param> - </when> - <!--<when value="img"> - <param name="version" type="select" label="Version of IMG/QIIME reference protein sequences"> - <option value="img-qiime-25oct2012" selected="true">img-qiime-25oct2012</option> - </param> - </when>--> - </conditional> - </inputs> - <outputs> - <data name="out_file" format="data_manager_json" label="${tool.name}"/> - </outputs> - <tests> - </tests> - <help><![CDATA[ -This tool downloads the reference databases for QIIME - ]]></help> - <citations> - <citation type="doi"></citation> - <yield /> - </citations> -</tool> \ No newline at end of file
--- a/tool-data/qiime_rep_set.loc.sample Wed May 03 12:12:47 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -#<unique_id> <name> <database_caption> <fasta_file_path> -# -#For each reference database, you need to download the fasta file in qiime path -# -#List of useful db for qiime: http://qiime.org/home_static/dataFiles.html -# \ No newline at end of file
--- a/tool-data/qiime_rep_set_aligned.loc.sample Wed May 03 12:12:47 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -#<unique_id> <name> <database_caption> <fasta_file_path> -# -#For each reference database, you need to download the fasta file in qiime path -# -#List of useful db for qiime: http://qiime.org/home_static/dataFiles.html -# \ No newline at end of file
--- a/tool-data/qiime_taxonomy.loc.sample Wed May 03 12:12:47 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -#<unique_id> <name> <database_caption> <fasta_file_path> -# -#For each reference database, you need to download the fasta file in qiime path -# -#List of useful db for qiime: http://qiime.org/home_static/dataFiles.html -# \ No newline at end of file
--- a/tool-data/qiime_trees.loc.sample Wed May 03 12:12:47 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -#<unique_id> <name> <database_caption> <fasta_file_path> -# -#For each reference database, you need to download the fasta file in qiime path -# -#List of useful db for qiime: http://qiime.org/home_static/dataFiles.html -# \ No newline at end of file
--- a/tool_data_table_conf.xml.sample Wed May 03 12:12:47 2017 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,18 +0,0 @@ -<tables> - <table name="qiime_rep_set" comment_char="#"> - <columns>value, name, dbkey, path</columns> - <file path="tool-data/qiime_rep_set.loc" /> - </table> - <table name="qiime_rep_set_aligned" comment_char="#"> - <columns>value, name, dbkey, path</columns> - <file path="tool-data/qiime_rep_set_aligned.loc" /> - </table> - <table name="qiime_taxonomy" comment_char="#"> - <columns>value, name, dbkey, path</columns> - <file path="tool-data/qiime_taxonomy.loc" /> - </table> - <table name="qiime_trees" comment_char="#"> - <columns>value, name, dbkey, path</columns> - <file path="tool-data/qiime_trees.loc" /> - </table> -</tables> \ No newline at end of file