Mercurial > repos > pimarin > data_manager_fetch_plasmidfinder
changeset 0:8ad77a9dc91d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_plasmidfinder commit f3383fdb9a17d1b69d05547cdb96534a5f762bec-dirty
author | pimarin |
---|---|
date | Tue, 14 Feb 2023 14:25:43 +0000 |
parents | |
children | 9ef108e34063 |
files | data_manager/macro.xml data_manager/plasmidfinder_fetch_database.py data_manager/plasmidfinder_fetch_database.xml data_manager_conf.xml plasmidfinder_database.loc test-data/plasmidfinder.loc.test test-data/plasmidfinder_test_data_manager_2.1.json tool-data/plasmidfinder_database.loc tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 10 files changed, 337 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/macro.xml Tue Feb 14 14:25:43 2023 +0000 @@ -0,0 +1,13 @@ +<macros> + <token name="@TOOL_VERSION@">2.1.6</token> + <token name="@REQUESTS_VERSION@">2.27.1</token> + <token name="@PYTHON_VERSION@">3.8</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">21.05</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@PYTHON_VERSION@">python</requirement> + <requirement type="package" version="@REQUESTS_VERSION@">requests</requirement> + </requirements> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/plasmidfinder_fetch_database.py Tue Feb 14 14:25:43 2023 +0000 @@ -0,0 +1,212 @@ +import argparse +import json +import os +import tarfile +from datetime import datetime +from pathlib import Path + +import requests + + +class GetPlasmidfinderDataManager: + """ + Create the json file with database information for galaxy data manager + """ + + def __init__(self, + plasmidfinder_database="plasmidfinder_database", + db_name="plasmidfinder-db", + plasmidfinder_version="latest"): + self.data_table_name = plasmidfinder_database + self._db_name = db_name + self._plasmidfinder_version = plasmidfinder_version + self._plasmidfinder_date_version = None + self.data_table_entry = None + self.plasmidfinder_table_list = None + self._commit_number = None + + def get_data_table_format(self): + """ + Skeleton of a data_table format + return: a data table formatted for json output + """ + self.data_table_entry = { + "data_tables": { + self.data_table_name: {} + } + } + return self.data_table_entry + + def get_data_manager(self): + """ + Create the empty data table format and add all the information into + Commit number is added if latest is required instead of version number + return: The data table with database information + """ + self.plasmidfinder_table_list = self.get_data_table_format() + if self._plasmidfinder_version == "latest": + version_value = self._commit_number + else: + version_value = self._plasmidfinder_version + plasmidfinder_value = f"plasmidfinder_{self._commit_number}" \ + f"_{self._plasmidfinder_date_version}" + plasmidfinder_name = f"{version_value}" \ + f"_{self._plasmidfinder_date_version}" + data_info = dict(value=plasmidfinder_value, + name=plasmidfinder_name, + date=self._plasmidfinder_date_version, + path=self._db_name) + self.plasmidfinder_table_list["data_tables"][self.data_table_name] = [data_info] + return self.plasmidfinder_table_list + + +class DownloadPlasmidfinderDatabase(GetPlasmidfinderDataManager): + """ + Download the plasmidfinder database from the bitbucket repository. + Build the data manager info for galaxy + """ + + def __init__(self, + output_dir=Path.cwd(), + plasmidfinder_url="https://bitbucket.org/genomicepidemiology/plasmidfinder_db/get/", + db_name="plasmidfinder-db", + db_tmp="tmp_database", + plasmidfinder_version="latest", + json_file_path=None, + date_version=datetime.now().strftime("%Y-%m-%d")): + + super().__init__() + self.json_file_path = json_file_path + self._output_dir = output_dir + self._plasmidfinder_url = plasmidfinder_url + self._temporary_folder = db_tmp + self._db_name = db_name + self._db_name_tar = f'{db_name}.gz' + self._plasmidfinder_version = plasmidfinder_version + self._plasmidfinder_date_version = date_version + self._commit_number = None + + def extract_db_commit(self, request_header, title_name="content-disposition"): + """ + Extract the commit if to add the information as identifier of the download + @request_header: a request object obtained from requests.get() + @title_name: the tag to search in the header of the requests object + return: the value of the commit + """ + db_info = request_header.headers[title_name] + commit_number = db_info.split("-")[2].split(".")[0] + return commit_number + + def untar_files(self, file_path: Path, extracted_path_output: Path): + """ + untar the download archive + @file_path: input path of the tar.gz file + @extracted_path_output: output path of the extract folder + return: the path of the output + """ + try: + with file_path.open('rb') as fh_in, \ + tarfile.open(fileobj=fh_in, mode='r:gz') as tar_file: + tar_file.extractall(path=extracted_path_output) + print(f'Untar the database in {extracted_path_output}') + return extracted_path_output + except OSError: + os.sys.exit(f'ERROR: Could not extract {file_path}') + + def choose_db_version(self): + """ + Update the url link depending on the version choosen by user. + This method could be upgraded simply by adding the new versions + """ + if self._plasmidfinder_version == "latest": + self._plasmidfinder_url = f"{self._plasmidfinder_url}master.gz" + elif self._plasmidfinder_version == "2.1": + self._plasmidfinder_url = f"{self._plasmidfinder_url}1307168.gz" + + def download_database(self): + """ + Download the plasmidfinder database using requests lib + Make the directory and temporary directory for download + Untar the download files + """ + self._output_dir = Path(self._output_dir) + self.choose_db_version() + try: + request_info = requests.get(self._plasmidfinder_url) + request_info.raise_for_status() + self._commit_number = self.extract_db_commit(request_info) + output_tar_path = self._output_dir.joinpath(self._temporary_folder) + output_tar_path_file = output_tar_path.joinpath(self._db_name_tar) + output_path = self._output_dir.joinpath(self._db_name) + os.makedirs(output_tar_path) + os.makedirs(output_path) + with open(output_tar_path_file, 'wb') as output_dir: + output_dir.write(request_info.content) + untar_output = self.untar_files(file_path=output_tar_path_file, extracted_path_output=output_tar_path.joinpath(self._db_name)) + + self.moove_download_files(older_path=untar_output, new_path=output_path) + except requests.exceptions.HTTPError as http_error: + print(f"Requests Error: {http_error}") + print(f"Fail to import Plasmidfinder database from {self._plasmidfinder_url}") + + def moove_download_files(self, older_path, new_path, expression_search="*fsa"): + """ + Clean downloaded data by mooving fasta files in the final folder + @older_path: previous path where the files are located + @new_path: final path where files will be mooved + @expression_search: keep only file with this expression + """ + fasta_files = Path(older_path).rglob(expression_search) + file_list_paths = [file for file in fasta_files if file.is_file()] + [self.keep_filename(pathname=path, output_path=new_path) for path in file_list_paths] + + def keep_filename(self, pathname, output_path): + """ + Moove files + @pathname: previous path + @output_path: final path + """ + Path.replace(pathname, output_path.joinpath(pathname.name)) + + def read_json_input_file(self): + """ + Import the json file + """ + with open(self.json_file_path) as fh: + params = json.load(fh) + target_dir = params['output_data'][0]['extra_files_path'] + os.makedirs(target_dir) + self._output_dir = target_dir + + def write_json_infos(self): + """ + Write in the imported json file + """ + with open(self.json_file_path, 'w') as fh: + json.dump(self.get_data_manager(), fh, sort_keys=True) + + +def parse_arguments(): + """ + List of arguments provided by the user + return: parsed arguments + """ + # parse options and arguments + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("data_manager_json", + help="json file from galaxy") + arg_parser.add_argument("-v", "--db_version", + help="version of the plasmidfinder (latest or 2.1)") + return arg_parser.parse_args() + + +def main(): + all_args = parse_arguments() + plasmidfinder_download = DownloadPlasmidfinderDatabase(json_file_path=all_args.data_manager_json, plasmidfinder_version=all_args.db_version) + plasmidfinder_download.read_json_input_file() + plasmidfinder_download.download_database() + plasmidfinder_download.write_json_infos() + + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/plasmidfinder_fetch_database.xml Tue Feb 14 14:25:43 2023 +0000 @@ -0,0 +1,48 @@ +<tool id="data_manager_fetch_plasmidfinder" name="plasmidfinder_datamanager" tool_type="manage_data" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>plasmidfinder database builder</description> + <macros> + <import>macro.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="exit_code"> + <![CDATA[ + python '$__tool_directory__/plasmidfinder_fetch_database.py' + --db_version '$database_select' + '$output_file' + ]]></command> + <inputs> + <param name="database_select" type="select" label="Database version" help="Choose a database version to download (default latest version)"> + <option value="latest" selected="true">Latest available version</option> + <option value="2.1">V2.1_2019-08-28</option> + </param>param> + </inputs> + <outputs> + <data name="output_file" format="data_manager_json"/> + </outputs> + <tests> + <!-- Test_1 DB latest --> + <test expect_num_outputs="1"> + <param name="database_select" value="latest"/> + <output name="output_file" value="plasmidfinder_test_data_manager_latest.json"> + <assert_contents> + <has_text_matching expression='{"data_tables": {"plasmidfinder_database": [{"date": "\d\d\d\d-\d\d-\d\d", "name": "b1c9ddac0e61_\d\d\d\d-\d\d-\d\d", "path": "plasmidfinder-db", "value": "plasmidfinder_b1c9ddac0e61_\d\d\d\d-\d\d-\d\d"' /> + </assert_contents> + </output> + </test> + <!-- Test_2 DB 2.1 --> + <test expect_num_outputs="1"> + <param name="database_select" value="2.1"/> + <output name="output_file" value="plasmidfinder_test_data_manager_2.1.json"> + <assert_contents> + <has_text_matching expression='"data_tables": \{"plasmidfinder_database": \[\{"date": "\d\d\d\\d-\d\d-\d\d", "name": "2\.1_\d\d\d\d-\d\d-\d\d", "path": "plasmidfinder-db", "value": "plasmidfinder_1307168b1ce7_\d\d\d\d-\d\d-\d\d"' /> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ + Download plasmidfinder database from the bitbucket repository + ]]></help> + <citations> + <citation type="doi">10.1007/978-1-4939-9877-7_20</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Tue Feb 14 14:25:43 2023 +0000 @@ -0,0 +1,20 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/plasmidfinder_fetch_database.xml" id="plasmidfinder_fetch_database" version="@TOOL_VERSION@"> + <data_table name="plasmidfinder_database"> + <output> + <column name="value" /> + <column name="name" /> + <column name="date" /> + <column name="path" output_ref="output_file"> + <move type="directory" relativize_symlinks="True"> + <source>${path}</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">plasmidfinder-db/${name}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/plasmidfinder-db/${name}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/plasmidfinder_database.loc Tue Feb 14 14:25:43 2023 +0000 @@ -0,0 +1,8 @@ +# this is a tab separated file describing the location of plasmidfinder database +# The name was obtained by merging date of download and commit number +# the columns are: +# value, name, date, path +# +# for example +#plasmidfinder_9002e7282dd0_2022-12-20 9002e7282dd0_2022-12-20 2022-12-20 plasmidfinder-db +#plasmidfinder_1307168b1ce7_2022-12-20 2.1_2022-12-20 2022-12-20 plasmidfinder-db
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/plasmidfinder.loc.test Tue Feb 14 14:25:43 2023 +0000 @@ -0,0 +1,12 @@ +plasmidfinder_9002e7282dd0_2022-12-20 9002e7282dd0_2022-12-20 2022-12-20 plasmidfinder-db +plasmidfinder_1307168b1ce7_2022-12-20 2.1_2022-12-20 2022-12-20 plasmidfinder-db +plasmidfinder_c18e08c17a59_2023-01-27 c18e08c17a59_2023-01-27 2023-01-27 /tmp/tmp9g7s9mjt/galaxy-dev/tool-data/plasmidfinder-db/c18e08c17a59_2023-01-27 +plasmidfinder_1307168b1ce7_2023-01-27 2.1_2023-01-27 2023-01-27 /tmp/tmp9g7s9mjt/galaxy-dev/tool-data/plasmidfinder-db/2.1_2023-01-27 +plasmidfinder_c18e08c17a59_2023-01-27 c18e08c17a59_2023-01-27 2023-01-27 /tmp/tmpdy82xyjr/galaxy-dev/tool-data/plasmidfinder-db/c18e08c17a59_2023-01-27 +plasmidfinder_1307168b1ce7_2023-01-27 2.1_2023-01-27 2023-01-27 /tmp/tmpdy82xyjr/galaxy-dev/tool-data/plasmidfinder-db/2.1_2023-01-27 +plasmidfinder_c18e08c17a59_2023-01-27 c18e08c17a59_2023-01-27 2023-01-27 /tmp/tmp6_mld4pi/galaxy-dev/tool-data/plasmidfinder-db/c18e08c17a59_2023-01-27 +plasmidfinder_1307168b1ce7_2023-01-27 2.1_2023-01-27 2023-01-27 /tmp/tmp6_mld4pi/galaxy-dev/tool-data/plasmidfinder-db/2.1_2023-01-27 +plasmidfinder_c18e08c17a59_2023-02-07 c18e08c17a59_2023-02-07 2023-02-07 /tmp/tmp70ecz0oo/galaxy-dev/tool-data/plasmidfinder-db/c18e08c17a59_2023-02-07 +plasmidfinder_1307168b1ce7_2023-02-07 2.1_2023-02-07 2023-02-07 /tmp/tmp70ecz0oo/galaxy-dev/tool-data/plasmidfinder-db/2.1_2023-02-07 +plasmidfinder_c18e08c17a59_2023-02-14 c18e08c17a59_2023-02-14 2023-02-14 /tmp/tmpvbz0fwsy/galaxy-dev/tool-data/plasmidfinder-db/c18e08c17a59_2023-02-14 +plasmidfinder_1307168b1ce7_2023-02-14 2.1_2023-02-14 2023-02-14 /tmp/tmpvbz0fwsy/galaxy-dev/tool-data/plasmidfinder-db/2.1_2023-02-14
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/plasmidfinder_test_data_manager_2.1.json Tue Feb 14 14:25:43 2023 +0000 @@ -0,0 +1,1 @@ +{"data_tables": {"plasmidfinder_database": [{"date": "2023-01-27", "name": "2.1_2023-01-27", "path": "plasmidfinder-db", "value": "plasmidfinder_1307168b1ce7_2023-01-27"}]}} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/plasmidfinder_database.loc Tue Feb 14 14:25:43 2023 +0000 @@ -0,0 +1,8 @@ +# this is a tab separated file describing the location of plasmidfinder database +# The name was obtained by merging date of download and commit number +# the columns are: +# value, name, date, path +# +# for example +#plasmidfinder_9002e7282dd0_2022-12-20 9002e7282dd0_2022-12-20 2022-12-20 plasmidfinder-db +#plasmidfinder_1307168b1ce7_2022-12-20 2.1_2022-12-20 2022-12-20 plasmidfinder-db
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Feb 14 14:25:43 2023 +0000 @@ -0,0 +1,8 @@ + +<tables> + <!-- Locations of plasmidfinder database in the required format --> + <table name="plasmidfinder_database" comment_char="#"> + <columns>value, name, date, path</columns> + <file path="tool-data/plasmidfinder.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Tue Feb 14 14:25:43 2023 +0000 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of plasmidfinder database in the required format --> + <table name="plasmidfinder_database" comment_char="#"> + <columns>value, name, date, path</columns> + <file path="${__HERE__}/test-data/plasmidfinder.loc.test"/> + </table> +</tables>