Mercurial > repos > iuc > data_manager_ncbi_fcs_gx_database_downloader
changeset 0:668bc86eb2ca draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_ncbi_fcs_gx_database_downloader commit 25c9d8d297d0e10f92e373f6a959274dedc10433
| author | iuc |
|---|---|
| date | Wed, 09 Oct 2024 08:52:45 +0000 |
| parents | |
| children | |
| files | data_manager/data_manager_ncbi_fcs_gx_database_downloader.py data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml data_manager/macros.xml data_manager_conf.xml test-data/ncbi_fcs_gx_databases_ext.loc test-data/ncbi_fcs_gx_divisions.tsv test-data/test.json tool-data/ncbi_fcs_gx_databases_ext.loc.sample tool-data/ncbi_fcs_gx_divisions.tsv.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
| diffstat | 11 files changed, 453 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.py Wed Oct 09 08:52:45 2024 +0000 @@ -0,0 +1,122 @@ +#!/usr/bin/env python + +import argparse +import json +import os +import subprocess +import typing + + +def main() -> None: + opts = parse_args() + + output_dict = { + "data_tables": { + "ncbi_fcs_gx_databases_ext": sync_files(opts), + "ncbi_fcs_gx_divisions": get_divisions(opts), + } + } + + with open(opts.output_file, "w") as f: + print(json.dumps(output_dict, sort_keys=True, indent=2), file=f) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + + parser.add_argument("--tag", required=True, help="Unique identifier for this database") + parser.add_argument("--description", required=True, help="Description for this database") + parser.add_argument("--source_manifest", required=True, help="Should the tool use the source manifest") + parser.add_argument("--use_source_manifest", action="store_true", help="Manifest file for this database") + parser.add_argument("--phone_home", action="store_true", help="Should phone home be enabled") + parser.add_argument("--phone_home_label", default="", help="Phone home label") + parser.add_argument("--node_cache_dir", required=True, help="Directory to copy database to local node") + parser.add_argument("--output_file", required=True) + parser.add_argument("--output_dir", required=True) + + return parser.parse_args() + + +def sync_files(opts: argparse.Namespace) -> typing.Dict[str, typing.List[typing.Dict[str, str]]]: + os.makedirs(opts.output_dir, exist_ok=True) + + args = [ + "sync_files.py", + "--mft", + opts.source_manifest, + "--dir", + opts.output_dir, + "get", + ] + + try: + subprocess.run(args, capture_output=True, check=True) + except subprocess.CalledProcessError: + raise + + entries_dict = { + "add": [ + { + "value": opts.tag, + "description": opts.description, + "source_manifest": opts.source_manifest, + "use_source_manifest": "1" if opts.use_source_manifest else "0", + "phone_home": "1" if opts.phone_home else "0", + "phone_home_label": opts.phone_home_label, + "local_manifest": opts.output_dir, + } + ] + } + + return entries_dict + + +def get_divisions(opts: argparse.Namespace) -> typing.Dict[str, typing.List[typing.Dict[str, str]]]: + # descriptions for the top-level gx divisions + top_level_description = { + "anml": "Animals (Metazoa)", + "arch": "Archaea", + "fung": "Fungi", + "plnt": "Plants (Viridiplantae)", + "prok": "Bacteria", + "prst": "Protists (other Eukaryota)", + "synt": "Synthetic", + "virs": "Virus", + } + + # get the pathname for the taxa file + manifest_filename = os.path.basename(opts.source_manifest) + assert manifest_filename.lower().endswith( + ".manifest" + ), 'source_manifest does not end with ".manifest"' + manifest_tag = manifest_filename[:-9] + taxa_pathname = os.path.join(opts.output_dir, f"{manifest_tag}.taxa.tsv") + + gx_divisions = set() + with open(taxa_pathname) as f: + for line in f: + if line.startswith("#"): + continue + line = line.rstrip("\n") + tax_id, species, common_name, blast_div, div = line.split("\t", 4) + gx_divisions.add(div) + + elements = [] + for division in gx_divisions: + top, bottom = division.split(":", 1) + description = f"{top_level_description[top]} - {bottom}" + elements.append((description, division)) + + # add an element to support unknown/unclassified samples + elements.append(("Unknown / Unclassified", "unkn:unknown")) + + entries_dict: typing.Dict[str, typing.List[typing.Dict[str, str]]] = {"add": []} + + for name, gx_div in sorted(elements): + entries_dict["add"].append({"value": gx_div, "tag": opts.tag, "description": name}) + + return entries_dict + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml Wed Oct 09 08:52:45 2024 +0000 @@ -0,0 +1,135 @@ +<tool id="data_manager_ncbi_fcs_gx_database_downloader" name="NCBI FCS GX database downloader" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" tool_type="manage_data" profile="@PROFILE@"> + <description>Downoad the NCBI FCS GX database</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="edam_ontology"/> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ +python '$__tool_directory__/data_manager_ncbi_fcs_gx_database_downloader.py' + --tag '$tag' + --description '$description' + --source_manifest '$source_manifest' +#if str($use_source_manifest) == "true" + --use_source_manifest +#end if +#if str($phone_home) == "true" + --phone_home + --phone_home_label '$phone_home_label' +#end if + --node_cache_dir '\${TMPDIR}' + --output_file '$output_file' + --output_dir '$output_file.extra_files_path' + ]]></command> + <inputs> + <param name="tag" type="text" optional="false" label="Unique identifier for this database"/> + <param name="description" type="text" optional="false" label="Description for this database"/> + <param name="source_manifest" type="text" optional="false" label="Manifest file for this database"/> + <param name="use_source_manifest" type="boolean" label="Should the tool use the source manifest"/> + <param name="phone_home" type="boolean" label="Should phone home be enabled"/> + <param name="phone_home_label" type="text" label="Phone home label"/> + </inputs> + <outputs> + <data name="output_file" format="data_manager_json"/> + </outputs> + <tests> + <test> + <param name="tag" value="test"/> + <param name="description" value="Test Database"/> + <param name="source_manifest" value="https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest"/> + <param name="use_source_manifest" value="true"/> + <param name="phone_home" value="false"/> + <output name="output_file" file="test.json" compare="re_match"/> + </test> + </tests> + <help><![CDATA[ + +Overview +======== + +The NCBI FCS GX tool requires a curated reference database as described in the paper, `Rapid and sensitive detection of genome contamination at scale with FCS‑GX <https://doi.org/10.1186/s13059-024-03198-7>`_. The current database is about 470 GiB in total. Each database includes a json-formatted manifest file with contains details about each database file. A sample manifest file can be found below. + +The data manager downloads the GX database given a manifest file. It takes six inputs: + +1. **tag** - unique identifier for this database chosen by the Galaxy Admin +2. **description** - description for this database seen and selectable by the user when running the NCBI FCS GX tool +3. **source_manifest** - manifest file for this database (url or filesystem path) +4. **use_source_manifest** - when true, the compute node will download the GX database itself instead of using the local copy +5. **phone_home** - when true, the NCBI FCS GX tool will send analytics to NCBI about the run, The code for this can be seen `here <https://github.com/ncbi/fcs-gx/blob/release/scripts/run_gx.py#L79-L115>`_. It sends the following information: + + 1. version of the gx executable + 2. build date of the GX database + 3. the platform the software is running on + 4. the version of the Python interpreter + 5. the size of physical memory in GiB + 6. the duration of the run + 7. the run’s exit status (0 for success, otherwise 1) + 8. **phone_home_label** + +6. **phone_home_label** - arbitrary string set by the Galaxy Admin to identify the analytics data sent to NCBI + +The data manager also creates a lookup table for the NCBI FCS GX tool based on the `taxa.tsv <https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.taxa.tsv>`_ file in the database. + +Sample Manifest File +==================== + +.. code-block:: JSON + + { + "version": 1, + "totalFiles": 8, + "timeStamp": "2023-01-24T16:18:22.220812", + "fileDetails": [ + { + "fileName": "all.blast_div.tsv.gz", + "fileSize": 8241107, + "hashAlgorithm": "md5", + "hashValue": "a6b08c85c46da76548fff6ed220f8f9d" + }, + { + "fileName": "all.assemblies.tsv", + "fileSize": 8887448, + "hashAlgorithm": "md5", + "hashValue": "441beceb8c467593fa6b87a071c5ec6b" + }, + { + "fileName": "all.taxa.tsv", + "fileSize": 6385518, + "hashAlgorithm": "md5", + "hashValue": "c94d1fc80f81dbbf30b114d4cdaf29ad" + }, + { + "fileName": "all.gxs", + "fileSize": 177317125807, + "hashAlgorithm": "md5", + "hashValue": "da205626565a61be6dfd8c9b5ed1a9b7" + }, + { + "fileName": "all.meta.jsonl", + "fileSize": 59, + "hashAlgorithm": "md5", + "hashValue": "c2096cdb8106d44a310052b06a23836c" + }, + { + "fileName": "all.gxi", + "fileSize": 321216733352, + "hashAlgorithm": "md5", + "hashValue": "36bf346693e2b9de693de38efe219aa7" + }, + { + "fileName": "all.seq_info.tsv.gz", + "fileSize": 22549956, + "hashAlgorithm": "md5", + "hashValue": "6a760eed5a94aaf46d4dd8c75f370875" + }, + { + "fileName": "all.README.txt", + "fileSize": 187, + "hashAlgorithm": "md5", + "hashValue": "7deb2d4fa5241f95a25073fb43147cb1" + } + ] + } + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/macros.xml Wed Oct 09 08:52:45 2024 +0000 @@ -0,0 +1,20 @@ +<macros> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">ncbi-fcs-gx</requirement> + </requirements> + </xml> + <token name="@TOOL_VERSION@">0.5.4</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">21.05</token> + <xml name="edam_ontology"> + <edam_operations> + <edam_operation>operation_3187</edam_operation> + </edam_operations> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1101/2023.06.02.543519</citation> + </citations> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Wed Oct 09 08:52:45 2024 +0000 @@ -0,0 +1,33 @@ +<data_managers> + <data_manager tool_file="data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml" id="data_manager_ncbi_fcs_gx_database_downloader"> + <data_table name="ncbi_fcs_gx_databases_ext"> + <output> + <column name="value" /> + <column name="name" /> + <column name="source_manifest" /> + <column name="use_source_manifest" /> + <column name="phone_home" /> + <column name="phone_home_label" /> + <column name="local_manifest" output_ref="output_file"> + <move type="directory"> + <source>${local_manifest}</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">ncbi_fcs_gx_databases_ext/${value}</target> + </move> + <value_translation><![CDATA[ +#import os +#set manifest_filename = os.path.basename($source_manifest) +$GALAXY_DATA_MANAGER_DATA_PATH/ncbi_fcs_gx_databases_ext/$value/$manifest_filename + ]]></value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + <data_table name="ncbi_fcs_gx_divisions"> + <output> + <column name="value" /> + <column name="tag" /> + <column name="name" /> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ncbi_fcs_gx_databases_ext.loc Wed Oct 09 08:52:45 2024 +0000 @@ -0,0 +1,2 @@ +#tag description source_manifest use_source_manifest phone_home phone_home_label local_manifest +test https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest 1 0 /scratch/rico/galaxy/tool-data/ncbi_fcs_gx_databases_ext/test/test-only.manifest
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/ncbi_fcs_gx_divisions.tsv Wed Oct 09 08:52:45 2024 +0000 @@ -0,0 +1,3 @@ +#tag gx_div description +prok:CFB group bacteria test +unkn:unknown test
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test.json Wed Oct 09 08:52:45 2024 +0000 @@ -0,0 +1,31 @@ +\{ + "data_tables": \{ + "ncbi_fcs_gx_databases_ext": \{ + "add": \[ + \{ + "description": "Test Database", + "local_manifest": ".+", + "phone_home": "0", + "phone_home_label": "", + "source_manifest": "https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest", + "use_source_manifest": "1", + "value": "test" + \} + \] + \}, + "ncbi_fcs_gx_divisions": \{ + "add": \[ + \{ + "description": "Bacteria - CFB group bacteria", + "tag": "test", + "value": "prok:CFB group bacteria" + \}, + \{ + "description": "Unknown / Unclassified", + "tag": "test", + "value": "unkn:unknown" + \} + \] + \} + \} +\}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/ncbi_fcs_gx_databases_ext.loc.sample Wed Oct 09 08:52:45 2024 +0000 @@ -0,0 +1,19 @@ +# When phone_home is set to "1", the NCBI FCS GX tool will send analytics +# to NCBI about the run. The following information is sent: +# +# 1. version of the gx executable +# 2. build date of the GX database +# 3. the platform the software is running on +# 4. the version of the Python interpreter +# 5. the size of physical memory in GiB +# 6. the duration of the run +# 7. the run’s exit status (0 for success, otherwise 1) +# 8. phone_home_label +# +# The phone_home_label is an arbitrary string send to NCBI to identify +# data. For instance, all NCBI FCS GX runs on usegalaxy.org use the +# phone_home_label "usegalaxy.org" +# +#tag description source_manifest use_source_manifest phone_home phone_home_label local_manifest +#latest Full GX Database https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.manifest 0 1 usegalaxy.org /big/data/dir/ncbi_fcs_gx_databases_ext/latest/all.manifest +#test Test GX Database https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.manifest 0 1 usegalaxy.org /big/data/dir/ncbi_fcs_gx_databases_ext/test/test-only.manifest
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/ncbi_fcs_gx_divisions.tsv.sample Wed Oct 09 08:52:45 2024 +0000 @@ -0,0 +1,64 @@ +#gx_div tag description +#anml:amphibians latest Animals (Metazoa) - amphibians +#anml:basal metazoans latest Animals (Metazoa) - basal metazoans +#anml:birds latest Animals (Metazoa) - birds +#anml:brachiopods latest Animals (Metazoa) - brachiopods +#anml:crustaceans latest Animals (Metazoa) - crustaceans +#anml:echinoderms latest Animals (Metazoa) - echinoderms +#anml:fishes latest Animals (Metazoa) - fishes +#anml:insects latest Animals (Metazoa) - insects +#anml:mammals latest Animals (Metazoa) - mammals +#anml:marsupials latest Animals (Metazoa) - marsupials +#anml:molluscs latest Animals (Metazoa) - molluscs +#anml:nematodes latest Animals (Metazoa) - nematodes +#anml:primates latest Animals (Metazoa) - primates +#anml:reptiles latest Animals (Metazoa) - reptiles +#anml:rodents latest Animals (Metazoa) - rodents +#anml:rotifers latest Animals (Metazoa) - rotifers +#anml:tardigrades latest Animals (Metazoa) - tardigrades +#anml:worms latest Animals (Metazoa) - worms +#arch:archaea latest Archaea - archaea +#prok:CFB group bacteria latest Bacteria - CFB group bacteria +#prok:GNS bacteria latest Bacteria - GNS bacteria +#prok:a-proteobacteria latest Bacteria - a-proteobacteria +#prok:actinobacteria latest Bacteria - actinobacteria +#prok:aquificales latest Bacteria - aquificales +#prok:b-proteobacteria latest Bacteria - b-proteobacteria +#prok:bacteria latest Bacteria - bacteria +#prok:chlamydias latest Bacteria - chlamydias +#prok:cyanobacteria latest Bacteria - cyanobacteria +#prok:d-proteobacteria latest Bacteria - d-proteobacteria +#prok:firmicutes latest Bacteria - firmicutes +#prok:fusobacteria latest Bacteria - fusobacteria +#prok:g-proteobacteria latest Bacteria - g-proteobacteria +#prok:green sulfur bacteria latest Bacteria - green sulfur bacteria +#prok:high GC Gram+ latest Bacteria - high GC Gram+ +#prok:mycoplasmas latest Bacteria - mycoplasmas +#prok:planctomycetes latest Bacteria - planctomycetes +#prok:proteobacteria latest Bacteria - proteobacteria +#prok:spirochetes latest Bacteria - spirochetes +#prok:thermotogales latest Bacteria - thermotogales +#prok:verrucomicrobia latest Bacteria - verrucomicrobia +#fung:ascomycetes latest Fungi - ascomycetes +#fung:basidiomycetes latest Fungi - basidiomycetes +#fung:budding yeasts latest Fungi - budding yeasts +#fung:chytrids latest Fungi - chytrids +#fung:fungi latest Fungi - fungi +#fung:microsporidians latest Fungi - microsporidians +#plnt:green algae latest Plants (Viridiplantae) - green algae +#plnt:mosses latest Plants (Viridiplantae) - mosses +#plnt:plants latest Plants (Viridiplantae) - plants +#prst:algae latest Protists (other Eukaryota) - algae +#prst:alveolates latest Protists (other Eukaryota) - alveolates +#prst:cellular slime molds latest Protists (other Eukaryota) - cellular slime molds +#prst:cercozoans latest Protists (other Eukaryota) - cercozoans +#prst:choanoflagellates latest Protists (other Eukaryota) - choanoflagellates +#prst:euglenoids latest Protists (other Eukaryota) - euglenoids +#prst:monads latest Protists (other Eukaryota) - monads +#prst:protists latest Protists (other Eukaryota) - protists +#prst:slime nets latest Protists (other Eukaryota) - slime nets +#synt:synthetic latest Synthetic - synthetic +#unkn:unknown latest Unknown / Unclassified +#virs:eukaryotic viruses latest Virus - eukaryotic viruses +#virs:prokaryotic viruses latest Virus - prokaryotic viruses +#virs:viruses latest Virus - viruses
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Wed Oct 09 08:52:45 2024 +0000 @@ -0,0 +1,12 @@ +<tables> + <!-- Locations of NCBI FCS GX databases --> + <table name="ncbi_fcs_gx_databases_ext" comment_char="#"> + <columns>value, name, source_manifest, use_source_manifest, phone_home, phone_home_label, local_manifest</columns> + <file path="tool-data/ncbi_fcs_gx_databases_ext.loc" /> + </table> + <!-- NCBI FCS GX divisions --> + <table name="ncbi_fcs_gx_divisions" comment_char="#"> + <columns>value, tag, name</columns> + <file path="tool-data/ncbi_fcs_gx_divisions.tsv" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Wed Oct 09 08:52:45 2024 +0000 @@ -0,0 +1,12 @@ +<tables> + <!-- Locations of NCBI FCS GX databases --> + <table name="ncbi_fcs_gx_databases_ext" comment_char="#"> + <columns>value, name, source_manifest, use_source_manifest, phone_home, phone_home_label, local_manifest</columns> + <file path="${__HERE__}/test-data/ncbi_fcs_gx_databases_ext.loc" /> + </table> + <!-- NCBI FCS GX divisions --> + <table name="ncbi_fcs_gx_divisions" comment_char="#"> + <columns>value, tag, name</columns> + <file path="${__HERE__}/test-data/ncbi_fcs_gx_divisions.tsv" /> + </table> +</tables>
