changeset 0:668bc86eb2ca draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/data_managers/data_manager_ncbi_fcs_gx_database_downloader commit 25c9d8d297d0e10f92e373f6a959274dedc10433
author iuc
date Wed, 09 Oct 2024 08:52:45 +0000
parents
children
files data_manager/data_manager_ncbi_fcs_gx_database_downloader.py data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml data_manager/macros.xml data_manager_conf.xml test-data/ncbi_fcs_gx_databases_ext.loc test-data/ncbi_fcs_gx_divisions.tsv test-data/test.json tool-data/ncbi_fcs_gx_databases_ext.loc.sample tool-data/ncbi_fcs_gx_divisions.tsv.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test
diffstat 11 files changed, 453 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.py	Wed Oct 09 08:52:45 2024 +0000
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+
+import argparse
+import json
+import os
+import subprocess
+import typing
+
+
+def main() -> None:
+    opts = parse_args()
+
+    output_dict = {
+        "data_tables": {
+            "ncbi_fcs_gx_databases_ext": sync_files(opts),
+            "ncbi_fcs_gx_divisions": get_divisions(opts),
+        }
+    }
+
+    with open(opts.output_file, "w") as f:
+        print(json.dumps(output_dict, sort_keys=True, indent=2), file=f)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--tag", required=True, help="Unique identifier for this database")
+    parser.add_argument("--description", required=True, help="Description for this database")
+    parser.add_argument("--source_manifest", required=True, help="Should the tool use the source manifest")
+    parser.add_argument("--use_source_manifest", action="store_true", help="Manifest file for this database")
+    parser.add_argument("--phone_home", action="store_true", help="Should phone home be enabled")
+    parser.add_argument("--phone_home_label", default="", help="Phone home label")
+    parser.add_argument("--node_cache_dir", required=True, help="Directory to copy database to local node")
+    parser.add_argument("--output_file", required=True)
+    parser.add_argument("--output_dir", required=True)
+
+    return parser.parse_args()
+
+
+def sync_files(opts: argparse.Namespace) -> typing.Dict[str, typing.List[typing.Dict[str, str]]]:
+    os.makedirs(opts.output_dir, exist_ok=True)
+
+    args = [
+        "sync_files.py",
+        "--mft",
+        opts.source_manifest,
+        "--dir",
+        opts.output_dir,
+        "get",
+    ]
+
+    try:
+        subprocess.run(args, capture_output=True, check=True)
+    except subprocess.CalledProcessError:
+        raise
+
+    entries_dict = {
+        "add": [
+            {
+                "value": opts.tag,
+                "description": opts.description,
+                "source_manifest": opts.source_manifest,
+                "use_source_manifest": "1" if opts.use_source_manifest else "0",
+                "phone_home": "1" if opts.phone_home else "0",
+                "phone_home_label": opts.phone_home_label,
+                "local_manifest": opts.output_dir,
+            }
+        ]
+    }
+
+    return entries_dict
+
+
+def get_divisions(opts: argparse.Namespace) -> typing.Dict[str, typing.List[typing.Dict[str, str]]]:
+    # descriptions for the top-level gx divisions
+    top_level_description = {
+        "anml": "Animals (Metazoa)",
+        "arch": "Archaea",
+        "fung": "Fungi",
+        "plnt": "Plants (Viridiplantae)",
+        "prok": "Bacteria",
+        "prst": "Protists (other Eukaryota)",
+        "synt": "Synthetic",
+        "virs": "Virus",
+    }
+
+    # get the pathname for the taxa file
+    manifest_filename = os.path.basename(opts.source_manifest)
+    assert manifest_filename.lower().endswith(
+        ".manifest"
+    ), 'source_manifest does not end with ".manifest"'
+    manifest_tag = manifest_filename[:-9]
+    taxa_pathname = os.path.join(opts.output_dir, f"{manifest_tag}.taxa.tsv")
+
+    gx_divisions = set()
+    with open(taxa_pathname) as f:
+        for line in f:
+            if line.startswith("#"):
+                continue
+            line = line.rstrip("\n")
+            tax_id, species, common_name, blast_div, div = line.split("\t", 4)
+            gx_divisions.add(div)
+
+    elements = []
+    for division in gx_divisions:
+        top, bottom = division.split(":", 1)
+        description = f"{top_level_description[top]} - {bottom}"
+        elements.append((description, division))
+
+    # add an element to support unknown/unclassified samples
+    elements.append(("Unknown / Unclassified", "unkn:unknown"))
+
+    entries_dict: typing.Dict[str, typing.List[typing.Dict[str, str]]] = {"add": []}
+
+    for name, gx_div in sorted(elements):
+        entries_dict["add"].append({"value": gx_div, "tag": opts.tag, "description": name})
+
+    return entries_dict
+
+
+if __name__ == "__main__":
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml	Wed Oct 09 08:52:45 2024 +0000
@@ -0,0 +1,135 @@
+<tool id="data_manager_ncbi_fcs_gx_database_downloader" name="NCBI FCS GX database downloader" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" tool_type="manage_data" profile="@PROFILE@">
+    <description>Downoad the NCBI FCS GX database</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="edam_ontology"/>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+python '$__tool_directory__/data_manager_ncbi_fcs_gx_database_downloader.py'
+    --tag '$tag'
+    --description '$description'
+    --source_manifest '$source_manifest'
+#if str($use_source_manifest) == "true"
+    --use_source_manifest
+#end if
+#if str($phone_home) == "true"
+    --phone_home
+    --phone_home_label '$phone_home_label'
+#end if
+    --node_cache_dir '\${TMPDIR}'
+    --output_file '$output_file'
+    --output_dir '$output_file.extra_files_path'
+    ]]></command>
+    <inputs>
+        <param name="tag" type="text" optional="false" label="Unique identifier for this database"/>
+        <param name="description" type="text" optional="false" label="Description for this database"/>
+        <param name="source_manifest" type="text" optional="false" label="Manifest file for this database"/>
+        <param name="use_source_manifest" type="boolean" label="Should the tool use the source manifest"/>
+        <param name="phone_home" type="boolean" label="Should phone home be enabled"/>
+        <param name="phone_home_label" type="text" label="Phone home label"/>
+    </inputs>
+    <outputs>
+        <data name="output_file" format="data_manager_json"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="tag" value="test"/>
+            <param name="description" value="Test Database"/>
+            <param name="source_manifest" value="https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest"/>
+            <param name="use_source_manifest" value="true"/>
+            <param name="phone_home" value="false"/>
+            <output name="output_file" file="test.json" compare="re_match"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+Overview
+========
+
+The NCBI FCS GX tool requires a curated reference database as described in the paper, `Rapid and sensitive detection of genome contamination at scale with FCS‑GX <https://doi.org/10.1186/s13059-024-03198-7>`_. The current database is about 470 GiB in total. Each database includes a json-formatted manifest file with contains details about each database file.  A sample manifest file can be found below.
+
+The data manager downloads the GX database given a manifest file.  It takes six inputs:
+
+1. **tag** - unique identifier for this database chosen by the Galaxy Admin
+2. **description** - description for this database  seen and selectable by the user when running the NCBI FCS GX tool
+3. **source_manifest** - manifest file for this database (url or filesystem path)
+4. **use_source_manifest** - when true, the compute node will download the GX database itself instead of using the local copy
+5. **phone_home** - when true, the NCBI FCS GX tool will send analytics to NCBI about the run,  The code for this can be seen `here <https://github.com/ncbi/fcs-gx/blob/release/scripts/run_gx.py#L79-L115>`_. It sends the following information:
+
+   1. version of the gx executable
+   2. build date of the GX database
+   3. the platform the software is running on
+   4. the version of the Python interpreter
+   5. the size of physical memory in GiB
+   6. the duration of the run
+   7. the run’s exit status (0 for success, otherwise 1)
+   8. **phone_home_label**
+
+6. **phone_home_label** - arbitrary string set by the Galaxy Admin to identify the analytics data sent to NCBI
+
+The data manager also creates a lookup table for the NCBI FCS GX tool based on the `taxa.tsv <https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.taxa.tsv>`_ file in the database.
+
+Sample Manifest File
+====================
+
+.. code-block:: JSON
+
+   {
+     "version": 1,
+     "totalFiles": 8,
+     "timeStamp": "2023-01-24T16:18:22.220812",
+     "fileDetails": [
+       {
+         "fileName": "all.blast_div.tsv.gz",
+         "fileSize": 8241107,
+         "hashAlgorithm": "md5",
+         "hashValue": "a6b08c85c46da76548fff6ed220f8f9d"
+       },
+       {
+         "fileName": "all.assemblies.tsv",
+         "fileSize": 8887448,
+         "hashAlgorithm": "md5",
+         "hashValue": "441beceb8c467593fa6b87a071c5ec6b"
+       },
+       {
+         "fileName": "all.taxa.tsv",
+         "fileSize": 6385518,
+         "hashAlgorithm": "md5",
+         "hashValue": "c94d1fc80f81dbbf30b114d4cdaf29ad"
+       },
+       {
+         "fileName": "all.gxs",
+         "fileSize": 177317125807,
+         "hashAlgorithm": "md5",
+         "hashValue": "da205626565a61be6dfd8c9b5ed1a9b7"
+       },
+       {
+         "fileName": "all.meta.jsonl",
+         "fileSize": 59,
+         "hashAlgorithm": "md5",
+         "hashValue": "c2096cdb8106d44a310052b06a23836c"
+       },
+       {
+         "fileName": "all.gxi",
+         "fileSize": 321216733352,
+         "hashAlgorithm": "md5",
+         "hashValue": "36bf346693e2b9de693de38efe219aa7"
+       },
+       {
+         "fileName": "all.seq_info.tsv.gz",
+         "fileSize": 22549956,
+         "hashAlgorithm": "md5",
+         "hashValue": "6a760eed5a94aaf46d4dd8c75f370875"
+       },
+       {
+         "fileName": "all.README.txt",
+         "fileSize": 187,
+         "hashAlgorithm": "md5",
+         "hashValue": "7deb2d4fa5241f95a25073fb43147cb1"
+       }
+     ]
+   }
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager/macros.xml	Wed Oct 09 08:52:45 2024 +0000
@@ -0,0 +1,20 @@
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">ncbi-fcs-gx</requirement>
+        </requirements>
+    </xml>
+    <token name="@TOOL_VERSION@">0.5.4</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">21.05</token>
+    <xml name="edam_ontology">
+        <edam_operations>
+            <edam_operation>operation_3187</edam_operation>
+        </edam_operations>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1101/2023.06.02.543519</citation>
+        </citations>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_conf.xml	Wed Oct 09 08:52:45 2024 +0000
@@ -0,0 +1,33 @@
+<data_managers>
+    <data_manager tool_file="data_manager/data_manager_ncbi_fcs_gx_database_downloader.xml" id="data_manager_ncbi_fcs_gx_database_downloader">
+        <data_table name="ncbi_fcs_gx_databases_ext">
+            <output>
+                <column name="value" />
+                <column name="name" />
+                <column name="source_manifest" />
+                <column name="use_source_manifest" />
+                <column name="phone_home" />
+                <column name="phone_home_label" />
+                <column name="local_manifest" output_ref="output_file">
+                    <move type="directory">
+                        <source>${local_manifest}</source>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">ncbi_fcs_gx_databases_ext/${value}</target>
+                    </move>
+                    <value_translation><![CDATA[
+#import os
+#set manifest_filename = os.path.basename($source_manifest)
+$GALAXY_DATA_MANAGER_DATA_PATH/ncbi_fcs_gx_databases_ext/$value/$manifest_filename
+                    ]]></value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+        <data_table name="ncbi_fcs_gx_divisions">
+            <output>
+                <column name="value" />
+                <column name="tag" />
+                <column name="name" />
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_fcs_gx_databases_ext.loc	Wed Oct 09 08:52:45 2024 +0000
@@ -0,0 +1,2 @@
+#tag	description	source_manifest	use_source_manifest	phone_home	phone_home_label	local_manifest
+test		https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest	1	0		/scratch/rico/galaxy/tool-data/ncbi_fcs_gx_databases_ext/test/test-only.manifest
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/ncbi_fcs_gx_divisions.tsv	Wed Oct 09 08:52:45 2024 +0000
@@ -0,0 +1,3 @@
+#tag	gx_div	description
+prok:CFB group bacteria	test	
+unkn:unknown	test	
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test.json	Wed Oct 09 08:52:45 2024 +0000
@@ -0,0 +1,31 @@
+\{
+  "data_tables": \{
+    "ncbi_fcs_gx_databases_ext": \{
+      "add": \[
+        \{
+          "description": "Test Database",
+          "local_manifest": ".+",
+          "phone_home": "0",
+          "phone_home_label": "",
+          "source_manifest": "https://ncbi-fcs-gx.s3.amazonaws.com/gxdb/test-only/test-only.manifest",
+          "use_source_manifest": "1",
+          "value": "test"
+        \}
+      \]
+    \},
+    "ncbi_fcs_gx_divisions": \{
+      "add": \[
+        \{
+          "description": "Bacteria - CFB group bacteria",
+          "tag": "test",
+          "value": "prok:CFB group bacteria"
+        \},
+        \{
+          "description": "Unknown / Unclassified",
+          "tag": "test",
+          "value": "unkn:unknown"
+        \}
+      \]
+    \}
+  \}
+\}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/ncbi_fcs_gx_databases_ext.loc.sample	Wed Oct 09 08:52:45 2024 +0000
@@ -0,0 +1,19 @@
+# When phone_home is set to "1", the NCBI FCS GX tool will send analytics
+# to NCBI about the run.  The following information is sent:
+#
+#  1. version of the gx executable
+#  2. build date of the GX database
+#  3. the platform the software is running on
+#  4. the version of the Python interpreter
+#  5. the size of physical memory in GiB
+#  6. the duration of the run
+#  7. the run’s exit status (0 for success, otherwise 1)
+#  8. phone_home_label
+#
+# The phone_home_label is an arbitrary string send to NCBI to identify
+# data. For instance, all NCBI FCS GX runs on usegalaxy.org use the
+# phone_home_label "usegalaxy.org"
+#
+#tag	description	source_manifest	use_source_manifest	phone_home	phone_home_label	local_manifest
+#latest	Full GX Database	https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/latest/all.manifest	0	1	usegalaxy.org	/big/data/dir/ncbi_fcs_gx_databases_ext/latest/all.manifest
+#test	Test GX Database	https://ftp.ncbi.nlm.nih.gov/genomes/TOOLS/FCS/database/test-only/test-only.manifest	0	1	usegalaxy.org	/big/data/dir/ncbi_fcs_gx_databases_ext/test/test-only.manifest
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/ncbi_fcs_gx_divisions.tsv.sample	Wed Oct 09 08:52:45 2024 +0000
@@ -0,0 +1,64 @@
+#gx_div	tag	description
+#anml:amphibians	latest	Animals (Metazoa) - amphibians
+#anml:basal metazoans	latest	Animals (Metazoa) - basal metazoans
+#anml:birds	latest	Animals (Metazoa) - birds
+#anml:brachiopods	latest	Animals (Metazoa) - brachiopods
+#anml:crustaceans	latest	Animals (Metazoa) - crustaceans
+#anml:echinoderms	latest	Animals (Metazoa) - echinoderms
+#anml:fishes	latest	Animals (Metazoa) - fishes
+#anml:insects	latest	Animals (Metazoa) - insects
+#anml:mammals	latest	Animals (Metazoa) - mammals
+#anml:marsupials	latest	Animals (Metazoa) - marsupials
+#anml:molluscs	latest	Animals (Metazoa) - molluscs
+#anml:nematodes	latest	Animals (Metazoa) - nematodes
+#anml:primates	latest	Animals (Metazoa) - primates
+#anml:reptiles	latest	Animals (Metazoa) - reptiles
+#anml:rodents	latest	Animals (Metazoa) - rodents
+#anml:rotifers	latest	Animals (Metazoa) - rotifers
+#anml:tardigrades	latest	Animals (Metazoa) - tardigrades
+#anml:worms	latest	Animals (Metazoa) - worms
+#arch:archaea	latest	Archaea - archaea
+#prok:CFB group bacteria	latest	Bacteria - CFB group bacteria
+#prok:GNS bacteria	latest	Bacteria - GNS bacteria
+#prok:a-proteobacteria	latest	Bacteria - a-proteobacteria
+#prok:actinobacteria	latest	Bacteria - actinobacteria
+#prok:aquificales	latest	Bacteria - aquificales
+#prok:b-proteobacteria	latest	Bacteria - b-proteobacteria
+#prok:bacteria	latest	Bacteria - bacteria
+#prok:chlamydias	latest	Bacteria - chlamydias
+#prok:cyanobacteria	latest	Bacteria - cyanobacteria
+#prok:d-proteobacteria	latest	Bacteria - d-proteobacteria
+#prok:firmicutes	latest	Bacteria - firmicutes
+#prok:fusobacteria	latest	Bacteria - fusobacteria
+#prok:g-proteobacteria	latest	Bacteria - g-proteobacteria
+#prok:green sulfur bacteria	latest	Bacteria - green sulfur bacteria
+#prok:high GC Gram+	latest	Bacteria - high GC Gram+
+#prok:mycoplasmas	latest	Bacteria - mycoplasmas
+#prok:planctomycetes	latest	Bacteria - planctomycetes
+#prok:proteobacteria	latest	Bacteria - proteobacteria
+#prok:spirochetes	latest	Bacteria - spirochetes
+#prok:thermotogales	latest	Bacteria - thermotogales
+#prok:verrucomicrobia	latest	Bacteria - verrucomicrobia
+#fung:ascomycetes	latest	Fungi - ascomycetes
+#fung:basidiomycetes	latest	Fungi - basidiomycetes
+#fung:budding yeasts	latest	Fungi - budding yeasts
+#fung:chytrids	latest	Fungi - chytrids
+#fung:fungi	latest	Fungi - fungi
+#fung:microsporidians	latest	Fungi - microsporidians
+#plnt:green algae	latest	Plants (Viridiplantae) - green algae
+#plnt:mosses	latest	Plants (Viridiplantae) - mosses
+#plnt:plants	latest	Plants (Viridiplantae) - plants
+#prst:algae	latest	Protists (other Eukaryota) - algae
+#prst:alveolates	latest	Protists (other Eukaryota) - alveolates
+#prst:cellular slime molds	latest	Protists (other Eukaryota) - cellular slime molds
+#prst:cercozoans	latest	Protists (other Eukaryota) - cercozoans
+#prst:choanoflagellates	latest	Protists (other Eukaryota) - choanoflagellates
+#prst:euglenoids	latest	Protists (other Eukaryota) - euglenoids
+#prst:monads	latest	Protists (other Eukaryota) - monads
+#prst:protists	latest	Protists (other Eukaryota) - protists
+#prst:slime nets	latest	Protists (other Eukaryota) - slime nets
+#synt:synthetic	latest	Synthetic - synthetic
+#unkn:unknown	latest	Unknown / Unclassified
+#virs:eukaryotic viruses	latest	Virus - eukaryotic viruses
+#virs:prokaryotic viruses	latest	Virus - prokaryotic viruses
+#virs:viruses	latest	Virus - viruses
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Wed Oct 09 08:52:45 2024 +0000
@@ -0,0 +1,12 @@
+<tables>
+    <!-- Locations of NCBI FCS GX databases -->
+    <table name="ncbi_fcs_gx_databases_ext" comment_char="#">
+        <columns>value, name, source_manifest, use_source_manifest, phone_home, phone_home_label, local_manifest</columns>
+        <file path="tool-data/ncbi_fcs_gx_databases_ext.loc" />
+    </table>
+    <!-- NCBI FCS GX divisions -->
+    <table name="ncbi_fcs_gx_divisions" comment_char="#">
+        <columns>value, tag, name</columns>
+        <file path="tool-data/ncbi_fcs_gx_divisions.tsv" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.test	Wed Oct 09 08:52:45 2024 +0000
@@ -0,0 +1,12 @@
+<tables>
+    <!-- Locations of NCBI FCS GX databases -->
+    <table name="ncbi_fcs_gx_databases_ext" comment_char="#">
+        <columns>value, name, source_manifest, use_source_manifest, phone_home, phone_home_label, local_manifest</columns>
+        <file path="${__HERE__}/test-data/ncbi_fcs_gx_databases_ext.loc" />
+    </table>
+    <!-- NCBI FCS GX divisions -->
+    <table name="ncbi_fcs_gx_divisions" comment_char="#">
+        <columns>value, tag, name</columns>
+        <file path="${__HERE__}/test-data/ncbi_fcs_gx_divisions.tsv" />
+    </table>
+</tables>