Mercurial > repos > iuc > gtdbtk_classify_wf

<tool id="gtdbtk_classify_wf" name="GTDB-Tk Classify genomes" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>by placement in GTDB reference tree</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <xrefs>
        <xref type="bio.tools">GTDB-Tk</xref>
    </xrefs>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
#import re

mkdir input_dir &&
mkdir output_dir &&
#for $i in $input:
    ## gtdbtk uses the file extension to determine the input format.
    #set ext = "." + $i.ext
    #set input_identifier = re.sub('[^\w\-.]', '_', str($i.element_identifier)) + $ext
    ln -s '${i}' input_dir/'${input_identifier}' &&
#end for
export GTDBTK_DATA_PATH=$gtdbtk_db.fields.path &&
gtdbtk classify_wf
--genome_dir input_dir
--extension '$ext'
--out_dir output_dir
--cpus "\${GALAXY_SLOTS:-4}"
--min_perc_aa $advanced.min_perc_aa
$advanced.force
--min_af $advanced.min_af
$full_tree
## Required unless mash_db is available:
--skip_ani_screen

#if str($advanced.output_process_log) == 'yes':
    && cat output_dir/gtdbtk.warnings.log output_dir/gtdbtk.log > '$process_log'
#end if
    ]]></command>
    <inputs>
        <param name="input" type="data" format="fasta,fasta.gz" multiple="true" label="Fasta (Genome) files"/>
        <param name="gtdbtk_db" type="select" label="GTDB-Tk database" help="This version of GTDB-Tk requires GTDB version 220. Please contact your service administrator if this version is not available to select.">
            <options from_data_table="gtdbtk_database_versioned">
                <filter type="regexp" column="version" value="^22(0|6)$"/>
                <validator type="no_options" message="No locally cached GTDB-Tk database is available"/>
            </options>
        </param>
        <section name="advanced" title="Advanced options">
            <param argument="--min_perc_aa" type="integer" min="0" max="100" value="10" label="Exclude genomes that do not have at least this percentage of AA in the MSA" help="Inclusive bound"/>
            <param argument="--force" type="boolean" truevalue="--force" falsevalue="" checked="false" label="Continue processing if an error occurs on a single genome?"/>
            <param argument="--min_af" type="float" min="0" max="1" value="0.65" label="Minimum alignment fraction to consider closest genome"/>
            <param argument="--full_tree" type="boolean" truevalue="--full_tree" falsevalue="" checked="false" label="Use the full tree" help="Use the unsplit bacterial tree for the classify step; this is the original GTDB-Tk approach (version &lt; 2) and requires more than 320 GB of RAM to load the reference tree (default: False)"/>
            <param name="output_process_log" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Output process log file?"/>
        </section>
    </inputs>
    <outputs>
        <data name="process_log" format="txt" label="${tool.name} on ${on_string} (process log)">
            <filter>advanced['output_process_log']</filter>
        </data>
        <collection name="output_align" type="list" format="fasta.gz" label="${tool.name} on ${on_string} (align)">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.fasta.gz" ext="fasta.gz" directory="output_dir/align"/>
        </collection>
        <collection name="output_identfy" type="list" format="tsv" label="${tool.name} on ${on_string} (identify)">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.tsv" ext="tsv" directory="output_dir/identify"/>
        </collection>
        <collection name="output_classify" type="list" format="newick" label="${tool.name} on ${on_string} (classify)">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.tree" ext="newick" directory="output_dir/classify"/>
        </collection>
        <collection name="output_summary" type="list" format="tsv" label="${tool.name} on ${on_string} (summary)">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.tsv" ext="tsv" directory="output_dir"/>
        </collection>
    </outputs>
    <tests>
        <!-- Test works with mock DB https://data.ace.uq.edu.au/public/gtdb/data/releases/release226/226.0/auxillary_files/gtdbtk_package/mockup_db/ which is still a bit large -->
        <!-- <test expect_num_outputs="4">
            <param name="input" location="https://data.ace.uq.edu.au/public/gtdb/data/releases/release226/226.0/auxillary_files/gtdbtk_package/mockup_db/test_genome_G021783475.fna.gz" ftype="fasta.gz"/>
            <param name="gtdbtk_db" value="gtdbtk226"/>
            <section name="advanced">
                <param name="full_tree" value="true"/>
            </section>
            <output_collection name="output_summary" type="list" count="1">
                <element name="gtdbtk.bac120.summary" ftype="tsv">
                    <assert_contents>
                        <has_text text="user_genome"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="output_identfy" type="list" count="4">
                <element name="gtdbtk.ar53.markers_summary" ftype="tsv">
                    <assert_contents>
                        <has_text text="number_unique_genes"/>
                    </assert_contents>
                </element>
                <element name="gtdbtk.bac120.markers_summary" ftype="tsv">
                    <assert_contents>
                        <has_text text="test_genome_G021783475"/>
                    </assert_contents>
                </element>
                <element name="gtdbtk.failed_genomes" ftype="tsv">
                    <assert_contents>
                        <has_size value="0"/>
                    </assert_contents>
                </element>
                <element name="gtdbtk.translation_table_summary" ftype="tsv">
                    <assert_contents>
                        <has_text text="test_genome_G021783475"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="output_classify" type="list" count="1">
                <element name="gtdbtk.bac120.classify" ftype="newick">
                    <assert_contents>
                        <has_text text="GB_GCA_"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="output_align" type="list" count="2">
                <element name="gtdbtk.bac120.msa" ftype="fasta.gz" decompress="true">
                    <assert_contents>
                        <has_text text="GB_GCA_"/>
                    </assert_contents>
                </element>
                <element name="gtdbtk.bac120.user_msa" ftype="fasta.gz" decompress="true">
                    <assert_contents>
                        <has_text text="test_genome_G021783475"/>
                    </assert_contents>
                </element>
            </output_collection>
        </test> -->

        <!-- GTDB-Tk databases are far too large to test currently -->
        <test expect_failure="true">
            <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/>
            <param name="gtdbtk_db" value="gtdbtk214"/>
            <assert_stderr>
                <has_text text="Fatal error: Exit code 1"/>
            </assert_stderr>
        </test>
    </tests>
    <help><![CDATA[
**What it does**

GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes
based on the Genome Database Taxonomy GTDB. It is designed to work with recent advances that allow hundreds or
thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also
be applied to isolate and single-cell genomes.

This tool accepts one or more fasta (genome) files and determines taxonomic classification of genomes by
maximum-likelihood (ML) placement.  The classification workflow consists of three steps: identify, align, and
classify.

The identify step calls genes using Prodigal, and uses HMM models and the HMMER package to identify the 120 bacterial
and 122 archaeal marker genes used for phylogenetic inference.  Multiple sequence alignments (MSA) are obtained by
aligning marker genes to their respective HMM model.

The align step concatenates the aligned marker genes and filters the concatenated MSA to approximately 5,000
amino acids.

Finally, the classify step uses pplacer to find the maximum-likelihood placement of each genome in the GTDB-Tk
reference tree. GTDB-Tk classifies each genome based on its placement in the reference tree, its relative evolutionary
divergence, and/or average nucleotide identity (ANI) to reference genomes.

Results can be impacted by a lack of marker genes or contamination.
    ]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Wed, 10 Dec 2025 18:58:28 +0000
parents	202432c562b0
children	3a16872c1088