view claraguess.xml @ 0:52d4151e00d8 draft default tip

planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
author ecology
date Wed, 28 May 2025 10:12:06 +0000
parents
children
line wrap: on
line source

<tool id="ClaraGuess" name="Clara Estimate and Clustering" version="0.1.2" profile="23.2">
    <description>Environmental clustering using CLARA and BRT predictions</description>

    <requirements>
        <requirement type="package" version="4.3.3">r-base</requirement>
        <requirement type="package" version="2.1.8.1">r-cluster</requirement>
        <requirement type="package" version="1.1.4">r-dplyr</requirement>
        <requirement type="package" version="2.0.0">r-tidyverse</requirement>
    </requirements>

    <command detect_errors="exit_code"><![CDATA[
Rscript '$__tool_directory__/claraguess.R'
    '$enviro'
    '$preds'
    '$taxas'
    '$type'
    '$k'
    '$metric'
    '$samples'
    '$data_cluster'
    '$silhouette_plot'
#if str($type) == "auto":
    '$sih_scores'
#else:
    'NA'
#end if
    '$clustered_taxas_env'
    ]]></command>

    <inputs>
        <param name="enviro" type="data" format="tabular" label="Environmental data (tabular)"/>
        <param name="preds" type="data" format="tabular" multiple="true" label="BRT prediction files (collection of tabular)"/>
        <param name="taxas" type="data" format="txt" label="List of taxa (from TaxaSeeker)"/>

        <param name="type" type="select" label="k is ...">
            <option value="fixed">the number of clusters (fixed)</option>
            <option value="auto">the maximum number of clusters (automatic)</option>
        </param>

        <param name="k" type="integer" optional="true" label="Value of k"/>

        <param name="metric" type="select" label=" dissimilarity metric">
            <option value="manhattan">Manhattan</option>
            <option value="jaccard">Jaccard</option>
            <option value="euclidean" selected="true">Euclidean</option>
        </param>

        <param name="samples" type="integer" value="1000" label="Number of samples for CLARA"/>
    </inputs>

    <outputs>
        <data name="data_cluster" from_work_dir="data_cluster.tabular" format="tabular" label="Cluster assignments (lat, long, cluster)"/>
        <data name="silhouette_plot" from_work_dir="silhouette_plot.png" format="png" label="Silhouette Index Plot"/>
        <data name="sih_scores" from_work_dir="sih_scores.png" format="png" label="Silhouette Plot">
            <filter>type == "auto"</filter>
        </data>
        <data name="clustered_taxas_env" from_work_dir="clustered_taxas_env.tabular" format="tabular" label="Environment + Clustered Data"/>
    </outputs>

    <tests>
        <test expect_num_outputs="3">
            <param name="enviro" value="enviro.tabular"/>
            <param name="preds" value="preds.tabular"/>
            <param name="taxas" value="taxas.tabular"/>
            <param name="type" value="fixed"/>
            <param name="k" value="3"/>
            <param name="metric" value="manhattan"/>
            <param name="samples" value="10"/>

            <output name="data_cluster">
                <assert_contents>
                    <has_line_matching expression="^lat\tlong\tcluster$"/>
                    <has_n_columns n="3"/>
                </assert_contents>
            </output>

            <output name="silhouette_plot">
                <assert_contents>
                    <has_size value="8400" delta="600"/>
                </assert_contents>
            </output>

            <output name="clustered_taxas_env">
                <assert_contents>
                    <has_line_matching expression="^lat\tlong\tcluster.*$"/>
                </assert_contents>
            </output>
        </test>

        <test expect_num_outputs="4">
            <param name="enviro" value="enviro.tabular"/>
            <param name="preds" value="preds.tabular"/>
            <param name="taxas" value="taxas.tabular"/>
            <param name="type" value="auto"/>
            <param name="k" value="3"/>
            <param name="metric" value="manhattan"/>
            <param name="samples" value="10"/>

            <output name="data_cluster">
                <assert_contents>
                    <has_line_matching expression="^lat\tlong\tcluster$"/>
                    <has_n_columns n="3"/>
                </assert_contents>
            </output>

            <output name="silhouette_plot">
                <assert_contents>
                    <has_size value="8400" delta="600"/>
                </assert_contents>
            </output>

            <output name="sih_scores">
                <assert_contents>
                    <has_size value="6918" delta="600"/>
                </assert_contents>
            </output>

            <output name="clustered_taxas_env">
                <assert_contents>
                    <has_line_matching expression="^lat\tlong\tcluster.*$"/>
                </assert_contents>
            </output>
        </test>
    </tests>

    <help><![CDATA[
==================    
**What it does ?**
==================

This tool applies the CLARA clustering method to identify environmental clusters based on:
- BRT model predictions (a collection of tabular files),
- environmental variables (tabular),
- a list of taxa (tabular, from TaxaSeeker).
The tool enables the determination of the optimal number of clusters for partition-based clustering (if automatic mode is selected), along with generating files used in the subsequent ecoregionalization workflow.

===================         
**How to use it ?**
===================

## Parameters:

- **Clustering type**: Choose between a fixed number of clusters ("Number of clusters") or an automatic mode using a maximum number ("Max number of clusters").
- **k**: The number of clusters (used based on the selected mode).
- **Distance metric**: dissimilarity metric / distance used in clustering (Manhattan, Jaccard, or Euclidean).
- **Samples**: Number of samples drawn for CLARA clustering.

## Outputs:

- A tabular file containing cluster assignments for each geographic point (columns: lat, long, cluster).
- A collection of:
  - A silhouette plot (PNG),
  - A silhouette index plot (PNG),
  - A tabular file with original environmental variables and predicted cluster number.

This tool is useful for ecological modeling and spatial analysis, particularly in marine or terrestrial biogeography contexts.

**Example of the environemental file :**

+------+------+---------+------+--------------+-----+
| long | lat  |  Carbo  | Grav |  Maxbearing  | ... |
+------+------+---------+------+--------------+-----+
|139.22|-65.57|   0.88  |28.59 |     3.67     | ... |
+------+------+---------+------+--------------+-----+
|139.22|-65.57|   0.88  |28.61 |     3.64     | ... |
+------+------+---------+------+--------------+-----+
| ...  | ...  |   ...   | ...  |     ...      | ... |
+------+------+---------+------+--------------+-----+

**Example of the Brt prediction file :**

+-----------+----------+-----------------------+-------------+
|    lat    |   long   |   Prediction.index    |     spe     |
+-----------+----------+-----------------------+-------------+
|  -65.57   |  139.22  |   0.122438487221909   |  Acarnidae  |
+-----------+----------+-----------------------+-------------+
|  -65.57   |  139.32  |   0.119154535627801   |  Acarnidae  |
+-----------+----------+-----------------------+-------------+
|   ...     |   ...    |         ...           |     ...     |
+-----------+----------+-----------------------+-------------+

]]></help>

    <citations>
        <citation type="doi">10.32614/CRAN.package.dplyr</citation>
        <citation type="doi">10.32614/CRAN.package.cluster</citation>
        <citation type="doi">10.32614/CRAN.package.tidyverse</citation>
    </citations>
</tool>