Mercurial > repos > recetox > recetox_xmsannotator_advanced

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Thu Aug 12 11:12:59 2021 +0000
@@ -0,0 +1,57 @@
+<macros>
+    <token name="@TOOL_VERSION@">0.9.0</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="0.9.0">r-recetox-xmsannotator</requirement>
+        </requirements>
+    </xml>
+
+    <xml name="inputs">
+        <param name="peak_table" type="data" format="csv,h5,parquet">
+            <label>Peak table</label>
+            <help><![CDATA[
+                A peak-intensity table such as outputted from apLCMS.
+                The file is required to contain the fields <em>mz</em> and <em>rt</em>.
+            ]]></help>
+        </param>
+        <param name="compound_table" type="data" format="csv,parquet">
+            <label>Compound database</label>
+            <help><![CDATA[
+                Database of compounds according to which the annotation is performed.
+                The database is required to contain the fields <em>compound_id</em>, <em>monoisotopic_mass</em>, and <em>molecular_formula</em>.
+            ]]></help>
+        </param>
+        <param name="adduct_table" type="data" format="csv,parquet" optional="true">
+            <label>Adduct database (optional)</label>
+            <help><![CDATA[
+                Database of adduct which is combined with the database of compound to form a molecule-adduct pairs.
+                The database is required to contain <em>adduct</em>, <em>charge</em>, <em>mass</em>, and <em>n_molecules</em>.
+            ]]></help>
+        </param>
+    </xml>
+
+    <xml name="outputs">
+        <data name="annotation_parquet" format="parquet"/>
+    </xml>
+
+    <xml name="tolerance">
+        <param name="mass_tolerance_ppm" type="integer" min="0" value="5">
+            <label>Mass tolerance [ppm]</label>
+            <help>Mass tolerance in ppm for database matching.</help>
+        </param>
+        <yield/>
+    </xml>
+    <token name="@HELP@">
+        <![CDATA[
+        Annotate the peak intensity table (e.g. from an apLCMS run) with compounds from the compounds database
+        using advanced methods.
+
+        The annotation process generates all possible compound-adduct pairs and matches those pairs to the measured
+        peaks. A compound-adduct pair is pronounced as a match to a certain peak when the difference of their masses are
+        withing some tolerance.
+        ]]>
+    </token>
+    <xml name="citations">
+        <citation type="doi">10.1021/acs.analchem.6b01214</citation>
+    </xml>
+</macros>
--- a/recetox_xmsannotator_advanced.xml	Thu Sep 17 08:40:19 2020 +0000
+++ b/recetox_xmsannotator_advanced.xml	Thu Aug 12 11:12:59 2021 +0000
@@ -1,73 +1,141 @@
-<tool id="recetox_xmsannotator_advanced" name="xmsannotator - advanced" version="v2.0">
-    <macros>
-        <import>recetox_xmsannotator_macros.xml</import>
-    </macros>
-
-    <expand macro="requirements"/>
-
-    <command detect_errors="aggressive"><![CDATA[
-        #set expected_adducts = [f'"{$i.adduct}"' for $i in $annotation.expected_adducts]
-        #set expected_adducts = "c(" + ', '.join($annotation.expected_adducts) + ",)"
-
-        Rscript -e 'annotation <- xmsannotator::advanced_annotation(
-                        data = arrow::read_feather("$peaks"),
-                        metabolite = rhdf5::h5read("$metabolites", "metabolites"),
-                        max_mz_diff = as.double($tolerances.max_mz_diff),
-                        max_rt_diff = as.double($tolerances.max_rt_diff),
-                        correlation_method = "$clustering.correlation_method",
-                        correlation_threshold = as.double($clustering.correlation_threshold),
-                        min_cluster_size = as.integer($clustering.min_cluster_size),
-                        deep_split = as.integer($clustering.deep_split),
-                        network_type = "$clustering.network_type",
-                        boost_metabolites = arrow::read_feather("$annotation.boost_metabolites"),
-                        expected_adducts = as.character($expected_adducts),
-                        min_isp = as.integer($annotation.min_isp),
-                        max_isp = as.integer($annotation.max_isp),
-                        strict_boosting = as.logical($annotation.strict_boosting),
-                        redundancy_filtering = as.logical($annotation.redundancy_filtering)
-                   )'
-                -e 'arrow::write_feather(annotation, "$annotation")'
-    ]]></command>
-
-    <inputs>
-        <expand macro="peaks"/>
-        <expand macro="metabolites"/>
-
-        <section name="tolerances" title="Tolerances" expanded="true">
-            <param name="max_mz_diff" type="float" value="10" min="0" label="Mass tolerance [ppm]" help="Mass tolerance in ppm for database matching."/>
-            <param name="max_rt_diff" type="float" value="10" min="0" label="Retention time tolerance [s]" help="Retention time tolerance in seconds for finding peaks derived from the same parent metabolite."/>
-        </section>
-
-        <section name="clustering" title="Clustering">
-            <param name="correlation_method" type="select" display="radio" label="Correlation method">
-                <option value="pearson" selected="true"/>
-                <option value="spearman"/>
-            </param>
-            <param name="correlation_threshold" type="float" value="0.7" label="Correlation threshold" help="Correlation threshold between peaks to qualify as adducts/isotopes of the same metabolite."/>
-            <param name="min_cluster_size" type="integer" value="10" min="1" label="Minimum cluster size" help="The minimum number of nodes to be considered as a cluster."/>
-            <param name="deep_split" type="integer" value="2" min="0" max="4" label="Deep split" help="Deep split provides a rough control over sensitivity to cluster splitting. The higher the value, the more and smaller clusters will be produced (see WGCNA package documentation)."/>
-            <param name="network_type" type="select" display="radio" label="Network type" help="Network type parameter affects how the network's adjacency matrix is created from the correlation matrix (see WGCNA package documentation).">
-                <option value="signed"/>
-                <option value="unsigned" selected="true"/>
-            </param>
-        </section>
-
-        <section name="annotation" title="Annotation" expanded="true">
-            <param name="boost_metabolites" type="data" format="csv" optional="true" label="Validated metabolites score boosting (optional)" help="Table of previously validated metabolites to boost their confidence scores. The 1st column of the table must contain IDs of metabolites. The optional 2nd and 3rd columns may contain mz values and retention times."/>
-            <repeat name="expected_adducts" title="Expected adducts" help="Require the presence of certain adducts for a high confidence match.">
-                <param name="adduct" type="text" value="M+H" label="Adduct"/>
-            </repeat>
-            <param name="min_isp" type="integer" min="0" value="1" label="Minimum number of expected isotopes" help="Minimum number of adducts/isotopes to be present for a match to be considered as a high confidence match."/>
-            <param name="max_isp" type="integer" min="0" value="5" label="Maximum number of expected isotopes" help="Maximum number of adducts/isotopes to be present for a match to be considered as a high confidence match."/>
-            <param name="strict_boosting" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Strict boosting" help="Boost the scores of metabolites that not only belongs to the same pathway but also to the same cluster. Otherwise, do not account for cluster membership."/>
-            <param name="redundancy_filtering" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Redundancy filtering" help="Whether to perform final redundancy filtering or not."/>
-        </section>
-    </inputs>
-
-    <outputs>
-        <expand macro="annotation_output"/>
-    </outputs>
-
-    <help><![CDATA[]]></help>
-    <expand macro="citations"/>
-</tool>
\ No newline at end of file
+<tool id="recetox_xmsannotator_advanced" name="RECETOX xMSannotator advanced" version="@TOOL_VERSION@+galaxy0">
+    <description>annotate peak intensity table including scores and confidence levels</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+
+    <command detect_errors="aggressive"><![CDATA[
+        Rscript -e "n_workers <- \${GALAXY_SLOTS:-1}" -e "source('${wrapper}')"
+    ]]></command>
+
+    <configfiles>
+        <configfile name="wrapper"><![CDATA[
+            library(xmsannotator)
+
+            annotation <- advanced_annotation(
+            #if $peak_table.is_of_type("h5")
+                peak_table = load_peak_table_hdf("${peak_table}"),
+            #elif $peak_table.is_of_type("parquet")
+                peak_table = load_peak_table_parquet("${peak_table}"),
+            #end if
+                adduct_table = load_adduct_table_parquet("${adduct_table}"),
+                compound_table = load_compound_table_parquet("${compound_table}"),
+                mass_tolerance = 1e-6 * ${mass_tolerance_ppm},
+                time_tolerance = $time_tolerance,
+                correlation_threshold = as.double($clustering.correlation_threshold),
+                min_cluster_size = as.integer($clustering.min_cluster_size),
+                deep_split = as.integer($clustering.deep_split),
+                network_type = "$clustering.network_type",
+            #if $scoring.expected_adducts
+                expected_adducts = load_expected_adducts_csv("${scoring.expected_adducts}"),
+            #end if
+            #if $scoring.boost_compounds
+                boost_compounds = load_boost_compounds_csv("${scoring.boost_compounds}"),
+            #end if
+                redundancy_filtering = $scoring.redundancy_filtering,
+                n_workers = n_workers
+            )
+
+            save_parquet(data = annotation, file = "${annotation_parquet}")
+        ]]></configfile>
+    </configfiles>
+
+    <inputs>
+        <expand macro="inputs"/>
+        <expand macro="tolerance">
+            <param name="time_tolerance" type="float" value="10" min="0">
+                <label>Retention time tolerance [s]</label>
+                <help>
+                    Retention time tolerance in seconds for finding peaks derived from the same parent metabolite.
+                </help>
+            </param>
+        </expand>
+        <section name="clustering" title="Clustering">
+            <param name="correlation_method" type="select" display="radio" label="Correlation method">
+                <option value="pearson" selected="true"/>
+                <option value="spearman"/>
+            </param>
+            <param name="correlation_threshold" type="float" value="0.7">
+                <label>Correlation threshold</label>
+                <help>Correlation threshold between peaks to qualify as adducts/isotopes of the same metabolite.</help>
+            </param>
+            <param name="min_cluster_size" type="integer" value="10" min="1">
+                <label>Minimum cluster size</label>
+                <help>The minimum number of nodes to be considered as a cluster.</help>
+            </param>
+            <param name="deep_split" type="integer" value="2" min="0" max="4">
+                <label>Deep split</label>
+                <help>
+                    Deep split provides a rough control over sensitivity to cluster splitting. The higher the value,
+                    the more and smaller clusters will be produced (see WGCNA package documentation).
+                </help>
+            </param>
+            <param name="network_type" type="select" display="radio">
+                <label>Network type</label>
+                <help>
+                    Network type parameter affects how the network's adjacency matrix is created from the correlation
+                    matrix (see WGCNA package documentation).
+                </help>
+                <option value="signed"/>
+                <option value="unsigned" selected="true"/>
+            </param>
+        </section>
+        <section name="scoring" title="Scoring" expanded="true">
+            <param name="strict_boosting" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE">
+                <label>Strict boosting</label>
+                <help>
+                    Boost the scores of metabolites that not only belongs to the same pathway but also to the same
+                    cluster. Otherwise, do not account for cluster membership.
+                </help>
+            </param>
+            <param name="expected_adducts" type="data" format="csv" optional="true">
+                <label>Expected adducts</label>
+                <help>
+                    Require the presence of certain adducts for a high confidence match. By default, at least the
+                    presence of an M+H adduct is required for a high confidence match.
+                </help>
+            </param>
+            <param name="boost_compounds" type="data" format="csv" optional="true">
+                <label>Validated compounds score boosting (optional)</label>
+                <help>
+                    Table of previously validated compounds to boost their scores and confidence levels.
+                    The 1st column of the table must contain IDs of compounds.
+                    The optional 2nd and 3rd columns may contain mz values and retention times.
+                </help>
+            </param>
+            <param name="min_isp" type="integer" min="0" value="1">
+                <label>Minimum number of expected isotopes</label>
+                <help>
+                    Minimum number of adducts/isotopes to be present for a match to be considered as a high confidence match.
+                </help>
+            </param>
+            <param name="max_isp" type="integer" min="0" value="5">
+                <label>Maximum number of expected isotopes</label>
+                <help>
+                    Maximum number of adducts/isotopes to be present for a match to be considered as a high confidence match.
+                </help>
+            </param>
+            <param name="redundancy_filtering" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE">
+                <label>Redundancy filtering</label>
+                <help>Whether to filter out low-scored multiple matcher or not.</help>
+            </param>
+        </section>
+    </inputs>
+
+    <outputs>
+        <expand macro="outputs"/>
+    </outputs>
+
+    <help>
+        <![CDATA[
+        @HELP@
+        Then, a score and a confidence level is assigned to each match based on peak correlation
+        clustering, metabolite pathway associations, adducts expectations, and isotope conformations.
+        ]]>
+    </help>
+
+    <citations>
+        <expand macro="citations"/>
+    </citations>
+</tool>
--- a/recetox_xmsannotator_macros.xml	Thu Sep 17 08:40:19 2020 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-<macros>
-    <xml name="requirements">
-        <requirements>
-            <requirement type="package">recetox_datatypes</requirement>
-            <container type="docker">registry.gitlab.ics.muni.cz:443/recetox/mass-spectrometry/xmsannotator:v2.0</container>
-        </requirements>
-    </xml>
-
-    <xml name="peaks">
-        <param name="peaks" type="data" format="peak_table.feather" label="Peak intensity table"/>
-    </xml>
-
-    <xml name="metabolites">
-        <!-- <param name="metabolites" type="select" label="Metabolite database">
-            <options from_data_table="recetox_metabolite_databases">
-                <column name="name" index="2"/>
-                <column name="value" index="3"/>
-            </options>
-        </param> -->
-        <param name="metabolites" type="data" format="metabolites.h5" label="Metabolite database"/>
-    </xml>
-
-    <xml name="mz_tolerance">
-        <param name="mz_tolerance" type="float" value="10" min="0" label="Mass tolerance [ppm]" help="Mass tolerance in ppm for database matching."/>
-    </xml>
-
-    <xml name="annotation_output">
-        <data format="annotated_peak_list.feather" name="annotation"/>
-    </xml>
-
-    <xml name="citations">
-        <citations>
-            <citation type="doi">10.1021/acs.analchem.6b01214</citation>
-        </citations>
-    </xml>
-</macros>
\ No newline at end of file
--- a/tool-data/recetox_metabolite_databases.loc.sample	Thu Sep 17 08:40:19 2020 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,5 +0,0 @@
-# This file has the format (white space characters are
-# TAB characters):
-#
-# <value>	<name>	<date>	<path>
-#
\ No newline at end of file
--- a/tool_data_table_conf.xml.sample	Thu Sep 17 08:40:19 2020 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-<tables>
-    <table name="recetox_metabolite_databases" comment_char="#">
-        <columns>value, date, name, path</columns>
-        <file path="tool-data/recetox_metabolite_databases.loc" />
-    </table>
-</tables>
\ No newline at end of file