Mercurial > repos > recetox > recetox_xmsannotator_advanced

--- a/macros.xml	Fri Jan 28 16:27:30 2022 +0000
+++ b/macros.xml	Mon Jun 26 13:55:44 2023 +0000
@@ -1,74 +1,144 @@
 <macros>
-    <token name="@TOOL_VERSION@">0.9.0</token>
-    <xml name="requirements">
-        <requirements>
-            <requirement type="package" version="0.9.0">r-recetox-xmsannotator</requirement>
-        </requirements>
-    </xml>
-    <xml name="creator">
-        <creator>
-            <person
-                givenName="Jiří"
-                familyName="Novotný"
-                url="https://github.com/xtracko"
-                identifier="0000-0001-5449-3523" />
-            <person
-                givenName="Martin"
-                familyName="Čech"
-                url="https://github.com/martenson"
-                identifier="0000-0002-9318-1781" />
-            <organization
-                url="https://www.recetox.muni.cz/"
-                email="GalaxyToolsDevelopmentandDeployment@space.muni.cz"
-                name="RECETOX MUNI" />
-        </creator>
-    </xml>
-    <xml name="inputs">
-        <param name="peak_table" type="data" format="csv,h5,parquet">
-            <label>Peak table</label>
-            <help><![CDATA[
-                A peak-intensity table such as outputted from apLCMS.
-                The file is required to contain the fields <em>mz</em> and <em>rt</em>.
-            ]]></help>
-        </param>
-        <param name="compound_table" type="data" format="csv,parquet">
-            <label>Compound database</label>
-            <help><![CDATA[
-                Database of compounds according to which the annotation is performed.
-                The database is required to contain the fields <em>compound_id</em>, <em>monoisotopic_mass</em>, and <em>molecular_formula</em>.
-            ]]></help>
-        </param>
-        <param name="adduct_table" type="data" format="csv,parquet" optional="true">
-            <label>Adduct database (optional)</label>
-            <help><![CDATA[
-                Database of adduct which is combined with the database of compound to form a molecule-adduct pairs.
-                The database is required to contain <em>adduct</em>, <em>charge</em>, <em>mass</em>, and <em>n_molecules</em>.
-            ]]></help>
-        </param>
-    </xml>
+<token name="@TOOL_VERSION@">0.10.0</token>
+
+<xml name="requirements">
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">r-recetox-xmsannotator</requirement>
+    </requirements>
+</xml>
+
+<xml name="creator">
+    <creator>
+        <person
+            givenName="Jiří"
+            familyName="Novotný"
+            url="https://github.com/xtracko"
+            identifier="0000-0001-5449-3523" />
+        <person
+            givenName="Martin"
+            familyName="Čech"
+            url="https://github.com/martenson"
+            identifier="0000-0002-9318-1781" />
+        <person
+            givenName="Matej"
+            familyName="Troják"
+            url="https://github.com/xtrojak"
+            identifier="0000-0003-0841-2707" />
+        <organization
+            url="https://www.recetox.muni.cz/"
+            email="GalaxyToolsDevelopmentandDeployment@space.muni.cz"
+            name="RECETOX MUNI" />
+    </creator>
+</xml>
+
+<xml name="inputs">
+    <param name="metadata_table" type="data" format="parquet,csv">
+        <label>Metadata table</label>
+        <help><![CDATA[
+            Peak metadata table*.
+        ]]></help>
+    </param>
+    <param name="intensity_table" type="data" format="parquet,csv">
+        <label>Intensity table</label>
+        <help><![CDATA[
+            Table with intensities** for features (rows) across samples (columns).
+        ]]></help>
+    </param>
+    <param name="compound_table" type="data" format="parquet,csv">
+        <label>Compound database</label>
+        <help><![CDATA[
+            Database of compounds according to which the annotation is performed.
+            The database is required to contain the fields <em>compound_id</em>, <em>monoisotopic_mass</em>, and <em>molecular_formula</em>.
+        ]]></help>
+    </param>
+    <param name="adduct_table" type="data" format="parquet,csv" optional="true">
+        <label>Adduct database</label>
+        <help><![CDATA[
+            Database of adduct which is combined with the database of compound to form a molecule-adduct pairs.
+            The database is required to contain <em>adduct</em>, <em>charge</em>, <em>mass</em>, and <em>n_molecules</em>.
+        ]]></help>
+    </param>
+    <param name="adduct_weights" type="data" format="parquet,csv" optional="true">
+        <label>Adduct weights</label>
+        <help>
+            A weight-by-adduct table.
+        </help>
+    </param>
+</xml>

-    <xml name="outputs">
-        <data name="annotation_parquet" format="parquet"/>
-    </xml>
+<xml name="outputs">
+    <data name="output_file" format="parquet">
+        <change_format>
+            <when input="metadata_table.ext" value="csv" format="csv" />
+        </change_format>
+    </data>
+</xml>
+
+<xml name="tolerance">
+    <param name="mass_tolerance_ppm" type="integer" min="0" value="5">
+        <label>Mass tolerance [ppm]</label>
+        <help>Mass tolerance in ppm for database matching.</help>
+    </param>
+    <yield/>
+</xml>
+
+<token name="@HELP@">
+Description
+===========
+
+Annotate the peak intensity table (e.g. from an apLCMS run) with compounds from the compounds database
+using advanced methods.
+
+The annotation process generates all possible compound-adduct pairs and matches those pairs to the measured
+peaks. A compound-adduct pair is pronounced as a match to a certain peak when the difference of their masses are
+withing some tolerance.
+
+Then, a score and a confidence level is assigned to each match based on peak correlation
+clustering, metabolite pathway associations, adducts expectations, and isotope conformations.
+
+Input tables description
+------------------------
+
+(*) Metadata table
+~~~~~~~~~~~~~~~~~~

-    <xml name="tolerance">
-        <param name="mass_tolerance_ppm" type="integer" min="0" value="5">
-            <label>Mass tolerance [ppm]</label>
-            <help>Mass tolerance in ppm for database matching.</help>
-        </param>
-        <yield/>
-    </xml>
-    <token name="@HELP@">
-        <![CDATA[
-        Annotate the peak intensity table (e.g. from an apLCMS run) with compounds from the compounds database
-        using advanced methods.
+The output from recetox-aplcms tool.
+The `npeaks` column denotes the number of peaks which have been grouped into this feature.
+The columns with the sample names indicate whether this feature is present in the sample.
+Only id, mz, and rt columns are required to be present.
+
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+|  id   | mz           |  mzmin       |  mzmax        |  rt            |  rtmin        |  rtmax        |   npeaks  |  21_qc_no_dil_milliq   |  29_qc_no_dil_milliq   |  8_qc_no_dil_milliq    |
++=======+==============+==============+===============+================+===============+===============+===========+========================+========================+========================+
+|  1    | 70.03707021  |  70.037066   |  70.0370750   |  294.1038014   |  294.0634942  |  294.149985   |   3       |  1                     |  1                     |  1                     |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+|  2    | 70.06505677  |  70.065045   |  70.0650676   |  141.9560055   |  140.5762528  |  143.335758   |   2       |  1                     |  0                     |  1                     |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+|  57   | 78.04643252  |  78.046429   |  78.0464325   |  294.0063397   |  293.9406777  |  294.072001   |   2       |  1                     |  1                     |  0                     |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+|  ...  | ...          |   ...        |  ...          |  ...           |  ...          |  ...          |   ...     |  ...                   |  ...                   |  ...                   |
++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+
+
+(**) Intensity table
+~~~~~~~~~~~~~~~~~~~~

-        The annotation process generates all possible compound-adduct pairs and matches those pairs to the measured
-        peaks. A compound-adduct pair is pronounced as a match to a certain peak when the difference of their masses are
-        withing some tolerance.
-        ]]>
-    </token>
-    <xml name="citations">
-        <citation type="doi">10.1021/acs.analchem.6b01214</citation>
-    </xml>
+The output from recetox-aplcms tool.
+This table contains the peak area for aligned features in all samples.
+
++-------+------------------------+------------------------+------------------------+
+|  id   |  21_qc_no_dil_milliq   |  29_qc_no_dil_milliq   |  8_qc_no_dil_milliq    |
++=======+========================+========================+========================+
+|  1    |  13187487.20482895     |  7957395.699119729     |  11700594.397257797    |
++-------+------------------------+------------------------+------------------------+
+|  2    |  2075168.6398983458    |  0                     |  2574362.159289044     |
++-------+------------------------+------------------------+------------------------+
+|  57   |  2934524.4406785755    |  1333044.5065971944    |  0                     |
++-------+------------------------+------------------------+------------------------+
+|  ...  |  ...                   |  ...                   |  ...                   |
++-------+------------------------+------------------------+------------------------+
+</token>
+
+<xml name="citations">
+    <citation type="doi">10.1021/acs.analchem.6b01214</citation>
+</xml>
 </macros>
--- a/recetox_xmsannotator_advanced.xml	Fri Jan 28 16:27:30 2022 +0000
+++ b/recetox_xmsannotator_advanced.xml	Mon Jun 26 13:55:44 2023 +0000
@@ -1,43 +1,49 @@
-<tool id="recetox_xmsannotator_advanced" name="RECETOX xMSannotator advanced" version="@TOOL_VERSION@+galaxy0">
+<tool id="recetox_xmsannotator_advanced" name="recetox-xMSannotator" version="@TOOL_VERSION@+galaxy0">
+
     <description>annotate peak intensity table including scores and confidence levels</description>
     <macros>
         <import>macros.xml</import>
     </macros>
     <expand macro="creator"/>
+    <xrefs>
+        <xref type="bio.tools">recetox-xmsannotator</xref>
+    </xrefs>
     <expand macro="requirements" />
     <command detect_errors="aggressive"><![CDATA[
-        Rscript -e "n_workers <- \${GALAXY_SLOTS:-1}" -e "source('${wrapper}')"
+        Rscript -e 'source("${__tool_directory__}/utils.R")' -e "n_workers <- \${GALAXY_SLOTS:-1}" -e "source('${wrapper}')"
     ]]></command>

     <configfiles>
         <configfile name="wrapper"><![CDATA[
-            library(xmsannotator)
+            metadata_table <- load_table("$metadata_table", "$metadata_table.ext")
+            intensity_table <- load_table("$intensity_table", "$intensity_table.ext")
+            peak_table <- create_peak_table(metadata_table, intensity_table)
+
+            filter_by <- create_filter_by_adducts("$filter_by")

             annotation <- advanced_annotation(
-            #if $peak_table.is_of_type("h5")
-                peak_table = load_peak_table_hdf("${peak_table}"),
-            #elif $peak_table.is_of_type("parquet")
-                peak_table = load_peak_table_parquet("${peak_table}"),
-            #end if
-                adduct_table = load_adduct_table_parquet("${adduct_table}"),
-                compound_table = load_compound_table_parquet("${compound_table}"),
+                peak_table = peak_table,
+                adduct_table = load_table("$adduct_table", "$adduct_table.ext"),
+                adduct_weights = load_table("$adduct_weights", "$adduct_weights.ext"),
+                compound_table = load_table("$compound_table", "$compound_table.ext"),
                 mass_tolerance = 1e-6 * ${mass_tolerance_ppm},
                 time_tolerance = $time_tolerance,
                 correlation_threshold = as.double($clustering.correlation_threshold),
                 min_cluster_size = as.integer($clustering.min_cluster_size),
                 deep_split = as.integer($clustering.deep_split),
                 network_type = "$clustering.network_type",
-            #if $scoring.expected_adducts
-                expected_adducts = load_expected_adducts_csv("${scoring.expected_adducts}"),
-            #end if
-            #if $scoring.boost_compounds
-                boost_compounds = load_boost_compounds_csv("${scoring.boost_compounds}"),
-            #end if
                 redundancy_filtering = $scoring.redundancy_filtering,
-                n_workers = n_workers
+                n_workers = n_workers,
+                intensity_deviation_tolerance = as.double($intensity_deviation_tolerance),
+                mass_defect_tolerance = as.double($mass_defect_tolerance),
+                mass_defect_precision = as.double($mass_defect_precision),
+                peak_rt_width = as.integer($peak_rt_width),
+                maximum_isotopes = as.integer($maximum_isotopes),
+                min_ions_per_chemical = as.integer($min_ions_per_chemical),
+                filter_by = filter_by
             )

-            save_parquet(data = annotation, file = "${annotation_parquet}")
+            save_table(annotation, "$output_file", "$output_file.ext")
         ]]></configfile>
     </configfiles>

@@ -47,15 +53,11 @@
             <param name="time_tolerance" type="float" value="10" min="0">
                 <label>Retention time tolerance [s]</label>
                 <help>
-                    Retention time tolerance in seconds for finding peaks derived from the same parent metabolite.
+                    Retention time tolerance in seconds for finding peaks derived from the same parent compound.
                 </help>
             </param>
         </expand>
         <section name="clustering" title="Clustering">
-            <param name="correlation_method" type="select" display="radio" label="Correlation method">
-                <option value="pearson" selected="true">pearson</option>
-                <option value="spearman">spearman</option>
-            </param>
             <param name="correlation_threshold" type="float" value="0.7">
                 <label>Correlation threshold</label>
                 <help>Correlation threshold between peaks to qualify as adducts/isotopes of the same metabolite.</help>
@@ -77,8 +79,8 @@
                     Network type parameter affects how the network's adjacency matrix is created from the correlation
                     matrix (see WGCNA package documentation).
                 </help>
-                <option value="signed">signed</option>
-                <option value="unsigned" selected="true">unsigned</option>
+                <option value="signed">Signed</option>
+                <option value="unsigned" selected="true">Unsigned</option>
             </param>
         </section>
         <section name="scoring" title="Scoring" expanded="true">
@@ -89,21 +91,6 @@
                     cluster. Otherwise, do not account for cluster membership.
                 </help>
             </param>
-            <param name="expected_adducts" type="data" format="csv" optional="true">
-                <label>Expected adducts</label>
-                <help>
-                    Require the presence of certain adducts for a high confidence match. By default, at least the
-                    presence of an M+H adduct is required for a high confidence match.
-                </help>
-            </param>
-            <param name="boost_compounds" type="data" format="csv" optional="true">
-                <label>Validated compounds score boosting (optional)</label>
-                <help>
-                    Table of previously validated compounds to boost their scores and confidence levels.
-                    The 1st column of the table must contain IDs of compounds.
-                    The optional 2nd and 3rd columns may contain mz values and retention times.
-                </help>
-            </param>
             <param name="min_isp" type="integer" min="0" value="1">
                 <label>Minimum number of expected isotopes</label>
                 <help>
@@ -121,17 +108,58 @@
                 <help>Whether to filter out low-scored multiple matcher or not.</help>
             </param>
         </section>
+        <param name="intensity_deviation_tolerance" type="float" value="0.1">
+            <label>Tolerance of intensity deviation</label>
+            <help>A numeric threshold by which an intensity ratio of two isotopic peaks may differ from their actual abundance ratio.</help>
+        </param>
+        <param name="mass_defect_tolerance" type="float" value="0.1">
+            <label>Tolerance of mass defect</label>
+            <help>Maximum difference in mass defect between two peaks of the same compound.</help>
+        </param>
+        <param name="mass_defect_precision" type="float" value="0.01">
+            <label>Precision for computing mass defect</label>
+        </param>
+        <param name="peak_rt_width" type="integer" value="1">
+            <label>Estimated chromatographic peak width</label>
+        </param>
+        <param name="maximum_isotopes" type="integer" value="10">
+            <label>Maximum isotopes</label>
+        </param>
+        <param name="min_ions_per_chemical" type="integer" value="2">
+            <label>Minimum ions per chemical</label>
+        </param>
+        <param name="filter_by" type="select" label="Adducts to filter by" multiple="true" optional="true">
+            <option value="M-H" selected="true">M-H</option>
+            <option value="M+H" selected="true">M+H</option>
+            <option value="2M-H">2M-H</option>
+            <option value="M-2H">M-2H</option>
+        </param>
     </inputs>

     <outputs>
         <expand macro="outputs"/>
     </outputs>

+    <tests>
+        <test>
+            <param name="metadata_table" value="metadata_table.parquet" ftype="parquet" />
+            <param name="intensity_table" value="intensity_table.parquet" ftype="parquet" />
+            <param name="compound_table" value="database.parquet" ftype="parquet" />
+            <param name="adduct_table" value="adduct_table.parquet" ftype="parquet" />
+            <output name="output_file" file="expected_output.parquet" ftype="parquet"/>
+        </test>
+        <test>
+            <param name="metadata_table" value="metadata_table.csv" ftype="csv" />
+            <param name="intensity_table" value="intensity_table.csv" ftype="csv" />
+            <param name="compound_table" value="database.csv" ftype="csv" />
+            <param name="adduct_table" value="adduct_table.csv" ftype="csv" />
+            <output name="output_file" file="expected_output.csv" ftype="csv"/>
+        </test>
+    </tests>
+
     <help>
         <![CDATA[
-        @HELP@
-        Then, a score and a confidence level is assigned to each match based on peak correlation
-        clustering, metabolite pathway associations, adducts expectations, and isotope conformations.
+            @HELP@
         ]]>
     </help>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils.R	Mon Jun 26 13:55:44 2023 +0000
@@ -0,0 +1,37 @@
+library(recetox.xmsannotator)
+library(dplyr)
+
+load_table <- function(filename, filetype) {
+    if (filename == "None") {
+        return(NULL)
+    }
+    if (filetype == "csv") {
+        return(as.data.frame(read.csv(filename)))
+    } else {
+        return(as.data.frame(arrow::read_parquet(filename)))
+    }
+}
+
+save_table <- function(table, filename, filetype) {
+    if (filetype == "csv") {
+        write.csv(table, filename, row.names = FALSE)
+    } else {
+        arrow::write_parquet(table, filename)
+    }
+}
+
+create_filter_by_adducts <- function(comma_separated_values) {
+    if (comma_separated_values == "None") {
+        return(NA)
+    }
+    filter_by <- strsplit(trimws(comma_separated_values), ",")[[1]]
+    return(filter_by)
+}
+
+create_peak_table <- function(metadata_table, intensity_table) {
+    metadata_table <- select(metadata_table, id, mz, rt)
+    peak_table <- inner_join(metadata_table, intensity_table, by = "id")
+    peak_table <- rename(peak_table, peak = id)
+    peak_table$peak <- as.integer(peak_table$peak)
+    return(peak_table)
+}