Mercurial > repos > recetox > ramclustr
changeset 3:211cd88b5148 draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/ramclustr commit 9bc547872c98a9c13c561d15e8990fe82bdc0e72"
author | recetox |
---|---|
date | Fri, 28 Jan 2022 16:25:33 +0000 |
parents | eac0e6feb850 |
children | 69e0da4703b5 |
files | macros.xml ramclustr.xml ramclustr_macros.xml |
diffstat | 3 files changed, 313 insertions(+), 184 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Fri Jan 28 16:25:33 2022 +0000 @@ -0,0 +1,305 @@ +<macros> + <token name="@TOOL_VERSION@">1.2.2</token> + + <xml name="creator"> + <creator> + <person + givenName="Helge" + familyName="Hecht" + url="https://github.com/hechth" + identifier="0000-0001-6744-996X" /> + <person + givenName="Maksym" + familyName="Skoryk" + url="https://github.com/maximskorik" + identifier="0000-0003-2056-8018" /> + <person + givenName="Matej" + familyName="Troják" + url="https://github.com/xtrojak" + identifier="0000-0003-0841-2707" /> + <person + givenName="Martin" + familyName="Čech" + url="https://github.com/martenson" + identifier="0000-0002-9318-1781" /> + <organization + url="https://www.recetox.muni.cz/" + email="GalaxyToolsDevelopmentandDeployment@space.muni.cz" + name="RECETOX MUNI"/> + </creator> + </xml> + + <xml name="parameters_csv"> + <section name="ms_csv" title="Input MS Data as CSV" expanded="true"> + <param label="ms" name="ms" type="data" format="csv" + help="Features as columns, rows as samples. Column header mz_rt"/> + <param label="idmsms" name="idmsms" type="data" format="csv" optional="true" + help="Optional idMSMS / MSe csv data. same dim and names as ms required"/> + <param label="sample_name_column" name="sample_name_column" type="integer" value="1" + help="Which column from the csv file contains sample names?"/> + <param label="feature_delimiter" name="feature_delimiter" type="text" value="_" + help="Only required if ms input is set! How feature mz and rt are delimited in csv import column header e.g. ='-'"/> + <param label="retention_time_column" name="retention_time_column" type="integer" value="2" + help="Which position in delimited column header represents the retention time (csv only)"/> + <param label="st" name="st" type="float" value="1" help="Sigma t - time similarity decay value. + A recommended starting point is half the value of your average chromatographic peak width at half max (seconds))."/> + </section> + </xml> + + <xml name="parameters_excluded"> + <param label="MStag" name="MStag" type="text" optional="true" + help="Character string in 'taglocation' to designat MS / MSe files e.g. '01.cdf'"/> + <param label="idMSMStag" name="idMSMStag" type="text" optional="true" + help="Character string in 'taglocation' to designat idMSMS / MSe files e.g. '02.cdf'"/> + <param label="taglocation" name="taglocation" type="text" value="filepaths" + help="'filepaths' by default, 'phenoData[,1]' is another option. refers to xcms slot"/> + </xml> + + <xml name="parameters_required"> + <section name="required" title="Required Parameters" expanded="true"> + <param label="sr" name="sr" type="float" value="0.5" help="Sigma r - correlational similarity decay value"/> + <param label="deepSplit" name="deepSplit" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" + help="Controls how agressively the HCA tree is cut - see ?cutreeDynamicTree"/> + <param label="blocksize" name="blocksize" type="integer" value="2000" + help="Number of features (scans?) processed in one block =1000,"/> + <param label="mult" name="mult" type="integer" value="5" + help="Internal value, can be used to influence processing speed/ram usage"/> + <param label="hmax" name="hmax" type="float" value="0.3" + help="Precut the tree at this height, default 0.3 - see ?cutreeDynamicTree"/> + <param label="collapse" name="collapse" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" + help="Reduce feature intensities to spectrum intensities?"/> + <param label="usePheno" name="usePheno" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" + help="Transfer phenotype data from XCMS object to SpecAbund dataset?"/> + <!-- + Currently not forwarded because the MSP is exported always manually afterwards + <param label="mspout" name="mspout" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="write msp formatted spectra to file?" /> + --> + <param label="normalize" name="normalize" type="select" display="radio" + help="Either 'none', 'TIC', 'quantile', or 'batch.qc' normalization of feature intensities. see batch.qc overview in details. "> + <option value="none" selected="true">none</option> + <option value="TIC">TIC</option> + <option value="quantile">quantile</option> + <option value="batch.qc">batch.qc</option> + </param> + <param label="qc_inj_range" name="qc_inj_range" type="integer" value="20" + help="How many injections around each injection are to be scanned for presence of QC samples when + using batch.qc normalization? A good rule of thumb is between 1 and 3 times the typical + injection span between QC injections. i.e. if you inject QC ever 7 samples, set this to + between 7 and 21. Smaller values provide more local precision but make normalization sensitive + to individual poor outliers (though these are first removed using the boxplot function outlier + detection), while wider values provide less local precision in normalization but better + stability to individual peak areas."/> + + <param label="minModuleSize" name="minModuleSize" type="integer" value="2" + help="How many features must be part of a cluster to be returned? default = 2"/> + <param label="linkage" name="linkage" type="select" display="radio" value="average" + help="Hierarchical clustering linkage method - see ?hclust"> + <option value="average" selected="true">average</option> + <option value="ward.D">ward.D</option> + <option value="ward.D2">ward.D2</option> + <option value="single">single</option> + <option value="complete">complete</option> + <option value="mcquitty">mcquitty</option> + <option value="median">median</option> + <option value="centroid">centroid</option> + </param> + + <param label="mzdec" name="mzdec" type="integer" value="3" + help="Number of decimal places used in printing m/z values"/> + <param label="cor_method" name="cor_method" type="select" display="radio" value="pearson" + help="Which correlational method used to calculate 'r' - see ?cor"> + <option value="pearson" selected="true">pearson</option> + <option value="everything">everything</option> + <option value="spearman">spearman</option> + <option value="kendall">kendall</option> + </param> + + <param label="rt_only_low_n" name="rt_only_low_n" type="boolean" truevalue="TRUE" falsevalue="FALSE" + checked="true" + help="At low injection numbers, correlational relationships of peak intensities may be unreliable. + By defualt ramclustR will simply ignore the correlational r value and cluster on retention time alone. + If you wish to use correlation with at n less than 5, set this value to FALSE."/> + <param label="replace_zeros" name="replace_zeros" type="boolean" truevalue="TRUE" falsevalue="FALSE" + checked="true" + help="NA, NaN, and Inf values are replaced with zero, and zero values are sometimes returned from + peak peaking. When TRUE, zero values will be replaced with a small amount of noise, with noise level + set based on the detected signal intensities for that feature. "/> + <param label="Merge MSP Files" name="merge_msp" type="boolean" truevalue="TRUE" falsevalue="FALSE" + checked="true" help="Whether to merge all msp in one file or export one msp per spectra"/> + </section> + </xml> + + <xml name="parameters_optional_xcms"> + <section name="optional" title="Optional Parameters" expanded="false"> + <param label="st" name="st" type="float" optional="true" help="Sigma t - time similarity decay value. + A recommended starting point is half the value of your average chromatographic peak width at half max (seconds))."/> + <param label="fftempdir" name="fftempdir" type="text" optional="true" + help="Valid path: if there are file size limitations on the default ff pacakge temp directory - + getOptions('fftempdir') - you can change the directory used as the fftempdir with this option."/> + <param label="maxt" name="maxt" type="integer" optional="true" + help="Maximum time difference to calculate retention similarity for - all values beyond this are assigned similarity of zero"/> + </section> + </xml> + + <xml name="parameters_optional_csv"> + <section name="optional" title="Optional Parameters" expanded="false"> + <param label="fftempdir" name="fftempdir" type="text" optional="true" + help="Valid path: if there are file size limitations on the default ff pacakge temp directory - + getOptions('fftempdir') - you can change the directory used as the fftempdir with this option."/> + <param label="maxt" name="maxt" type="integer" optional="true" + help="Maximum time difference to calculate retention similarity for - all values beyond this are assigned similarity of zero"/> + </section> + </xml> + + <xml name="parameters_optional_metadata"> + <section name="metadata" title="Optional Metadata" expanded="false"> + <param label="metadata" name="batch_order_qc" type="data" format="csv" optional="true" + help="CSV with sample names (or indices, currently not handled) on rows and columns with: batch + number ('batch'), position in sequence ('order') and whether it is a qc sample or not + ('qc' with true/false OR 'sampleType' with 'sample/qc/blank')."/> + <!-- <param label="ExpDes" name="ExpDes" type="data" format="RData" optional="true" help=" an R object created by R ExpDes object: data used for record keeping and labelling msp spectral output" /> --> + </section> + </xml> + + <xml name="parameters_define_experiment"> + <section name="define_experiment" title="Define Experiment" expanded="false"> + <param label="Experiment" name="experiment" type="text" help="Experiment name, no spaces"/> + <param label="Species" name="species" type="text" help="Species name"/> + <param label="Sample" name="sample" type="text" help="Sample type"/> + <param label="Contributor" name="contributor" type="text" + help="Individual and/or organizational affiliation"/> + <param label="Platform" name="platform" type="select" display="radio" help="Platform"> + <option value="GC-MS" selected="true">GC-MS</option> + <option value="LC-MS">LC-MS</option> + </param> + </section> + </xml> + + <xml name="output_msp"> + <collection label="Mass spectra from ${tool.name} on ${on_string}" name="mass_spectra" type="list"> + <discover_datasets pattern="__name_and_ext__" directory="spectra" recurse="true" ext="msp"/> + </collection> + </xml> + + <xml name="citations"> + <citations> + <!-- Example of annotating a citation using a BibTex entry. --> + <citation type="bibtex"> + @article{Broeckling2014e, + abstract = {Metabolomic data are frequently acquired using chromatographically coupled mass spectrometry + (MS) platforms. For such datasets, the first step in data analysis relies on feature detection, where a + feature is defined by a mass and retention time. While a feature typically is derived from a single + compound, a spectrum of mass signals is more a more-accurate representation of the mass spectrometric + signal for a given metabolite. Here, we report a novel feature grouping method that operates in an + unsupervised manner to group signals from MS data into spectra without relying on predictability of the + in-source phenomenon. We additionally address a fundamental bottleneck in metabolomics, annotation of MS + level signals, by incorporating indiscriminant MS/MS (idMS/MS) data implicitly: feature detection is + performed on both MS and idMS/MS data, and feature-feature relationships are determined simultaneously + from the MS and idMS/MS data. This approach facilitates identification of metabolites using in-source MS + and/or idMS/MS spectra from a single experiment, reduces quantitative analytical variation compared to + single-feature measures, and decreases false positive annotations of unpredictable phenomenon as novel + compounds. This tool is released as a freely available R package, called RAMClustR, and is sufficiently + versatile to group features from any chromatographic-spectrometric platform or feature-finding software. + {\textcopyright} 2014 American Chemical Society.}, + author = {Broeckling, C. D. and Afsar, F. A. and Neumann, S. and Ben-Hur, A. and Prenni, J. E.}, + doi = {10.1021/ac501530d}, + issn = {15206882}, + journal = {Analytical Chemistry}, + number = {14}, + pages = {6812--6817}, + pmid = {24927477}, + title = {{RAMClust: A novel feature clustering method enables spectral-matching-based annotation for + metabolomics data}}, + volume = {86}, + year = {2014} + } + </citation> + </citations> + </xml> + + <token name="@HELP@"> + <![CDATA[ + Documentation + For documentation on the tool see https://github.com/cbroeckl/RAMClustR/blob/master/vignettes/RAMClustR.Rmd + + Upstream Tools + +-------+----------------------+----------------------+------------+ + | Name | Output File | Format | Parameter | + +=======+======================+======================+============+ + | xcms | xset.fillPeaks.RData | rdata.xcms.fillpeaks | xcmsObj | + +-------+----------------------+----------------------+------------+ + + The tool takes an **xcmsSet** object as input and extracts all relevant information. + + +-------+------------------------+--------+------------+ + | Name | Output File | Format | Parameter | + +=======+========================+========+============+ + | ??? | Feature Table with MS1 | csv | ms | + +-------+------------------------+--------+------------+ + | ??? | Feature Table with MS2 | csv | idmsms | + +-------+------------------------+--------+------------+ + + Alternatively, the tool takes a **csv** table as input which has to fulfill the following requirements + + (1) no more than one sample (or file) name column and one feature name row; + (2) feature names that contain the mass and retention times, separated by a constant delimiter; and + (3) features in columns and samples in rows. + + Downstream Tools + +---------+--------------+----------------------+ + | Name | Output File | Format | + +=========+==============+======================+ + | matchMS | Mass Spectra | collection (tgz/msp) | + +---------+--------------+----------------------+ + + @GENERAL_HELP@ + ]]> + </token> + + <token name="@GENERAL_HELP@"> + Background + Metabolomics + Metabolomics is frequently performed using chromatographically coupled mass spectrometry, with gas + chromatography, liquid chromatography, and capillary electrophoresis being the most frequently utilized + methods of separation. The coupling of chromatography to mass spectrometry is enabled with an + appropriate ionization source - electron impact (EI) for gas phase separations and electrospray + ionization (ESI) for liquid phase separations. XCMS is a commonly used tool to detect all the signals + from a metabolomics dataset, generating aligned features, where a feature is represented by a mass and + retention time. Each feature is presumed to derive from a single compound. However, each compound is + represented by several features. With any ionization method, isotopic peaks will be observed reflective + of the elemental composition of the analyte. In EI, fragmentation is a byproduct of ionization, and has + driven the generation of large mass spectral libraries. In ESI, in-source fragmentation frequently + occurs, the magnitude of which is compound dependent, with more labile compounds being more prone to + in-source fragmentation. ESI can also product multiple adduct forms (protonated, potassiated, sodiated, + ammoniated...), and can produce multimers (i.e. [2M+H]+, [3M+K]+, etc) and multiple charged species + ([M+2H]++). This can become further complicated by considering combinations of these phenomena. For + example [2M+3H]+++ (triply charged dimer) or an in-source fragment of a dimer. + + RAMClustR approach + RAMClustR was designed to group features designed from the same compound using an approach which is + __1.__ unsupervised, __2.__ platform agnosic, and __3.__ devoid of curated rules, as the depth of + understanding of these processes is insufficent to enable accurate curation/prediction of all phenomenon + that may occur. We acheive this by making two assumptions. The first is that two features derived + from the same compound with have (approximately) the same retention time. The second is that two + features derived from the same compound will have (approximately) the same quantitative trend across + all samples in the xcms sample set. From these assumptions, we can calculate a retention time + similarity score and a correlational similarity score for each feature pair. A high similarity score + for both retention time and correlation indicates a strong probability that two features derive from + the same compound. Since both conditions must be met, the product of the two similarity scores provides + the best approximatio of the total similarity score - i.e. a feature pair with retention time similarity + of 1 and correlational similarity of 0 is unlikely to derive from one compound - 1 x 0 = 0, the final + similarity score is zero, indicating the two features represent two different compounds. Similarly, a + feature pair with retention time similarity of 0 and correlational similarity of 1 is unlikely to derive + from one compound - 0 x 1 = 0. Alternatively - a feature pair with retention time similarity of 1 and + correlational similarity of 1 is likely to derive from one compound - 1 x 1 = 1. + + The RAMClustR algorithm is built on creating similarity scores for all pairs of features, submitting + this score matrix for heirarchical clustering, and then cutting the resulting dendrogram into neat + chunks using the dynamicTreeCut package - where each 'chunk' of the dendrogram results in a group of + features likely to be derived from a single compound. Importantly, this is acheived without looking for + specific phenomenon (i.e. sodiation), meaning that grouping can be performed on any dataset, whether it + is poisitive or negative ionization mode, EI or ESI, LC-MS GC-MS or CE-MS, in-source fragment or complex + adduction event, and predictable or unpredictable signals. + </token> +</macros>
--- a/ramclustr.xml Mon Jan 17 16:28:54 2022 +0000 +++ b/ramclustr.xml Fri Jan 28 16:25:33 2022 +0000 @@ -1,16 +1,14 @@ <tool id="ramclustr" name="RAMClustR" version="@TOOL_VERSION@+galaxy0"> <macros> - <import>ramclustr_macros.xml</import> + <import>macros.xml</import> </macros> - <creator> - <organization - url="https://www.recetox.muni.cz/" - name="RECETOX MUNI" /> - </creator> + <expand macro="creator"/> + <requirements> <requirement type="package" version="@TOOL_VERSION@">r-ramclustr</requirement> <requirement type="package" version="3.14.0">bioconductor-xcms</requirement> </requirements> + <command detect_errors="aggressive"><![CDATA[ Rscript -e 'source("${__tool_directory__}/ramclustr_wrapper.R")' @@ -73,7 +71,7 @@ <option value="csv">CSV</option> </param> <when value="xcms"> - <param name="input_xcms" label="input_xcms" type="data" format="rdata.xcms.fillpeaks" help=": containing grouped feature data for clustering by ramclustR" /> + <param name="input_xcms" label="input_xcms" type="data" format="rdata.xcms.fillpeaks" help="Grouped feature data for clustering by ramclustR" /> <expand macro="parameters_required" /> <expand macro="parameters_optional_xcms" /> <expand macro="parameters_optional_metadata" /> @@ -144,40 +142,9 @@ </tests> <help> - Documentation - For documentation on the tool see https://github.com/cbroeckl/RAMClustR/blob/master/vignettes/RAMClustR.Rmd - - Upstream Tools - +-------+----------------------+----------------------+------------+ - | Name | Output File | Format | Parameter | - +=======+======================+======================+============+ - | xcms | xset.fillPeaks.RData | rdata.xcms.fillpeaks | xcmsObj | - +-------+----------------------+----------------------+------------+ - - The tool takes an **xcmsSet** object as input and extracts all relevant information. - - +-------+------------------------+--------+------------+ - | Name | Output File | Format | Parameter | - +=======+========================+========+============+ - | ??? | Feature Table with MS1 | csv | ms | - +-------+------------------------+--------+------------+ - | ??? | Feature Table with MS2 | csv | idmsms | - +-------+------------------------+--------+------------+ - - Alternatively, the tool takes a **csv** table as input which has to fulfill the following requirements - - (1) no more than one sample (or file) name column and one feature name row; - (2) feature names that contain the mass and retention times, separated by a constant delimiter; and - (3) features in columns and samples in rows. - - Downstream Tools - +---------+--------------+----------------------+ - | Name | Output File | Format | - +=========+==============+======================+ - | matchMS | Mass Spectra | collection (tgz/msp) | - +---------+--------------+----------------------+ - - @GENERAL_HELP@ + <![CDATA[ + @HELP@ + ]]> </help> <expand macro="citations" />
--- a/ramclustr_macros.xml Mon Jan 17 16:28:54 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,143 +0,0 @@ -<macros> - <token name="@TOOL_VERSION@">1.2.2</token> - - <xml name="parameters_csv"> - <section name="ms_csv" title="Input MS Data as CSV" expanded="true"> - <param label="ms" name="ms" type="data" format="csv" help="Features as columns, rows as samples. Column header mz_rt" /> - <param label="idmsms" name="idmsms" type="data" format="csv" optional="true" help="Optional idMSMS / MSe csv data. same dim and names as ms required" /> - <param label="sample_name_column" name="sample_name_column" type="integer" value="1" help="which column from the csv file contains sample names?" /> - <param label="feature_delimiter" name="feature_delimiter" type="text" value="_" help="Only required if ms input is set! How feature mz and rt are delimited in csv import column header e.g. ='-'" /> - <param label="retention_time_column" name="retention_time_column" type="integer" value="2" help="which position in delimited column header represents the retention time (csv only)" /> - <param label="st" name="st" type="float" value="1" help="sigma t - time similarity decay value. A recommended starting point is half the value of - your average chromatographic peak width at half max (seconds))." /> - </section> - </xml> - - <xml name="parameters_excluded"> - <param label="MStag" name="MStag" type="text" optional="true" help="character string in 'taglocation' to designat MS / MSe files e.g. '01.cdf'" /> - <param label="idMSMStag" name="idMSMStag" type="text" optional="true" help="character string in 'taglocation' to designat idMSMS / MSe files e.g. '02.cdf'" /> - <param label="taglocation" name="taglocation" type="text" value="filepaths" help="'filepaths' by default, 'phenoData[,1]' is another option. refers to xcms slot" /> - </xml> - - <xml name="parameters_required"> - <section name="required" title="Required Parameters" expanded="true"> - <param label="sr" name="sr" type="float" value="0.5" help="sigma r - correlational similarity decay value" /> - <param label="deepSplit" name="deepSplit" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" help="controls how agressively the HCA tree is cut - see ?cutreeDynamicTree" /> - <param label="blocksize" name="blocksize" type="integer" value="2000" help="number of features (scans?) processed in one block =1000," /> - <param label="mult" name="mult" type="integer" value="5" help="internal value, can be used to influence processing speed/ram usage" /> - <param label="hmax" name="hmax" type="float" value="0.3" help="precut the tree at this height, default 0.3 - see ?cutreeDynamicTree" /> - <param label="collapse" name="collapse" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="reduce feature intensities to spectrum intensities?" /> - <param label="usePheno" name="usePheno" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="transfer phenotype data from XCMS object to SpecAbund dataset?" /> - <!-- - Currently not forwarded because the MSP is exported always manually afterwards - <param label="mspout" name="mspout" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="write msp formatted spectra to file?" /> - --> - <param label="normalize" name="normalize" type="select" display="radio" help="either 'none', 'TIC', 'quantile', or 'batch.qc' normalization of feature intensities. see batch.qc overview in details. "> - <option value="none" selected="true">none</option> - <option value="TIC">TIC</option> - <option value="quantile">quantile</option> - <option value="batch.qc">batch.qc</option> - </param> - <param label="qc_inj_range" name="qc_inj_range" type="integer" value="20" help="how many injections around each injection are to be scanned for presence of QC samples when using batch.qc normalization? A good rule of thumb is between 1 and 3 times the typical injection span between QC injections. i.e. if you inject QC ever 7 samples, set this to between 7 and 21. smaller values provide more local precision but make normalization sensitive to individual poor outliers (though these are first removed using the boxplot function outlier detection), while wider values provide less local precision in normalization but better stability to individual peak areas." /> - - <param label="minModuleSize" name="minModuleSize" type="integer" value="2" help="how many features must be part of a cluster to be returned? default = 2" /> - <param label="linkage" name="linkage" type="select" display="radio" value="average" help="hierarchical clustering linkage method - see ?hclust"> - <option value="average" selected="true">average</option> - <option value="ward.D">ward.D</option> - <option value="ward.D2">ward.D2</option> - <option value="single">single</option> - <option value="complete">complete</option> - <option value="mcquitty">mcquitty</option> - <option value="median">median</option> - <option value="centroid">centroid</option> - </param> - - <param label="mzdec" name="mzdec" type="integer" value="3" help="number of decimal places used in printing m/z values" /> - <param label="cor_method" name="cor_method" type="select" display="radio" value="pearson" help="which correlational method used to calculate 'r' - see ?cor"> - <option value="pearson" selected="true">pearson</option> - <option value="everything">everything</option> - <option value="spearman">spearman</option> - <option value="kendall">kendall</option> - </param> - - <param label="rt_only_low_n" name="rt_only_low_n" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="At low injection numbers, correlational relationships of peak intensities may be unreliable. by defualt ramclustR will simply ignore the correlational r value and cluster on retention time alone. if you wish to use correlation with at n less than 5, set this value to FALSE." /> - <param label="replace_zeros" name="replace_zeros" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="NA, NaN, and Inf values are replaced with zero, and zero values are sometimes returned from peak peaking. When TRUE, zero values will be replaced with a small amount of noise, with noise level set based on the detected signal intensities for that feature. " /> - <param label="Merge MSP Files" name="merge_msp" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="Whether to merge all msp in one file or export one msp per spectra"/> - </section> - </xml> - - <xml name="parameters_optional_xcms"> - <section name="optional" title="Optional Parameters" expanded="false"> - <param label="st" name="st" type="float" optional="true" help="sigma t - time similarity decay value. A recommended starting point is half the value of - your average chromatographic peak width at half max (seconds))." /> - <param label="fftempdir" name="fftempdir" type="text" optional="true" help="valid path: if there are file size limitations on the default ff pacakge temp directory - getOptions('fftempdir') - you can change the directory used as the fftempdir with this option." /> - <param label="maxt" name="maxt" type="integer" optional="true" help="maximum time difference to calculate retention similarity for - all values beyond this are assigned similarity of zero" /> - </section> - </xml> - - <xml name="parameters_optional_csv"> - <section name="optional" title="Optional Parameters" expanded="false"> - <param label="fftempdir" name="fftempdir" type="text" optional="true" help="valid path: if there are file size limitations on the default ff pacakge temp directory - getOptions('fftempdir') - you can change the directory used as the fftempdir with this option." /> - <param label="maxt" name="maxt" type="integer" optional="true" help="maximum time difference to calculate retention similarity for - all values beyond this are assigned similarity of zero" /> - </section> - </xml> - - <xml name="parameters_optional_metadata"> - <section name="metadata" title="Optional Metadata" expanded="false"> - <param label="metadata" name="batch_order_qc" type="data" format="csv" optional="true" help="CSV with sample names (or indices, currently not handled) on rows and columns with: batch number ('batch'), position in sequence ('order') and whether it is a qc sample or not ('qc' with true/false OR 'sampleType' with 'sample/qc/blank')." /> - <!-- <param label="ExpDes" name="ExpDes" type="data" format="RData" optional="true" help=" an R object created by R ExpDes object: data used for record keeping and labelling msp spectral output" /> --> - </section> - </xml> - - <xml name="parameters_define_experiment"> - <section name="define_experiment" title="Define Experiment" expanded="false"> - <param label="Experiment" name="experiment" type="text" help="experiment name, no spaces" /> - <param label="Species" name="species" type="text" help="species name" /> - <param label="Sample" name="sample" type="text" help="sample type" /> - <param label="Contributor" name="contributor" type="text" help="individual and/or organizational affiliation" /> - <param label="Platform" name="platform" type="select" display="radio" help="platform"> - <option value="GC-MS" selected="true">GC-MS</option> - <option value="LC-MS">LC-MS</option> - </param> - </section> - </xml> - - <xml name="output_msp"> - <collection label="Mass spectra from ${tool.name} on ${on_string}" name="mass_spectra" type="list"> - <discover_datasets pattern="__name_and_ext__" directory="spectra" recurse="true" ext="msp" /> - </collection> - </xml> - - <token name="@GENERAL_HELP@"> - Background - Metabolomics - Metabolomics is frequently performed using chromatographically coupled mass spectrometry, with gas chromatography, liquid chromatography, and capillary electrophoresis being the most frequently utilized methods of separation. The coupling of chromatography to mass spectrometry is enabled with an appropriate ionization source - electron impact (EI) for gas phase separations and electrospray ionization (ESI) for liquid phase separations. XCMS is a commonly used tool to detect all the signals from a metabolomics dataset, generating aligned features, where a feature is represented by a mass and retention time. Each feature is presumed to derive from a single compound. However, each compound is represented by several features. With any ionization method, isotopic peaks will be observed reflective of the elemental composition of the analyte. In EI, fragmentation is a byproduct of ionization, and has driven the generation of large mass spectral libraries. In ESI, in-source fragmentation frequently occurs, the magnitude of which is compound dependent, with more labile compounds being more prone to in-source fragmentation. ESI can also product multiple adduct forms (protonated, potassiated, sodiated, ammoniated...), and can produce multimers (i.e. [2M+H]+, [3M+K]+, etc) and multiple charged species ([M+2H]++). This can become further complicated by considering combinations of these phenomena. For example [2M+3H]+++ (triply charged dimer) or an in-source fragment of a dimer. - - RAMClustR approach - RAMClustR was designed to group features designed from the same compound using an approach which is __1.__ unsupervised, __2.__ platform agnosic, and __3.__ devoid of curated rules, as the depth of understanding of these processes is insufficent to enable accurate curation/prediction of all phenomenon that may occur. We acheive this by making two assumptions. The first is that two features derived from the same compound with have (approximately) the same retention time. The second is that two features derived from the same compound will have (approximately) the same quantitative trend across all samples in the xcms sample set. From these assumptions, we can calculate a retention time similarity score and a correlational similarity score for each feature pair. A high similarity score for both retention time and correlation indicates a strong probability that two features derive from the same compound. Since both conditions must be met, the product of the two similarity scores provides the best approximatio of the total similarity score - i.e. a feature pair with retention time similarity of 1 and correlational similarity of 0 is unlikely to derive from one compound - 1 x 0 = 0, the final similarity score is zero, indicating the two features represent two different compounds. Similarly, a feature pair with retention time similarity of 0 and correlational similarity of 1 is unlikely to derive from one compound - 0 x 1 = 0. Alternatively - a feature pair with retention time similarity of 1 and correlational similarity of 1 is likely to derive from one compound - 1 x 1 = 1. - - - The RAMClustR algorithm is built on creating similarity scores for all pairs of features, submitting this score matrix for heirarchical clustering, and then cutting the resulting dendrogram into neat chunks using the dynamicTreeCut package - where each 'chunk' of the dendrogram results in a group of features likely to be derived from a single compound. Importantly, this is acheived without looking for specific phenomenon (i.e. sodiation), meaning that grouping can be performed on any dataset, whether it is poisitive or negative ionization mode, EI or ESI, LC-MS GC-MS or CE-MS, in-source fragment or complex adduction event, and predictable or unpredictable signals. - </token> - - <xml name="citations"> - <citations> - <!-- Example of annotating a citation using a BibTex entry. --> - <citation type="bibtex"> - @article{Broeckling2014e, - abstract = {Metabolomic data are frequently acquired using chromatographically coupled mass spectrometry (MS) platforms. For such datasets, the first step in data analysis relies on feature detection, where a feature is defined by a mass and retention time. While a feature typically is derived from a single compound, a spectrum of mass signals is more a more-accurate representation of the mass spectrometric signal for a given metabolite. Here, we report a novel feature grouping method that operates in an unsupervised manner to group signals from MS data into spectra without relying on predictability of the in-source phenomenon. We additionally address a fundamental bottleneck in metabolomics, annotation of MS level signals, by incorporating indiscriminant MS/MS (idMS/MS) data implicitly: feature detection is performed on both MS and idMS/MS data, and feature-feature relationships are determined simultaneously from the MS and idMS/MS data. This approach facilitates identification of metabolites using in-source MS and/or idMS/MS spectra from a single experiment, reduces quantitative analytical variation compared to single-feature measures, and decreases false positive annotations of unpredictable phenomenon as novel compounds. This tool is released as a freely available R package, called RAMClustR, and is sufficiently versatile to group features from any chromatographic-spectrometric platform or feature-finding software. {\textcopyright} 2014 American Chemical Society.}, - author = {Broeckling, C. D. and Afsar, F. A. and Neumann, S. and Ben-Hur, A. and Prenni, J. E.}, - doi = {10.1021/ac501530d}, - issn = {15206882}, - journal = {Analytical Chemistry}, - number = {14}, - pages = {6812--6817}, - pmid = {24927477}, - title = {{RAMClust: A novel feature clustering method enables spectral-matching-based annotation for metabolomics data}}, - volume = {86}, - year = {2014} - } - </citation> - </citations> - </xml> -</macros>