comparison macros.xml @ 5:10ded21d47c0 draft

"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/ramclustr commit 3d2821ffc97cc4f9287ee83bbddb306a8034daa0"
author recetox
date Fri, 11 Feb 2022 14:14:38 +0000
parents 69e0da4703b5
children f01ab6fe8857
comparison
equal deleted inserted replaced
4:69e0da4703b5 5:10ded21d47c0
30 </creator> 30 </creator>
31 </xml> 31 </xml>
32 32
33 <xml name="parameters_csv"> 33 <xml name="parameters_csv">
34 <section name="ms_csv" title="Input MS Data as CSV" expanded="true"> 34 <section name="ms_csv" title="Input MS Data as CSV" expanded="true">
35 <param label="ms" name="ms" type="data" format="csv" 35 <param label="Input CSV" name="ms" type="data" format="csv"
36 help="Features as columns, rows as samples. Column header mz_rt"/> 36 help="Features as columns, rows as samples. Column header in format mz_rt."/>
37 <param label="idmsms" name="idmsms" type="data" format="csv" optional="true" 37 <param label="idMSMS" name="idmsms" type="data" format="csv" optional="true"
38 help="Optional idMSMS / MSe csv data. same dim and names as ms required"/> 38 help="Optional idMSMS / MSe csv data. Same dimension and names as in input CSV are required."/>
39 <param label="sample_name_column" name="sample_name_column" type="integer" value="1" 39 </section>
40 help="Which column from the csv file contains sample names?"/> 40 </xml>
41 <param label="feature_delimiter" name="feature_delimiter" type="text" value="_" 41
42 help="Only required if ms input is set! How feature mz and rt are delimited in csv import column header e.g. ='-'"/> 42 <xml name="parameters_xcms">
43 <param label="retention_time_column" name="retention_time_column" type="integer" value="2" 43 <section name="xcms" title="Input MS Data as XCMS" expanded="true">
44 help="Which position in delimited column header represents the retention time (csv only)"/> 44 <param name="input_xcms" label="Input XCMS" type="data" format="rdata.xcms.fillpeaks"
45 <param label="st" name="st" type="float" value="1" help="Sigma t - time similarity decay value. 45 help="Grouped feature data for clustering." />
46 A recommended starting point is half the value of your average chromatographic peak width at half max (seconds))."/> 46 <param label="Preserve phenotype" name="usePheno" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true"
47 </section> 47 help="Transfer phenotype data from XCMS object to Spec abundance file."/>
48 </xml> 48 </section>
49
50 <xml name="parameters_excluded">
51 <param label="MStag" name="MStag" type="text" optional="true"
52 help="Character string in 'taglocation' to designat MS / MSe files e.g. '01.cdf'"/>
53 <param label="idMSMStag" name="idMSMStag" type="text" optional="true"
54 help="Character string in 'taglocation' to designat idMSMS / MSe files e.g. '02.cdf'"/>
55 <param label="taglocation" name="taglocation" type="text" value="filepaths"
56 help="'filepaths' by default, 'phenoData[,1]' is another option. refers to xcms slot"/>
57 </xml> 49 </xml>
58 50
59 <xml name="parameters_required"> 51 <xml name="parameters_required">
60 <section name="required" title="Required Parameters" expanded="true"> 52 <param label="Sigma r" name="sr" type="float" value="0.5" help="Correlational similarity between features."/>
61 <param label="sr" name="sr" type="float" value="0.5" help="Sigma r - correlational similarity decay value"/> 53 <param label="Correlation method" name="cor_method" type="select" display="radio"
62 <param label="deepSplit" name="deepSplit" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" 54 help="Choose correlational method to be used - see [1] for details.">
63 help="Controls how agressively the HCA tree is cut - see ?cutreeDynamicTree"/> 55 <option value="pearson" selected="true">pearson</option>
64 <param label="blocksize" name="blocksize" type="integer" value="2000" 56 <option value="everything">everything</option>
65 help="Number of features (scans?) processed in one block =1000,"/> 57 <option value="spearman">spearman</option>
66 <param label="mult" name="mult" type="integer" value="5" 58 <option value="kendall">kendall</option>
67 help="Internal value, can be used to influence processing speed/ram usage"/> 59 </param>
68 <param label="hmax" name="hmax" type="float" value="0.3" 60 <param label="Maximum RT difference" name="maxt" value="60" type="float"
69 help="Precut the tree at this height, default 0.3 - see ?cutreeDynamicTree"/> 61 help="Maximum difference to calculate RT similarity - values beyond this are assigned zero similarity."/>
70 <param label="collapse" name="collapse" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" 62 </xml>
71 help="Reduce feature intensities to spectrum intensities?"/> 63
72 <param label="usePheno" name="usePheno" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" 64 <xml name="main_parameters">
73 help="Transfer phenotype data from XCMS object to SpecAbund dataset?"/> 65 <section name="clustering" title="Clustering" expanded="true">
74 <!-- 66 <param label="Clustering linkage method" name="linkage" type="select" display="radio"
75 Currently not forwarded because the MSP is exported always manually afterwards 67 help="Choose hierarchical clustering linkage method - see [2] for details.">
76 <param label="mspout" name="mspout" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="write msp formatted spectra to file?" />
77 -->
78 <param label="normalize" name="normalize" type="select" display="radio"
79 help="Either 'none', 'TIC', 'quantile', or 'batch.qc' normalization of feature intensities. see batch.qc overview in details. ">
80 <option value="none" selected="true">none</option>
81 <option value="TIC">TIC</option>
82 <option value="quantile">quantile</option>
83 <option value="batch.qc">batch.qc</option>
84 </param>
85 <param label="qc_inj_range" name="qc_inj_range" type="integer" value="20"
86 help="How many injections around each injection are to be scanned for presence of QC samples when
87 using batch.qc normalization? A good rule of thumb is between 1 and 3 times the typical
88 injection span between QC injections. i.e. if you inject QC ever 7 samples, set this to
89 between 7 and 21. Smaller values provide more local precision but make normalization sensitive
90 to individual poor outliers (though these are first removed using the boxplot function outlier
91 detection), while wider values provide less local precision in normalization but better
92 stability to individual peak areas."/>
93
94 <param label="minModuleSize" name="minModuleSize" type="integer" value="2"
95 help="How many features must be part of a cluster to be returned? default = 2"/>
96 <param label="linkage" name="linkage" type="select" display="radio" value="average"
97 help="Hierarchical clustering linkage method - see ?hclust">
98 <option value="average" selected="true">average</option> 68 <option value="average" selected="true">average</option>
99 <option value="ward.D">ward.D</option> 69 <option value="ward.D">ward.D</option>
100 <option value="ward.D2">ward.D2</option> 70 <option value="ward.D2">ward.D2</option>
101 <option value="single">single</option> 71 <option value="single">single</option>
102 <option value="complete">complete</option> 72 <option value="complete">complete</option>
103 <option value="mcquitty">mcquitty</option> 73 <option value="mcquitty">mcquitty</option>
104 <option value="median">median</option> 74 <option value="median">median</option>
105 <option value="centroid">centroid</option> 75 <option value="centroid">centroid</option>
106 </param> 76 </param>
107 77 <param label="Minimal cluster size" name="minModuleSize" type="integer" value="2"
108 <param label="mzdec" name="mzdec" type="integer" value="3" 78 help="Minimal size (number of features) of a cluster."/>
109 help="Number of decimal places used in printing m/z values"/> 79 <param label="Maximal tree height" name="hmax" type="float" value="0.3"
110 <param label="cor_method" name="cor_method" type="select" display="radio" value="pearson" 80 help="Cut the Hierarchical Cluster Analysis tree at this height, see [3] for details."/>
111 help="Which correlational method used to calculate 'r' - see ?cor"> 81 <param label="Use deepSplit" name="deepSplit" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false"
112 <option value="pearson" selected="true">pearson</option> 82 help="Check to produce more smaller clusters, uncheck for fewer bigger clusters, see [3] for details."/>
113 <option value="everything">everything</option> 83 </section>
114 <option value="spearman">spearman</option> 84
115 <option value="kendall">kendall</option> 85 <section name="normalisation" title="Normalisation" expanded="true">
116 </param> 86 <conditional name="normalisation_method">
117 87 <param label="Normalisation method" name="normalize" type="select" display="radio"
118 <param label="rt_only_low_n" name="rt_only_low_n" type="boolean" truevalue="TRUE" falsevalue="FALSE" 88 help="Choose method for normalization of feature intensities.">
89 <option value="none" selected="true">none</option>
90 <option value="TIC">TIC</option>
91 <option value="quantile">quantile</option>
92 <option value="batch.qc">batch.qc</option>
93 </param>
94 <when value="batch.qc">
95 <param label="Metadata details" name="batch_order_qc" type="data" format="csv" optional="true"
96 help="CSV with sample names (or indices, currently not handled) on rows and columns with:
97 batch number ('batch'), position in sequence ('order'), and whether it is a QC sample or not
98 ('qc' with true/false OR 'sampleType' with 'sample/qc/blank')."/>
99 <param label="QC injection range" name="qc_inj_range" type="integer" value="20"
100 help="How many injections around each injection are to be scanned for presence of QC samples?
101 A good rule of thumb is between 1 and 3 times the typical
102 injection span between QC injections. i.e. if you inject QC ever 7 samples, set this to
103 between 7 and 21. Smaller values provide more local precision but make normalization sensitive
104 to individual poor outliers (though these are first removed using the boxplot function outlier
105 detection), while wider values provide less local precision in normalization but better
106 stability to individual peak areas."/>
107 </when>
108 </conditional>
109 </section>
110
111 <section name="performance" title="Performance">
112 <param label="Blocksize" name="blocksize" type="integer" value="2000"
113 help="Number of features processed in one block."/>
114 <param label="Blocksize factor" name="mult" type="integer" value="5"
115 help="Factor to scale blocksize to influence processing speed."/>
116 </section>
117
118 <section name="msp_output_details" title="MSP output">
119 <param label="Merge MSP Files" name="merge_msp" type="boolean" truevalue="TRUE" falsevalue="FALSE"
120 checked="true" help="Merge all MSP in one file or export one MSP per spectra."/>
121 <param label="m/z decimal places" name="mzdec" type="integer" value="6"
122 help="Number of decimal places used in printing m/z values."/>
123 <!--
124 Currently not forwarded because the MSP is exported always manually afterwards
125 <param label="mspout" name="mspout" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="write msp formatted spectra to file?" />
126 -->
127 </section>
128
129 <section name="extras" title="Extras">
130 <param label="RT only low n" name="rt_only_low_n" type="boolean" truevalue="TRUE" falsevalue="FALSE"
119 checked="true" 131 checked="true"
120 help="At low injection numbers, correlational relationships of peak intensities may be unreliable. 132 help="At low injection numbers, correlational relationships of peak intensities may be unreliable.
121 By defualt ramclustR will simply ignore the correlational r value and cluster on retention time alone. 133 By default, RAMClustR will simply ignore the correlational Sigma r value and cluster on retention time alone.
122 If you wish to use correlation with at n less than 5, set this value to FALSE."/> 134 If you wish to use correlation with at n less than 5, set this value to FALSE."/>
123 <param label="replace_zeros" name="replace_zeros" type="boolean" truevalue="TRUE" falsevalue="FALSE" 135 <param label="Replace zeros" name="replace_zeros" type="boolean" truevalue="TRUE" falsevalue="FALSE"
124 checked="true" 136 checked="true"
125 help="NA, NaN, and Inf values are replaced with zero, and zero values are sometimes returned from 137 help="NA, NaN, and Inf values are replaced with zero, and zero values are sometimes returned from
126 peak peaking. When TRUE, zero values will be replaced with a small amount of noise, with noise level 138 peak peaking. When TRUE, zero values will be replaced with a small amount of noise, with noise level
127 set based on the detected signal intensities for that feature. "/> 139 set based on the detected signal intensities for that feature."/>
128 <param label="Merge MSP Files" name="merge_msp" type="boolean" truevalue="TRUE" falsevalue="FALSE" 140 <param label="Experimental design metadata" name="ExpDes" type="data" format="csv" optional="true"
129 checked="true" help="Whether to merge all msp in one file or export one msp per spectra"/> 141 help="Definition of experimental design in CSV format." />
130 </section>
131 </xml>
132
133 <xml name="parameters_optional_xcms">
134 <section name="optional" title="Optional Parameters" expanded="false">
135 <param label="st" name="st" type="float" optional="true" help="Sigma t - time similarity decay value.
136 A recommended starting point is half the value of your average chromatographic peak width at half max (seconds))."/>
137 <param label="fftempdir" name="fftempdir" type="text" optional="true"
138 help="Valid path: if there are file size limitations on the default ff pacakge temp directory -
139 getOptions('fftempdir') - you can change the directory used as the fftempdir with this option."/>
140 <param label="maxt" name="maxt" type="integer" optional="true"
141 help="Maximum time difference to calculate retention similarity for - all values beyond this are assigned similarity of zero"/>
142 </section>
143 </xml>
144
145 <xml name="parameters_optional_csv">
146 <section name="optional" title="Optional Parameters" expanded="false">
147 <param label="fftempdir" name="fftempdir" type="text" optional="true"
148 help="Valid path: if there are file size limitations on the default ff pacakge temp directory -
149 getOptions('fftempdir') - you can change the directory used as the fftempdir with this option."/>
150 <param label="maxt" name="maxt" type="integer" optional="true"
151 help="Maximum time difference to calculate retention similarity for - all values beyond this are assigned similarity of zero"/>
152 </section>
153 </xml>
154
155 <xml name="parameters_optional_metadata">
156 <section name="metadata" title="Optional Metadata" expanded="false">
157 <param label="metadata" name="batch_order_qc" type="data" format="csv" optional="true"
158 help="CSV with sample names (or indices, currently not handled) on rows and columns with: batch
159 number ('batch'), position in sequence ('order') and whether it is a qc sample or not
160 ('qc' with true/false OR 'sampleType' with 'sample/qc/blank')."/>
161 <param label="Experimental design" name="ExpDes" type="data" format="csv" optional="true" help="Definition of experimental design in CSV format." />
162 </section> 142 </section>
163 </xml> 143 </xml>
164 144
165 <xml name="output_msp"> 145 <xml name="output_msp">
166 <collection label="Mass spectra from ${tool.name} on ${on_string}" name="mass_spectra" type="list"> 146 <collection label="Mass spectra from ${tool.name} on ${on_string}" name="mass_spectra_collection" type="list">
167 <discover_datasets pattern="__name_and_ext__" directory="spectra" recurse="true" ext="msp"/> 147 <discover_datasets pattern="__name_and_ext__" directory="spectra" recurse="true" ext="msp"/>
168 </collection> 148 <filter>not msp_output_details['merge_msp']</filter>
149 </collection>
150 <data label="Mass spectra from ${tool.name} on ${on_string}" name="mass_spectra_merged" format="msp">
151 <filter>msp_output_details['merge_msp']</filter>
152 </data>
169 </xml> 153 </xml>
170 154
171 <xml name="citations"> 155 <xml name="citations">
172 <citations> 156 <citations>
173 <!-- Example of annotating a citation using a BibTex entry. --> 157 <!-- Example of annotating a citation using a BibTex entry. -->
232 216
233 (1) no more than one sample (or file) name column and one feature name row; 217 (1) no more than one sample (or file) name column and one feature name row;
234 (2) feature names that contain the mass and retention times, separated by a constant delimiter; and 218 (2) feature names that contain the mass and retention times, separated by a constant delimiter; and
235 (3) features in columns and samples in rows. 219 (3) features in columns and samples in rows.
236 220
221 +----------------------+-------------------+-------------------+--------------------+--------------------+
222 | sample | 100.88_262.464 | 100.01_423.699 | 100.003_128.313 | 100.0057_154.686 |
223 +======================+===================+===================+====================+====================+
224 | 10_qc_16x_dil_milliq | 0 | 195953.6376 | 0 | 0 |
225 +----------------------+-------------------+-------------------+--------------------+--------------------+
226 | 11_qc_8x_dil_milliq | 0 | 117742.1828 | 4247300.664 | 0 |
227 +----------------------+-------------------+-------------------+--------------------+--------------------+
228 | 12_qc_32x_dil_milliq | 4470859.38 | 0 | 2206092.112 | 0 |
229 +----------------------+-------------------+-------------------+--------------------+--------------------+
230 | 15_qc_16x_dil_milliq | 0 | 0 | 2767477.481 | 0 |
231 +----------------------+-------------------+-------------------+--------------------+--------------------+
232
233
237 Downstream Tools 234 Downstream Tools
235 The output is a msp file or a collection of msp files, with additional Spec Abundance file.
236
238 +---------+--------------+----------------------+ 237 +---------+--------------+----------------------+
239 | Name | Output File | Format | 238 | Name | Output File | Format |
240 +=========+==============+======================+ 239 +=========+==============+======================+
241 | matchMS | Mass Spectra | collection (tgz/msp) | 240 | matchMS | Mass Spectra | collection (tgz/msp) |
242 +---------+--------------+----------------------+ 241 +---------+--------------+----------------------+
264 ([M+2H]++). This can become further complicated by considering combinations of these phenomena. For 263 ([M+2H]++). This can become further complicated by considering combinations of these phenomena. For
265 example [2M+3H]+++ (triply charged dimer) or an in-source fragment of a dimer. 264 example [2M+3H]+++ (triply charged dimer) or an in-source fragment of a dimer.
266 265
267 RAMClustR approach 266 RAMClustR approach
268 RAMClustR was designed to group features designed from the same compound using an approach which is 267 RAMClustR was designed to group features designed from the same compound using an approach which is
269 __1.__ unsupervised, __2.__ platform agnosic, and __3.__ devoid of curated rules, as the depth of 268 **1.** unsupervised, **2.** platform agnostic, and **3.** devoid of curated rules, as the depth of
270 understanding of these processes is insufficent to enable accurate curation/prediction of all phenomenon 269 understanding of these processes is insufficient to enable accurate curation/prediction of all phenomenon
271 that may occur. We acheive this by making two assumptions. The first is that two features derived 270 that may occur. We achieve this by making two assumptions. The first is that two features derived
272 from the same compound with have (approximately) the same retention time. The second is that two 271 from the same compound with have (approximately) the same retention time. The second is that two
273 features derived from the same compound will have (approximately) the same quantitative trend across 272 features derived from the same compound will have (approximately) the same quantitative trend across
274 all samples in the xcms sample set. From these assumptions, we can calculate a retention time 273 all samples in the xcms sample set. From these assumptions, we can calculate a retention time
275 similarity score and a correlational similarity score for each feature pair. A high similarity score 274 similarity score and a correlational similarity score for each feature pair. A high similarity score
276 for both retention time and correlation indicates a strong probability that two features derive from 275 for both retention time and correlation indicates a strong probability that two features derive from
277 the same compound. Since both conditions must be met, the product of the two similarity scores provides 276 the same compound. Since both conditions must be met, the product of the two similarity scores provides
278 the best approximatio of the total similarity score - i.e. a feature pair with retention time similarity 277 the best approximation of the total similarity score - i.e. a feature pair with retention time similarity
279 of 1 and correlational similarity of 0 is unlikely to derive from one compound - 1 x 0 = 0, the final 278 of 1 and correlational similarity of 0 is unlikely to derive from one compound - 1 x 0 = 0, the final
280 similarity score is zero, indicating the two features represent two different compounds. Similarly, a 279 similarity score is zero, indicating the two features represent two different compounds. Similarly, a
281 feature pair with retention time similarity of 0 and correlational similarity of 1 is unlikely to derive 280 feature pair with retention time similarity of 0 and correlational similarity of 1 is unlikely to derive
282 from one compound - 0 x 1 = 0. Alternatively - a feature pair with retention time similarity of 1 and 281 from one compound - 0 x 1 = 0. Alternatively - a feature pair with retention time similarity of 1 and
283 correlational similarity of 1 is likely to derive from one compound - 1 x 1 = 1. 282 correlational similarity of 1 is likely to derive from one compound - 1 x 1 = 1.
284 283
285 The RAMClustR algorithm is built on creating similarity scores for all pairs of features, submitting 284 The RAMClustR algorithm is built on creating similarity scores for all pairs of features, submitting
286 this score matrix for heirarchical clustering, and then cutting the resulting dendrogram into neat 285 this score matrix for hierarchical clustering, and then cutting the resulting dendrogram into neat
287 chunks using the dynamicTreeCut package - where each 'chunk' of the dendrogram results in a group of 286 chunks using the dynamicTreeCut package - where each 'chunk' of the dendrogram results in a group of
288 features likely to be derived from a single compound. Importantly, this is acheived without looking for 287 features likely to be derived from a single compound. Importantly, this is achieved without looking for
289 specific phenomenon (i.e. sodiation), meaning that grouping can be performed on any dataset, whether it 288 specific phenomenon (i.e. sodiation), meaning that grouping can be performed on any dataset, whether it
290 is poisitive or negative ionization mode, EI or ESI, LC-MS GC-MS or CE-MS, in-source fragment or complex 289 is positive or negative ionization mode, EI or ESI, LC-MS GC-MS or CE-MS, in-source fragment or complex
291 adduction event, and predictable or unpredictable signals. 290 adduction event, and predictable or unpredictable signals.
292 </token> 291 </token>
293 292
294 <token name="@HELP_experiment@"> 293 <token name="@HELP_experiment@">
295 <![CDATA[ 294 <![CDATA[