comparison process_scans.xml @ 0:fde0aac74d2f draft default tip

"planemo upload for repository https://github.com/computational-metabolomics/dimspy-galaxy commit 80069808371b58f45da0c8133c27d67ac1a5b448"
author computational-metabolomics
date Wed, 17 Feb 2021 10:50:27 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:fde0aac74d2f
1 <tool id="dimspy_process_scans" name="Process Scans (and SIM-Stitch)" version="@TOOL_VERSION@+galaxy1">
2 <description> - Read, filter and average MS scans</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements" />
7 <command detect_errors="exit_code">
8 <![CDATA[
9 #if $data.input[0].is_of_type("zip")
10 dimspy unzip
11 --input $data.input[0]
12 --output ./data
13 &&
14 dimspy process-scans
15 --input ./data
16 #else
17 #for $fn in $data.input
18 ln -s '$fn' '$fn.name'
19 &&
20 #end for
21 dimspy process-scans
22 --input .
23 #end if
24 --output '$hdf5_file_out'
25 #if $filelist
26 --filelist '$filelist'
27 #end if
28 --function-noise $function_noise
29 --snr-threshold $snr_threshold
30 --ppm $mults.ppm
31 --min_scans $mults.min_scans
32 #if float($mults.min_fraction) > 0.0
33 --min-fraction $mults.min_fraction
34 #else
35 --min-fraction 0.0
36 #end if
37 #if float($mults.rsd_threshold) > 0.0
38 --rsd-threshold $mults.rsd_threshold
39 #end if
40 #if $adv.skip_stitching
41 --skip-stitching
42 #end if
43 #if float($adv.ringing_threshold) > 0.0
44 --ringing-threshold $adv.ringing_threshold
45 #end if
46 #for $mzr in $adv.remove_mz_range
47 --remove-mz-range $mzr.start $mzr.end
48 #end for
49 #if $scan_events.filter == 'true'
50 #for $se in $scan_events.descriptions
51 #if $scan_events.incl_excl == 'include'
52 --include-scan-events $se.start $se.end $se.scan_type
53 #elif $scan_events.incl_excl == 'exclude'
54 --exclude-scan-events $se.start $se.end $se.scan_type
55 #end if
56 #end for
57 #end if
58 --report '$report'
59 --ncpu \${GALAXY_SLOTS:-1}
60 &&
61 dimspy hdf5-pls-to-txt
62 --input '$hdf5_file_out'
63 --output .
64 --delimiter $delimiter
65 ]]>
66 </command>
67 <inputs>
68 <conditional name="data">
69 <param name="type" type="select" label="Select the MS data file type?">
70 <option value="mzml" selected="true">*.mzML files</option>
71 <option value="raw">*.raw files</option>
72 </param>
73 <when value="raw">
74 <param name="license_agreement" type="boolean" label="Do you agree to the RawFileReader license terms?" help="*.raw files are read using the RawFileReader reading tool (Copyright © 2016 by Thermo Fisher Scientific, Inc. All rights reserved). To run this tool and process .raw files you must agree to the RawFileReader license terms. Read it at https://github.com/computational-metabolomics/dimspy-galaxy/blob/master/tools/dimspy/RawFileReaderLicense.md. See generic help section of this tool for more details.">
75 <validator type="expression" message="You must agree to the RawFileReader license terms to run this tool and process *.raw files.">True == value</validator>
76 </param>
77 <param name="input" argument="--source" type="data" format="thermo.raw" multiple="true" label="*.raw files" />
78 </when>
79 <when value="mzml">
80 <param name="input" argument="--source" type="data" format="zip,mzml" multiple="true" label="*.mzML files" />
81 </when>
82 </conditional>
83 <param name="filelist" argument="--filelist" type="data" format="tsv,tabular" optional="true" label="Filelist / Samplelist" />
84 <param name="function_noise" argument="--function-noise" type="select" label="Function to calculate the noise from each scan" help="">
85 <option value="median" selected="true">median intensity</option>
86 <option value="mean">mean intensity</option>
87 <option value="mad">mad (mean absolute deviation) intensity</option>
88 <option value="noise_packets">As shown in Xcalibur Qual Browser (Available for *.RAW files only)</option>
89 </param>
90 <param name="snr_threshold" argument="--snr-threshold" type="float" value="3.0" label="Signal-to-noise ratio threshold" help="" />
91 <conditional name="scan_events">
92 <param name="filter" type="boolean" label="Filter specific windows or scan events?" help="(--include-scan-events / --exclude-scan-events)"/>
93 <when value="true">
94 <param name="incl_excl" type="select" label="Include / Exclude scan event(s)" >
95 <option value="exclude" selected="true">Exclude</option>
96 <option value="include">Include</option>
97 </param>
98 <repeat name="descriptions" title="Description">
99 <param name="start" type="float" value="0" label="Start m/z for scan event"/>
100 <param name="end" type="float" value="0" label="End m/z for scan event">
101 <validator type="expression" message="M/z value must be larger than 0.0">float(value) > 0.0</validator>
102 </param>
103 <param name="scan_type" type="select" label="Scan type">
104 <option value="full" selected="true">Full scan</option>
105 <option value="sim">SIM scan</option>
106 </param>
107 </repeat>
108 </when>
109 <when value="false">
110 </when>
111 </conditional>
112 <section name="mults" title="Show options for multiple scans" expanded="True">
113 <param name="min_scans" argument="--min_scans" type="integer" min="1" value="1" label="Minimum number of scans required for each m/z window or event" help="" />
114 <param name="ppm" argument="--ppm" type="float" value="2.0" label="Ppm error tolerance" help="Maximum tolerated m/z deviation in consecutive scans in parts per million." />
115 <param name="min_fraction" argument="--min-fraction" type="float" min="0.0" max="1.0" value="0.0" label="Minimum fraction (i.e. percentage) of scans a peak has to be present in." help="Select '0' to skip this step." />
116 <param name="rsd_threshold" argument="--rsd-threshold" type="float" min="0.0" value="0.0" label="Relative standard deviation threshold" help="Select '0' to skip this step. Maximum tolerated relative standard deviation (RSD) of the peak intensities across scans." />
117 </section>
118 <section name="adv" title="Show advanced options" expanded="True">
119 <param name="skip_stitching" argument="--skip-stitching" type="boolean" value="false" label="Skip SIM-Stitching?" help="When set to 'yes' it will skip the processing step where (SIM) windows are 'stitched' or 'joined' together. Set this option to 'yes' if you like to proces individual scan/SIM windows (events/ranges) without 'stitching' them."/>
120 <repeat name="remove_mz_range" title="Remove m/z range(s)?">
121 <param name="start" type="float" value="0.0" label="Start m/z of removal range"/>
122 <param name="end" type="float" value="0.0" label="End m/z of removal range">
123 <validator type="expression" message="M/z value must be larger than 0.0">float(value) > 0.0</validator>
124 </param>
125 </repeat>
126 <param name="ringing_threshold" argument="--ringing-threshold" type="float" value="0.0" min="0.0" max="1.0" label="Relative intensity threshold used to remove ringing artifacts" help="Select '0' to skip this filter." />
127 </section>
128 <param name="delimiter" argument="--delimiter" type="hidden" value="tab" />
129 </inputs>
130 <outputs>
131 <data name="hdf5_file_out" format="h5" label="${tool.name} on ${on_string}: Peaklists (HDF5 file)" />
132 <data name="report" format="txt" label="${tool.name} on ${on_string}: Report" />
133 <collection name="peaklists_txt" type="list" label="${tool.name} on ${on_string}: Peaklists">
134 <discover_datasets pattern="(?P&lt;designation&gt;.+)\.txt" format="tsv" directory="." visible="false" />
135 </collection>
136 </outputs>
137 <tests>
138 <test>
139 <conditional name="data">
140 <param name="type" value="mzml"/>
141 <param name="input" value="batch04_QC17_rep02_263.mzML,batch04_QC17_rep01_262.mzML,batch04_QC17_rep03_264.mzML" ftype="mzml" />
142 </conditional>
143 <param name="filelist" value="filelist_mzml_QC17_triplicate.txt" ftype="tsv" />
144 <param name="function" value="median" />
145 <param name="snr_threshold" value="100.0" />
146 <conditional name="mults">
147 <param name="ppm" value="2.0" />
148 <param name="min_scans" value="1" />
149 <param name="min_fraction" value="0.5" />
150 <param name="rsd_threshold" value="0" />
151 </conditional>
152 <param name="delimiter" value="tab" />
153 <output name="hdf5_file_out" value="pls_scan5.h5" ftype="h5" compare="sim_size"/>
154 <output name="report" value="report_pls_scan5.txt" ftype="txt"/>
155 <output_collection name="peaklists_txt" type="list">
156 <element name="batch04_QC17_rep01_262" file="batch04_QC17_rep01_262_scan5.txt" ftype="tsv"/>
157 <element name="batch04_QC17_rep02_263" file="batch04_QC17_rep02_263_scan5.txt" ftype="tsv"/>
158 <element name="batch04_QC17_rep03_264" file="batch04_QC17_rep03_264_scan5.txt" ftype="tsv"/>
159 </output_collection>
160 </test>
161 <test>
162 <conditional name="data">
163 <param name="type" value="mzml"/>
164 <param name="input" value="MTBLS79_mzml_triplicates.zip" ftype="zip"/>
165 </conditional>
166 <param name="filelist" value="filelist_mzml_triplicates.txt" ftype="tsv" />
167 <param name="function" value="median" />
168 <param name="snr_threshold" value="10.0" />
169 <conditional name="mults">
170 <param name="ppm" value="2.0" />
171 <param name="min_scans" value="1" />
172 <param name="min_fraction" value="0.5" />
173 <param name="rsd_threshold" value="0" />
174 </conditional>
175 <param name="delimiter" value="tab" />
176 <output name="hdf5_file_out" value="pls.h5" ftype="h5" compare="sim_size"/>
177 <output name="report" value="report_pls_01.xt" ftype="txt"/>
178 <output_collection name="peaklists_txt" type="list">
179 <element name="batch04_QC17_rep01_262" file="batch04_QC17_rep01_262.txt" ftype="tsv"/>
180 <element name="batch04_QC17_rep02_263" file="batch04_QC17_rep02_263.txt" ftype="tsv"/>
181 <element name="batch04_QC17_rep03_264" file="batch04_QC17_rep03_264.txt" ftype="tsv"/>
182 </output_collection>
183 </test>
184 <test>
185 <conditional name="data">
186 <param name="type" value="mzml"/>
187 <param name="input" value="batch_04_QC18_mzml_triplicate.zip" ftype="zip"/>
188 </conditional>
189 <param name="function" value="median" />
190 <param name="snr_threshold" value="10.0" />
191 <conditional name="mults">
192 <param name="ppm" value="2.0" />
193 <param name="min_scans" value="1" />
194 <param name="min_fraction" value="0.8" />
195 <param name="rsd_threshold" value="20.0" />
196 </conditional>
197 <param name="delimiter" value="tab" />
198 <output name="hdf5_file_out" value="pls_QC18.h5" ftype="h5" compare="sim_size"/>
199 <output name="report" value="report_pls_02.xt" ftype="txt"/>
200 <output_collection name="peaklists_txt" type="list">
201 <element name="batch04_QC18_rep01_280" file="batch04_QC18_rep01_280.txt" ftype="tsv"/>
202 <element name="batch04_QC18_rep02_281" file="batch04_QC18_rep02_281.txt" ftype="tsv"/>
203 <element name="batch04_QC18_rep03_282" file="batch04_QC18_rep03_282.txt" ftype="tsv"/>
204 </output_collection>
205 </test>
206 </tests>
207 <help>
208
209
210 Process Scans (and SIM stitch)
211 ==============================
212
213 ..
214
215 ----------------
216
217 Description
218 -----------
219
220 Standard DIMS processing workflow: **Process Scans** -> [Replicate Filter] -> Align Samples -> [Missing values sample filter] -> Blank Filter -> Sample Filter -> [Missing values sample filter] -> Pre-processing -> Statistics
221
222 This tool is used to generate a single mass spectral peaklist for each of the data files defined in the ‘Filelist/Samplelist’. The tool extracts mass spectral peaks from a data file (in either .mzML or .RAW format) and then filters these in accordance with user-defined parameter settings. All peaks remaining after filtering are hierarchically clustered in one-dimension, during which pairs of peaks with similar m/z values are grouped together if the difference between their m/z values, when divided by the average of their m/z values and multiplied by 1 x 10\ :sup:`6` \, equates to less-than the user-defined ppm error tolerance.
223
224 **IMPORTANT:** when using .mzML files generated using the Proteowizard tool, SIM-type scans will only be treated as spectra if the ‘simAsSpectra’ filter was set to true during the conversion process, e.g.:
225
226 *msconvert.exe example.raw* **--simAsSpectra** *--64 --zlib --filter "peakPicking true 1-”*
227
228 -----------------
229
230
231 Parameters
232 ----------
233
234 ***.mzml or *.raw files** (REQUIRED) - use one of the following inputs:
235
236 * **Single or multiple .mzML or .raw file**
237
238 * **Data collection** - use this option if .mzml or .raw files are contained within a Galaxy dataset collection. Dataset collections may be generated within the Galaxy environment.
239
240 * **Zip file** from history - use this option if you have uploaded a \*.zip directory containing \*.mzML files (.raw files are not supported).
241
242
243 **Filelist / Samplelist** (HIGHLY RECOMMENDED) - a table containing **filename** and **classLabel** information for each experimental sample. These column headers MUST be included in the first row of the table.
244
245 For a standard DIMS experiment, users are advised to also include the following additional columns in order to ensure their data remains compatible with future versions of the dimspy processing pipeline:
246
247 * **injectionOrder** - integer values ranging from 1 to i, where i is the total number of independent infusions performed as part of a DIMS experiment. e.g. if a study included 20 samples, each of which was injected as four independent replicates, there would be at least 20 * 4 injections, so i = 80 and the range for injection order would be from 1 to 80 in steps of 1.
248
249 * **replicate** - integer value from 1 to r, indicating the order in which technical replicates of each study sample were injected in to the mass spectrometer, e.g. if study samples were analysed in quadruplicate, r = 4 and integer values are accordingly 1, 2, 3, 4.
250
251 * **batch** - integer value from 1 to b, where b corresponds to the total number of batches analysed under define analysis conditions, for any given experiment. e.g. : if 4 independent plates of polar extracts were analysed in the positive ionisation mode, then valid values for batch are 1, 2, 3 and 4.
252
253 * **NOTE**: for DIMS experiments, “batch” is synonymous with plate, i.e. each independent plate analysed under a given analytical configuration may be considered an individual “batch”.
254
255 This file:
256
257 * must be uploaded to (or be accessible to) the active Galaxy history in order to allow for its selection in the Filelist / Samplelist drop-down menu. The file list / sample list may be created in .txt format, however, when imported in to the active Galaxy history, users must ensure to select ‘.tabular’ format.
258
259 * may include additional columns, e.g. additional metadata relating to study samples. Ensure that columns names do not conflict with existing column names.
260
261 |
262
263 @example_filelist@
264
265 |
266
267 **Function to calculate the noise from each scan** (REQUIRED; default = **median**) - toggle requiring selection of one option from the drop-down menu to indicate the preferred algorithm to apply for spectral noise calculation. The following options are available:
268
269 * **Median** - the median of all peak intensities within a given file is used as the noise value. This simplistic approach to estimating noise may be suitable for spectra with many low abundant features, but it is generally not recommended for use when spectra contain relatively few low-abundant peaks e.g. MS2 spectra.
270
271 * **Mean** - the unweighted mean average of all peak intensities within a given file is used as the noise value. This simplistic approach to estimating noise may be suitable for spectra with many low abundant features, but it is generally not recommended for use when spectra contain relatively few low-abundant peaks e.g. MS2 spectra.
272
273 * **Mean absolute deviation (MAD)** - the noise value is set as the mean of the absolute differences between peak intensities and the mean peak intensity (calculated across all peak intensities within a given file).
274
275 * **Xcalibur** - the noise value is calculated using the proprietary algorithms contained in Thermo Fisher Scientific’s reader libdrary. This option should only be applied when you are processing .RAW files.
276
277 |
278
279 **Signal-to-noise ratio (SNR) threshold** (REQUIRED; default = 3.0) - a numerical value from 0 upwards.
280
281 Peaks with a signal-to-noise ratio (SNR) less-than or equal-to this value will be removed from the output peaklist. In the comprehensive peaklist output (.tsv-formatted), peaks with a SNR below the user-defined threshold will have a ‘0’ in the ‘snr-flag’ column, which indicates that they should be ignored in downstream processing procedures. Peaks with a SNR exceeding the user-defined cutoff will have a ‘1’ in the ‘snr-flag’ column.
282
283 |
284
285 **Filter specific scan windows or scan events?** (OPTIONAL; default = **No**) - a boolean toggle where:
286
287 * **No** - do not perform scan event filtering;
288
289 * **Yes** - filter specific scan events
290
291 * when selected, users must specify whether to 'Exclude' or 'Include' specific scan events. This can be useful if, for example, a user wishes to run the Process Scans tool on only a subset of scan types collected in each file. e.g. some SIM stitch acquisitions may be initiated with an initial 30 second stabilisation period, during which full-scan data are acquired. This full-scan data can be excluded from further consideration by using the ‘exclude’ toggle.
292
293 * Included or excluded scan events must be fully defined by the user, else ALL scan events will be included. To do so:
294 * Click the '+ Description' button and insert the start and stop m/z values for the scan event to be included/excluded..
295 * Select the 'scan type' to be filtered. Options are: 'Full scan' or 'SIM scan'
296 * Click '+ Description' to 'Exclude/Include' an additional scan event.
297
298 |
299
300 **Show options for multiple scans** (OPTIONAL)
301
302 * **Minimum number of scans required for each m/z window or event within a raw/mzML data file** (default = 1) - A positive integer equal-to or greater-than 1 that specifies the number of times a given scan event must occur in a given file in order for this scan event to be included in downstream processing steps and in the output .tsv-formatted peaklist.
303
304 * **ppm error tolerance** (default = 2.0) - A positive numerical value equal-to or greater-than zero. This option impacts the clustering of peaks extracted from an input file. If the mass-to-charge ratios of two peaks, when divided by the average of their mass-to-charge ratios and then multiplied by 1 × 106, is equal-to or less-than this user-defined value, then these peaks are clustered together as a single peak. Clustering is applied across all replicates of a given scan event type i.e. with a given input file, all peaks detected in the three replicates of a 50-400 m/z scan event would undergo assessment for the need for clustering.
305
306 * **Minimum fraction (i.e. percentage; default = 0, i.e. skip) of scans a peak has to be present in** - A numerical value from 0 to 1 that specifies the minimum proportion of scans a given mass spectral peak must be detected in, in order for it to be kept in the output peaklist. Here, scans refers to replicates of the same scan event type, i.e. if set to 0.33, then a peak would need to be detected in at least 1 of the 3 replicates of a given scan event type. The ppm error specified by the user will significantly impact which peaks fulfil this criteria.
307
308 * **Relative standard deviation threshold** (default = 0, i.e. skip) - A numerical value equal-to or greater-than 0. If greater than 0, then peaks whose intensity values have a percent relative standard deviation (otherwise termed the percent coefficient of variation) greater-than this value are excluded from the output peaklist.
309
310 |
311
312 **Show advanced options** (OPTIONAL)
313
314 * **Skip SIM-stitching** (REQUIRED; default = **No**) - a boolean toggle where:
315
316 * **No** - perform SIM stitching
317
318 * **Yes** - skip the processing step where (SIM) windows are 'stitched' or 'joined' together. Use this option if you would like to process individual scan/SIM windows (events/ranges) without 'stitching' them.
319
320 * **Remove m/z range(s)** (OPTIONAL) - this option allows for specific regions of the output peak matrices to be deleted by the user - this option may be useful for removing sections of a spectrum known to correspond to system noise peaks.
321
322 * **Start m/z of removal range** - a positive numerical value corresponding to the lowest m/z value in the spectral region to be removed.
323
324 * **End m/z of removal range** - a positive numerical value corresponding to the highest m/z value in the spectral region to be removed (must be greater than the ‘start m/z of removal range’).
325
326 * **Relative intensity threshold used to remove ringing artefacts** (OPTIONAL) - Fourier transform-based mass spectra often contain peaks (ringing artefacts) around spectral features arising from detection of charged, gas-phase bio-molecules.
327
328 * A positive numerical value indicating the required relative intensity a peak must exceed (with reference to the largest peak in a cluster of peaks) in order to be retained.
329
330 ----------------------------------
331
332
333 Output file(s)
334 --------------
335
336 |
337
338 The Process scans (and SIM stitch) tool will output three file types:
339
340 1) **A HDF5 file** containing the processed peaklists
341
342 2) **A processed peaklist**, presented in tabular format, for each study sample specified in the filelist/samplelist. Each row corresponds to a single peak. Where multiple peaks were grouped together during the hierarchical clustering process, each peaklist metric constitutes an average of the groups’ values. Metrics included in the peaklist are:
343
344 @help_columns_peaklist@
345
346 @example_peaklist@
347
348 |
349
350 3) **A tabular “report” file** that details, for each scan event processed in each file:
351
352 * Scan range of scan event
353
354 * Scan number of scan event
355
356 * Number of peaks detected in scan event
357
358 * Median RSD of peaks detected in each scan event type (only applied if number of scans for a given scan event is <![CDATA[ > ]]> 1
359
360 -----------------------------------
361
362 @github_developers_contributors@
363 @license@
364
365 RawFileReader reading tool. Copyright © 2016 by Thermo Fisher Scientific, Inc. All rights reserved. **Using this galaxy tool implies the acceptance of the RawFileReader** `license terms`_.
366
367 .. _`license terms`: https://github.com/computational-metabolomics/dimspy-galaxy/blob/master/tools/dimspy/RawFileReaderLicense.md
368
369 |
370 </help>
371
372 <expand macro="citations" />
373
374 </tool>
375