Mercurial > repos > recetox > ramclustr
changeset 5:10ded21d47c0 draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/ramclustr commit 3d2821ffc97cc4f9287ee83bbddb306a8034daa0"
author | recetox |
---|---|
date | Fri, 11 Feb 2022 14:14:38 +0000 |
parents | 69e0da4703b5 |
children | 8d6f7543a56a |
files | macros.xml ramclustr.xml ramclustr_wrapper.R test-data/test1_metadata_xcms_1.txt test-data/test1_ramclustObj_xcms_1.rdata test-data/test2_metadata_xcms_2.txt test-data/test2_ramclustObj_xcms_2.rdata test-data/test3_metadata_csv_1.txt test-data/test3_ramclustObj_csv_1.rdata test-data/test4_metadata_csv_2.txt test-data/test4_ramclustObj_csv_2.rdata |
diffstat | 11 files changed, 243 insertions(+), 265 deletions(-) [+] |
line wrap: on
line diff
--- a/macros.xml Fri Feb 04 08:31:26 2022 +0000 +++ b/macros.xml Fri Feb 11 14:14:38 2022 +0000 @@ -32,69 +32,39 @@ <xml name="parameters_csv"> <section name="ms_csv" title="Input MS Data as CSV" expanded="true"> - <param label="ms" name="ms" type="data" format="csv" - help="Features as columns, rows as samples. Column header mz_rt"/> - <param label="idmsms" name="idmsms" type="data" format="csv" optional="true" - help="Optional idMSMS / MSe csv data. same dim and names as ms required"/> - <param label="sample_name_column" name="sample_name_column" type="integer" value="1" - help="Which column from the csv file contains sample names?"/> - <param label="feature_delimiter" name="feature_delimiter" type="text" value="_" - help="Only required if ms input is set! How feature mz and rt are delimited in csv import column header e.g. ='-'"/> - <param label="retention_time_column" name="retention_time_column" type="integer" value="2" - help="Which position in delimited column header represents the retention time (csv only)"/> - <param label="st" name="st" type="float" value="1" help="Sigma t - time similarity decay value. - A recommended starting point is half the value of your average chromatographic peak width at half max (seconds))."/> + <param label="Input CSV" name="ms" type="data" format="csv" + help="Features as columns, rows as samples. Column header in format mz_rt."/> + <param label="idMSMS" name="idmsms" type="data" format="csv" optional="true" + help="Optional idMSMS / MSe csv data. Same dimension and names as in input CSV are required."/> </section> </xml> - <xml name="parameters_excluded"> - <param label="MStag" name="MStag" type="text" optional="true" - help="Character string in 'taglocation' to designat MS / MSe files e.g. '01.cdf'"/> - <param label="idMSMStag" name="idMSMStag" type="text" optional="true" - help="Character string in 'taglocation' to designat idMSMS / MSe files e.g. '02.cdf'"/> - <param label="taglocation" name="taglocation" type="text" value="filepaths" - help="'filepaths' by default, 'phenoData[,1]' is another option. refers to xcms slot"/> + <xml name="parameters_xcms"> + <section name="xcms" title="Input MS Data as XCMS" expanded="true"> + <param name="input_xcms" label="Input XCMS" type="data" format="rdata.xcms.fillpeaks" + help="Grouped feature data for clustering." /> + <param label="Preserve phenotype" name="usePheno" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" + help="Transfer phenotype data from XCMS object to Spec abundance file."/> + </section> </xml> <xml name="parameters_required"> - <section name="required" title="Required Parameters" expanded="true"> - <param label="sr" name="sr" type="float" value="0.5" help="Sigma r - correlational similarity decay value"/> - <param label="deepSplit" name="deepSplit" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" - help="Controls how agressively the HCA tree is cut - see ?cutreeDynamicTree"/> - <param label="blocksize" name="blocksize" type="integer" value="2000" - help="Number of features (scans?) processed in one block =1000,"/> - <param label="mult" name="mult" type="integer" value="5" - help="Internal value, can be used to influence processing speed/ram usage"/> - <param label="hmax" name="hmax" type="float" value="0.3" - help="Precut the tree at this height, default 0.3 - see ?cutreeDynamicTree"/> - <param label="collapse" name="collapse" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" - help="Reduce feature intensities to spectrum intensities?"/> - <param label="usePheno" name="usePheno" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" - help="Transfer phenotype data from XCMS object to SpecAbund dataset?"/> - <!-- - Currently not forwarded because the MSP is exported always manually afterwards - <param label="mspout" name="mspout" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="write msp formatted spectra to file?" /> - --> - <param label="normalize" name="normalize" type="select" display="radio" - help="Either 'none', 'TIC', 'quantile', or 'batch.qc' normalization of feature intensities. see batch.qc overview in details. "> - <option value="none" selected="true">none</option> - <option value="TIC">TIC</option> - <option value="quantile">quantile</option> - <option value="batch.qc">batch.qc</option> - </param> - <param label="qc_inj_range" name="qc_inj_range" type="integer" value="20" - help="How many injections around each injection are to be scanned for presence of QC samples when - using batch.qc normalization? A good rule of thumb is between 1 and 3 times the typical - injection span between QC injections. i.e. if you inject QC ever 7 samples, set this to - between 7 and 21. Smaller values provide more local precision but make normalization sensitive - to individual poor outliers (though these are first removed using the boxplot function outlier - detection), while wider values provide less local precision in normalization but better - stability to individual peak areas."/> + <param label="Sigma r" name="sr" type="float" value="0.5" help="Correlational similarity between features."/> + <param label="Correlation method" name="cor_method" type="select" display="radio" + help="Choose correlational method to be used - see [1] for details."> + <option value="pearson" selected="true">pearson</option> + <option value="everything">everything</option> + <option value="spearman">spearman</option> + <option value="kendall">kendall</option> + </param> + <param label="Maximum RT difference" name="maxt" value="60" type="float" + help="Maximum difference to calculate RT similarity - values beyond this are assigned zero similarity."/> + </xml> - <param label="minModuleSize" name="minModuleSize" type="integer" value="2" - help="How many features must be part of a cluster to be returned? default = 2"/> - <param label="linkage" name="linkage" type="select" display="radio" value="average" - help="Hierarchical clustering linkage method - see ?hclust"> + <xml name="main_parameters"> + <section name="clustering" title="Clustering" expanded="true"> + <param label="Clustering linkage method" name="linkage" type="select" display="radio" + help="Choose hierarchical clustering linkage method - see [2] for details."> <option value="average" selected="true">average</option> <option value="ward.D">ward.D</option> <option value="ward.D2">ward.D2</option> @@ -104,68 +74,82 @@ <option value="median">median</option> <option value="centroid">centroid</option> </param> + <param label="Minimal cluster size" name="minModuleSize" type="integer" value="2" + help="Minimal size (number of features) of a cluster."/> + <param label="Maximal tree height" name="hmax" type="float" value="0.3" + help="Cut the Hierarchical Cluster Analysis tree at this height, see [3] for details."/> + <param label="Use deepSplit" name="deepSplit" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" + help="Check to produce more smaller clusters, uncheck for fewer bigger clusters, see [3] for details."/> + </section> - <param label="mzdec" name="mzdec" type="integer" value="3" - help="Number of decimal places used in printing m/z values"/> - <param label="cor_method" name="cor_method" type="select" display="radio" value="pearson" - help="Which correlational method used to calculate 'r' - see ?cor"> - <option value="pearson" selected="true">pearson</option> - <option value="everything">everything</option> - <option value="spearman">spearman</option> - <option value="kendall">kendall</option> - </param> + <section name="normalisation" title="Normalisation" expanded="true"> + <conditional name="normalisation_method"> + <param label="Normalisation method" name="normalize" type="select" display="radio" + help="Choose method for normalization of feature intensities."> + <option value="none" selected="true">none</option> + <option value="TIC">TIC</option> + <option value="quantile">quantile</option> + <option value="batch.qc">batch.qc</option> + </param> + <when value="batch.qc"> + <param label="Metadata details" name="batch_order_qc" type="data" format="csv" optional="true" + help="CSV with sample names (or indices, currently not handled) on rows and columns with: + batch number ('batch'), position in sequence ('order'), and whether it is a QC sample or not + ('qc' with true/false OR 'sampleType' with 'sample/qc/blank')."/> + <param label="QC injection range" name="qc_inj_range" type="integer" value="20" + help="How many injections around each injection are to be scanned for presence of QC samples? + A good rule of thumb is between 1 and 3 times the typical + injection span between QC injections. i.e. if you inject QC ever 7 samples, set this to + between 7 and 21. Smaller values provide more local precision but make normalization sensitive + to individual poor outliers (though these are first removed using the boxplot function outlier + detection), while wider values provide less local precision in normalization but better + stability to individual peak areas."/> + </when> + </conditional> + </section> - <param label="rt_only_low_n" name="rt_only_low_n" type="boolean" truevalue="TRUE" falsevalue="FALSE" + <section name="performance" title="Performance"> + <param label="Blocksize" name="blocksize" type="integer" value="2000" + help="Number of features processed in one block."/> + <param label="Blocksize factor" name="mult" type="integer" value="5" + help="Factor to scale blocksize to influence processing speed."/> + </section> + + <section name="msp_output_details" title="MSP output"> + <param label="Merge MSP Files" name="merge_msp" type="boolean" truevalue="TRUE" falsevalue="FALSE" + checked="true" help="Merge all MSP in one file or export one MSP per spectra."/> + <param label="m/z decimal places" name="mzdec" type="integer" value="6" + help="Number of decimal places used in printing m/z values."/> + <!-- + Currently not forwarded because the MSP is exported always manually afterwards + <param label="mspout" name="mspout" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="write msp formatted spectra to file?" /> + --> + </section> + + <section name="extras" title="Extras"> + <param label="RT only low n" name="rt_only_low_n" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="At low injection numbers, correlational relationships of peak intensities may be unreliable. - By defualt ramclustR will simply ignore the correlational r value and cluster on retention time alone. + By default, RAMClustR will simply ignore the correlational Sigma r value and cluster on retention time alone. If you wish to use correlation with at n less than 5, set this value to FALSE."/> - <param label="replace_zeros" name="replace_zeros" type="boolean" truevalue="TRUE" falsevalue="FALSE" + <param label="Replace zeros" name="replace_zeros" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" help="NA, NaN, and Inf values are replaced with zero, and zero values are sometimes returned from peak peaking. When TRUE, zero values will be replaced with a small amount of noise, with noise level - set based on the detected signal intensities for that feature. "/> - <param label="Merge MSP Files" name="merge_msp" type="boolean" truevalue="TRUE" falsevalue="FALSE" - checked="true" help="Whether to merge all msp in one file or export one msp per spectra"/> - </section> - </xml> - - <xml name="parameters_optional_xcms"> - <section name="optional" title="Optional Parameters" expanded="false"> - <param label="st" name="st" type="float" optional="true" help="Sigma t - time similarity decay value. - A recommended starting point is half the value of your average chromatographic peak width at half max (seconds))."/> - <param label="fftempdir" name="fftempdir" type="text" optional="true" - help="Valid path: if there are file size limitations on the default ff pacakge temp directory - - getOptions('fftempdir') - you can change the directory used as the fftempdir with this option."/> - <param label="maxt" name="maxt" type="integer" optional="true" - help="Maximum time difference to calculate retention similarity for - all values beyond this are assigned similarity of zero"/> - </section> - </xml> - - <xml name="parameters_optional_csv"> - <section name="optional" title="Optional Parameters" expanded="false"> - <param label="fftempdir" name="fftempdir" type="text" optional="true" - help="Valid path: if there are file size limitations on the default ff pacakge temp directory - - getOptions('fftempdir') - you can change the directory used as the fftempdir with this option."/> - <param label="maxt" name="maxt" type="integer" optional="true" - help="Maximum time difference to calculate retention similarity for - all values beyond this are assigned similarity of zero"/> - </section> - </xml> - - <xml name="parameters_optional_metadata"> - <section name="metadata" title="Optional Metadata" expanded="false"> - <param label="metadata" name="batch_order_qc" type="data" format="csv" optional="true" - help="CSV with sample names (or indices, currently not handled) on rows and columns with: batch - number ('batch'), position in sequence ('order') and whether it is a qc sample or not - ('qc' with true/false OR 'sampleType' with 'sample/qc/blank')."/> - <param label="Experimental design" name="ExpDes" type="data" format="csv" optional="true" help="Definition of experimental design in CSV format." /> + set based on the detected signal intensities for that feature."/> + <param label="Experimental design metadata" name="ExpDes" type="data" format="csv" optional="true" + help="Definition of experimental design in CSV format." /> </section> </xml> <xml name="output_msp"> - <collection label="Mass spectra from ${tool.name} on ${on_string}" name="mass_spectra" type="list"> - <discover_datasets pattern="__name_and_ext__" directory="spectra" recurse="true" ext="msp"/> - </collection> + <collection label="Mass spectra from ${tool.name} on ${on_string}" name="mass_spectra_collection" type="list"> + <discover_datasets pattern="__name_and_ext__" directory="spectra" recurse="true" ext="msp"/> + <filter>not msp_output_details['merge_msp']</filter> + </collection> + <data label="Mass spectra from ${tool.name} on ${on_string}" name="mass_spectra_merged" format="msp"> + <filter>msp_output_details['merge_msp']</filter> + </data> </xml> <xml name="citations"> @@ -234,7 +218,22 @@ (2) feature names that contain the mass and retention times, separated by a constant delimiter; and (3) features in columns and samples in rows. + +----------------------+-------------------+-------------------+--------------------+--------------------+ + | sample | 100.88_262.464 | 100.01_423.699 | 100.003_128.313 | 100.0057_154.686 | + +======================+===================+===================+====================+====================+ + | 10_qc_16x_dil_milliq | 0 | 195953.6376 | 0 | 0 | + +----------------------+-------------------+-------------------+--------------------+--------------------+ + | 11_qc_8x_dil_milliq | 0 | 117742.1828 | 4247300.664 | 0 | + +----------------------+-------------------+-------------------+--------------------+--------------------+ + | 12_qc_32x_dil_milliq | 4470859.38 | 0 | 2206092.112 | 0 | + +----------------------+-------------------+-------------------+--------------------+--------------------+ + | 15_qc_16x_dil_milliq | 0 | 0 | 2767477.481 | 0 | + +----------------------+-------------------+-------------------+--------------------+--------------------+ + + Downstream Tools + The output is a msp file or a collection of msp files, with additional Spec Abundance file. + +---------+--------------+----------------------+ | Name | Output File | Format | +=========+==============+======================+ @@ -266,16 +265,16 @@ RAMClustR approach RAMClustR was designed to group features designed from the same compound using an approach which is - __1.__ unsupervised, __2.__ platform agnosic, and __3.__ devoid of curated rules, as the depth of - understanding of these processes is insufficent to enable accurate curation/prediction of all phenomenon - that may occur. We acheive this by making two assumptions. The first is that two features derived + **1.** unsupervised, **2.** platform agnostic, and **3.** devoid of curated rules, as the depth of + understanding of these processes is insufficient to enable accurate curation/prediction of all phenomenon + that may occur. We achieve this by making two assumptions. The first is that two features derived from the same compound with have (approximately) the same retention time. The second is that two features derived from the same compound will have (approximately) the same quantitative trend across all samples in the xcms sample set. From these assumptions, we can calculate a retention time similarity score and a correlational similarity score for each feature pair. A high similarity score for both retention time and correlation indicates a strong probability that two features derive from the same compound. Since both conditions must be met, the product of the two similarity scores provides - the best approximatio of the total similarity score - i.e. a feature pair with retention time similarity + the best approximation of the total similarity score - i.e. a feature pair with retention time similarity of 1 and correlational similarity of 0 is unlikely to derive from one compound - 1 x 0 = 0, the final similarity score is zero, indicating the two features represent two different compounds. Similarly, a feature pair with retention time similarity of 0 and correlational similarity of 1 is unlikely to derive @@ -283,11 +282,11 @@ correlational similarity of 1 is likely to derive from one compound - 1 x 1 = 1. The RAMClustR algorithm is built on creating similarity scores for all pairs of features, submitting - this score matrix for heirarchical clustering, and then cutting the resulting dendrogram into neat + this score matrix for hierarchical clustering, and then cutting the resulting dendrogram into neat chunks using the dynamicTreeCut package - where each 'chunk' of the dendrogram results in a group of - features likely to be derived from a single compound. Importantly, this is acheived without looking for + features likely to be derived from a single compound. Importantly, this is achieved without looking for specific phenomenon (i.e. sodiation), meaning that grouping can be performed on any dataset, whether it - is poisitive or negative ionization mode, EI or ESI, LC-MS GC-MS or CE-MS, in-source fragment or complex + is positive or negative ionization mode, EI or ESI, LC-MS GC-MS or CE-MS, in-source fragment or complex adduction event, and predictable or unpredictable signals. </token>
--- a/ramclustr.xml Fri Feb 04 08:31:26 2022 +0000 +++ b/ramclustr.xml Fri Feb 11 14:14:38 2022 +0000 @@ -20,51 +20,49 @@ store_output( #if $filetype.type_choice == "xcms": ramclustr_xcms( - input_xcms = "$filetype.input_xcms", + input_xcms = "$filetype.xcms.input_xcms", + use_pheno = $filetype.xcms.usePheno, #else: ramclustr_csv( - ms="$filetype.ms_csv.ms", - idmsms="$filetype.ms_csv.idmsms", - feature_delimiter="$filetype.ms_csv.feature_delimiter", - sample_name_column = $filetype.ms_csv.sample_name_column, - retention_time_column= $filetype.ms_csv.retention_time_column, + ms = "$filetype.ms_csv.ms", + idmsms = "$filetype.ms_csv.idmsms", #end if sr = $filetype.required.sr, - deep_split = $filetype.required.deepSplit, - block_size = $filetype.required.blocksize, - mult = $filetype.required.mult, - hmax = $filetype.required.hmax, - collapse = $filetype.required.collapse, - use_pheno = $filetype.required.usePheno, - qc_inj_range = $filetype.required.qc_inj_range, - normalize = "$filetype.required.normalize", - min_module_size = $filetype.required.minModuleSize, - linkage = "$filetype.required.linkage", - mzdec = $filetype.required.mzdec, + #if $filetype.type_choice == "xcms": + #if $filetype.required.st + st = $filetype.required.st, + #end if + #else: + st = $filetype.required.st, + #end if cor_method = "$filetype.required.cor_method", - rt_only_low_n = $filetype.required.rt_only_low_n, - replace_zeros = $filetype.required.replace_zeros, - #if $filetype.type_choice == "xcms": - #if $filetype.optional.st - st = $filetype.optional.st, + maxt = $filetype.required.maxt, + linkage = "$clustering.linkage", + min_module_size = $clustering.minModuleSize, + hmax = $clustering.hmax, + deep_split = "$clustering.deepSplit", + normalize = "$normalisation.normalisation_method.normalize", + #if "$normalisation.normalisation_method.normalize" == "batch.qc": + metadata_file = "$normalisation.normalisation_method.batch_order_qc", + qc_inj_range = $normalisation.normalisation_method.qc_inj_range, #end if - #else: - st = $filetype.ms_csv.st, - #end if - #if $filetype.optional.maxt - maxt = $filetype.optional.maxt, - #end if - #if $filetype.optional.fftempdir - fftempdir = $filetype.optional.fftempdir, - #end if - #if $filetype.metadata.batch_order_qc - metadata_file = "${filetype.metadata.batch_order_qc}", - #end if - #if $filetype.metadata.ExpDes - exp_design = "${filetype.metadata.ExpDes}" + block_size = $performance.blocksize, + mult = $performance.mult, + mzdec = $msp_output_details.mzdec, + rt_only_low_n = $extras.rt_only_low_n, + replace_zeros = $extras.replace_zeros, + #if $extras.ExpDes: + exp_design = "${$extras.ExpDes}" #end if ), - "$result", "$method_metadata", $filetype.required.merge_msp, "$spec_abundance") + $msp_output_details.merge_msp, + "$spec_abundance", + #if $msp_output_details.merge_msp: + "$mass_spectra_merged" + #else: + NULL + #end if + ) </configfile> </configfiles> <inputs> @@ -74,80 +72,102 @@ <option value="csv">CSV</option> </param> <when value="xcms"> - <param name="input_xcms" label="input_xcms" type="data" format="rdata.xcms.fillpeaks" help="Grouped feature data for clustering by ramclustR" /> - <expand macro="parameters_required" /> - <expand macro="parameters_optional_xcms" /> - <expand macro="parameters_optional_metadata" /> + <expand macro="parameters_xcms" /> + <section name="required" title="General parameters" expanded="true"> + <param label="Sigma t" name="st" type="float" optional="true" help="Retention time similarity (optional). + A recommended starting point is half the value of your average chromatographic peak width at half max (seconds))."/> + <expand macro="parameters_required" /> + </section> </when> <when value="csv"> <expand macro="parameters_csv" /> - <expand macro="parameters_required" /> - <expand macro="parameters_optional_csv" /> - <expand macro="parameters_optional_metadata" /> + <section name="required" title="General parameters" expanded="true"> + <param label="Sigma t" name="st" type="float" value="1" help="Retention time similarity. + A recommended starting point is half the value of your average chromatographic peak width at half max (seconds))."/> + <expand macro="parameters_required" /> + </section> </when> </conditional> + <expand macro="main_parameters" /> </inputs> <outputs> - <data label="${tool.name} on ${on_string}" name="result" format="RData" /> <data label="Spec Abundance of ${on_string}" name="spec_abundance" format="csv" /> - <data label="Metadata for ${tool.name} on ${on_string}" name="method_metadata" format="txt"/> <expand macro="output_msp"/> </outputs> <tests> <test><!-- TEST 1 --> - <param name="type_choice" value="xcms"/> - <param name="input_xcms" value="test1_xcmsObj_1.rdata.xcms.fillpeaks" ftype="rdata.xcms.fillpeaks"/> - <param name="ExpDes" value="lc-ramclustr-define-experiment.csv" ftype="csv"/> - <output name="result" file="test1_ramclustObj_xcms_1.rdata" ftype="RData" compare="sim_size" delta="200"/> - <output_collection name="mass_spectra" type="list"> - <element name="experiment_lc" file="test1_fill_xcms_1.msp" ftype="msp"/> - </output_collection> - <output name="method_metadata" file="test1_metadata_xcms_1.txt" ftype="txt"/> + <section name="filetype"> + <param name="type_choice" value="xcms"/> + <section name="xcms"> + <param name="input_xcms" value="test1_xcmsObj_1.rdata.xcms.fillpeaks" ftype="rdata.xcms.fillpeaks"/> + </section> + <section name="required"> + <param name="maxt" value="259.8"/> + </section> + </section> + <section name="extras"> + <param name="ExpDes" value="lc-ramclustr-define-experiment.csv" ftype="csv"/> + </section> + <output name="mass_spectra_merged" file="test1_fill_xcms_1.msp" ftype="msp"/> <output name="spec_abundance" file="test1_spec_abundance_xcms_1.csv" ftype="csv" compare="sim_size" delta="100"/> </test> <test><!-- TEST 2 --> - <param name="type_choice" value="xcms"/> - <param name="input_xcms" value="test2_xcmsObj_2.rdata.xcms.fillpeaks" ftype="rdata.xcms.fillpeaks"/> - <param name="batch_order_qc" value="test2_sample_metadata_xcms_2.csv" ftype="csv" /> - <output name="result" file="test2_ramclustObj_xcms_2.rdata" ftype="RData" compare="sim_size" delta="200"/> - <output_collection name="mass_spectra" type="list"> - <element name="fill" file="test2_fill_xcms_2.msp" ftype="msp" compare="diff" lines_diff="10"/> - </output_collection> - <output name="method_metadata" file="test2_metadata_xcms_2.txt" ftype="txt"/> + <section name="filetype"> + <param name="type_choice" value="xcms"/> + <section name="xcms"> + <param name="input_xcms" value="test2_xcmsObj_2.rdata.xcms.fillpeaks" ftype="rdata.xcms.fillpeaks"/> + </section> + <section name="required"> + <param name="maxt" value="78.4"/> + </section> + </section> + <section name="normalisation"> + <section name="normalisation_method"> + <param name="batch_order_qc" value="test2_sample_metadata_xcms_2.csv" ftype="csv" /> + </section> + </section> + <output name="mass_spectra_merged" file="test2_fill_xcms_2.msp" ftype="msp" compare="diff" lines_diff="10"/> <output name="spec_abundance" file="test2_spec_abundance_xcms_2.csv" ftype="csv" compare="sim_size" delta="100"/> </test> <test><!-- TEST 3 --> - <param name="type_choice" value="csv"/> - <param name="ms" value="test3_csv_test-input_1_2.csv" ftype="csv"/> - <param name="st" value="5.0"/> - <param name="blocksize" value="1000"/> - <param name="mult" value="1"/> - <param name="maxt" value="1"/> - <output name="result" file="test3_ramclustObj_csv_1.rdata" ftype="RData" compare="sim_size" delta="200"/> + <section name="filetype"> + <param name="type_choice" value="csv"/> + <section name="ms_csv"> + <param name="ms" value="test3_csv_test-input_1_2.csv" ftype="csv"/> + </section> + <section name="required"> + <param name="st" value="5.0"/> + <param name="maxt" value="1"/> + </section> + </section> + <section name="performance"> + <param name="blocksize" value="1000"/> + <param name="mult" value="1"/> + </section> + <output name="mass_spectra_merged" file="test3_spectra_csv_1.msp" ftype="msp"/> <output name="spec_abundance" file="test3_spec_abundance_csv_1.csv" ftype="csv"/> - <output name="method_metadata" file="test3_metadata_csv_1.txt" ftype="txt"/> - <output_collection name="mass_spectra" type="list"> - <element name="fill" file="test3_spectra_csv_1.msp" ftype="msp"/> - </output_collection> </test> <test><!-- TEST 4 --> - <param name="type_choice" value="csv"/> - <param name="ms" value="test3_csv_test-input_1_2.csv" ftype="csv"/> - <param name="batch_order_qc" value="test4_sample_metadata_csv_2.csv" ftype="csv" /> - <output name="result" file="test4_ramclustObj_csv_2.rdata" ftype="RData" compare="sim_size" delta="200"/> - <output name="spec_abundance" file="test4_spec_abundance_csv_2.csv" ftype="csv"/> - <output name="method_metadata" file="test4_metadata_csv_2.txt" ftype="txt"/> - <output_collection name="mass_spectra" type="list"> - <element name="fill" file="test4_spectra_csv_2.msp" ftype="msp" lines_diff="10"/> - </output_collection> + <section name="filetype"> + <param name="type_choice" value="csv"/> + <section name="ms_csv"> + <param name="ms" value="test3_csv_test-input_1_2.csv" ftype="csv"/> + </section> + </section> + <output name="mass_spectra_merged" file="test4_spectra_csv_2.msp" ftype="msp" lines_diff="10"/> </test> </tests> <help> <![CDATA[ @HELP@ + + .. rubric:: **Footnotes** + .. [1] Correlation, Variance and Covariance - `stats::cor <https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/cor>`_ + .. [2] Hierarchical Clustering - `stats::hclust <https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/hclust>`_ + .. [3] Dynamic Dendrogram Pruning Based on Dendrogram Only - `dynamicTreeCut::cutreeDynamicTree <https://www.rdocumentation.org/packages/dynamicTreeCut/versions/1.63-1/topics/cutreeDynamicTree>`_ ]]> </help>
--- a/ramclustr_wrapper.R Fri Feb 04 08:31:26 2022 +0000 +++ b/ramclustr_wrapper.R Fri Feb 11 14:14:38 2022 +0000 @@ -1,13 +1,16 @@ store_output <- function( ramclustr_obj, - output_filename, - output_method_metadata, output_merge_msp, - output_spec_abundance) { - save(ramclustr_obj, file = output_filename) - RAMClustR::write.methods(ramclustr_obj, output_method_metadata) + output_spec_abundance, + msp_file) { RAMClustR::write.msp(ramclustr_obj, one.file = output_merge_msp) write.csv(ramclustr_obj$SpecAbund, file = output_spec_abundance, row.names = TRUE) + + if (!is.null(msp_file)) { + exp.name <- ramclustr_obj$ExpDes[[1]][which(row.names(ramclustr_obj$ExpDes[[1]]) == "Experiment"), 1] + filename <- paste("spectra/", exp.name, ".msp", sep = "") + file.copy(from = filename, to = msp_file, overwrite = TRUE) + } } load_experiment_definition <- function(filename) { @@ -35,25 +38,23 @@ ramclustr_xcms <- function( input_xcms, + use_pheno, sr, + st = NULL, + cor_method, + maxt, + linkage, + min_module_size, + hmax, deep_split, + normalize, + metadata_file = NULL, + qc_inj_range, block_size, mult, - hmax, - collapse, - use_pheno, - qc_inj_range, - normalize, - min_module_size, - linkage, mzdec, - cor_method, rt_only_low_n, replace_zeros, - st = NULL, - maxt = NULL, - fftempdir = NULL, - metadata_file = NULL, exp_design = NULL ) { obj <- load(input_xcms) @@ -84,7 +85,6 @@ blocksize = block_size, mult = mult, hmax = hmax, - collapse = collapse, usePheno = use_pheno, mspout = FALSE, qc.inj.range = qc_inj_range, @@ -94,7 +94,7 @@ mzdec = mzdec, cor.method = cor_method, rt.only.low.n = rt_only_low_n, - fftempdir = fftempdir, + fftempdir = NULL, replace.zeros = replace_zeros, batch = batch, order = order, @@ -107,28 +107,22 @@ ramclustr_csv <- function( ms, idmsms, - sample_name_column, - feature_delimiter, - retention_time_column, sr, + st, + cor_method, + maxt, + linkage, + min_module_size, + hmax, deep_split, + normalize, + metadata_file = NULL, + qc_inj_range, block_size, mult, - hmax, - collapse, - use_pheno, - qc_inj_range, - normalize, - min_module_size, - linkage, mzdec, - cor_method, rt_only_low_n, replace_zeros, - st = NULL, - maxt = NULL, - fftempdir = NULL, - metadata_file = NULL, exp_design = NULL ) { if (!file.exists(idmsms)) @@ -154,9 +148,6 @@ x <- RAMClustR::ramclustR( ms = ms, idmsms = idmsms, - featdelim = feature_delimiter, - timepos = retention_time_column, - sampNameCol = sample_name_column, st = st, maxt = maxt, sr = sr, @@ -164,8 +155,6 @@ blocksize = block_size, mult = mult, hmax = hmax, - collapse = collapse, - usePheno = use_pheno, mspout = FALSE, qc.inj.range = qc_inj_range, normalize = normalize, @@ -174,7 +163,7 @@ mzdec = mzdec, cor.method = cor_method, rt.only.low.n = rt_only_low_n, - fftempdir = fftempdir, + fftempdir = NULL, replace.zeros = replace_zeros, batch = batch, order = order,
--- a/test-data/test1_metadata_xcms_1.txt Fri Feb 04 08:31:26 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -Raw mass spectrometry data were processed using an R based workflow for feature detection, retention time alignment, feature grouping, peak filling, feature clustering. XCMS(v.3.14.0)was used for feature detection and retention time alighment. Processing was performed using R(v.R Core Team 2021). Feature data was input as an xcms object with ramclustR parameter settings of st = 12.99 sr = 0.5 and maxt = 259.8.RAMClustR (version 1.2.2) was utilized to cluster features into spectra (Broeckling 2014). The feature similarity matrix was clustered using fastcluster package heirarchical clustering method using the average method. The dendrogram was cut using the cutreeDynamicTree function from the dynamicTreeCut package. Cutting parameters were set to minModuleSize = 2, hmax = 0.3, and deepSplit = FALSE. - - 1041 features were collapsed into 174 spectra. - -(Broeckling 2014): Broeckling CD, Afsar FA, Neumann S, Ben-Hur A, Prenni JE. RAMClust: a novel feature clustering method enables spectral-matching-based annotation for metabolomics data. Anal Chem. 2014. 86(14):6812-7. - -R Core Team: R Core Team (2021). R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna, Austria, https://www.R-project.org/. - -R Core Team (2021). R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna, Austria, https://www.R-project.org/. \ No newline at end of file
--- a/test-data/test2_metadata_xcms_2.txt Fri Feb 04 08:31:26 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -Raw mass spectrometry data were processed using an R based workflow for feature detection, retention time alignment, feature grouping, peak filling, feature clustering. XCMS(v.3.14.0)was used for feature detection and retention time alighment. Processing was performed using R(v.R Core Team 2021). Feature data was input as an xcms object with ramclustR parameter settings of st = 3.92 sr = 0.5 and maxt = 78.4.RAMClustR (version 1.2.2) was utilized to cluster features into spectra (Broeckling 2014). The feature similarity matrix was clustered using fastcluster package heirarchical clustering method using the average method. The dendrogram was cut using the cutreeDynamicTree function from the dynamicTreeCut package. Cutting parameters were set to minModuleSize = 2, hmax = 0.3, and deepSplit = FALSE. - - 5881 features were collapsed into 949 spectra. - -(Broeckling 2014): Broeckling CD, Afsar FA, Neumann S, Ben-Hur A, Prenni JE. RAMClust: a novel feature clustering method enables spectral-matching-based annotation for metabolomics data. Anal Chem. 2014. 86(14):6812-7. - -R Core Team: R Core Team (2021). R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna, Austria, https://www.R-project.org/. - -R Core Team (2021). R: A Language and Environment for Statistical Computing. R Foundation for Statistical Computing, Vienna, Austria, https://www.R-project.org/. \ No newline at end of file
--- a/test-data/test3_metadata_csv_1.txt Fri Feb 04 08:31:26 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -Raw mass spectrometry data were processed using an R based workflow for feature detection, retention time alignment, feature grouping, peak filling, feature clustering. Feature data was input as .csv files with ramclustR parameter settings of st = 5 sr = 0.5 and maxt = 1.RAMClustR (version 1.2.2) was utilized to cluster features into spectra (Broeckling 2014). The feature similarity matrix was clustered using fastcluster package heirarchical clustering method using the average method. The dendrogram was cut using the cutreeDynamicTree function from the dynamicTreeCut package. Cutting parameters were set to minModuleSize = 2, hmax = 0.3, and deepSplit = FALSE. - - 203 features were collapsed into 22 spectra. Since there were fewer than five injections, clustering was performed only using retention time simiilarity. - -(Broeckling 2014): Broeckling CD, Afsar FA, Neumann S, Ben-Hur A, Prenni JE. RAMClust: a novel feature clustering method enables spectral-matching-based annotation for metabolomics data. Anal Chem. 2014. 86(14):6812-7. -
--- a/test-data/test4_metadata_csv_2.txt Fri Feb 04 08:31:26 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -Raw mass spectrometry data were processed using an R based workflow for feature detection, retention time alignment, feature grouping, peak filling, feature clustering. Feature data was input as .csv files with ramclustR parameter settings of st = 1 sr = 0.5 and maxt = 60.RAMClustR (version 1.2.2) was utilized to cluster features into spectra (Broeckling 2014). The feature similarity matrix was clustered using fastcluster package heirarchical clustering method using the average method. The dendrogram was cut using the cutreeDynamicTree function from the dynamicTreeCut package. Cutting parameters were set to minModuleSize = 2, hmax = 0.3, and deepSplit = FALSE. - - 203 features were collapsed into 38 spectra. Since there were fewer than five injections, clustering was performed only using retention time simiilarity. - -(Broeckling 2014): Broeckling CD, Afsar FA, Neumann S, Ben-Hur A, Prenni JE. RAMClust: a novel feature clustering method enables spectral-matching-based annotation for metabolomics data. Anal Chem. 2014. 86(14):6812-7. -