comparison mqppep_anova.xml @ 26:5b8e15b2a67c draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit e0b80550743f634282b4b4348b75e6f172dc1488
author eschen42
date Wed, 26 Oct 2022 23:48:51 +0000
parents 3911581e639a
children 8ff2c287ff1c
comparison
equal deleted inserted replaced
25:f9cd87ac8006 26:5b8e15b2a67c
5 profile="21.05" 5 profile="21.05"
6 > 6 >
7 <description>Runs ANOVA and KSEA for phosphopeptides.</description> 7 <description>Runs ANOVA and KSEA for phosphopeptides.</description>
8 <macros> 8 <macros>
9 <import>macros.xml</import> 9 <import>macros.xml</import>
10 <xml name="group_matching_parm">
11 <param name="group_filter_mode" type="select"
12 help="Regular expression matching mode 'fixed', 'perl', or 'grep' with option for case insensitivity. See https://rdrr.io/r/base/grep.html"
13 label="Sample-group matching mode"
14 >
15 <option value="r" selected="true">ERE ("extended regular expressions")</option>
16 <option value="ri"> - ERE, case insensitive</option>
17 <option value="p">PCRE ("PERL-compatible regular expressions")</option>
18 <option value="pi"> - PCRE, case insensitive</option>
19 <option value="f">fixed strings ("no regular expressions")</option>
20 <option value="fi"> - fixed strings, case insensitive</option>
21 </param>
22 <param name="group_filter_patterns" type="text" value="\.+"
23 help="Comma-separated list of regular expressions matching group-names"
24 label="Sample-group matching pattern">
25 <sanitizer>
26 <valid initial="string.printable">
27 <remove value="&apos;"/>
28 </valid>
29 </sanitizer>
30 </param>
31 </xml>
10 </macros> 32 </macros>
11 <edam_topics> 33 <edam_topics>
12 <edam_topic>topic_0121</edam_topic><!-- proteomics --> 34 <edam_topic>topic_0121</edam_topic><!-- proteomics -->
13 <edam_topic>topic_3520</edam_topic><!-- proteomics experiment--> 35 <edam_topic>topic_3520</edam_topic><!-- proteomics experiment-->
14 </edam_topics> 36 </edam_topics>
26 The weird invocation used here is because knitr and install_tinytex 48 The weird invocation used here is because knitr and install_tinytex
27 both need access to a writeable directory, but most directories in a 49 both need access to a writeable directory, but most directories in a
28 biocontainer are read-only, so this builds a pseudo-home under /tmp 50 biocontainer are read-only, so this builds a pseudo-home under /tmp
29 --> 51 -->
30 <command detect_errors="exit_code"><![CDATA[ 52 <command detect_errors="exit_code"><![CDATA[
53 (printenv | sort) &&
31 cp '$__tool_directory__/mqppep_anova_script.Rmd' . && 54 cp '$__tool_directory__/mqppep_anova_script.Rmd' . &&
32 cp '$__tool_directory__/mqppep_anova.R' . && 55 cp '$__tool_directory__/mqppep_anova.R' . &&
56 cp '$__tool_directory__/kinase_name_uniprot_lut.tabular.bz2' . &&
57 cp '$__tool_directory__/kinase_uniprot_description_lut.tabular.bz2' . &&
58 cp '$__tool_directory__/mqppep_anova_preamble.tex' . &&
59 cp '$__tool_directory__/perpage.tex' . &&
60 cp '$__tool_directory__/KSEA_impl_flowchart.pdf' . &&
33 Rscript mqppep_anova.R 61 Rscript mqppep_anova.R
34 --inputFile '$input_file' 62 --inputFile '$input_file'
35 --alphaFile '$alpha_file' 63 --alphaFile '$alpha_file'
36 --preproc_sqlite '$preproc_sqlite' 64 --preproc_sqlite '$preproc_sqlite'
37 --firstDataColumn $intensity_column_regex_f 65 --firstDataColumn '$intensity_column_regex_f'
38 --imputationMethod $imputation.imputation_method 66 --imputationMethod $imputation.imputation_method
39 #if $imputation.imputation_method == "random" 67 #if $imputation.imputation_method == "random"
40 --meanPercentile '$imputation.meanPercentile' 68 --meanPercentile '$imputation.meanPercentile'
41 --sdPercentile '$imputation.sdPercentile' 69 --sdPercentile '$imputation.sdPercentile'
42 #end if 70 #end if
43 --regexSampleNames $sample_names_regex_f 71 --regexSampleNames '$sample_names_regex_f'
44 --regexSampleGrouping $sample_grouping_regex_f 72 --regexSampleGrouping '$sample_grouping_regex_f'
45 --imputedDataFile $imputed_data_file 73 #if $group_filter.group_filter_method == "none"
74 --sampleGroupFilter 'none'
75 #else
76 --sampleGroupFilter '$group_filter.group_filter_method'
77 --sampleGroupFilterPatterns '$group_filter_patterns_f'
78 --sampleGroupFilterMode '$group_filter.group_filter_mode'
79 #end if
80 --intensityMinValuesPerClass '$intnsty_min_vals_per_smpl_grp'
81 --imputedDataFile '$imputed_data_file'
46 --imputedQNLTDataFile '$imp_qn_lt_file' 82 --imputedQNLTDataFile '$imp_qn_lt_file'
47 --ksea_sqlite '$ksea_sqlite' 83 --ksea_sqlite '$ksea_sqlite'
84 --kseaMinSubstrateCount '$ksea_min_substrate_count'
48 --ksea_cutoff_threshold '$ksea_cutoff_threshold' 85 --ksea_cutoff_threshold '$ksea_cutoff_threshold'
49 --ksea_cutoff_statistic 'FDR' 86 --ksea_cutoff_statistic 'FDR'
87 --kseaUseAbsoluteLog2FC '$ksea_use_absolute_log2_fc'
88 --minQuality '$ksea_min_quality'
89 --anova_ksea_metadata '$anova_ksea_metadata'
50 --reportFile '$report_file' 90 --reportFile '$report_file'
51 --anova_ksea_metadata '$anova_ksea_metadata'
52 ]]></command> 91 ]]></command>
92 <!--
93 -->
53 <configfiles> 94 <configfiles>
54 <configfile name="sample_names_regex_f"> 95 <configfile name="sample_names_regex_f">
55 $sample_names_regex 96 $sample_names_regex
56 </configfile> 97 </configfile>
57 <configfile name="sample_grouping_regex_f"> 98 <configfile name="sample_grouping_regex_f">
58 $sample_grouping_regex 99 $sample_grouping_regex
59 </configfile> 100 </configfile>
101 <configfile name="group_filter_patterns_f">
102 #if $group_filter.group_filter_method != "none"
103 $group_filter.group_filter_patterns
104 #end if
105 </configfile>
60 <configfile name="intensity_column_regex_f"> 106 <configfile name="intensity_column_regex_f">
61 $intensity_column_regex 107 $intensity_column_regex
62 </configfile> 108 </configfile>
63 </configfiles> 109 </configfiles>
64 <inputs> 110 <inputs>
65 <param name="input_file" type="data" format="tabular" label="Filtered Phosphopeptide Intensities" 111 <!--
66 help="Phosphopeptide intensities filtered for minimal quality. First column label 'Phosphopeptide'; sample-intensities must begin in column 10 and must have column labels to match argument [sample_names_regex]" 112 needed inputs:
67 /> 113 - # should filters be used to identify sample-groups to be included or excluded
68 <param name="alpha_file" type="data" format="tabular" label="ANOVA alpha cutoff level" 114 sampleGroupFilter: !r c("none", "exclude", "include")[3]
115 - # what patterns should be used to match sample-groups
116 # (extracted by regexSampleGrouping) when determining sample-groups
117 # that should be included or excluded
118 sampleGroupFilterPatterns: ".*CR,N.*"
119 - # minimum number of observed values per class
120 intensityMinPerClass: 0
121 - # what should be the primary criterion to eliminate excessive heatmap rows
122 intensityHeatmapCriteria: !r c("quality", "na_count", "p_value")[1]
123 suggested or advanced inputs:
124 - kinaseNameUprtLutBz2: "./kinase_name_uniprot_lut.tabular.bz2"
125 - kinaseUprtDescLutBz2: "./kinase_uniprot_description_lut.tabular.bz2"
126 -->
127 <param name="input_file" type="data" format="tabular" label="Filtered phosphopeptide intensities (tabular)"
128 help="'preproc_tab' dataset produced by 'MaxQuant Phosphopeptide Preprocessing' tool"
129 />
130 <param name="alpha_file" type="data" format="tabular" label="ANOVA alpha cutoff level (tabular)"
69 help="ANOVA alpha cutoff values for significance testing: tabular data having one column and no header" 131 help="ANOVA alpha cutoff values for significance testing: tabular data having one column and no header"
70 /> 132 />
71 <param name="preproc_sqlite" type="data" format="sqlite" label="preproc_sqlite dataset from mqppep_preproc" 133 <param name="preproc_sqlite" type="data" format="sqlite" label="Database from mqppep_preproc (sqlite)"
72 help="'preproc_sqlite' dataset produced by 'MaxQuant Phosphopeptide Preprocessing' tool" 134 help="'preproc_sqlite' dataset produced by 'MaxQuant Phosphopeptide Preprocessing' tool"
73 /> 135 />
74 <param name="intensity_column_regex" type="text" value="^Intensity[^_]" 136 <param name="intensity_column_regex" type="text" value="^Intensity[^_]"
75 label="Intensity-column pattern" 137 label="Intensity-column pattern"
76 help="Pattern matching columns that have peptide intensity data (PERL-compatible regular expression matching column label)" 138 help="Pattern matching columns that have peptide intensity data (PERL-compatible regular expression matching column label)"
77 /> 139 />
78 <!-- imputation_method <- c("group-median","median","mean","random")[1] --> 140 <!-- imputation_method <- c("group-median","median","mean","random")[1] -->
79 <conditional name="imputation"> 141 <conditional name="imputation">
80 <param name="imputation_method" type="select" label="Imputation method" 142 <param name="imputation_method" type="select" label="Imputation method"
81 help="Impute missing values by (1) using median for each sample-group; (2) using median across all samples; (3) using mean across all samples; or (4) using randomly generated values having same std. dev. as across all samples (with mean specified by [meanPercentile])" 143 help="Impute missing values by (1) using median for each sample-group; (2) using median across all samples; (3) using mean across all samples; or (4) using randomly generated values having same SD as across all samples (with mean specified by 'Mean percentile for random values')"
82 > 144 >
83 <option value="random" selected="true">random</option> 145 <option value="random" selected="true">random</option>
84 <option value="group-median">group-median</option> 146 <option value="group-median">group-median</option>
85 <option value="median">median</option> 147 <option value="median">median</option>
86 <option value="mean">mean</option> 148 <option value="mean">mean</option>
91 <when value="random"> 153 <when value="random">
92 <param name="meanPercentile" type="integer" value="1" min="1" max="99" 154 <param name="meanPercentile" type="integer" value="1" min="1" max="99"
93 label="Mean percentile for random values" 155 label="Mean percentile for random values"
94 help="Percentile center of random values; range [1,99]" 156 help="Percentile center of random values; range [1,99]"
95 /> 157 />
96 <param name="sdPercentile" type="float" value="1.0" 158 <param name="sdPercentile" type="float" value="1"
97 label="Percentile std. dev. for random values" 159 label="Percentile SD for random values"
98 help="Standard deviation adjustment-factor for random values; real number. (1.0 means SD equal to the SD for the entire data set.)" 160 help="Standard deviation adjustment-factor for random values; real number. (1.0 means SD of random values equal to the SD for the entire data set.)"
99 /> 161 />
100 </when> 162 </when>
101 </conditional> 163 </conditional>
102 <param name="sample_names_regex" type="text" value="\.\d+[A-Z]$" 164 <param name="sample_names_regex" type="text" value="\.\d+[A-Z]$"
103 help="Pattern extracting sample-names from names of columns that have peptide intensity data (PERL-compatible regular expression)" 165 help="Pattern extracting sample-names from names of columns of 'Filtered phosphopeptide intensities' that have peptide intensity data (PERL-compatible regular expression)"
104 label="Sample-extraction pattern"> 166 label="Sample-name extraction pattern">
105 <sanitizer> 167 <sanitizer>
106 <valid initial="string.printable"> 168 <valid initial="string.printable">
107 <remove value="&apos;"/> 169 <remove value="&apos;"/>
108 </valid> 170 </valid>
109 </sanitizer> 171 </sanitizer>
110 </param> 172 </param>
111 <param name="sample_grouping_regex" type="text" value="\d+" 173 <param name="sample_grouping_regex" type="text" value="\d+"
112 help="Pattern extracting sample-group from the sample-names that are extracted by 'Sample-extraction pattern' (PERL-compatible regular expression)" 174 help="Pattern extracting sample-group from the extracted sample-names (PERL-compatible regular expression)"
113 label="Group-extraction pattern"> 175 label="Sample-group extraction pattern">
114 <sanitizer> 176 <sanitizer>
115 <valid initial="string.printable"> 177 <valid initial="string.printable">
116 <remove value="&apos;"/> 178 <remove value="&apos;"/>
117 </valid> 179 </valid>
118 </sanitizer> 180 </sanitizer>
119 </param> 181 </param>
182 <param name="intnsty_min_vals_per_smpl_grp" type="integer" value="1" min="0"
183 label="Minimum number of values per sample-group"
184 help="Only consider as comparable those intensities having at least this number of values in each sample-group (range [0,&#8734;])"
185 />
186 <conditional name="group_filter">
187 <param name="group_filter_method" type="select" label="Filter sample-groups"
188 help="What filter should be applied to sample-group names? (1) 'none', no filter; (2) 'include', match is required; (3) 'exclude', match is forbidden."
189 >
190 <option value="none" selected="true">none</option>
191 <option value="include">include</option>
192 <option value="exclude">exclude</option>
193 </param>
194 <when value="none" />
195 <when value="include">
196 <expand macro="group_matching_parm"/>
197 </when>
198 <when value="exclude">
199 <expand macro="group_matching_parm"/>
200 </when>
201 </conditional>
202 <param name="ksea_min_substrate_count" type="integer" value="1" min="1"
203 label="Minimum number of kinase-substrates for KSEA"
204 help="Minimum number of substrates to consider any kinase for KSEA (range [1,&#8734;])"
205 />
120 <param name="ksea_cutoff_threshold" type="float" value="0.05" 206 <param name="ksea_cutoff_threshold" type="float" value="0.05"
121 label="KSEA threshold level" 207 label="KSEA threshold level"
122 help="Maximum FDR to be used to score a kinase enrichment as significant" 208 help="Maximum FDR to be used to score a kinase enrichment as significant; see warning against setting this too low in help text below."
209 />
210 <param name="ksea_use_absolute_log2_fc"
211 type="boolean"
212 label="Use abs(log2(fold-change)) for KSEA"
213 help="Should log2(fold-change) be used for KSEA? (Checking this may alter (possibly reduce) the number of hits.)"
214 checked="false"
215 truevalue="TRUE"
216 falsevalue="FALSE"
217 />
218 <param name="ksea_min_quality" type="integer" value="0" min="0"
219 label="Minimum quality of substrates for KSEA"
220 help="Minimum 'quality' of substrates to be considered for KSEA (range [0,&#8734;]); higher numbers reduce the number of substrates considered - see help text below."
123 /> 221 />
124 </inputs> 222 </inputs>
125 <outputs> 223 <outputs>
126 <data name="imputed_data_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_intensities" ></data> 224 <!-- earlier outputs will appear lower in the history list; therefore, put report at the top -->
127 <data name="imp_qn_lt_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_QN_LT_intensities" ></data> 225 <data name="ksea_sqlite" format="sqlite" label="${input_file.name}..${imputation.imputation_method}-imputed_ksea_sqlite" />
128 <data name="anova_ksea_metadata" format="tabular" label="${input_file.name}.${imputation.imputation_method}-anova_ksea_metadata" ></data> 226 <data name="anova_ksea_metadata" format="tabular" label="${input_file.name}.${imputation.imputation_method}-anova_ksea_metadata" />
129 <!-- 227 <data name="imputed_data_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_intensities" />
130 <data name="report_file" format="html" label="${input_file.name}.${imputation.imputation_method}-imputed_report (download/unzip to view)" ></data> 228 <data name="imp_qn_lt_file" format="tabular" label="${input_file.name}.${imputation.imputation_method}-imputed_QN_LT_intensities" />
131 --> 229 <data name="report_file" format="pdf" label="${input_file.name}.${imputation.imputation_method}-imputed_report" />
132 <data name="report_file" format="pdf" label="${input_file.name}.${imputation.imputation_method}-imputed_report" ></data>
133 <data name="ksea_sqlite" format="sqlite" label="${input_file.name}..${imputation.imputation_method}-imputed_ksea_sqlite">
134 </data>
135 </outputs> 230 </outputs>
136 <tests> 231 <tests>
137 <test> 232 <test><!-- test #1 -->
138 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> 233 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/>
139 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> 234 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/>
140 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> 235 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/>
141 <param name="intensity_column_regex" value="^Intensity[^_]"/> 236 <param name="intensity_column_regex" value="^Intensity[^_]"/>
142 <param name="imputation_method" value="median"/> 237 <param name="imputation_method" value="median"/>
154 <output name="imp_qn_lt_file"> 249 <output name="imp_qn_lt_file">
155 <assert_contents> 250 <assert_contents>
156 <has_text text="Phosphopeptide" /> 251 <has_text text="Phosphopeptide" />
157 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> 252 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" />
158 <!-- missing missing observed missing observed observed --> 253 <!-- missing missing observed missing observed observed -->
159 <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.962256.*6.908828.*6.814580.*6.865411.*6.908828.*7.088909" /> 254 <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.962256.*6.908828.*6.814580.*6.865411.*6.908828.*7.093748" />
160 255
161 <has_text text="pSQKQEEENPAEETGEEK" /> 256 <has_text text="pSQKQEEENPAEETGEEK" />
162 </assert_contents> 257 </assert_contents>
163 </output> 258 </output>
164 </test> 259 </test>
165 <test> 260 <test><!-- test #2 -->
166 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> 261 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/>
167 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> 262 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/>
168 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> 263 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/>
169 <param name="intensity_column_regex" value="^Intensity[^_]"/> 264 <param name="intensity_column_regex" value="^Intensity[^_]"/>
170 <param name="imputation_method" value="mean"/> 265 <param name="imputation_method" value="mean"/>
266 <!--
267 <param name="meanPercentile" value="1"/>
268 <param name="sdPercentile" value="1"/>
269 -->
171 <param name="sample_names_regex" value="\.\d+[A-Z]$"/> 270 <param name="sample_names_regex" value="\.\d+[A-Z]$"/>
172 <param name="sample_grouping_regex" value="\d+"/> 271 <param name="sample_grouping_regex" value="\d+"/>
272 <param name="intnsty_min_vals_per_smpl_grp" value="1"/>
273 <param name="group_filter_method" value="none"/>
274 <!--
275 <param name="group_filter_mode" value="r"/>
276 <param name="group_filter_patterns" value="\.+"/>
277 -->
278 <param name="ksea_min_substrate_count" value="1"/>
279 <param name="ksea_cutoff_threshold" value="0.5"/>
173 <output name="imputed_data_file"> 280 <output name="imputed_data_file">
174 <assert_contents> 281 <assert_contents>
175 <has_text text="Phosphopeptide" /> 282 <has_text text="Phosphopeptide" />
176 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> 283 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" />
177 <!-- missing missing observd missing observd observd --> 284 <!-- missing missing observd missing observd observd -->
182 <output name="imp_qn_lt_file"> 289 <output name="imp_qn_lt_file">
183 <assert_contents> 290 <assert_contents>
184 <has_text text="Phosphopeptide" /> 291 <has_text text="Phosphopeptide" />
185 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> 292 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" />
186 <!-- missing missing observed missing observed observed --> 293 <!-- missing missing observed missing observed observed -->
187 <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.839850.*6.797424.*6.797424.*6.797424.*6.896609.*7.092451" /> 294 <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.839850.*6.797424.*6.797424.*6.797424.*6.896609.*7.097251" />
188 </assert_contents> 295 </assert_contents>
189 </output> 296 </output>
190 </test> 297 </test>
191 <test> 298 <test><!-- test #3 -->
192 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> 299 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/>
193 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> 300 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/>
194 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> 301 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/>
195 <param name="intensity_column_regex" value="^Intensity[^_]"/> 302 <param name="intensity_column_regex" value="^Intensity[^_]"/>
196 <param name="imputation_method" value="group-median"/> 303 <param name="imputation_method" value="group-median"/>
304 <!--
305 <param name="meanPercentile" value="1"/>
306 <param name="sdPercentile" value="1"/>
307 -->
197 <param name="sample_names_regex" value="\.\d+[A-Z]$"/> 308 <param name="sample_names_regex" value="\.\d+[A-Z]$"/>
198 <param name="sample_grouping_regex" value="\d+"/> 309 <param name="sample_grouping_regex" value="\d+"/>
310 <param name="intnsty_min_vals_per_smpl_grp" value="1"/>
311 <param name="group_filter_method" value="none"/>
312 <!--
313 <param name="group_filter_mode" value="r"/>
314 <param name="group_filter_patterns" value="\.+"/>
315 -->
316 <param name="ksea_min_substrate_count" value="1"/>
317 <param name="ksea_cutoff_threshold" value="0.5"/>
199 <output name="imputed_data_file"> 318 <output name="imputed_data_file">
200 <assert_contents> 319 <assert_contents>
201 <has_text text="Phosphopeptide" /> 320 <has_text text="Phosphopeptide" />
202 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> 321 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" />
203 <!-- missing missing observd missing observd observd --> 322 <!-- missing missing observd missing observd observd -->
212 <!-- missing missing observed missing observed observed --> 331 <!-- missing missing observed missing observed observed -->
213 <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.946112.*6.888985.*6.792137.*6.792137.*6.888985.*7.089555" /> 332 <has_text_matching expression="pSQKQEEENPAEETGEEK.*6.946112.*6.888985.*6.792137.*6.792137.*6.888985.*7.089555" />
214 </assert_contents> 333 </assert_contents>
215 </output> 334 </output>
216 </test> 335 </test>
217 <test> 336 <test><!-- test #4 -->
218 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/> 337 <param name="input_file" ftype="tabular" value="test_input_for_anova.tabular"/>
219 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/> 338 <param name="preproc_sqlite" ftype="sqlite" value="test_input_for_anova.sqlite"/>
220 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/> 339 <param name="alpha_file" ftype="tabular" value="alpha_levels.tabular"/>
221 <param name="intensity_column_regex" value="^Intensity[^_]"/> 340 <param name="intensity_column_regex" value="^Intensity[^_]"/>
222 <param name="imputation_method" value="random"/> 341 <param name="imputation_method" value="random"/>
235 </output> 354 </output>
236 <output name="imp_qn_lt_file"> 355 <output name="imp_qn_lt_file">
237 <assert_contents> 356 <assert_contents>
238 <has_text text="Phosphopeptide" /> 357 <has_text text="Phosphopeptide" />
239 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" /> 358 <has_text text="AAAITDMADLEELSRLpSPLPPGpSPGSAAR" />
240 <has_text text="5.409549" /> <!-- log-transformed value for pTYVDPFTpYEDPNQAVR .1B --> 359 <has_text text="5.522821" /> <!-- log-transformed value for pTYVDPFTpYEDPNQAVR .1B -->
241 <has_text text="6.464714" /> <!-- log-transformed value for pSQKQEEENPAEETGEEK .2A --> 360 <has_text text="6.638251" /> <!-- log-transformed value for pSQKQEEENPAEETGEEK .2A -->
242 </assert_contents> 361 </assert_contents>
243 </output> 362 </output>
244 </test> 363 </test>
245 </tests> 364 </tests>
246 <help><![CDATA[ 365 <help><![CDATA[
247 ==================================================== 366 ====================================================
248 Phopsphoproteomic Enrichment Pipeline ANOVA and KSEA 367 Phopsphoproteomic Enrichment Pipeline ANOVA and KSEA
249 ==================================================== 368 ====================================================
250 369
251 **Input files** 370 **Overview**
252 371 ============
253 ``Filtered Phosphopeptide Intensities`` 372
373 Perform statistical analysis of preprocessed MaxQuant output data collected as described in `[Cheng, 2018] <https://doi.org/10.3791/57996>`_.
374
375 - Extracts sample-group IDs from sample names.
376 - Imputes missing values.
377 - Performs ANOVA analysis for each phosphopeptide.
378 - Performs Kinase-Substrate Enrichment Analysis (KSEA) using the method described by `Casado et al. (2013) <doi:10.1126/scisignal.2003573>`_; see *"Algorithms"* section below.
379
380 **Workflow position**
381 =====================
382
383 Upstream tool
384 The "MaxQuant Phosphopeptide Preprocessing" tool (``mqppep_preproc``) that transforms MaxQuant output for phospoproteome-enriched samples into a form suitable for statistical analysis.
385
386 **Input datasets**
387 ==================
388
389 ``Filtered phosphopeptide intensities`` (tabular)
254 Phosphopeptides annotated with SwissProt and phosphosite metadata (in tabular format). 390 Phosphopeptides annotated with SwissProt and phosphosite metadata (in tabular format).
255 This is the output from the "Phopsphoproteomic Enrichment Pipeline Merge and Filter" 391 This is the output from the "MaxQuant Phopsphopeptide Preprocessing"
256 (``mqppep_mrgflt``) tool. 392 (``mqppep_preproc``) tool.
257 393
258 ``ANOVA alpha cutoff level`` 394 - First column label 'Phosphopeptide'.
395 - Sample-intensities must begin in first column matching 'Intensity-column pattern' and must have column labels to match argument 'Sample-name extraction pattern'.
396
397 ``ANOVA alpha cutoff level`` (tabular)
259 List of alpha cutoff values for significance testing; text file having one column and no header. For example: 398 List of alpha cutoff values for significance testing; text file having one column and no header. For example:
260 399
261 :: 400 ::
262 401
263 0.2 402 0.2
264 0.1 403 0.1
265 0.05 404 0.05
266 405
406 ``Database from mqppep_preproc`` (sqlite)
407 SQLite database produced by the "MaxQuant Phopsphopeptide Preprocessing"
408 (``mqppep_preproc``) tool.
409
267 **Input parameters** 410 **Input parameters**
411 ====================
268 412
269 ``Intensity-column pattern`` 413 ``Intensity-column pattern``
270 First column of ``input_file`` having intensity values (integer or PERL-compatible regular expression matching column label). Default: **Intensity** 414 First column of ``Filtered phosphopeptide intensities`` having intensity values (integer or PERL-compatible regular expression matching column label). Default::
415
416 ^Intensity[^_]
271 417
272 ``Imputation method`` 418 ``Imputation method``
273 Impute missing values by: 419 Impute missing values by:
274 420
275 1. ``group-median`` - use median for each sample-group; 421 1. ``group-median`` - use median for each sample-group;
276 2. ``mean`` - use mean across all samples; or 422 2. ``mean`` - use mean across all samples; or
277 3. ``median`` - use median across all samples; 423 3. ``median`` - use median across all samples;
278 4. ``random`` - use randomly generated values where: 424 4. ``random`` - use randomly generated values where:
279 425
280 - ``Mean percentile for random values`` specifies the percentile among non-missing values to be used as mean of random values, and 426 (i) ``Mean percentile for random values`` specifies the percentile among non-missing values to be used as mean of random values, and
281 - ``Percentile std. dev. for random values`` specifies the factor to be multiplied by the standard deviation among the non-missing values (across all samples) to determine the standard deviation of random values. 427 (ii) ``Percentile SD for random values`` specifies the factor to be multiplied by the standard deviation among the non-missing values (across all samples) to determine the standard deviation of random values.
282 428
283 ``Sample-extraction pattern`` 429 ``Sample-name extraction pattern``
284 PERL-compatible regular expression extracting the sample-name from the the name of a column of instensities (from ``input_file``) for one sample. 430 PERL-compatible regular expression extracting the sample-name from the the name of a column of intensities (from ``Filtered phosphopeptide intensities``) for one sample.
285 431
286 - For example, ``"\.\d+[A-Z]$"`` applied to ``Intensity.splunge.10A`` would produce ``.10A`` 432 - For example, ``"\.\d+[A-Z]$"`` applied to "``Intensity.splunge.10A``" would produce "``.10A``".
287 - Note that *this is case sensitive* by default. 433 - Note that *this is case sensitive* by default.
288 434
289 ``Group-extraction pattern`` 435 ``Sample-group extraction pattern``
290 PERL-compatible regular expression extracting the sample-grouping from the sample-name that was extracted with ``sample_names_regex`` from a column of intensites (from ``input_file``). 436 PERL-compatible regular expression extracting the sample-grouping from the sample-name (that was in turn extracted with ``Sample-name extraction pattern`` from a column of intensites from ``Filtered phosphopeptide intensities``).
291 437
292 - For example, ``"\d+$"`` applied to ``.10A`` would produce ``10`` 438 - For example, ``"\d+$"`` applied to "``.10A``" would produce "``10``".
293 - Note that *this is case sensitive* by default. 439 - Note that *this is case sensitive* by default.
294 440
441 ``Minimum number of values per sample-group``
442 Sometimes you may wish to filter out the intensities that are poorly represented among some sample groups because they complicate the comparison process. You can use this parameter to specify the minimum number of values in any sample-group (range [0,]]>&#8734;<![CDATA[])
443
444 ``Filter sample-groups``
445 Sometimes you may have spectra that are for treatments that you are not considering for your comparison. You can specify a filter (or not) for sample-group names; if you do, you can specify whether groups that match your criteria should be excluded from the analysis ("forbidden") or included in the analysis ("required").
446
447 ``Sample-group matching mode``
448 The R `base::grep` function that is used here for pattern matching is exhaustively documented at https://rdrr.io/r/base/grep.html. There are two choices you make here. The first is whether to differentiate lowercase and uppercase characters. The second is wheter to require exact matches ("fixed" pattern-matching mode) or to use "PERL-compatible regular expressions) ("perl") or "extendd regular expressions" ("grep"). See https://rdrr.io/r/base/grep.html for further info.
449
450 ``Sample-group matching pattern``
451 This is a comma-separated list of patterns to match to group-names, according to the ``Sample-group matching mode`` that you have chosen.
452
453 ``Minimum number of kinase-substrates for KSEA``
454 For KSEA, you may decide that you wish to ignore kinases having fewer substrates than some minimum; specify that minimum here (range [1,]]>&#8734;<![CDATA[])
455
295 ``KSEA threshold level`` 456 ``KSEA threshold level``
296 Specifies minimum FDR at which a kinase will be considered to be enriched; the default choice of 0.05 is arbitrary. 457 Specifies minimum FDR at which a kinase will be considered to be enriched; the default choice of ``0.05`` is arbitrary and may exclude kinases that are interesting. The KSEA FDR perhaps should not be treated as conservatively as would be appropriate for hypothesis testing. For example, at an FDR of ``0.05``, for every ``20`` kinases that on discards, ``19`` are likely truely enriched.
458
459 ``Use abs(log2(fold-change)) for KSEA``
460 When TRUE, consider only the magnitude of the differences across the contrast for all of the substrates when aggregating them to assess the enrichment of a given kinase's substrates. When FALSE, also consider the direction. Surprisingly, setting this to TRUE may decrease the enriched kinases.
461
462 ``Minimum quality of substrates for KSEA``
463 An arbitrary "quality score" is assigned to each substrate, as described in the PDF report produced by the tool. This score takes into account both FDR-adjusted p-value and the number of missing values for each substrate. Setting the minimum to zero retains all substrates, which may be a large number.
297 464
298 **Outputs** 465 **Outputs**
299 466 ===========
300 ``imputed_intensities (input_file.imputation_method-imputed_intensities)`` 467
301 Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, in tabular format. 468 Report dataset
302 469 *[input file].[imputation method]*-``imputed_report``
303 ``imputed_QN_LT_intensities (input_file.imputation_method-imputed_QN_LT_intensities)`` 470
304 Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, quantile-normalized (**QN**), and log10-transformed (**LT**), in tabular format. 471 Summary report for normalization, imputation, and **ANOVA**, in PDF format.
305 472
306 ``report_file (input_file.imputation_method-imputed_report)`` 473 Imputed intensities
307 Summary report for normalization, imputation, and **ANOVA**, in PDF format. 474 *[input file].[imputation method]*-``imputed_intensities``
308 475
309 ``anova_ksea_metadata (input_file.imputation_method-imputed_anova_ksea_metadata)`` 476 Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, in tabular format.
310 Phosphopeptide metadata including ANOVA significance and KSEA enrichments. 477
311 478 Imputed quantum-normalized log-transformed intensities
312 ``ksea_sqlite (input_file.imputation_method-imputed_ksea_sqlite)`` 479 *[input file].[imputation method]*-``imputed_QN_LT_intensities``
313 SQLite database for ad-hoc report creation. 480
481 Phosphopeptide MS intensities where missing values have been **imputed** by the chosen method, quantile-normalized (**QN**), and log10-transformed (**LT**), in tabular format.
482
483 ANOVA KSEA metadata
484 *[input file].[imputation method]*-``imputed_anova_ksea_metadata``
485 Phosphopeptide metadata including ANOVA significance and KSEA enrichments.
486
487 KSEA SQLite database sqlite
488 *[input file].[imputation method]*-``imputed_ksea_sqlite``
489 An SQLite database that is usable for *ad hoc* report creation.
314 490
315 **Algorithm** 491 **Algorithm**
316 492 =============
317 The KSEA algorithm used here is as in the KSEAapp package as reported in [Wiredja 2017]. 493
318 The code is adapted from "Danica D. Wiredja (2017). KSEAapp: Kinase-Substrate Enrichment Analysis. R package version 0.99.0." to work with output from the "MaxQuant Phosphopeptide Preprocessing" Galaxy tool. 494 The KSEA algorithm used here is as in the KSEAapp package as reported in `[Wiredja 2017] <https://doi.org/10.1093/bioinformatics/btx415>`_.
495 The code is adapted from `"Danica D. Wiredja (2017). KSEAapp: Kinase-Substrate Enrichment Analysis. R package version 0.99.0." <https://cran.r-project.org/package=KSEAapp>`_ to work with output from the "MaxQuant Phosphopeptide Preprocessing" Galaxy tool and the multiple kinase-substrate databases that the latter tool searches.
319 496
320 **Authors** 497 **Authors**
498 ===========
321 499
322 ``Larry C. Cheng`` 500 ``Larry C. Cheng``
323 (`ORCiD 0000-0002-6922-6433 <https://orcid.org/0000-0002-6922-6433>`_) wrote the original script. 501 (`ORCiD 0000-0002-6922-6433 <https://orcid.org/0000-0002-6922-6433>`_) wrote the original script.
324 502
325 ``Arthur C. Eschenlauer`` 503 ``Arthur C. Eschenlauer``
335 <citations> 513 <citations>
336 <!-- Cheng_2018 "Phosphopeptide Enrichment ..." PMID: 30124664 --> 514 <!-- Cheng_2018 "Phosphopeptide Enrichment ..." PMID: 30124664 -->
337 <citation type="doi">10.3791/57996</citation> 515 <citation type="doi">10.3791/57996</citation>
338 <!-- Wiredja_2017 "The KSEA App ..." PMID: 28655153 --> 516 <!-- Wiredja_2017 "The KSEA App ..." PMID: 28655153 -->
339 <citation type="doi">10.1093/bioinformatics/btx415</citation> 517 <citation type="doi">10.1093/bioinformatics/btx415</citation>
518 <citation type="bibtex">@Manual{,
519 title = {KSEAapp: Kinase-Substrate Enrichment Analysis},
520 author = {Danica D. Wiredja},
521 year = {2017},
522 note = {R package version 0.99.0},
523 }</citation>
340 </citations> 524 </citations>
341 </tool> 525 </tool>