Mercurial > repos > eschen42 > mqppep_anova
comparison mqppep_anova_script.Rmd @ 22:61adb8801b73 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 0c7ca054e77e042c8a584c9903073da064df7d8b
author | eschen42 |
---|---|
date | Thu, 30 Jun 2022 16:16:32 +0000 |
parents | 2c5f1a2fe16a |
children | 8582a9797c18 |
comparison
equal
deleted
inserted
replaced
21:f3deca1a3c84 | 22:61adb8801b73 |
---|---|
1 --- | 1 --- |
2 title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA" | 2 title: "MaxQuant Phosphoproteomic Enrichment Pipeline ANOVA/KSEA" |
3 author: "Larry Cheng; Art Eschenlauer" | 3 author: |
4 date: "May 28, 2018; Mar 16, 2022" | 4 - "Nick Graham^[ORCiD 0000-0002-6811-1941, University of Southern California: Los Angeles, CA, US]" |
5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" | |
6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" | |
7 date: | |
8 - "May 28, 2018" | |
9 - "; revised June 23, 2022" | |
5 output: | 10 output: |
6 pdf_document: | 11 pdf_document: |
7 toc: true | 12 toc: true |
8 latex_document: | 13 toc_depth: 3 |
9 toc: true | 14 keep_tex: true |
15 header-includes: | |
16 - \usepackage{longtable} | |
17 - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut | |
18 - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut | |
10 params: | 19 params: |
11 alphaFile: "test-data/alpha_levels.tabular" | 20 alphaFile: "test-data/alpha_levels.tabular" |
12 inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular" | 21 inputFile: "test-data/test_input_for_anova.tabular" |
13 firstDataColumn: "^Intensity[^_]" | 22 preprocDb: "test-data/test_input_for_anova.sqlite" |
14 imputationMethod: !r c("group-median", "median", "mean", "random")[4] | 23 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] |
15 meanPercentile: 1 | 24 show_toc: true |
16 sdPercentile: 1.0 | 25 firstDataColumn: "^Intensity[^_]" |
17 regexSampleNames: "\\.\\d+[A-Z]$" | 26 imputationMethod: !r c("group-median", "median", "mean", "random")[1] |
18 regexSampleGrouping: "\\d+" | 27 meanPercentile: 1 |
19 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" | 28 sdPercentile: 1.0 |
20 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" | 29 regexSampleNames: "\\.\\d+[A-Z]$" |
21 show_toc: true | 30 regexSampleGrouping: "\\d+" |
31 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" | |
32 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" | |
33 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" | |
34 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] | |
35 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] | |
36 kseaCutoffStatistic: !r c("p.value", "FDR")[2] | |
37 kseaCutoffThreshold: !r c( 0.1, 0.05)[2] | |
38 kseaMinKinaseCount: 1 | |
39 intensityHeatmapRows: 75 | |
22 --- | 40 --- |
23 <!-- | 41 <!-- |
24 alphaFile: "test-data/alpha_levels.tabular" | 42 kseaCutoffStatistic: !r c("p.value", "FDR")[2] |
25 inputFile: "test-data/test_input_for_anova.tabular" | 43 kseaCutoffThreshold: !r c(0.05, 0.1)[1] |
26 inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular" | 44 |
27 inputFile: "test-data/density_failure.preproc_tab.tabular" | 45 alphaFile: "test-data/alpha_levels.tabular" |
46 inputFile: "test-data/test_input_for_anova.tabular" | |
47 preprocDb: "test-data/test_input_for_anova.sqlite" | |
48 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
49 | |
50 alphaFile: "test-data/alpha_levels.tabular" | |
51 inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" | |
52 preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" | |
53 kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] | |
54 | |
55 alphaFile: "test-data/alpha_levels.tabular" | |
56 inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" | |
57 preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" | |
58 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] | |
59 | |
60 alphaFile: "test-data/alpha_levels.tabular" | |
61 inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" | |
62 preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" | |
63 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] | |
64 | |
65 inputFile: "test-data/density_failure.preproc_tab.tabular" | |
66 kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2] | |
28 latex_document: default | 67 latex_document: default |
29 --> | 68 --> |
30 ```{r setup, include = FALSE} | 69 ```{r setup, include = FALSE} |
70 #ref for debugging: https://yihui.org/tinytex/r/#debugging | |
71 options(tinytex.verbose = TRUE) | |
72 | |
31 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 | 73 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 |
74 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 | |
32 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) | 75 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) |
33 | 76 |
34 # freeze the random number generator so the same results will be produced | 77 # freeze the random number generator so the same results will be produced |
35 # from run to run | 78 # from run to run |
36 set.seed(28571) | 79 set.seed(28571) |
80 | |
81 ### LIBRARIES | |
82 library(gplots) | |
83 library(DBI) | |
84 library(RSQLite) | |
85 # Suppress "Warning: no DISPLAY variable so Tk is not available" | |
86 suppressWarnings(suppressMessages(library(sqldf))) | |
87 | |
88 # required but not added to search list: | |
89 # - DBI | |
90 # - RSQLite | |
91 # - ggplot2 | |
92 # - knitr | |
93 # - latex2exp | |
94 # - preprocessCore | |
95 # - reshape2 | |
96 # - vioplot | |
37 | 97 |
38 ### CONSTANTS | 98 ### CONSTANTS |
39 | 99 |
40 const_parfin <- par("fin") | 100 const_parfin <- par("fin") |
41 const_boxplot_fill <- "grey94" | 101 const_boxplot_fill <- "grey94" |
42 const_stripchart_cex <- 0.5 | 102 const_stripchart_cex <- 0.5 |
43 const_stripsmall_cex <- | 103 const_stripsmall_cex <- |
44 sqrt(const_stripchart_cex * const_stripchart_cex / 2) | 104 sqrt(const_stripchart_cex * const_stripchart_cex / 2) |
45 const_stripchart_jitter <- 0.3 | 105 const_stripchart_jitter <- 0.3 |
46 const_write_debug_files <- FALSE | 106 const_write_debug_files <- FALSE |
47 const_table_anchor <- "tbp" | 107 const_table_anchor_bp <- "bp" |
108 const_table_anchor_ht <- "ht" | |
109 const_table_anchor_p <- "p" | |
110 const_table_anchor_tbp <- "tbp" | |
111 | |
112 | |
113 const_ksea_astrsk_kinases <- 1 | |
114 const_ksea_nonastrsk_kinases <- 2 | |
115 const_ksea_all_kinases <- 3 | |
116 | |
117 const_log10_e <- log10(exp(1)) | |
48 | 118 |
49 ### FUNCTIONS | 119 ### FUNCTIONS |
50 | 120 |
51 #ANOVA filter function | 121 # from `demo(error.catching)` |
52 anova_func <- function(x, grouping_factor) { | 122 ##' Catch *and* save both errors and warnings, and in the case of |
53 x_aov <- aov(as.numeric(x) ~ grouping_factor) | 123 ##' a warning, also keep the computed result. |
54 pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1] | 124 ##' |
55 pvalue | 125 ##' @title tryCatch both warnings (with value) and errors |
56 } | 126 ##' @param expr an \R expression to evaluate |
127 ##' @return a list with 'value' and 'warning', where | |
128 ##' 'value' may be an error caught. | |
129 ##' @author Martin Maechler; | |
130 ##' Copyright (C) 2010-2012 The R Core Team | |
131 try_catch_w_e <- function(expr) { | |
132 wrn <- NULL | |
133 # warning handler | |
134 w_handler <- function(w) { | |
135 wrn <<- w | |
136 invokeRestart("muffleWarning") | |
137 } | |
138 list( | |
139 value = withCallingHandlers( | |
140 tryCatch( | |
141 expr, | |
142 error = function(e) e | |
143 ), | |
144 warning = w_handler | |
145 ), | |
146 warning = wrn | |
147 ) | |
148 } | |
149 | |
57 | 150 |
58 write_debug_file <- function(s) { | 151 write_debug_file <- function(s) { |
59 if (const_write_debug_files) { | 152 if (const_write_debug_files) { |
60 s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) | 153 s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) |
154 print(sprintf("DEBUG writing file %s", spath)) | |
61 write.table( | 155 write.table( |
62 s, | 156 s, |
63 file = s_path, | 157 file = s_path, |
64 sep = "\t", | 158 sep = "\t", |
65 col.names = TRUE, | 159 col.names = TRUE, |
67 quote = FALSE | 161 quote = FALSE |
68 ) | 162 ) |
69 } | 163 } |
70 } | 164 } |
71 | 165 |
72 latex_collapsed_vector <- function(collapse_string, v) { | 166 # ref: http://adv-r.had.co.nz/Environments.html |
167 # "When creating your own environment, note that you should set its parent | |
168 # environment to be the empty environment. This ensures you don't | |
169 # accidentally inherit objects from somewhere else." | |
170 # Caution: this prevents `with(my_env, expr)` from working when `expr` | |
171 # contains anything from the global environment, even operators! | |
172 # Hence, `x <- 1; get("x", new_env())` fails by design. | |
173 new_env <- function() { | |
174 new.env(parent = emptyenv()) | |
175 } | |
176 | |
177 ### numerical/statistical helper functions | |
178 | |
179 any_nan <- function(x) { | |
180 !any(x == "NaN") | |
181 } | |
182 | |
183 # determine standard deviation of quantile to impute | |
184 sd_finite <- function(x) { | |
185 ok <- is.finite(x) | |
186 sd(x[ok]) | |
187 } | |
188 | |
189 anova_func <- function(x, grouping_factor, one_way_f) { | |
190 subject <- data.frame( | |
191 intensity = x | |
192 ) | |
193 x_aov <- | |
194 one_way_f( | |
195 formula = intensity ~ grouping_factor, | |
196 data = subject | |
197 ) | |
198 pvalue <- | |
199 if (identical(one_way_f, aov)) | |
200 summary(x_aov)[[1]][["Pr(>F)"]][1] | |
201 else | |
202 pvalue <- x_aov$p.value | |
203 pvalue | |
204 } | |
205 | |
206 | |
207 ### LaTeX functions | |
208 | |
209 latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) { | |
210 v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v | |
73 cat( | 211 cat( |
74 paste0( | 212 paste0( |
75 gsub("_", "\\\\_", v), | 213 v_sub, |
76 collapse = collapse_string | 214 collapse = collapse_string |
77 ) | 215 ) |
78 ) | 216 ) |
79 } | 217 } |
80 | 218 |
81 latex_itemized_collapsed <- function(collapse_string, v) { | 219 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { |
82 cat("\\begin{itemize}\n\\item ") | 220 cat("\\begin{itemize}\n\\item ") |
83 latex_collapsed_vector(collapse_string, v) | 221 latex_collapsed_vector(collapse_string, v, underscore_whack) |
84 cat("\n\\end{itemize}\n") | 222 cat("\n\\end{itemize}\n") |
85 } | 223 } |
86 | 224 |
87 latex_itemized_list <- function(v) { | 225 latex_itemized_list <- function(v, underscore_whack = TRUE) { |
88 latex_itemized_collapsed("\n\\item ", v) | 226 latex_itemized_collapsed("\n\\item ", v, underscore_whack) |
89 } | 227 } |
90 | 228 |
91 latex_enumerated_collapsed <- function(collapse_string, v) { | 229 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { |
92 cat("\\begin{enumerate}\n\\item ") | 230 cat("\\begin{enumerate}\n\\item ") |
93 latex_collapsed_vector(collapse_string, v) | 231 latex_collapsed_vector(collapse_string, v, underscore_whack) |
94 cat("\n\\end{enumerate}\n") | 232 cat("\n\\end{enumerate}\n") |
95 } | 233 } |
96 | 234 |
97 latex_enumerated_list <- function(v) { | 235 latex_enumerated_list <- function(v) { |
98 latex_enumerated_collapsed("\n\\item ", v) | 236 latex_enumerated_collapsed("\n\\item ", v) |
99 } | 237 } |
100 | 238 |
101 latex_table_row <- function(v) { | 239 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) { |
102 latex_collapsed_vector(" & ", v) | 240 latex_collapsed_vector(" & ", v, underscore_whack) |
241 cat(extra) | |
103 cat(" \\\\\n") | 242 cat(" \\\\\n") |
104 } | 243 } |
105 | 244 |
106 # Use this like print.data.frame, from which it is adapted: | 245 # Use this like print.data.frame, from which it is adapted: |
107 data_frame_latex <- | 246 data_frame_latex <- |
116 max = NULL, | 255 max = NULL, |
117 # string with justification of each column | 256 # string with justification of each column |
118 justification = NULL, | 257 justification = NULL, |
119 # TRUE to center on page | 258 # TRUE to center on page |
120 centered = TRUE, | 259 centered = TRUE, |
121 # optional capttion | 260 # optional caption |
122 caption = NULL, | 261 caption = NULL, |
123 # h(inline); b(bottom); t (top) or p (separate page) | 262 # h(inline); b(bottom); t (top) or p (separate page) |
124 anchor = "h" | 263 anchor = "h", |
264 # set underscore_whack to TRUE to escape underscores | |
265 underscore_whack = TRUE | |
125 ) { | 266 ) { |
126 if (is.null(justification)) | 267 if (is.null(justification)) |
127 justification <- | 268 justification <- |
128 Reduce( | 269 Reduce( |
129 f = paste, | 270 f = paste, |
142 n | 283 n |
143 ), | 284 ), |
144 "\n", | 285 "\n", |
145 sep = "" | 286 sep = "" |
146 ) | 287 ) |
147 } | 288 } else if (n == 0L) { |
148 else if (n == 0L) { | |
149 cat("0 rows for:\n") | 289 cat("0 rows for:\n") |
150 latex_itemized_list(names(x)) | 290 latex_itemized_list( |
151 } | 291 v = names(x), |
152 else { | 292 underscore_whack = underscore_whack |
293 ) | |
294 } else { | |
153 if (is.null(max)) | 295 if (is.null(max)) |
154 max <- getOption("max.print", 99999L) | 296 max <- getOption("max.print", 99999L) |
155 if (!is.finite(max)) | 297 if (!is.finite(max)) |
156 stop("invalid 'max' / getOption(\"max.print\"): ", | 298 stop("invalid 'max' / getOption(\"max.print\"): ", |
157 max) | 299 max) |
176 justification, | 318 justification, |
177 "}\n", | 319 "}\n", |
178 sep = "" | 320 sep = "" |
179 ) | 321 ) |
180 ) | 322 ) |
323 # ref: https://tex.stackexchange.com/a/50353 | |
324 # Describes use of \rule{0pt}{3ex} | |
181 if (!is.null(caption)) | 325 if (!is.null(caption)) |
182 cat(" \\hline\\hline\n") | 326 cat("\\B \\\\ \\hline\\hline\n") |
183 latex_table_row(colnames(m)) | 327 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 |
328 latex_table_row( | |
329 v = colnames(m), | |
330 extra = "\\T\\B", | |
331 underscore_whack = underscore_whack | |
332 ) | |
184 cat("\\hline\n") | 333 cat("\\hline\n") |
185 for (i in seq_len(length(m[, 1]))) { | 334 for (i in seq_len(length(m[, 1]))) { |
186 latex_table_row(m[i, ]) | 335 latex_table_row( |
336 v = m[i, ], | |
337 underscore_whack = underscore_whack | |
338 ) | |
187 } | 339 } |
188 cat( | 340 cat( |
189 paste( | 341 paste( |
190 " \\end{tabular}", | 342 " \\end{tabular}", |
191 "\\end{table}", | 343 "\\end{table}", |
197 n - n0, "rows ]\n") | 349 n - n0, "rows ]\n") |
198 } | 350 } |
199 invisible(x) | 351 invisible(x) |
200 } | 352 } |
201 | 353 |
202 ``` | 354 hypersub <- |
203 | 355 function(s) { |
204 ## Purpose | 356 hyper <- tolower(s) |
205 | 357 hyper <- gsub("[^a-z0-9]+", "-", hyper) |
206 Perform imputation of missing values, quantile normalization, and ANOVA. | 358 hyper <- gsub("[-]+", "-", hyper) |
359 hyper <- sub("^[-]", "", hyper) | |
360 hyper <- sub("[-]$", "", hyper) | |
361 return(hyper) | |
362 } | |
363 | |
364 subsection_header <- | |
365 function(s) { | |
366 hyper <- hypersub(s) | |
367 cat( | |
368 sprintf( | |
369 "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n", | |
370 hyper, s, hyper | |
371 ) | |
372 ) | |
373 } | |
374 | |
375 subsubsection_header <- | |
376 function(s) { | |
377 hyper <- hypersub(s) | |
378 cat( | |
379 sprintf( | |
380 "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n", | |
381 hyper, s, hyper | |
382 ) | |
383 ) | |
384 } | |
385 | |
386 ### SQLite functions | |
387 | |
388 ddl_exec <- function(db, sql) { | |
389 discard <- DBI::dbExecute(conn = db, statement = sql) | |
390 if (FALSE && discard != 0) { | |
391 need_newpage <- TRUE | |
392 if (need_newpage) { | |
393 need_newpage <<- FALSE | |
394 cat("\\newpage\n") | |
395 } | |
396 o_file <- stdout() | |
397 cat("\n\\begin{verbatim}\n") | |
398 cat(sql, file = o_file) | |
399 cat(sprintf("\n%d rows affected by DDL\n", discard), file = o_file) | |
400 cat("\n\\end{verbatim}\n") | |
401 } | |
402 } | |
403 | |
404 dml_no_rows_exec <- function(db, sql) { | |
405 discard <- DBI::dbExecute(conn = db, statement = sql) | |
406 if (discard != 0) { | |
407 need_newpage <- TRUE | |
408 if (need_newpage) { | |
409 need_newpage <<- FALSE | |
410 cat("\\newpage\n") | |
411 } | |
412 cat("\n\\begin{verbatim}\n") | |
413 o_file <- stdout() | |
414 cat(sql, file = o_file) | |
415 cat(sprintf("\n%d rows affected by DML\n", discard), file = o_file) | |
416 cat("\n\\end{verbatim}\n") | |
417 } | |
418 } | |
419 | |
420 ### KSEA functions and helpers | |
421 | |
422 # Adapted from KSEAapp::KSEA.Scores to allow retrieval of: | |
423 # - maximum log2(FC) | |
424 ksea_scores <- function( | |
425 | |
426 # For human data, typically, ksdata = KSEAapp::ksdata | |
427 ksdata, | |
428 | |
429 # Input data file having columns: | |
430 # - Protein : abbreviated protein name | |
431 # - Gene : HUGO gene name | |
432 # - Peptide : peptide sequence without indications of phosphorylation | |
433 # - Reside.Both : position(s) of phosphorylation within Gene sequence | |
434 # - First letter designates AA that is modified | |
435 # - Numbers indicate position within Gene | |
436 # - Multiple values are separated by semicolons | |
437 # - p : p-value | |
438 # - FC : fold-change | |
439 px, | |
440 | |
441 # A binary input of TRUE or FALSE, indicating whether or not to include | |
442 # NetworKIN predictions | |
443 networkin, | |
444 | |
445 # A numeric value between 1 and infinity setting the minimum NetworKIN | |
446 # score (can be left out if networkin = FALSE) | |
447 networkin_cutoff | |
448 | |
449 ) { | |
450 if (length(grep(";", px$Residue.Both)) == 0) { | |
451 # There are no Residue.Both entries having semicolons, so new is | |
452 # simply px except two columns are renamed and a column is added | |
453 # for log2(abs(fold-change)) | |
454 new <- px | |
455 colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") | |
456 new$log2_fc <- log2(abs(as.numeric(as.character(new$FC)))) | |
457 new <- new[complete.cases(new$log2_fc), ] | |
458 } else { | |
459 # Split each row having semicolons in Residue.Both into rows that are | |
460 # duplicated in all respects except that each row has a single | |
461 # member of the set "split-on-semicolon-Residue.Both" | |
462 px_double <- px[grep(";", px$Residue.Both), ] | |
463 residues <- as.character(px_double$Residue.Both) | |
464 residues <- as.matrix(residues, ncol = 1) | |
465 split <- strsplit(residues, split = ";") | |
466 # x gets count of residues in each row, | |
467 # i.e., 1 + count of semicolons | |
468 x <- sapply(split, length) | |
469 # Here is the set of split rows | |
470 px_single <- data.frame( | |
471 Protein = rep(px_double$Protein, x), | |
472 Gene = rep(px_double$Gene, x), | |
473 Peptide = rep(px_double$Peptide, x), | |
474 Residue.Both = unlist(split), | |
475 p = rep(px_double$p, x), | |
476 FC = rep(px_double$FC, x) | |
477 ) | |
478 # new first gets the split rows | |
479 new <- px[-grep(";", px$Residue.Both), ] | |
480 # to new, append the rows that didn't need splitting in the first place | |
481 new <- rbind(new, px_single) | |
482 # map Gene to SUB_GENE | |
483 # map Residue.Both to SUB_MOD_RSD | |
484 colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") | |
485 # Eliminate any non-positive values to prevent introduction of | |
486 # infinite or NaN values | |
487 new[(0 <= new$log2_fc), "log2_fc"] <- NA | |
488 # Because of preceding step, there is no need for abs in the next line | |
489 new$log2_fc <- log2(as.numeric(as.character(new$FC))) | |
490 # Convert any illegal values from NaN to NA | |
491 new[is.nan(new$log2_fc), "log2_fc"] <- NA | |
492 # Eliminate rows having missing values (e.g., non-imputed data) | |
493 new <- new[complete.cases(new$log2_fc), ] | |
494 } | |
495 if (networkin == TRUE) { | |
496 # When NetworKIN is true, filter on NetworKIN.cutoff which includes | |
497 # PhosphoSitePlus data *because its networkin_score is set to Inf* | |
498 ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ] | |
499 ksdata_filtered <- ksdata_filtered[ | |
500 (ksdata_filtered$networkin_score >= networkin_cutoff), ] | |
501 } else { | |
502 # Otherwise, simply use PhosphSitePlus rows | |
503 ksdata_filtered <- ksdata[ | |
504 grep("PhosphoSitePlus", ksdata$Source), ] | |
505 } | |
506 # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD | |
507 # colnames of ksdata_filtered: | |
508 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" | |
509 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" | |
510 # "SITE_...7_AA" "networkin_score" "Source" | |
511 # colnames of new: | |
512 # "Protein" "SUB_GENE" "Peptide" "SUB_MOD_RSD" "p" "FC" "log2_fc" | |
513 # Equivalent to: | |
514 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc | |
515 # FROM ksdata_filtered a | |
516 # INNER JOIN new b | |
517 # ON a.SUB_GENE = b.SUB_GENE | |
518 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD | |
519 ksdata_dataset <- base::merge(ksdata_filtered, new) | |
520 # colnames of ksdata_dataset: | |
521 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" | |
522 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" | |
523 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" | |
524 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) | |
525 # Re-order dataset; prior to accounting for isoforms | |
526 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] | |
527 # Extract non-isoform accession in UniProtKB | |
528 ksdata_dataset$uniprot_no_isoform <- sapply( | |
529 ksdata_dataset$KIN_ACC_ID, | |
530 function(x) unlist(strsplit(as.character(x), split = "-"))[1] | |
531 ) | |
532 # Discard previous results while selecting interesting columns ... | |
533 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] | |
534 # Column names are now: | |
535 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" | |
536 # "FC" "log2_fc" "Source" | |
537 # Make column names human-readable | |
538 colnames(ksdata_dataset_abbrev) <- c( | |
539 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", | |
540 "FC", "log2FC", "Source" | |
541 ) | |
542 # SELECT * FROM ksdata_dataset_abbrev | |
543 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p | |
544 ksdata_dataset_abbrev <- | |
545 ksdata_dataset_abbrev[ | |
546 order( | |
547 ksdata_dataset_abbrev$Kinase.Gene, | |
548 ksdata_dataset_abbrev$Substrate.Gene, | |
549 ksdata_dataset_abbrev$Substrate.Mod, | |
550 ksdata_dataset_abbrev$p), | |
551 ] | |
552 # First aggregation step to account for multiply phosphorylated peptides | |
553 # and differing peptide sequences; the goal here is to combine results | |
554 # for all measurements of the same substrate. | |
555 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, | |
556 # `Source`, avg(log2FC) AS log2FC | |
557 # FROM ksdata_dataset_abbrev | |
558 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, | |
559 # `Source` | |
560 # ORDER BY `Kinase.Gene`; | |
561 # in two steps: | |
562 # (1) compute average log_2(fold-change) | |
563 ksdata_dataset_abbrev <- aggregate( | |
564 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, | |
565 data = ksdata_dataset_abbrev, | |
566 FUN = mean | |
567 ) | |
568 # (2) order by Kinase.Gene | |
569 ksdata_dataset_abbrev <- | |
570 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] | |
571 # SELECT `Kinase.Gene`, count(*) | |
572 # FROM ksdata_dataset_abbrev | |
573 # GROUP BY `Kinase.Gene`; | |
574 # in two steps: | |
575 # (1) Extract the list of Kinase.Gene names | |
576 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) | |
577 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, | |
578 # named by Kinase.Gene | |
579 kinase_list <- as.matrix(table(kinase_list)) | |
580 # Second aggregation step to account for all substrates per kinase | |
581 # CREATE TABLE mean_fc | |
582 # AS | |
583 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC | |
584 # FROM ksdata_dataset_abbrev | |
585 # GROUP BY `Kinase.Gene` | |
586 mean_fc <- aggregate( | |
587 log2FC ~ Kinase.Gene, | |
588 data = ksdata_dataset_abbrev, | |
589 FUN = mean | |
590 ) | |
591 # mean_fc columns: "Kinase.Gene", "log2FC" | |
592 if (FALSE) { | |
593 # I need to re-think this; I was trying to find the most-represented | |
594 # peptide, but that horse has already left the barn | |
595 # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC | |
596 # FROM ksdata_dataset_abbrev | |
597 # GROUP BY `Kinase.Gene` | |
598 max_fc <- aggregate( | |
599 log2FC ~ Kinase.Gene, | |
600 data = ksdata_dataset_abbrev, | |
601 FUN = function(r) max(abs(r)) | |
602 ) | |
603 } | |
604 | |
605 # Create column 3: mS | |
606 mean_fc$m_s <- mean_fc[, 2] | |
607 # Create column 4: Enrichment | |
608 mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE)) | |
609 # Create column 5: m, count of substrates | |
610 mean_fc$m <- kinase_list | |
611 # Create column 6: z-score | |
612 mean_fc$z_score <- ( | |
613 (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) * | |
614 sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE) | |
615 # Create column 7: p-value, deduced from z-score | |
616 mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) | |
617 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value | |
618 mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") | |
619 | |
620 # Remove log2FC column, which is duplicated as mS | |
621 mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2] | |
622 # Correct the column names which we had to hack because of the linter... | |
623 colnames(mean_fc) <- c( | |
624 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" | |
625 ) | |
626 return(mean_fc) | |
627 } | |
628 | |
629 low_fdr_barplot <- function( | |
630 rslt, | |
631 i_cntrst, | |
632 i, | |
633 a_level, | |
634 b_level, | |
635 fold_change, | |
636 caption | |
637 ) { | |
638 rslt_score_list_i <- rslt$score_list[[i]] | |
639 if (!is.null(rslt_score_list_i)) { | |
640 rslt_score_list_i_nrow <- nrow(rslt_score_list_i) | |
641 k <- data.frame( | |
642 contrast = as.integer(i_cntrst), | |
643 a_level = rep.int(a_level, rslt_score_list_i_nrow), | |
644 b_level = rep.int(b_level, rslt_score_list_i_nrow), | |
645 kinase_gene = rslt_score_list_i$Kinase.Gene, | |
646 mean_log2_fc = rslt_score_list_i$mS, | |
647 enrichment = rslt_score_list_i$Enrichment, | |
648 substrate_count = rslt_score_list_i$m, | |
649 z_score = rslt_score_list_i$z.score, | |
650 p_value = rslt_score_list_i$p.value, | |
651 fdr = rslt_score_list_i$FDR | |
652 ) | |
653 selector <- switch( | |
654 ksea_cutoff_statistic, | |
655 "FDR" = { | |
656 k$fdr | |
657 }, | |
658 "p.value" = { | |
659 k$p_value | |
660 }, | |
661 stop( | |
662 sprintf( | |
663 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
664 ksea_cutoff_statistic | |
665 ) | |
666 ) | |
667 ) | |
668 | |
669 k <- k[selector < ksea_cutoff_threshold, ] | |
670 | |
671 if (nrow(k) > 1) { | |
672 op <- par(mai = c(1, 1.5, 0.4, 0.4)) | |
673 numeric_z_score <- as.numeric(k$z_score) | |
674 z_score_order <- order(numeric_z_score) | |
675 kinase_name <- k$kinase_gene | |
676 long_caption <- | |
677 sprintf( | |
678 "Kinase z-score, %s < %s, %s", | |
679 ksea_cutoff_statistic, | |
680 ksea_cutoff_threshold, | |
681 caption | |
682 ) | |
683 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) | |
684 cat("\n\\clearpage\n") | |
685 barplot( | |
686 height = numeric_z_score[z_score_order], | |
687 border = NA, | |
688 xpd = FALSE, | |
689 cex.names = 1.0, | |
690 cex.axis = 1.0, | |
691 main = long_caption, | |
692 cex.main = my_cex_caption, | |
693 names.arg = kinase_name[z_score_order], | |
694 horiz = TRUE, | |
695 srt = 45, | |
696 las = 1) | |
697 par(op) | |
698 } | |
699 } | |
700 } | |
701 | |
702 # note that this adds elements to the global variable `ksea_asterisk_hash` | |
703 | |
704 low_fdr_print <- function( | |
705 rslt, | |
706 i_cntrst, | |
707 i, | |
708 a_level, | |
709 b_level, | |
710 fold_change, | |
711 caption | |
712 ) { | |
713 rslt_score_list_i <- rslt$score_list[[i]] | |
714 if (!is.null(rslt_score_list_i)) { | |
715 rslt_score_list_i_nrow <- nrow(rslt_score_list_i) | |
716 k <- contrast_ksea_scores <- data.frame( | |
717 contrast = as.integer(i_cntrst), | |
718 a_level = rep.int(a_level, rslt_score_list_i_nrow), | |
719 b_level = rep.int(b_level, rslt_score_list_i_nrow), | |
720 kinase_gene = rslt_score_list_i$Kinase.Gene, | |
721 mean_log2_fc = rslt_score_list_i$mS, | |
722 enrichment = rslt_score_list_i$Enrichment, | |
723 substrate_count = rslt_score_list_i$m, | |
724 z_score = rslt_score_list_i$z.score, | |
725 p_value = rslt_score_list_i$p.value, | |
726 fdr = rslt_score_list_i$FDR | |
727 ) | |
728 | |
729 selector <- switch( | |
730 ksea_cutoff_statistic, | |
731 "FDR" = { | |
732 k$fdr | |
733 }, | |
734 "p.value" = { | |
735 k$p_value | |
736 }, | |
737 stop( | |
738 sprintf( | |
739 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
740 ksea_cutoff_statistic | |
741 ) | |
742 ) | |
743 ) | |
744 | |
745 k <- k[selector < ksea_cutoff_threshold, ] | |
746 # save kinase names to ksea_asterisk_hash | |
747 for (kinase_name in k$kinase_gene) { | |
748 ksea_asterisk_hash[[kinase_name]] <- 1 | |
749 } | |
750 | |
751 db_write_table_overwrite <- (i_cntrst < 2) | |
752 db_write_table_append <- !db_write_table_overwrite | |
753 RSQLite::dbWriteTable( | |
754 conn = db, | |
755 name = "contrast_ksea_scores", | |
756 value = contrast_ksea_scores, | |
757 append = db_write_table_append | |
758 ) | |
759 selector <- switch( | |
760 ksea_cutoff_statistic, | |
761 "FDR" = { | |
762 contrast_ksea_scores$fdr | |
763 }, | |
764 "p.value" = { | |
765 contrast_ksea_scores$p_value | |
766 }, | |
767 stop( | |
768 sprintf( | |
769 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
770 ksea_cutoff_statistic | |
771 ) | |
772 ) | |
773 ) | |
774 output_df <- contrast_ksea_scores[ | |
775 selector < ksea_cutoff_threshold, | |
776 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", | |
777 "z_score", "p_value", "fdr") | |
778 ] | |
779 output_order <- with(output_df, order(mean_log2_fc, kinase_gene)) | |
780 output_df <- output_df[output_order, ] | |
781 colnames(output_df) <- | |
782 c( | |
783 colnames(output_df)[1], | |
784 colnames(output_df)[2], | |
785 "enrichment", | |
786 "m_s", | |
787 "z_score", | |
788 "p_value", | |
789 "fdr" | |
790 ) | |
791 output_df$fdr <- sprintf("%0.4f", output_df$fdr) | |
792 output_df$p_value <- sprintf("%0.2e", output_df$p_value) | |
793 output_df$z_score <- sprintf("%0.2f", output_df$z_score) | |
794 output_df$m_s <- sprintf("%d", output_df$m_s) | |
795 output_df$enrichment <- sprintf("%0.2f", output_df$enrichment) | |
796 output_ncol <- ncol(output_df) | |
797 colnames(output_df) <- | |
798 c( | |
799 "Kinase", | |
800 "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)", | |
801 "Enrichment", | |
802 "Substrates", | |
803 "z-score", | |
804 "p-value", | |
805 "FDR" | |
806 ) | |
807 selector <- switch( | |
808 ksea_cutoff_statistic, | |
809 "FDR" = { | |
810 rslt$score_list[[i]]$FDR | |
811 }, | |
812 "p.value" = { | |
813 rslt$score_list[[i]]$p.value | |
814 }, | |
815 stop( | |
816 sprintf( | |
817 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
818 ksea_cutoff_statistic | |
819 ) | |
820 ) | |
821 ) | |
822 if (sum(selector < ksea_cutoff_threshold) > 0) { | |
823 math_caption <- gsub("{", "\\{", caption, fixed = TRUE) | |
824 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) | |
825 data_frame_latex( | |
826 x = output_df, | |
827 justification = "l c c c c c c", | |
828 centered = TRUE, | |
829 caption = sprintf( | |
830 "\\text{%s}, %s < %s", | |
831 math_caption, | |
832 ksea_cutoff_statistic, | |
833 ksea_cutoff_threshold | |
834 ), | |
835 anchor = const_table_anchor_p | |
836 ) | |
837 } else { | |
838 cat( | |
839 sprintf( | |
840 "\\break | |
841 No kinases had | |
842 \\(\\text{%s}_\\text{enrichment} < %s\\) | |
843 for contrast %s\\hfill\\break\n", | |
844 ksea_cutoff_statistic, | |
845 ksea_cutoff_threshold, | |
846 caption | |
847 ) | |
848 ) | |
849 } | |
850 } | |
851 } | |
852 | |
853 # create_breaks is a helper for ksea_heatmap | |
854 create_breaks <- function(merged_scores) { | |
855 if (min(merged_scores, na.rm = TRUE) < -1.6) { | |
856 breaks_neg <- seq(-1.6, 0, length.out = 30) | |
857 breaks_neg <- | |
858 append( | |
859 seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10), | |
860 breaks_neg | |
861 ) | |
862 breaks_neg <- sort(unique(breaks_neg)) | |
863 } else { | |
864 breaks_neg <- seq(-1.6, 0, length.out = 30) | |
865 } | |
866 if (max(merged_scores, na.rm = TRUE) > 1.6) { | |
867 breaks_pos <- seq(0, 1.6, length.out = 30) | |
868 breaks_pos <- | |
869 append( | |
870 breaks_pos, | |
871 seq(1.6, max(merged_scores, na.rm = TRUE), | |
872 length.out = 10) | |
873 ) | |
874 breaks_pos <- sort(unique(breaks_pos)) | |
875 } else { | |
876 breaks_pos <- seq(0, 1.6, length.out = 30) | |
877 } | |
878 breaks_all <- unique(append(breaks_neg, breaks_pos)) | |
879 mycol_neg <- | |
880 gplots::colorpanel(n = length(breaks_neg), | |
881 low = "blue", | |
882 high = "white") | |
883 mycol_pos <- | |
884 gplots::colorpanel(n = length(breaks_pos) - 1, | |
885 low = "white", | |
886 high = "red") | |
887 mycol <- unique(append(mycol_neg, mycol_pos)) | |
888 color_breaks <- list(breaks_all, mycol) | |
889 return(color_breaks) | |
890 } | |
891 | |
892 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap | |
893 draw_kseaapp_summary_heatmap <- function( | |
894 x, | |
895 sample_cluster, | |
896 merged_asterisk, | |
897 my_cex_row, | |
898 color_breaks, | |
899 margins, | |
900 ... | |
901 ) { | |
902 merged_scores <- x | |
903 if (!is.matrix(x)) { | |
904 cat( | |
905 paste0( | |
906 "No plot because \\texttt{typeof(x)} is '", | |
907 typeof(x), | |
908 "' rather than 'matrix'.\n\n" | |
909 ) | |
910 ) | |
911 } else if (nrow(x) < 2) { | |
912 cat("No plot because matrix x has ", nrow(x), " rows.\n\n") | |
913 cat("\\begin{verbatim}\n") | |
914 str(x) | |
915 cat("\\end{verbatim}\n") | |
916 } else if (ncol(x) < 2) { | |
917 cat("No plot because matrix x has ", ncol(x), " columns.\n\n") | |
918 cat("\\begin{verbatim}\n") | |
919 str(x) | |
920 cat("\\end{verbatim}\n") | |
921 } else { | |
922 gplots::heatmap.2( | |
923 x = merged_scores, | |
924 Colv = sample_cluster, | |
925 scale = "none", | |
926 cellnote = merged_asterisk, | |
927 notecol = "white", | |
928 cexCol = 0.9, | |
929 # Heuristically assign size of row labels | |
930 cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25), | |
931 srtCol = 45, | |
932 srtRow = 45, | |
933 notecex = 3 * my_cex_row, | |
934 col = color_breaks[[2]], | |
935 density.info = "none", | |
936 trace = "none", | |
937 breaks = color_breaks[[1]], | |
938 lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), | |
939 lhei = c(0.4, 8.0, 1.1), | |
940 lwid = c(0.5, 3), | |
941 key = FALSE, | |
942 margins = margins, | |
943 ... | |
944 ) | |
945 } | |
946 } | |
947 | |
948 # Adapted from KSEAapp::KSEA.Heatmap | |
949 ksea_heatmap <- function( | |
950 # the data frame outputs from the KSEA.Scores() function, in list format | |
951 score_list, | |
952 # a character vector of all the sample names for heatmap annotation: | |
953 # - the names must be in the same order as the data in score_list | |
954 # - please avoid long names, as they may get cropped in the final image | |
955 sample_labels, | |
956 # character string of either "p.value" or "FDR" indicating the data column | |
957 # to use for marking statistically significant scores | |
958 stats, | |
959 # a numeric value between 0 and infinity indicating the min. number of | |
960 # substrates a kinase must have to be included in the heatmap | |
961 m_cutoff, | |
962 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff | |
963 # for indicating significant kinases in the heatmap | |
964 p_cutoff = | |
965 stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"), | |
966 # a binary input of TRUE or FALSE, indicating whether or not to perform | |
967 # hierarchical clustering of the sample columns | |
968 sample_cluster, | |
969 # a binary input of TRUE or FALSE, indicating whether or not to export | |
970 # the heatmap as a .png image into the working directory | |
971 export = FALSE, | |
972 # bottom and right margins; adjust as needed if contrast names are too long | |
973 margins = c(6, 20), | |
974 # print which kinases? | |
975 # - Mandatory argument, must be one of const_ksea_.*_kinases | |
976 which_kinases, | |
977 # additional arguments to gplots::heatmap.2, such as: | |
978 # - main: main title of plot | |
979 # - xlab: x-axis label | |
980 # - ylab: y-axis label | |
981 ... | |
982 ) { | |
983 filter_m <- function(dataset, m_cutoff) { | |
984 filtered <- dataset[(dataset$m >= m_cutoff), ] | |
985 return(filtered) | |
986 } | |
987 score_list_m <- lapply(score_list, function(...) filter_m(..., m_cutoff)) | |
988 for (i in seq_len(length(score_list_m))) { | |
989 names <- colnames(score_list_m[[i]])[c(2:7)] | |
990 colnames(score_list_m[[i]])[c(2:7)] <- | |
991 paste(names, i, sep = ".") | |
992 } | |
993 master <- | |
994 Reduce( | |
995 f = function(...) { | |
996 base::merge(..., by = "Kinase.Gene", all = FALSE) | |
997 }, | |
998 x = score_list_m | |
999 ) | |
1000 | |
1001 row.names(master) <- master$Kinase.Gene | |
1002 columns <- as.character(colnames(master)) | |
1003 merged_scores <- as.matrix(master[, grep("z.score", columns), drop = FALSE]) | |
1004 colnames(merged_scores) <- sample_labels | |
1005 merged_stats <- as.matrix(master[, grep(stats, columns)]) | |
1006 asterisk <- function(mtrx, p_cutoff) { | |
1007 new <- data.frame() | |
1008 for (i in seq_len(nrow(mtrx))) { | |
1009 for (j in seq_len(ncol(mtrx))) { | |
1010 my_value <- mtrx[i, j] | |
1011 if (!is.na(my_value) && my_value < p_cutoff) { | |
1012 new[i, j] <- "*" | |
1013 } else { | |
1014 new[i, j] <- "" | |
1015 } | |
1016 } | |
1017 } | |
1018 return(new) | |
1019 } | |
1020 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) | |
1021 | |
1022 # begin hack to print only significant rows | |
1023 asterisk_rows <- rowSums(merged_asterisk == "*") > 0 | |
1024 all_rows <- rownames(merged_stats) | |
1025 names(asterisk_rows) <- all_rows | |
1026 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) | |
1027 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) | |
1028 merged_scores_asterisk <- merged_scores[names(asterisk_rows), ] | |
1029 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ] | |
1030 # end hack to print only significant rows | |
1031 | |
1032 row_list <- list() | |
1033 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows | |
1034 row_list[[const_ksea_all_kinases]] <- all_rows | |
1035 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows | |
1036 | |
1037 i <- which_kinases | |
1038 my_row_names <- row_list[[i]] | |
1039 scrs <- merged_scores[my_row_names, ] | |
1040 stts <- merged_stats[my_row_names, ] | |
1041 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) | |
1042 | |
1043 color_breaks <- create_breaks(scrs) | |
1044 plot_height <- nrow(scrs) ^ 0.55 | |
1045 plot_width <- ncol(scrs) ^ 0.7 | |
1046 my_cex_row <- 0.25 * 16 / plot_height | |
1047 if (export == "TRUE") { | |
1048 png( | |
1049 "KSEA.Merged.Heatmap.png", | |
1050 width = plot_width * 300, | |
1051 height = 2 * plot_height * 300, | |
1052 res = 300, | |
1053 pointsize = 14 | |
1054 ) | |
1055 } | |
1056 draw_kseaapp_summary_heatmap( | |
1057 x = scrs, | |
1058 sample_cluster = sample_cluster, | |
1059 merged_asterisk = merged_asterisk, | |
1060 my_cex_row = my_cex_row, | |
1061 color_breaks = color_breaks, | |
1062 margins = margins | |
1063 ) | |
1064 if (export == "TRUE") { | |
1065 dev.off() | |
1066 } | |
1067 return(my_row_names) | |
1068 } | |
1069 | |
1070 # helper for heatmaps of phosphopeptide intensities | |
1071 | |
1072 draw_intensity_heatmap <- | |
1073 function( | |
1074 m, # matrix with rownames already formatted | |
1075 cutoff, # cutoff used by hm_heading_function | |
1076 hm_heading_function, # construct and cat heading from m and cutoff | |
1077 hm_main_title, # main title for plot (drawn below heading) | |
1078 suppress_row_dendrogram = TRUE, # set to false to show dendrogram | |
1079 max_peptide_count # experimental: | |
1080 = intensity_hm_rows, # values of 50 and 75 worked well | |
1081 ... # passthru parameters for heatmap | |
1082 ) { | |
1083 peptide_count <- 0 | |
1084 # emit the heading for the heatmap | |
1085 if (hm_heading_function(m, cutoff)) { | |
1086 peptide_count <- min(max_peptide_count, nrow(m)) | |
1087 if (nrow(m) > 1) { | |
1088 m_margin <- m[peptide_count:1, ] | |
1089 # Margin setting was heuristically derived | |
1090 margins <- | |
1091 c(0.5, # col | |
1092 max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row | |
1093 ) | |
1094 } | |
1095 if (nrow(m) > 1) { | |
1096 tryCatch( | |
1097 { | |
1098 old_oma <- par("oma") | |
1099 par(cex.main = 0.6) | |
1100 # Heuristically determined character size adjustment formula | |
1101 char_contractor <- | |
1102 250000 / ( | |
1103 max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows | |
1104 ) | |
1105 heatmap( | |
1106 m[peptide_count:1, ], | |
1107 Rowv = if (suppress_row_dendrogram) NA else NULL, | |
1108 Colv = NA, | |
1109 cexRow = char_contractor, | |
1110 cexCol = char_contractor * 50 / max_peptide_count, | |
1111 scale = "row", | |
1112 margins = margins, | |
1113 main = | |
1114 "Unimputed, unnormalized log(intensities)", | |
1115 xlab = "", | |
1116 las = 1, | |
1117 ... | |
1118 ) | |
1119 }, | |
1120 error = function(e) { | |
1121 cat( | |
1122 sprintf( | |
1123 "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n", | |
1124 e$message | |
1125 ) | |
1126 ) | |
1127 }, | |
1128 finally = par(old_oma) | |
1129 ) | |
1130 } | |
1131 } | |
1132 return(peptide_count) | |
1133 } | |
1134 ``` | |
1135 | |
1136 ```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
1137 cat("\\listoftables\n") | |
1138 ``` | |
1139 # Purpose | |
1140 | |
1141 To perform for phosphopeptides: | |
1142 | |
1143 - imputation of missing values, | |
1144 - quantile normalization, | |
1145 - ANOVA (using the R stats::`r params$oneWayManyCategories` function), and | |
1146 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: | |
1147 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) | |
1148 - The Human Proteome Database [http://hprd.org](http://hprd.org) | |
1149 - NetworKIN [http://networkin.science/](http://networkin.science/) | |
1150 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) | |
207 | 1151 |
208 ```{r include = FALSE} | 1152 ```{r include = FALSE} |
1153 | |
1154 ### GLOBAL VARIABLES | |
1155 | |
1156 # parameters for KSEA | |
1157 | |
1158 ksea_cutoff_statistic <- params$kseaCutoffStatistic | |
1159 ksea_cutoff_threshold <- params$kseaCutoffThreshold | |
1160 ksea_min_kinase_count <- params$kseaMinKinaseCount | |
1161 | |
1162 ksea_heatmap_titles <- list() | |
1163 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- | |
1164 sprintf( | |
1165 "Summary for all kinases enriched in one or more contrasts at %s < %s", | |
1166 ksea_cutoff_statistic, | |
1167 ksea_cutoff_threshold | |
1168 ) | |
1169 ksea_heatmap_titles[[const_ksea_all_kinases]] <- | |
1170 "Summary figure for all contrasts and all kinases" | |
1171 ksea_heatmap_titles[[const_ksea_nonastrsk_kinases]] <- | |
1172 sprintf( | |
1173 "Summary for all kinases not enriched at %s < %s in any contrast", | |
1174 ksea_cutoff_statistic, | |
1175 ksea_cutoff_threshold | |
1176 ) | |
1177 # hash to hold names of significantly enriched kinases | |
1178 ksea_asterisk_hash <- new_env() | |
1179 | |
1180 # READ PARAMETERS (mostly) | |
1181 | |
1182 intensity_hm_rows <- params$intensityHeatmapRows | |
209 # Input Filename | 1183 # Input Filename |
210 input_file <- params$inputFile | 1184 input_file <- params$inputFile |
211 | 1185 |
212 # First data column - ideally, this could be detected via regexSampleNames, | 1186 # First data column - ideally, this could be detected via regexSampleNames, |
213 # but for now leave it as is. | 1187 # but for now leave it as is. |
218 } | 1192 } |
219 | 1193 |
220 # False discovery rate adjustment for ANOVA | 1194 # False discovery rate adjustment for ANOVA |
221 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 | 1195 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 |
222 val_fdr <- | 1196 val_fdr <- |
223 read.table(file = params$alphaFile, sep = "\t", header = F, quote = "") | 1197 read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "") |
224 | 1198 |
225 if ( | 1199 if ( |
226 ncol(val_fdr) != 1 || | 1200 ncol(val_fdr) != 1 || |
227 sum(!is.numeric(val_fdr[, 1])) || | 1201 sum(!is.numeric(val_fdr[, 1])) || |
228 sum(val_fdr[, 1] < 0) || | 1202 sum(val_fdr[, 1] < 0) || |
233 val_fdr <- val_fdr[, 1] | 1207 val_fdr <- val_fdr[, 1] |
234 | 1208 |
235 #Imputed Data filename | 1209 #Imputed Data filename |
236 imputed_data_filename <- params$imputedDataFilename | 1210 imputed_data_filename <- params$imputedDataFilename |
237 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile | 1211 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile |
238 | 1212 anova_ksea_mtdt_file <- params$anovaKseaMetadata |
239 #ANOVA data filename | 1213 |
240 ``` | 1214 ``` |
241 | 1215 |
242 ```{r echo = FALSE} | 1216 ```{r echo = FALSE} |
243 # Imputation method, should be one of | 1217 # Imputation method, should be one of |
244 # "random", "group-median", "median", or "mean" | 1218 # "random", "group-median", "median", or "mean" |
253 | 1227 |
254 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" | 1228 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" |
255 regex_sample_names <- params$regexSampleNames | 1229 regex_sample_names <- params$regexSampleNames |
256 | 1230 |
257 # Regular expression to extract Sample Grouping from Sample Name; | 1231 # Regular expression to extract Sample Grouping from Sample Name; |
258 # if error occurs, compare sample_factor_levels and sample_name_matches | 1232 # if error occurs, compare sample_treatment_levels vs. sample_name_matches |
259 # to see if groupings/pairs line up | 1233 # to see if groupings/pairs line up |
260 # e.g., "(\\d+)" | 1234 # e.g., "(\\d+)" |
261 regex_sample_grouping <- params$regexSampleGrouping | 1235 regex_sample_grouping <- params$regexSampleGrouping |
262 | 1236 |
1237 one_way_all_categories_fname <- params$oneWayManyCategories | |
1238 one_way_all_categories <- try_catch_w_e( | |
1239 match.fun(one_way_all_categories_fname)) | |
1240 if (!is.function(one_way_all_categories$value)) { | |
1241 write("fatal error for parameter oneWayManyCategories:", stderr()) | |
1242 write(one_way_all_categories$value$message, stderr()) | |
1243 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
1244 stop("Cannot continue. Goodbye.") | |
1245 } | |
1246 one_way_all_categories <- one_way_all_categories$value | |
1247 | |
1248 one_way_two_categories_fname <- params$oneWayManyCategories | |
1249 one_way_two_categories <- try_catch_w_e( | |
1250 match.fun(one_way_two_categories_fname)) | |
1251 if (!is.function(one_way_two_categories$value)) { | |
1252 cat("fatal error for parameter oneWayTwoCategories: \n") | |
1253 cat(one_way_two_categories$value$message, fill = TRUE) | |
1254 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
1255 stop("Cannot continue. Goodbye.") | |
1256 } | |
1257 one_way_two_categories <- one_way_two_categories$value | |
1258 | |
1259 preproc_db <- params$preprocDb | |
1260 ksea_app_prep_db <- params$kseaAppPrepDb | |
1261 result <- file.copy( | |
1262 from = preproc_db, | |
1263 to = ksea_app_prep_db, | |
1264 overwrite = TRUE | |
1265 ) | |
1266 if (!result) { | |
1267 write( | |
1268 sprintf( | |
1269 "fatal error copying initial database '%s' to output '%s'", | |
1270 preproc_db, | |
1271 ksea_app_prep_db, | |
1272 ), | |
1273 stderr() | |
1274 ) | |
1275 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
1276 stop("Cannot continue. Goodbye.") | |
1277 } | |
263 ``` | 1278 ``` |
264 | 1279 |
265 ```{r echo = FALSE} | 1280 ```{r echo = FALSE} |
266 ### READ DATA | 1281 ### READ DATA |
267 | |
268 library(data.table) | |
269 | 1282 |
270 # read.table reads a file in table format and creates a data frame from it. | 1283 # read.table reads a file in table format and creates a data frame from it. |
271 # - note that `quote = ""` means that quotation marks are treated literally. | 1284 # - note that `quote = ""` means that quotation marks are treated literally. |
272 full_data <- read.table( | 1285 full_data <- read.table( |
273 file = input_file, | 1286 file = input_file, |
274 sep = "\t", | 1287 sep = "\t", |
275 header = T, | 1288 header = TRUE, |
276 quote = "", | 1289 quote = "", |
277 check.names = FALSE | 1290 check.names = FALSE |
278 ) | 1291 ) |
279 ``` | 1292 ``` |
280 | 1293 |
281 ## Extract Sample Names and Factor Levels | 1294 # Extract Sample Names and Treatment Levels |
282 | 1295 |
283 Column names parsed from input file are shown in Table 1; sample names and factor levels, in Table 2. | 1296 Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2. |
284 | 1297 |
285 ```{r echo = FALSE, results = 'asis'} | 1298 ```{r echo = FALSE, results = 'asis'} |
286 | 1299 |
287 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) | 1300 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) |
288 | 1301 |
295 } | 1308 } |
296 | 1309 |
297 cat( | 1310 cat( |
298 sprintf( | 1311 sprintf( |
299 paste( | 1312 paste( |
300 "\n\nPeptide-intensity data for each sample is", | 1313 "\n\nThe input data file has peptide-intensity data for each sample", |
301 "in one of columns %d through %d.\n\n" | 1314 "in one of columns %d through %d.\n\n" |
302 ), | 1315 ), |
303 min(data_column_indices), | 1316 min(data_column_indices), |
304 max(data_column_indices) | 1317 max(data_column_indices) |
305 ) | 1318 ) |
306 ) | 1319 ) |
307 | 1320 |
308 # Write column names as a LaTeX enumerated list. | 1321 # Write column names as a LaTeX enumerated list. |
309 column_name_df <- data.frame( | 1322 column_name_df <- data.frame( |
310 column = seq_len(length(colnames(full_data))), | 1323 column = seq_len(length(colnames(full_data))), |
311 name = colnames(full_data) | 1324 name = paste0("\\verb@", colnames(full_data), "@") |
312 ) | 1325 ) |
313 data_frame_latex( | 1326 data_frame_latex( |
314 x = column_name_df, | 1327 x = column_name_df, |
315 justification = "l l", | 1328 justification = "l l", |
316 centered = TRUE, | 1329 centered = TRUE, |
317 caption = "Input data column name", | 1330 caption = "Input data column names", |
318 anchor = const_table_anchor | 1331 anchor = const_table_anchor_bp, |
1332 underscore_whack = FALSE | |
319 ) | 1333 ) |
320 | 1334 |
321 ``` | 1335 ``` |
322 | 1336 |
323 ```{r echo = FALSE, results = 'asis'} | 1337 ```{r echo = FALSE, results = 'asis'} |
324 #```{r echo = FALSE, results = 'asis'} | |
325 quant_data <- full_data[first_data_column:length(full_data)] | 1338 quant_data <- full_data[first_data_column:length(full_data)] |
326 quant_data[quant_data == 0] <- NA | 1339 quant_data[quant_data == 0] <- NA |
327 rownames(quant_data) <- full_data$Phosphopeptide | 1340 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide |
328 # Get factors -> group replicates (as indicated by terminal letter) | 1341 # Extract factors and trt-replicates using regular expressions. |
329 # by the preceding digits; | 1342 # Typically: |
330 # Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$" | 1343 # regex_sample_names is "\\.\\d+[A-Z]$" |
331 # get factors -> | 1344 # regex_sample_grouping is "\\d+" |
332 # group runs (samples) by ignoring terminal [A-Z] in sample names | 1345 # This would distinguish trt-replicates by terminal letter [A-Z] |
333 # e.g. | 1346 # in sample names and group them into trts by the preceding digits. |
1347 # e.g.: | |
334 # group .1A .1B .1C into group 1; | 1348 # group .1A .1B .1C into group 1; |
335 # group .2A .2B .2C, into group 2; | 1349 # group .2A .2B .2C, into group 2; |
336 # etc. | 1350 # etc. |
337 m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE) | 1351 m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE) |
338 sample_name_matches <- regmatches(names(quant_data), m) | 1352 sample_name_matches <- regmatches(names(quant_data), m) |
339 colnames(quant_data) <- sample_name_matches | 1353 colnames(quant_data) <- sample_name_matches |
340 | 1354 |
341 write_debug_file(quant_data) | 1355 write_debug_file(quant_data) |
342 | 1356 |
343 m2 <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) | 1357 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) |
344 sample_factor_levels <- as.factor(regmatches(sample_name_matches, m2)) | 1358 sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match)) |
345 number_of_samples <- length(sample_name_matches) | 1359 number_of_samples <- length(sample_name_matches) |
346 sample_factor_df <- data.frame( | 1360 sample_treatment_df <- data.frame( |
347 sample = sample_name_matches, | 1361 level = sample_treatment_levels, |
348 level = sample_factor_levels | 1362 sample = sample_name_matches |
349 ) | 1363 ) |
350 data_frame_latex( | 1364 data_frame_latex( |
351 x = sample_factor_df, | 1365 x = sample_treatment_df, |
352 justification = "c c", | 1366 justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}", |
353 centered = TRUE, | 1367 centered = TRUE, |
354 caption = "Factor level", | 1368 caption = "Treatment levels", |
355 anchor = const_table_anchor | 1369 anchor = const_table_anchor_tbp, |
356 ) | 1370 underscore_whack = FALSE |
357 ``` | 1371 ) |
1372 ``` | |
1373 | |
358 ```{r echo = FALSE, results = 'asis'} | 1374 ```{r echo = FALSE, results = 'asis'} |
359 cat("\\newpage\n") | 1375 cat("\\newpage\n") |
360 ``` | 1376 ``` |
361 | 1377 |
362 ### Are the log-transformed sample distributions similar? | 1378 ## Are the log-transformed sample distributions similar? |
363 | 1379 |
364 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | 1380 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} |
365 | 1381 |
366 quant_data[quant_data == 0] <- NA #replace 0 with NA | 1382 quant_data[quant_data == 0] <- NA #replace 0 with NA |
367 quant_data_log <- log10(quant_data) | 1383 quant_data_log <- log10(quant_data) |
392 cat("\n\n\n") | 1408 cat("\n\n\n") |
393 | 1409 |
394 ``` | 1410 ``` |
395 | 1411 |
396 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} | 1412 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} |
397 library(ggplot2) | |
398 if (nrow(quant_data_log) > 1) { | 1413 if (nrow(quant_data_log) > 1) { |
399 quant_data_log_stack <- stack(quant_data_log) | 1414 quant_data_log_stack <- stack(quant_data_log) |
400 ggplot( | 1415 ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) + |
401 quant_data_log_stack, | 1416 ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + |
402 aes(x = values) | 1417 ggplot2::ylab("Probability density") + |
403 ) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + | 1418 ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE) |
404 ylab("Probability density") + | |
405 geom_density( | |
406 aes(group = ind, colour = ind), | |
407 na.rm = TRUE | |
408 ) | |
409 } else { | 1419 } else { |
410 cat("No density plot because there are too few peptides.\n\n") | 1420 cat("No density plot because there are too few peptides.\n\n") |
411 } | 1421 } |
412 ``` | 1422 ``` |
413 | 1423 |
414 ### Globally, are peptide intensities are approximately unimodal? | 1424 ## Globally, are peptide intensities are approximately unimodal? |
415 | 1425 |
416 <!-- | 1426 <!-- |
417 # bquote could be used as an alternative to latex2exp::TeX below particularly | 1427 # bquote could be used as an alternative to latex2exp::TeX below particularly |
418 # and when plotting math expressions generally, at the expense of mastering | 1428 # and when plotting math expressions generally, at the expense of mastering |
419 # another syntax, which hardly seems worthwhile when I need to use TeX | 1429 # another syntax, which hardly seems worthwhile when I need to use TeX |
442 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), | 1452 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), |
443 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 1453 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") |
444 ) | 1454 ) |
445 ``` | 1455 ``` |
446 | 1456 |
447 ### Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values: | 1457 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values |
448 | 1458 |
449 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} | 1459 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} |
450 # determine quantile | 1460 # determine quantile |
451 q1 <- quantile(logvalues, probs = mean_percentile)[1] | 1461 q1 <- quantile(logvalues, probs = mean_percentile)[1] |
452 | 1462 |
453 # determine standard deviation of quantile to impute | |
454 sd_finite <- function(x) { | |
455 ok <- is.finite(x) | |
456 sd(x[ok]) | |
457 } | |
458 # 1 = row of matrix (ie, phosphopeptide) | 1463 # 1 = row of matrix (ie, phosphopeptide) |
459 sds <- apply(quant_data_log, 1, sd_finite) | 1464 sds <- apply(quant_data_log, 1, sd_finite) |
460 if (sum(!is.na(sds)) > 2) { | 1465 if (sum(!is.na(sds)) > 2) { |
461 plot( | 1466 plot( |
462 density(sds, na.rm = T) | 1467 density(sds, na.rm = TRUE) |
463 , main = "Smoothed estimated probability density vs. std. deviation" | 1468 , main = "Smoothed estimated probability density vs. std. deviation" |
464 , sub = "(probability estimation made with Gaussian smoothing)" | 1469 , sub = "(probability estimation made with Gaussian smoothing)" |
465 , ylab = "Probability density" | 1470 , ylab = "Probability density" |
466 ) | 1471 ) |
467 } else { | 1472 } else { |
488 ```{r echo = FALSE} | 1493 ```{r echo = FALSE} |
489 | 1494 |
490 # prep for trt-median based imputation | 1495 # prep for trt-median based imputation |
491 | 1496 |
492 ``` | 1497 ``` |
493 ## Impute Missing Values | 1498 # Impute Missing Values |
494 | 1499 |
495 ```{r echo = FALSE} | 1500 ```{r echo = FALSE} |
496 | 1501 |
497 imp_smry_potential_before <- length(logvalues) | 1502 imp_smry_pot_peptides_before <- nrow(quant_data_log) |
498 imp_smry_missing_before <- number_to_impute | 1503 imp_smry_missing_values_before <- number_to_impute |
499 imp_smry_pct_missing <- pct_missing_values | 1504 imp_smry_pct_missing <- pct_missing_values |
500 | 1505 |
501 ``` | 1506 ``` |
502 | 1507 |
503 ```{r echo = FALSE} | 1508 ```{r echo = FALSE} |
504 #Determine number of cells to impute | 1509 #Determine number of cells to impute |
505 | 1510 |
506 ``` | 1511 ``` |
507 ```{r echo = FALSE} | 1512 ```{r echo = FALSE} |
508 | 1513 |
509 #Impute data | |
510 quant_data_imp <- quant_data | |
511 | |
512 # Identify which values are missing and need to be imputed | 1514 # Identify which values are missing and need to be imputed |
513 ind <- which(is.na(quant_data_imp), arr.ind = TRUE) | 1515 ind <- which(is.na(quant_data), arr.ind = TRUE) |
514 | 1516 |
515 ``` | 1517 ``` |
516 ```{r echo = FALSE, results = 'asis'} | 1518 ```{r echo = FALSE, results = 'asis'} |
517 | 1519 |
518 # Apply imputation | 1520 # Apply imputation |
519 switch( | 1521 switch( |
520 imputation_method | 1522 imputation_method |
521 , "group-median" = { | 1523 , "group-median" = { |
1524 quant_data_imp <- quant_data | |
522 imputation_method_description <- | 1525 imputation_method_description <- |
523 paste("Substitute missing value with", | 1526 paste("Substitute missing value with", |
524 "median peptide intensity for sample group.\n" | 1527 "median peptide-intensity for sample group.\n" |
525 ) | 1528 ) |
526 sample_level_integers <- as.integer(sample_factor_levels) | 1529 sample_level_integers <- as.integer(sample_treatment_levels) |
527 for (i in seq_len(length(levels(sample_factor_levels)))) { | 1530 # Take the accurate ln(x+1) because the data are log-normally distributed |
1531 # and because median can involve an average of two measurements. | |
1532 quant_data_imp <- log1p(quant_data_imp) | |
1533 for (i in seq_len(length(levels(sample_treatment_levels)))) { | |
1534 # Determine the columns for this factor-level | |
528 level_cols <- i == sample_level_integers | 1535 level_cols <- i == sample_level_integers |
529 ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE) | 1536 # Extract those columns |
530 quant_data_imp[ind, level_cols] <- | 1537 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] |
531 apply(quant_data_imp[, level_cols], 1, median, na.rm = T)[ind[, 1]] | 1538 # assign to ind the row-column pairs corresponding to each NA |
1539 ind <- which(is.na(lvlsbst), arr.ind = TRUE) | |
1540 # No group-median exists if there is only one sample | |
1541 # a given ppep has no measurement; otherwise, proceed. | |
1542 if (ncol(lvlsbst) > 1) { | |
1543 the_centers <- | |
1544 apply(lvlsbst, 1, median, na.rm = TRUE) | |
1545 for (j in seq_len(nrow(lvlsbst))) { | |
1546 for (k in seq_len(ncol(lvlsbst))) { | |
1547 if (is.na(lvlsbst[j, k])) { | |
1548 lvlsbst[j, k] <- the_centers[j] | |
1549 } | |
1550 } | |
1551 } | |
1552 quant_data_imp[, level_cols] <- lvlsbst | |
1553 } | |
532 } | 1554 } |
1555 # Take the accurate e^x - 1 to match scaling of original input. | |
1556 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | |
533 good_rows <- !is.na(rowMeans(quant_data_imp)) | 1557 good_rows <- !is.na(rowMeans(quant_data_imp)) |
534 } | 1558 } |
535 , "median" = { | 1559 , "median" = { |
1560 quant_data_imp <- quant_data | |
536 imputation_method_description <- | 1561 imputation_method_description <- |
537 paste("Substitute missing value with", | 1562 paste("Substitute missing value with", |
538 "median peptide intensity across all sample classes.\n" | 1563 "median peptide-intensity across all sample classes.\n" |
539 ) | 1564 ) |
540 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]] | 1565 # Take the accurate ln(x+1) because the data are log-normally distributed |
541 good_rows <- !is.na(rowMeans(quant_data_imp)) | 1566 # and because median can involve an average of two measurements. |
1567 quant_data_imp <- log1p(quant_data_imp) | |
1568 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]] | |
1569 # Take the accurate e^x - 1 to match scaling of original input. | |
1570 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | |
1571 good_rows <- !is.nan(rowMeans(quant_data_imp)) | |
542 } | 1572 } |
543 , "mean" = { | 1573 , "mean" = { |
1574 quant_data_imp <- quant_data | |
544 imputation_method_description <- | 1575 imputation_method_description <- |
545 paste("Substitute missing value with", | 1576 paste("Substitute missing value with", |
546 "mean peptide intensity across all sample classes.\n" | 1577 "geometric-mean peptide intensity across all sample classes.\n" |
547 ) | 1578 ) |
548 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]] | 1579 # Take the accurate ln(x+1) because the data are log-normally distributed, |
549 good_rows <- !is.na(rowMeans(quant_data_imp)) | 1580 # so arguments to mean should be previously transformed. |
1581 # this will have to be | |
1582 quant_data_imp <- log1p(quant_data_imp) | |
1583 # Assign to NA cells the mean for the row | |
1584 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]] | |
1585 # Take the accurate e^x - 1 to match scaling of original input. | |
1586 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | |
1587 good_rows <- !is.nan(rowMeans(quant_data_imp)) | |
550 } | 1588 } |
551 , "random" = { | 1589 , "random" = { |
552 m1 <- median(sds, na.rm = T) * sd_percentile #sd to be used is the median sd | 1590 quant_data_imp <- quant_data |
1591 m1 <- median(sds, na.rm = TRUE) * sd_percentile #sd to be used is the median sd | |
553 # If you want results to be reproducible, you will want to call | 1592 # If you want results to be reproducible, you will want to call |
554 # base::set.seed before calling stats::rnorm | 1593 # base::set.seed before calling stats::rnorm |
555 imputation_method_description <- | 1594 imputation_method_description <- |
556 paste("Substitute each missing value with random intensity", | 1595 paste("Substitute each missing value with random intensity", |
557 sprintf( | 1596 sprintf( |
563 100 * mean_percentile)) | 1602 100 * mean_percentile)) |
564 cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n", | 1603 cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n", |
565 sd_percentile)) | 1604 sd_percentile)) |
566 quant_data_imp[ind] <- | 1605 quant_data_imp[ind] <- |
567 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) | 1606 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) |
568 good_rows <- !is.na(rowMeans(quant_data_imp)) | 1607 quant_data_imp_ln <- log1p(quant_data_imp) |
1608 good_rows <- !is.nan(rowMeans(quant_data_imp)) | |
569 } | 1609 } |
570 ) | 1610 ) |
1611 quant_data_imp_log10 <- quant_data_imp_ln * const_log10_e | |
571 | 1612 |
572 if (length(good_rows) < 1) { | 1613 if (length(good_rows) < 1) { |
573 print("ERROR: Cannot impute data; there are no good rows!") | 1614 print("ERROR: Cannot impute data; there are no good rows!") |
574 return(-1) | 1615 return(-1) |
575 } | 1616 } |
579 cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description) | 1620 cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description) |
580 ``` | 1621 ``` |
581 | 1622 |
582 ```{r echo = FALSE} | 1623 ```{r echo = FALSE} |
583 | 1624 |
584 imp_smry_potential_after <- sum(good_rows) | 1625 imp_smry_pot_peptides_after <- sum(good_rows) |
585 imp_smry_rejected_after <- sum(!good_rows) | 1626 imp_smry_rejected_after <- sum(!good_rows) |
586 imp_smry_missing_after <- sum(is.na(quant_data_imp[good_rows, ])) | 1627 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) |
587 ``` | 1628 ``` |
588 ```{r echo = FALSE, results = 'asis'} | 1629 ```{r echo = FALSE, results = 'asis'} |
589 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf | 1630 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf |
590 tabular_lines <- paste( | 1631 tabular_lines_fmt <- paste( |
591 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) | 1632 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) |
592 " \\caption{Imputation Results}", | 1633 " \\caption{Imputation Results}", |
593 " \\centering", # \centering centers the table on the page | 1634 " \\centering", # \centering centers the table on the page |
594 " \\begin{tabular}{l c c c}", | 1635 " \\begin{tabular}{l c c c}", |
595 " \\hline\\hline", | 1636 " \\hline\\hline", |
604 "\\end{table}", | 1645 "\\end{table}", |
605 sep = "\n" | 1646 sep = "\n" |
606 ) | 1647 ) |
607 tabular_lines <- | 1648 tabular_lines <- |
608 sprintf( | 1649 sprintf( |
609 tabular_lines, | 1650 tabular_lines_fmt, |
610 imp_smry_potential_before, | 1651 imp_smry_pot_peptides_before, |
611 imp_smry_missing_before, | 1652 imp_smry_missing_values_before, |
612 imp_smry_pct_missing, | 1653 imp_smry_pct_missing, |
613 "%", | 1654 "%", |
614 imp_smry_potential_after, | 1655 imp_smry_pot_peptides_after, |
615 imp_smry_missing_after, | 1656 imp_smry_missing_values_after, |
616 imp_smry_rejected_after | 1657 imp_smry_rejected_after |
617 ) | 1658 ) |
618 cat(tabular_lines) | 1659 cat(tabular_lines) |
619 ``` | 1660 ``` |
620 ```{r echo = FALSE} | 1661 ```{r echo = FALSE} |
622 | 1663 |
623 # Zap rows where imputation was ineffective | 1664 # Zap rows where imputation was ineffective |
624 full_data <- full_data [good_rows, ] | 1665 full_data <- full_data [good_rows, ] |
625 quant_data <- quant_data [good_rows, ] | 1666 quant_data <- quant_data [good_rows, ] |
626 | 1667 |
1668 quant_data_imp <- quant_data_imp[good_rows, ] | |
627 write_debug_file(quant_data_imp) | 1669 write_debug_file(quant_data_imp) |
628 | |
629 quant_data_imp <- quant_data_imp[good_rows, ] | |
630 quant_data_imp_good_rows <- quant_data_imp | 1670 quant_data_imp_good_rows <- quant_data_imp |
631 | 1671 |
632 write_debug_file(quant_data_imp_good_rows) | 1672 write_debug_file(quant_data_imp_good_rows) |
633 ``` | 1673 ``` |
634 | 1674 |
646 as.matrix( | 1686 as.matrix( |
647 log10(quant_data_imp[!is.na(quant_data)]) | 1687 log10(quant_data_imp[!is.na(quant_data)]) |
648 ) | 1688 ) |
649 ) | 1689 ) |
650 | 1690 |
651 if (sum(!is.na(d_combined)) > 2 && sum(!is.na(d_original)) > 2) { | 1691 if (sum(!is.na(d_original)) > 2) { |
1692 d_original <- density(d_original) | |
1693 } else { | |
1694 can_plot_before_after_imp <- FALSE | |
1695 } | |
1696 if (can_plot_before_after_imp && sum(is.na(d_combined)) < 1) { | |
652 d_combined <- density(d_combined) | 1697 d_combined <- density(d_combined) |
653 d_original <- density(d_original) | |
654 } else { | 1698 } else { |
655 can_plot_before_after_imp <- FALSE | 1699 can_plot_before_after_imp <- FALSE |
656 } | 1700 } |
657 | 1701 |
658 if (sum(is.na(quant_data)) > 0) { | 1702 if (sum(is.na(quant_data)) > 0) { |
661 as.numeric( | 1705 as.numeric( |
662 as.matrix( | 1706 as.matrix( |
663 log10(quant_data_imp[is.na(quant_data)]) | 1707 log10(quant_data_imp[is.na(quant_data)]) |
664 ) | 1708 ) |
665 ) | 1709 ) |
666 if (sum(!is.na(d_combined)) > 2) { | 1710 if (can_plot_before_after_imp && sum(is.na(d_imputed)) < 1) { |
667 d_imputed <- (density(d_imputed)) | 1711 d_imputed <- (density(d_imputed)) |
668 } else { | 1712 } else { |
669 can_plot_before_after_imp <- FALSE | 1713 can_plot_before_after_imp <- FALSE |
670 } | 1714 } |
671 } else { | 1715 } else { |
674 } | 1718 } |
675 | 1719 |
676 ``` | 1720 ``` |
677 | 1721 |
678 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | 1722 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} |
1723 zero_sd_rownames <- | |
1724 rownames(quant_data_imp)[ | |
1725 is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0) | |
1726 ] | |
1727 | |
1728 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { | |
1729 stop("All peptides have zero standard deviation. Cannot continue.") | |
1730 } | |
1731 if (length(zero_sd_rownames) > 0) { | |
1732 cat( | |
1733 sprintf("%d peptides with zero variance were removed from statistical consideration", | |
1734 length(zero_sd_rownames) | |
1735 ) | |
1736 ) | |
1737 zap_named_rows <- function(df, nms) { | |
1738 return(df[!(row.names(df) %in% nms), ]) | |
1739 } | |
1740 quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames) | |
1741 quant_data <- zap_named_rows(quant_data, zero_sd_rownames) | |
1742 full_data <- zap_named_rows(full_data, zero_sd_rownames) | |
1743 } | |
1744 | |
679 if (sum(is.na(quant_data)) > 0) { | 1745 if (sum(is.na(quant_data)) > 0) { |
680 cat("\\leavevmode\\newpage\n") | 1746 cat("\\leavevmode\\newpage\n") |
681 # data visualization | 1747 # data visualization |
682 old_par <- par( | 1748 old_par <- par( |
683 mai = par("mai") + c(0.5, 0, 0, 0) | 1749 mai = par("mai") + c(0.5, 0, 0, 0) |
684 ) | 1750 ) |
1751 # Copy quant data to x | |
685 x <- quant_data | 1752 x <- quant_data |
686 x <- blue_dots <- x / x | 1753 # x gets to have values of: |
687 blue_dots <- log10(blue_dots * quant_data) | 1754 # - NA for observed values |
1755 # - 1 for missing values | |
688 x[is.na(x)] <- 0 | 1756 x[is.na(x)] <- 0 |
689 x[x == 1] <- NA | 1757 x[x > 1] <- NA |
690 x[x == 0] <- 1 | 1758 x[x == 0] <- 1 |
691 quant_data_imp_log10 <- | 1759 # Log-transform imputed data |
692 log10(quant_data_imp) | 1760 # update variable because rows may have been eliminated from quant_data_imp |
1761 quant_data_imp_log10 <- log10(quant_data_imp) | |
693 | 1762 |
694 write_debug_file(quant_data_imp_log10) | 1763 write_debug_file(quant_data_imp_log10) |
695 | 1764 |
1765 # Set blue_dots to log of quant data or NA for NA quant data | |
1766 blue_dots <- log10(quant_data) | |
1767 # Set red_dots to log of imputed data or NA for observed quant data | |
696 red_dots <- quant_data_imp_log10 * x | 1768 red_dots <- quant_data_imp_log10 * x |
1769 | |
697 count_red <- sum(!is.na(red_dots)) | 1770 count_red <- sum(!is.na(red_dots)) |
698 count_blue <- sum(!is.na(blue_dots)) | 1771 count_blue <- sum(!is.na(blue_dots)) |
699 ylim_save <- ylim <- c( | 1772 ylim_save <- ylim <- c( |
700 min(red_dots, blue_dots, na.rm = TRUE), | 1773 min(red_dots, blue_dots, na.rm = TRUE), |
701 max(red_dots, blue_dots, na.rm = TRUE) | 1774 max(red_dots, blue_dots, na.rm = TRUE) |
713 boxplot( | 1786 boxplot( |
714 blue_dots | 1787 blue_dots |
715 , las = 1 # "always horizontal" | 1788 , las = 1 # "always horizontal" |
716 , col = const_boxplot_fill | 1789 , col = const_boxplot_fill |
717 , ylim = ylim | 1790 , ylim = ylim |
718 , main = "Peptide intensities before and after imputation" | 1791 , main = "Peptide intensities after eliminating unusable peptides" |
719 , sub = boxplot_sub | 1792 , sub = boxplot_sub |
720 , xlab = "Sample" | 1793 , xlab = "Sample" |
721 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 1794 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") |
722 ) | 1795 ) |
723 | 1796 |
744 col = "red", # Color of the symbol | 1817 col = "red", # Color of the symbol |
745 vertical = TRUE, # Vertical mode | 1818 vertical = TRUE, # Vertical mode |
746 add = TRUE # Add it over | 1819 add = TRUE # Add it over |
747 ) | 1820 ) |
748 | 1821 |
749 } else { | 1822 } |
750 # violin plot | 1823 if (TRUE) { |
1824 # show measured values in blue on left half-violin plot | |
751 cat("\\leavevmode\n\\quad\n\n\\quad\n\n") | 1825 cat("\\leavevmode\n\\quad\n\n\\quad\n\n") |
752 vioplot::vioplot( | 1826 vioplot::vioplot( |
753 x = lapply(blue_dots, function(x) x[!is.na(x)]), | 1827 x = lapply(blue_dots, function(x) x[!is.na(x)]), |
754 col = "lightblue1", | 1828 col = "lightblue1", |
755 side = "left", | 1829 side = "left", |
758 main = "Distributions of observed and imputed data", | 1832 main = "Distributions of observed and imputed data", |
759 sub = "Light blue = observed data; Pink = imputed data", | 1833 sub = "Light blue = observed data; Pink = imputed data", |
760 xlab = "Sample", | 1834 xlab = "Sample", |
761 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 1835 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") |
762 ) | 1836 ) |
1837 red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) | |
1838 cols_to_delete <- c() | |
1839 for (ix in seq_len(length(red_violins))) { | |
1840 if (length(red_violins[[ix]]) < 1) { | |
1841 cols_to_delete <- c(cols_to_delete, ix) | |
1842 } | |
1843 } | |
1844 # destroy any unimputable columns | |
1845 if (!is.null(cols_to_delete)) { | |
1846 red_violins <- red_violins[-cols_to_delete] | |
1847 } | |
1848 # plot imputed values in red on right half-violin plot | |
763 vioplot::vioplot( | 1849 vioplot::vioplot( |
764 x = lapply(red_dots, function(x) x[!is.na(x)]), | 1850 x = red_violins, |
765 col = "lightpink1", | 1851 col = "lightpink1", |
766 side = "right", | 1852 side = "right", |
767 plotCentre = "line", | 1853 plotCentre = "line", |
768 add = T | 1854 add = TRUE |
769 ) | 1855 ) |
770 } | 1856 } |
771 | 1857 |
772 par(old_par) | 1858 par(old_par) |
773 | 1859 |
774 # density plot | 1860 # density plot |
775 cat("\\leavevmode\n\n\n\n\n\n\n") | 1861 cat("\\leavevmode\n\n\n\n\n\n\n") |
776 if (can_plot_before_after_imp) { | 1862 if (can_plot_before_after_imp) { |
777 ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) | 1863 ylim <- c( |
1864 0, | |
1865 max( | |
1866 if (is.list(d_combined)) d_combined$y else d_combined, | |
1867 if (is.list(d_original)) d_original$y else d_original, | |
1868 if (is.list(d_imputed)) d_imputed$y else d_imputed, | |
1869 na.rm = TRUE | |
1870 ) | |
1871 ) | |
778 plot( | 1872 plot( |
779 d_combined, | 1873 d_combined, |
780 ylim = ylim, | 1874 ylim = ylim, |
781 sub = | 1875 sub = |
782 paste( | 1876 paste( |
797 } | 1891 } |
798 cat("\\leavevmode\\newpage\n") | 1892 cat("\\leavevmode\\newpage\n") |
799 } | 1893 } |
800 ``` | 1894 ``` |
801 | 1895 |
802 ## Perform Quantile Normalization | 1896 # Perform Quantile Normalization |
1897 | |
1898 The excellent `normalize.quantiles` function from | |
1899 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* | |
1900 performs "quantile normalization" as described Bolstad *et al.* (2003), | |
1901 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* | |
1902 and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*, | |
1903 i.e., it assumes that the goal is to detect | |
1904 subtle differences among grossly similar samples (having similar distributions) | |
1905 by equailzing intra-quantile quantitations. | |
1906 Unfortunately, one software library upon which it depends | |
1907 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* | |
1908 that requires that a specific, non-concurrent version of the library be | |
1909 installed. The installation command equivalent to what was used to install the library to produce the results presented here is: | |
1910 ``` | |
1911 conda install bioconductor-preprocesscore openblas=0.3.3 | |
1912 ``` | |
1913 | |
803 | 1914 |
804 <!-- | 1915 <!-- |
805 # Apply quantile normalization using preprocessCore::normalize.quantiles | 1916 # Apply quantile normalization using preprocessCore::normalize.quantiles |
806 # --- | 1917 # --- |
807 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html | 1918 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html |
812 # ``` | 1923 # ``` |
813 # conda installation (necessary because of a bug in recent openblas): | 1924 # conda installation (necessary because of a bug in recent openblas): |
814 # conda install bioconductor-preprocesscore openblas=0.3.3 | 1925 # conda install bioconductor-preprocesscore openblas=0.3.3 |
815 # ... | 1926 # ... |
816 # --- | 1927 # --- |
817 # normalize.quantiles {preprocessCore} -- Quantile Normalization | 1928 # normalize.quantiles {preprocessCore} -- Quantile Normalization |
818 # | 1929 # |
819 # Description: | 1930 # Description: |
820 # Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities. | 1931 # Using a normalization based upon quantiles, this function normalizes a |
1932 # matrix of probe level intensities. | |
1933 # | |
1934 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the | |
1935 # assumption that the data is missing at random. | |
821 # | 1936 # |
822 # Usage: | 1937 # Usage: |
823 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) | 1938 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) |
824 # | 1939 # |
825 # Arguments: | 1940 # Arguments: |
857 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 | 1972 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 |
858 # http://bmbolstad.com/misc/normalize/normalize.html | 1973 # http://bmbolstad.com/misc/normalize/normalize.html |
859 # ... | 1974 # ... |
860 --> | 1975 --> |
861 ```{r echo = FALSE, results = 'asis'} | 1976 ```{r echo = FALSE, results = 'asis'} |
862 library(preprocessCore) | |
863 | 1977 |
864 if (nrow(quant_data_imp) > 0) { | 1978 if (nrow(quant_data_imp) > 0) { |
865 quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp)) | 1979 quant_data_imp_qn <- preprocessCore::normalize.quantiles( |
1980 as.matrix(quant_data_imp), keep.names = TRUE | |
1981 ) | |
866 } else { | 1982 } else { |
867 quant_data_imp_qn <- as.matrix(quant_data_imp) | 1983 quant_data_imp_qn <- as.matrix(quant_data_imp) |
868 } | 1984 } |
869 | 1985 |
870 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) | 1986 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) |
871 names(quant_data_imp_qn) <- names(quant_data_imp) | |
872 | 1987 |
873 write_debug_file(quant_data_imp_qn) | 1988 write_debug_file(quant_data_imp_qn) |
874 | 1989 |
875 quant_data_imp_qn_log <- log10(quant_data_imp_qn) | 1990 quant_data_imp_qn_log <- log10(quant_data_imp_qn) |
876 rownames(quant_data_imp_qn_log) <- rownames(quant_data_imp) | |
877 | 1991 |
878 write_debug_file(quant_data_imp_qn_log) | 1992 write_debug_file(quant_data_imp_qn_log) |
879 | 1993 |
880 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) | 1994 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) |
881 | 1995 |
882 any_nan <- function(x) { | |
883 !any(x == "NaN") | |
884 } | |
885 sel <- apply(quant_data_imp_qn_ls, 1, any_nan) | 1996 sel <- apply(quant_data_imp_qn_ls, 1, any_nan) |
886 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ] | 1997 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls |
1998 | |
1999 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] | |
887 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) | 2000 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) |
888 rownames(quant_data_imp_qn_ls2) <- rownames(quant_data_imp)[sel] | |
889 | 2001 |
890 quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls) | 2002 quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls) |
891 rownames(quant_data_imp_qn_ls) <- rownames(quant_data_imp) | |
892 | 2003 |
893 write_debug_file(quant_data_imp_qn_ls) | 2004 write_debug_file(quant_data_imp_qn_ls) |
894 write_debug_file(quant_data_imp_qn_ls2) | 2005 write_debug_file(quant_data_imp_qn_ls2) |
895 | 2006 |
896 #output quantile normalized data | 2007 # Create data.frame used by ANOVA analysis |
897 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) | 2008 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) |
898 write.table( | |
899 data_table_imp_qn_lt, | |
900 file = imp_qn_lt_data_filenm, | |
901 sep = "\t", | |
902 col.names = TRUE, | |
903 row.names = FALSE, | |
904 quote = FALSE | |
905 ) | |
906 | |
907 ``` | 2009 ``` |
908 | 2010 |
909 <!-- ACE insertion begin --> | 2011 <!-- ACE insertion begin --> |
910 ### Checking that normalized, imputed, log-transformed sample distributions are similar: | 2012 ## Are normalized, imputed, log-transformed sample distributions similar? |
911 | 2013 |
912 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | 2014 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} |
913 | 2015 |
914 # Save unimputed quant_data_log for plotting below | 2016 # Save unimputed quant_data_log for plotting below |
915 unimputed_quant_data_log <- quant_data_log | 2017 unimputed_quant_data_log <- quant_data_log |
916 | 2018 |
917 # log10 transform (after preparing for zero values, | 2019 # log10 transform (after preparing for zero values, |
918 # which should never happen...) | 2020 # which should never happen...) |
919 quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 | 2021 quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 |
920 quant_data_log <- log10(quant_data_imp_qn) | 2022 quant_data_log <- log10(quant_data_imp_qn) |
921 | |
922 # Output with imputed, un-normalized data | |
923 | |
924 data_table_imputed <- cbind(full_data[1:9], quant_data_imp) | |
925 write.table( | |
926 data_table_imputed | |
927 , file = imputed_data_filename | |
928 , sep = "\t" | |
929 , col.names = TRUE | |
930 , row.names = FALSE | |
931 , quote = FALSE | |
932 ) | |
933 | 2023 |
934 how_many_peptides <- nrow(quant_data_log) | 2024 how_many_peptides <- nrow(quant_data_log) |
935 | 2025 |
936 if ((how_many_peptides) > 0) { | 2026 if ((how_many_peptides) > 0) { |
937 cat( | 2027 cat( |
968 ``` | 2058 ``` |
969 | 2059 |
970 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} | 2060 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} |
971 if (nrow(quant_data_log) > 1) { | 2061 if (nrow(quant_data_log) > 1) { |
972 quant_data_log_stack <- stack(quant_data_log) | 2062 quant_data_log_stack <- stack(quant_data_log) |
973 ggplot( | 2063 ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) + |
974 quant_data_log_stack, | 2064 ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + |
975 aes(x = values) | 2065 ggplot2::ylab("Probability density") + |
976 ) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + | 2066 ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE) |
977 ylab("Probability density") + | |
978 geom_density( | |
979 aes(group = ind, colour = ind), | |
980 na.rm = TRUE | |
981 ) | |
982 } else { | 2067 } else { |
983 cat("No density plot because there are fewer than two peptides to plot.\n\n") | 2068 cat("No density plot because there are fewer than two peptides to plot.\n\n") |
984 } | 2069 } |
985 ``` | 2070 ``` |
986 ```{r echo = FALSE, results = 'asis'} | 2071 ```{r echo = FALSE, results = 'asis'} |
987 cat("\\leavevmode\\newpage\n") | 2072 cat("\\leavevmode\\newpage\n") |
988 ``` | 2073 ``` |
989 | 2074 |
990 ## Perform ANOVA Filters | 2075 # ANOVA Analysis |
991 | 2076 |
992 ```{r, echo = FALSE} | 2077 ```{r, echo = FALSE} |
993 # Make new data frame containing only Phosphopeptides | 2078 # Make new data frame containing only Phosphopeptides |
994 # to connect preANOVA to ANOVA (connect_df) | 2079 # to connect preANOVA to ANOVA (connect_df) |
995 connect_df <- data.frame( | 2080 connect_df <- data.frame( |
998 ) | 2083 ) |
999 colnames(connect_df) <- c("Phosphopeptide", "Intensity") | 2084 colnames(connect_df) <- c("Phosphopeptide", "Intensity") |
1000 ``` | 2085 ``` |
1001 | 2086 |
1002 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | 2087 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} |
1003 count_of_factor_levels <- length(levels(sample_factor_levels)) | 2088 count_of_treatment_levels <- length(levels(sample_treatment_levels)) |
1004 if (count_of_factor_levels < 2) { | 2089 if (count_of_treatment_levels < 2) { |
1005 nuke_control_sequences <- | 2090 nuke_control_sequences <- |
1006 function(s) { | 2091 function(s) { |
1007 s <- gsub("[\\]", "xyzzy_plugh", s) | 2092 s <- gsub("[\\]", "xyzzy_plugh", s) |
1008 s <- gsub("[$]", "\\\\$", s) | 2093 s <- gsub("[$]", "\\\\$", s) |
1009 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) | 2094 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) |
1052 ) | 2137 ) |
1053 | 2138 |
1054 cat("\n\n\n") | 2139 cat("\n\n\n") |
1055 cat("Sample group assignments are:\n", | 2140 cat("Sample group assignments are:\n", |
1056 "\\begin{quote}\n", | 2141 "\\begin{quote}\n", |
1057 paste(regmatches(sample_name_matches, m2), collapse = "\n\n\n"), | 2142 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), |
1058 "\n\\end{quote}\n\n") | 2143 "\n\\end{quote}\n\n") |
1059 | 2144 |
1060 } else { | 2145 } else { |
2146 | |
1061 p_value_data_anova_ps <- | 2147 p_value_data_anova_ps <- |
1062 apply( | 2148 apply( |
1063 quant_data_imp_qn_log, | 2149 quant_data_imp_qn_log, |
1064 1, | 2150 1, |
1065 anova_func, | 2151 anova_func, |
1066 grouping_factor = sample_factor_levels | 2152 grouping_factor = sample_treatment_levels, |
2153 one_way_f = one_way_all_categories | |
1067 ) | 2154 ) |
1068 | 2155 |
1069 p_value_data_anova_ps_fdr <- | 2156 p_value_data_anova_ps_fdr <- |
1070 p.adjust(p_value_data_anova_ps, method = "fdr") | 2157 p.adjust(p_value_data_anova_ps, method = "fdr") |
1071 p_value_data <- data.frame( | 2158 p_value_data <- data.frame( |
1079 # output ANOVA file to constructed filename, | 2166 # output ANOVA file to constructed filename, |
1080 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" | 2167 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" |
1081 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" | 2168 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" |
1082 | 2169 |
1083 # Re-output datasets to include p-values | 2170 # Re-output datasets to include p-values |
2171 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3]) | |
1084 write.table( | 2172 write.table( |
1085 cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp), | 2173 cbind(metadata_plus_p, quant_data_imp), |
1086 file = imputed_data_filename, | 2174 file = imputed_data_filename, |
1087 sep = "\t", | 2175 sep = "\t", |
1088 col.names = TRUE, | 2176 col.names = TRUE, |
1089 row.names = FALSE, | 2177 row.names = FALSE, |
1090 quote = FALSE | 2178 quote = FALSE |
1091 ) | 2179 ) |
1092 | 2180 |
1093 write.table( | 2181 write.table( |
1094 cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp_qn_log), | 2182 cbind(metadata_plus_p, quant_data_imp_qn_log), |
1095 file = imp_qn_lt_data_filenm, | 2183 file = imp_qn_lt_data_filenm, |
1096 sep = "\t", | 2184 sep = "\t", |
1097 col.names = TRUE, | 2185 col.names = TRUE, |
1098 row.names = FALSE, | 2186 row.names = FALSE, |
1099 quote = FALSE | 2187 quote = FALSE |
1133 # <!-- ACE insertion start --> | 2221 # <!-- ACE insertion start --> |
1134 | 2222 |
1135 if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { | 2223 if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { |
1136 if (first_page_suppress == 1) { | 2224 if (first_page_suppress == 1) { |
1137 first_page_suppress <- 0 | 2225 first_page_suppress <- 0 |
1138 } | 2226 } else { |
1139 else { | |
1140 cat("\\newpage\n") | 2227 cat("\\newpage\n") |
1141 } | 2228 } |
1142 cat(sprintf( | 2229 if (nrow(filtered_data_filtered) > 1) { |
1143 "Intensities for %d peptides whose adjusted p-value < %0.2f:\n", | 2230 subsection_header(sprintf( |
1144 nrow(filtered_data_filtered), | 2231 "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n", |
1145 cutoff | 2232 nrow(filtered_data_filtered), |
1146 )) | 2233 cutoff |
2234 )) | |
2235 } else { | |
2236 subsection_header(sprintf( | |
2237 "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n", | |
2238 rownames(filtered_data_filtered)[1], | |
2239 cutoff | |
2240 )) | |
2241 } | |
1147 cat("\n\n\n") | 2242 cat("\n\n\n") |
1148 cat("\n\n\n") | 2243 cat("\n\n\n") |
1149 | 2244 |
1150 old_oma <- par("oma") | 2245 old_oma <- par("oma") |
1151 old_par <- par( | 2246 old_par <- par( |
1156 fin = c(9, 7.25) | 2251 fin = c(9, 7.25) |
1157 ) | 2252 ) |
1158 # ref: https://r-charts.com/distribution/add-points-boxplot/ | 2253 # ref: https://r-charts.com/distribution/add-points-boxplot/ |
1159 # Vertical plot | 2254 # Vertical plot |
1160 colnames(filtered_data_filtered) <- sample_name_matches | 2255 colnames(filtered_data_filtered) <- sample_name_matches |
1161 boxplot( | 2256 tryCatch( |
1162 filtered_data_filtered, | 2257 boxplot( |
1163 main = "Imputed, normalized intensities", # no line plot | 2258 filtered_data_filtered, |
1164 las = 1, | 2259 main = "Imputed, normalized intensities", # no line plot |
1165 col = const_boxplot_fill, | 2260 las = 1, |
1166 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | 2261 col = const_boxplot_fill, |
2262 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
2263 ), | |
2264 error = function(e) print(e) | |
1167 ) | 2265 ) |
1168 par(old_par) | 2266 par(old_par) |
1169 } else { | 2267 } else { |
1170 cat(sprintf( | 2268 cat(sprintf( |
1171 "%s < %0.2f\n\n\n\n\n", | 2269 "%s < %0.2f\n\n\n\n\n", |
1173 cutoff | 2271 cutoff |
1174 )) | 2272 )) |
1175 } | 2273 } |
1176 | 2274 |
1177 if (nrow(filtered_data_filtered) > 0) { | 2275 if (nrow(filtered_data_filtered) > 0) { |
1178 #Add Phosphopeptide column to anova_filtered table | 2276 # Add Phosphopeptide column to anova_filtered table |
1179 anova_filtered_merge <- merge( | 2277 # The assumption here is that the first intensity is unique; |
2278 # this is a hokey assumption but almost definitely will | |
2279 # be true in the real world unless there is a computation | |
2280 # error upstream. | |
2281 anova_filtered_merge <- base::merge( | |
1180 x = connect_df, | 2282 x = connect_df, |
1181 y = filtered_data_filtered, | 2283 y = filtered_data_filtered, |
1182 by.x = "Intensity", | 2284 by.x = "Intensity", |
1183 by.y = 1 | 2285 by.y = 1 |
1184 ) | 2286 ) |
1185 anova_filtered_merge_order <- rownames(filtered_p) | 2287 anova_filtered_merge_order <- rownames(filtered_p) |
2288 | |
2289 anova_filtered <- data.frame( | |
2290 ppep = anova_filtered_merge$Phosphopeptide, | |
2291 intense = anova_filtered_merge$Intensity, | |
2292 data = anova_filtered_merge[, 2:number_of_samples + 1] | |
2293 ) | |
2294 colnames(anova_filtered) <- | |
2295 c("Phosphopeptide", colnames(filtered_data_filtered)) | |
2296 | |
2297 # Merge qualitative columns into the ANOVA data | |
2298 output_table <- data.frame(anova_filtered$Phosphopeptide) | |
2299 output_table <- base::merge( | |
2300 x = output_table, | |
2301 y = data_table_imp_qn_lt, | |
2302 by.x = "anova_filtered.Phosphopeptide", | |
2303 by.y = "Phosphopeptide" | |
2304 ) | |
2305 | |
2306 # Produce heatmap to visualize significance and the effect of imputation | |
1186 | 2307 |
1187 anova_filtered_merge_format <- sapply( | 2308 anova_filtered_merge_format <- sapply( |
1188 X = filtered_p$fdr_adjusted_anova_p | 2309 X = filtered_p$fdr_adjusted_anova_p |
1189 , | 2310 , |
1190 FUN = function(x) { | 2311 FUN = function(x) { |
1193 else | 2314 else |
1194 paste0("(%0.4e) %s") | 2315 paste0("(%0.4e) %s") |
1195 } | 2316 } |
1196 ) | 2317 ) |
1197 | 2318 |
1198 anova_filtered <- data.table( | 2319 cat_hm_heading <- function(m, cutoff) { |
1199 anova_filtered_merge$Phosphopeptide | 2320 cat("\\newpage\n") |
1200 , | 2321 if (nrow(m) > intensity_hm_rows) { |
1201 anova_filtered_merge$Intensity | 2322 subsection_header( |
1202 , | 2323 paste( |
1203 anova_filtered_merge[, 2:number_of_samples + 1] | 2324 sprintf("Heatmap for the %d most-significant peptides", |
1204 ) | 2325 intensity_hm_rows), |
1205 colnames(anova_filtered) <- | 2326 sprintf("whose adjusted p-value < %0.2f\n", cutoff) |
1206 c("Phosphopeptide", colnames(filtered_data_filtered)) | 2327 ) |
1207 | 2328 ) |
1208 # Merge qualitative columns into the ANOVA data | 2329 } else { |
1209 output_table <- data.frame(anova_filtered$Phosphopeptide) | 2330 if (nrow(m) == 1) { |
1210 output_table <- merge( | 2331 return(FALSE) |
1211 x = output_table, | 2332 } else { |
1212 y = data_table_imp_qn_lt, | 2333 subsection_header( |
1213 by.x = "anova_filtered.Phosphopeptide", | 2334 paste( |
1214 by.y = "Phosphopeptide" | 2335 sprintf("Heatmap for %d usable peptides whose", nrow(m)), |
1215 ) | 2336 sprintf("adjusted p-value < %0.2f\n", cutoff) |
1216 | 2337 ) |
1217 # Produce heatmap to visualize significance and the effect of imputation | 2338 ) |
2339 } | |
2340 } | |
2341 cat("\n\n\n") | |
2342 cat("\n\n\n") | |
2343 return(TRUE) | |
2344 } | |
2345 | |
2346 # construct matrix with appropriate rownames | |
1218 m <- | 2347 m <- |
1219 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) | 2348 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) |
1220 m_nan_rows <- rowSums( | |
1221 matrix( | |
1222 as.integer(is.na(m)), | |
1223 nrow = nrow(m) | |
1224 ) | |
1225 ) | |
1226 m <- m[!m_nan_rows, , drop = FALSE] | |
1227 if (nrow(m) > 0) { | 2349 if (nrow(m) > 0) { |
1228 rownames_m <- rownames(m) | 2350 rownames_m <- rownames(m) |
1229 rownames(m) <- sapply( | 2351 rownames(m) <- sapply( |
1230 X = seq_len(nrow(m)) | 2352 X = seq_len(nrow(m)) |
1231 , | 2353 , |
1235 filtered_p$fdr_adjusted_anova_p[i], | 2357 filtered_p$fdr_adjusted_anova_p[i], |
1236 rownames_m[i] | 2358 rownames_m[i] |
1237 ) | 2359 ) |
1238 } | 2360 } |
1239 ) | 2361 ) |
1240 how_many_peptides <- min(50, nrow(m)) | 2362 } |
1241 number_of_peptides_found <- how_many_peptides | 2363 # draw the heading and heatmap |
1242 if (nrow(m) > 1) { | 2364 if (nrow(m) > 0) { |
1243 m_margin <- m[how_many_peptides:1, ] | 2365 number_of_peptides_found <- |
1244 margins <- | 2366 draw_intensity_heatmap( |
1245 c(max(nchar(colnames(m_margin))) * 10 / 16 # col | 2367 m = m, |
1246 , max(nchar(rownames(m_margin))) * 5 / 16 # row | 2368 cutoff = cutoff, |
1247 ) | 2369 hm_heading_function = cat_hm_heading, |
1248 } | 2370 hm_main_title = "Unimputed, unnormalized log(intensities)", |
1249 | 2371 suppress_row_dendrogram = FALSE |
1250 cat("\\newpage\n") | |
1251 if (nrow(m) > 50) { | |
1252 cat("Heatmap for the 50 most-significant peptides", | |
1253 sprintf( | |
1254 "whose adjusted p-value < %0.2f\n", | |
1255 cutoff) | |
1256 ) | |
1257 } else { | |
1258 if (nrow(m) == 1) { | |
1259 next | |
1260 } else { | |
1261 cat( | |
1262 sprintf("Heatmap for %d usable peptides whose", nrow(m)), | |
1263 sprintf("adjusted p-value < %0.2f\n", cutoff) | |
1264 ) | 2372 ) |
1265 } | |
1266 } | |
1267 cat("\n\n\n") | |
1268 cat("\n\n\n") | |
1269 try( | |
1270 if (nrow(m) > 1) { | |
1271 old_oma <- par("oma") | |
1272 par(cex.main = 0.6) | |
1273 heatmap( | |
1274 m[how_many_peptides:1, ], | |
1275 Rowv = NA, | |
1276 Colv = NA, | |
1277 cexRow = 0.7, | |
1278 cexCol = 0.8, | |
1279 scale = "row", | |
1280 margins = margins, | |
1281 main = | |
1282 "Unimputed, unnormalized intensities", | |
1283 xlab = "", | |
1284 las = 1 #, fin = c(9, 5.5) | |
1285 ) | |
1286 } | |
1287 ) | |
1288 } | 2373 } |
1289 } | 2374 } |
1290 } | 2375 } |
1291 } | 2376 } |
1292 cat("\\leavevmode\n\n\n") | 2377 cat("\\leavevmode\n\n\n") |
1293 ``` | 2378 ``` |
2379 | |
2380 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
2381 | |
2382 if (count_of_treatment_levels > 1) { | |
2383 # Prepare two-way contrasts with adjusted p-values | |
2384 # Strategy: | |
2385 # - use imputed, log-transformed data: | |
2386 # - remember this when computing log2(fold-change) | |
2387 # - each contrast is between a combination of trt levels | |
2388 # - for each contrast, compute samples that are members | |
2389 # - compute one-way test: | |
2390 # - use `oneway.test` (Welch test) if numbers of samples | |
2391 # are not equivalent between trt levels | |
2392 # - otherwise, aov is fine but offers no advantage | |
2393 # - adjust p-value, assuming that | |
2394 # (# of pppeps)*(# of contrasts) tests were performed | |
2395 | |
2396 # Each contrast is between a combination of trt levels | |
2397 m2 <- combn( | |
2398 x = seq_len(length(levels(sample_treatment_levels))), | |
2399 m = 2, | |
2400 simplify = TRUE | |
2401 ) | |
2402 contrast_count <- ncol(m2) | |
2403 | |
2404 # For each contrast, compute samples that are members | |
2405 # - local function to construct a data.frame for each contrast | |
2406 # - the contrast in the first "column" | |
2407 f_m2 <- | |
2408 function(cntrst, lvl1, lvl2) { | |
2409 return( | |
2410 data.frame( | |
2411 contrast = cntrst, | |
2412 level = sample_treatment_levels[ | |
2413 sample_treatment_levels %in% | |
2414 levels(sample_treatment_levels)[c(lvl1, lvl2)] | |
2415 ], | |
2416 label = sample_name_matches[ | |
2417 sample_treatment_levels %in% | |
2418 levels(sample_treatment_levels)[c(lvl1, lvl2)] | |
2419 ] | |
2420 ) | |
2421 ) | |
2422 } | |
2423 # - compute a df for each contrast | |
2424 sample_level_dfs <- lapply( | |
2425 X = 1:contrast_count, | |
2426 FUN = function(i) f_m2(i, m2[1, i], m2[2, i]) | |
2427 ) | |
2428 # - compute a single df for all contrasts | |
2429 combined_contrast_df <- Reduce(f = rbind, x = sample_level_dfs) | |
2430 | |
2431 # - dispose objects to free resources | |
2432 rm(sample_level_dfs) | |
2433 | |
2434 # - write the df to a DB for later join-per-contrast | |
2435 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) | |
2436 | |
2437 RSQLite::dbWriteTable( | |
2438 conn = db, | |
2439 name = "contrast", | |
2440 value = combined_contrast_df, | |
2441 overwrite = TRUE | |
2442 ) | |
2443 | |
2444 # Create UK for insert | |
2445 ddl_exec(db, " | |
2446 CREATE UNIQUE INDEX IF NOT EXISTS contrast__uk__idx | |
2447 ON contrast(contrast, label); | |
2448 " | |
2449 ) | |
2450 # Create indexes for join | |
2451 ddl_exec(db, " | |
2452 -- index for join in contrast_ppep_smpl_qnlt on a.label < b.label | |
2453 CREATE INDEX IF NOT EXISTS contrast__label__idx | |
2454 ON contrast(label); | |
2455 " | |
2456 ) | |
2457 ddl_exec(db, " | |
2458 -- index for joining two contrast_lvl_ppep_avg_quant on contrast | |
2459 CREATE INDEX IF NOT EXISTS contrast__contrast__idx | |
2460 ON contrast(contrast); | |
2461 " | |
2462 ) | |
2463 ddl_exec(db, " | |
2464 -- index for joining two contrast_lvl_ppep_avg_quant on phophospep | |
2465 CREATE INDEX IF NOT EXISTS contrast__level__idx | |
2466 ON contrast(level); | |
2467 " | |
2468 ) | |
2469 # - dispose objects to free resources | |
2470 rm(combined_contrast_df) | |
2471 | |
2472 # Use imputed, log-transformed data | |
2473 # - remember that this was donoe when computing log2(fold-change) | |
2474 # - melt data matrix for use in later join-per-contrast | |
2475 casted <- cbind( | |
2476 data.frame(vrbl = rownames(quant_data_imp_qn_log)), | |
2477 quant_data_imp_qn_log | |
2478 ) | |
2479 quant_data_imp_qn_log_melted <- reshape2::melt( | |
2480 casted, | |
2481 id.vars = "vrbl" | |
2482 ) | |
2483 colnames(quant_data_imp_qn_log_melted) <- | |
2484 c("phosphopep", "sample", "quant") | |
2485 # - dispose objects to free resources | |
2486 rm(casted) | |
2487 | |
2488 # - write the df to a DB for use in later join-per-contrast | |
2489 RSQLite::dbWriteTable( | |
2490 conn = db, | |
2491 name = "ppep_smpl_qnlt", | |
2492 value = quant_data_imp_qn_log_melted, | |
2493 overwrite = TRUE | |
2494 ) | |
2495 # Create UK for insert | |
2496 ddl_exec(db, " | |
2497 CREATE UNIQUE INDEX IF NOT EXISTS ppep_smpl_qnlt__uk__idx | |
2498 ON ppep_smpl_qnlt(phosphopep, sample); | |
2499 " | |
2500 ) | |
2501 # Create index for join | |
2502 ddl_exec(db, " | |
2503 -- index for join in contrast_ppep_smpl_qnlt | |
2504 CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__sample__idx | |
2505 ON ppep_smpl_qnlt(sample); | |
2506 " | |
2507 ) | |
2508 ddl_exec(db, " | |
2509 -- index for joining two contrast_lvl_ppep_avg_quant on phopho.pep | |
2510 CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__phosphopep__idx | |
2511 ON ppep_smpl_qnlt(phosphopep); | |
2512 " | |
2513 ) | |
2514 # - dispose objects to free resources | |
2515 rm(quant_data_imp_qn_log_melted) | |
2516 | |
2517 # - drop views if exist | |
2518 ddl_exec(db, " | |
2519 -- drop view dependent on contrast_lvl_ppep_avg_quant | |
2520 DROP VIEW IF EXISTS v_contrast_log2_fc; | |
2521 " | |
2522 ) | |
2523 ddl_exec(db, " | |
2524 -- drop table dependent on contrast_ppep_smpl_qnlt | |
2525 DROP TABLE IF EXISTS contrast_lvl_ppep_avg_quant; | |
2526 " | |
2527 ) | |
2528 ddl_exec(db, " | |
2529 DROP TABLE IF EXISTS contrast_lvl_lvl_metadata; | |
2530 " | |
2531 ) | |
2532 ddl_exec(db, " | |
2533 DROP VIEW IF EXISTS v_contrast_lvl_metadata; | |
2534 " | |
2535 ) | |
2536 ddl_exec(db, " | |
2537 -- drop view dependent on contrast_ppep_smpl_qnlt | |
2538 DROP VIEW IF EXISTS v_contrast_lvl_ppep_avg_quant; | |
2539 " | |
2540 ) | |
2541 ddl_exec(db, " | |
2542 DROP VIEW IF EXISTS v_contrast_lvl_lvl; | |
2543 " | |
2544 ) | |
2545 ddl_exec(db, " | |
2546 -- drop view upon which other views depend | |
2547 DROP VIEW IF EXISTS contrast_ppep_smpl_qnlt; | |
2548 " | |
2549 ) | |
2550 # - create view | |
2551 dml_no_rows_exec(db, " | |
2552 -- view contrast_ppep_smpl_qnlt is used for each phopshopep to | |
2553 -- compute p-value for test of trt effect for two trt levels | |
2554 CREATE VIEW contrast_ppep_smpl_qnlt | |
2555 AS | |
2556 SELECT contrast, | |
2557 level, | |
2558 phosphopep, | |
2559 sample, | |
2560 quant | |
2561 FROM contrast AS c, | |
2562 ppep_smpl_qnlt AS q | |
2563 WHERE q.sample = c.label | |
2564 ORDER BY contrast, level, phosphopep | |
2565 ; | |
2566 " | |
2567 ) | |
2568 # - create simplification views | |
2569 dml_no_rows_exec(db, " | |
2570 CREATE VIEW v_contrast_lvl_metadata | |
2571 AS | |
2572 SELECT contrast, | |
2573 level, | |
2574 group_concat(label, ';') AS samples | |
2575 FROM contrast | |
2576 GROUP BY contrast, level | |
2577 /* view v_contrast_lvl_metadata is used | |
2578 to simplify creation of table contrast_lvl_lvl_metadata */ | |
2579 ; | |
2580 " | |
2581 ) | |
2582 dml_no_rows_exec(db, " | |
2583 CREATE VIEW v_contrast_lvl_ppep_avg_quant | |
2584 AS | |
2585 SELECT contrast, | |
2586 level, | |
2587 phosphopep, | |
2588 avg(quant) AS avg_quant | |
2589 FROM contrast_ppep_smpl_qnlt | |
2590 GROUP BY contrast, level, phosphopep | |
2591 /* view v_contrast_lvl_ppep_avg_quant is used | |
2592 to simplify view v_contrast_log2_fc */ | |
2593 ; | |
2594 " | |
2595 ) | |
2596 | |
2597 # - create contrast-metadata table | |
2598 dml_no_rows_exec(db, " | |
2599 CREATE TABLE contrast_lvl_lvl_metadata | |
2600 AS | |
2601 SELECT DISTINCT | |
2602 a.contrast AS ab_contrast, | |
2603 a.level AS a_level, | |
2604 b.level AS b_level, | |
2605 a.samples AS a_samples, | |
2606 b.samples AS b_samples, | |
2607 'log2(level_'||a.level||'/level_'||b.level||')' | |
2608 AS fc_description | |
2609 FROM v_contrast_lvl_metadata AS a, | |
2610 v_contrast_lvl_metadata AS b | |
2611 WHERE a.contrast = b.contrast | |
2612 AND a.level > b.level | |
2613 /* view v_contrast_lvl_lvl is used | |
2614 to simplify view v_contrast_log2_fc */ | |
2615 ; | |
2616 " | |
2617 ) | |
2618 # - create pseudo-materialized view table | |
2619 dml_no_rows_exec(db, " | |
2620 CREATE VIEW v_contrast_lvl_lvl | |
2621 AS | |
2622 SELECT DISTINCT | |
2623 a.contrast AS ab_contrast, | |
2624 a.level AS a_level, | |
2625 b.level AS b_level | |
2626 FROM contrast AS a, | |
2627 contrast AS b | |
2628 WHERE a.contrast = b.contrast | |
2629 AND a.level > b.level | |
2630 /* view v_contrast_lvl_lvl is used | |
2631 to simplify view v_contrast_log2_fc */ | |
2632 ; | |
2633 " | |
2634 ) | |
2635 | |
2636 # - create view to compute log2(fold-change) | |
2637 dml_no_rows_exec(db, " | |
2638 CREATE VIEW v_contrast_log2_fc | |
2639 AS | |
2640 SELECT ab.ab_contrast AS contrast, | |
2641 m.a_level AS a_level, | |
2642 c.avg_quant AS a_quant, | |
2643 m.a_samples AS a_samples, | |
2644 ab.b_level AS b_level, | |
2645 d.avg_quant AS b_quant, | |
2646 m.b_samples AS b_samples, | |
2647 m.fc_description AS fc_description, | |
2648 3.32193 * ( d.avg_quant - c.avg_quant) AS log2_fc, | |
2649 d.phosphopep AS phosphopep | |
2650 FROM contrast_lvl_lvl_metadata AS m, | |
2651 v_contrast_lvl_ppep_avg_quant AS d, | |
2652 v_contrast_lvl_lvl AS ab | |
2653 INNER JOIN v_contrast_lvl_ppep_avg_quant AS c | |
2654 ON c.contrast = ab.ab_contrast | |
2655 AND c.level = ab.a_level | |
2656 WHERE d.contrast = ab.ab_contrast | |
2657 AND m.ab_contrast = ab.ab_contrast | |
2658 AND d.level = ab.b_level | |
2659 AND d.phosphopep = c.phosphopep | |
2660 /* view to compute log2(fold-change) */ | |
2661 ; | |
2662 " | |
2663 ) | |
2664 | |
2665 # For each contrast, compute samples that are members | |
2666 # compute one-way test: | |
2667 # - use `oneway.test` (Welch test) if numbers of samples | |
2668 # are not equivalent between trt levels | |
2669 # - otherwise, aov is fine but offers no advantage | |
2670 for (contrast in contrast_count:2) { | |
2671 invisible(contrast) | |
2672 } | |
2673 for (contrast in 1:contrast_count) { | |
2674 contrast_df <- sqldf::sqldf( | |
2675 x = paste0(" | |
2676 SELECT level, phosphopep, sample, quant | |
2677 FROM contrast_ppep_smpl_qnlt | |
2678 WHERE contrast = ", contrast, " | |
2679 ORDER BY phosphopep, level, sample | |
2680 "), | |
2681 connection = db | |
2682 ) | |
2683 contrast_cast <- reshape2::dcast( | |
2684 data = contrast_df, | |
2685 formula = phosphopep ~ sample, | |
2686 value.var = "quant" | |
2687 ) | |
2688 contrast_cast_ncol <- ncol(contrast_cast) | |
2689 contrast_cast_data <- contrast_cast[, 2:contrast_cast_ncol] | |
2690 contrast_cast_samples <- colnames(contrast_cast_data) | |
2691 | |
2692 # - order grouping_factor by order of sample columns of contrast_cast_data | |
2693 grouping_factor <- sqldf::sqldf( | |
2694 x = paste0(" | |
2695 SELECT sample, level | |
2696 FROM contrast_ppep_smpl_qnlt | |
2697 WHERE contrast = ", contrast, " | |
2698 ORDER BY phosphopep, level, sample | |
2699 LIMIT ", contrast_cast_ncol - 1 | |
2700 ), | |
2701 connection = db | |
2702 ) | |
2703 rownames(grouping_factor) <- grouping_factor$sample | |
2704 grouping_factor <- grouping_factor[, "level", drop = FALSE] | |
2705 | |
2706 # - run the two-level (one-way) test | |
2707 p_value_data_contrast_ps <- | |
2708 apply( | |
2709 X = contrast_cast_data, | |
2710 MARGIN = 1, # apply to rows | |
2711 FUN = anova_func, | |
2712 grouping_factor = | |
2713 as.factor(as.numeric(grouping_factor$level)), # anova_func arg2 | |
2714 one_way_f = one_way_two_categories, # anova_func arg3 | |
2715 simplify = TRUE # TRUE is the default for simplify | |
2716 ) | |
2717 contrast_data_adj_p_values <- p.adjust( | |
2718 p = p_value_data_contrast_ps, | |
2719 method = "fdr", | |
2720 n = length(p_value_data_contrast_ps) # this is the default, length(p) | |
2721 ) | |
2722 # - compute the fold-change | |
2723 contrast_p_df <- | |
2724 data.frame( | |
2725 contrast = contrast, | |
2726 phosphopep = contrast_cast$phosphopep, | |
2727 p_value_raw = p_value_data_contrast_ps, | |
2728 p_value_adj = contrast_data_adj_p_values | |
2729 ) | |
2730 db_write_table_overwrite <- (contrast < 2) | |
2731 db_write_table_append <- !db_write_table_overwrite | |
2732 RSQLite::dbWriteTable( | |
2733 conn = db, | |
2734 name = "contrast_ppep_p_val", | |
2735 value = contrast_p_df, | |
2736 append = db_write_table_append | |
2737 ) | |
2738 # Create UK for insert | |
2739 ddl_exec(db, " | |
2740 CREATE UNIQUE INDEX IF NOT EXISTS contrast_ppep_p_val__uk__idx | |
2741 ON contrast_ppep_p_val(phosphopep, contrast); | |
2742 " | |
2743 ) | |
2744 } | |
2745 # Perhaps this could be done more elegantly using unique keys | |
2746 # or creating the tables before saving data to them, but this | |
2747 # is fast and, if the database exists on disk rather than in | |
2748 # memory, it doesn't stress memory. | |
2749 dml_no_rows_exec(db, " | |
2750 CREATE TEMP table contrast_log2_fc | |
2751 AS | |
2752 SELECT * | |
2753 FROM v_contrast_log2_fc | |
2754 ORDER BY contrast, phosphopep | |
2755 ; | |
2756 " | |
2757 ) | |
2758 dml_no_rows_exec(db, " | |
2759 CREATE TEMP table ppep_p_val | |
2760 AS | |
2761 SELECT p_value_raw, | |
2762 p_value_adj, | |
2763 contrast AS p_val_contrast, | |
2764 phosphopep AS p_val_ppep | |
2765 FROM contrast_ppep_p_val | |
2766 ORDER BY contrast, phosphopep | |
2767 ; | |
2768 " | |
2769 ) | |
2770 dml_no_rows_exec(db, " | |
2771 DROP TABLE IF EXISTS contrast_log2_fc_p_val | |
2772 ; | |
2773 " | |
2774 ) | |
2775 dml_no_rows_exec(db, " | |
2776 CREATE TABLE contrast_log2_fc_p_val | |
2777 AS | |
2778 SELECT a.*, | |
2779 b.p_value_raw, | |
2780 b.p_value_adj, | |
2781 b.p_val_contrast, | |
2782 b.p_val_ppep | |
2783 FROM contrast_log2_fc a, ppep_p_val b | |
2784 WHERE a.rowid = b.rowid | |
2785 AND a.phosphopep = b.p_val_ppep | |
2786 ; | |
2787 " | |
2788 ) | |
2789 # Create UK | |
2790 ddl_exec(db, " | |
2791 CREATE UNIQUE INDEX IF NOT EXISTS contrast_log2_fc_p_val__uk__idx | |
2792 ON contrast_log2_fc_p_val(phosphopep, contrast); | |
2793 " | |
2794 ) | |
2795 # Create indices for future queries | |
2796 ddl_exec(db, " | |
2797 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__contrast__idx | |
2798 ON contrast_log2_fc_p_val(contrast); | |
2799 " | |
2800 ) | |
2801 ddl_exec(db, " | |
2802 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__phosphopep__idx | |
2803 ON contrast_log2_fc_p_val(phosphopep); | |
2804 " | |
2805 ) | |
2806 ddl_exec(db, " | |
2807 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_raw__idx | |
2808 ON contrast_log2_fc_p_val(p_value_raw); | |
2809 " | |
2810 ) | |
2811 ddl_exec(db, " | |
2812 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_adj__idx | |
2813 ON contrast_log2_fc_p_val(p_value_adj); | |
2814 " | |
2815 ) | |
2816 dml_no_rows_exec(db, " | |
2817 DROP VIEW IF EXISTS v_contrast_log2_fc_p_val | |
2818 ; | |
2819 " | |
2820 ) | |
2821 dml_no_rows_exec(db, " | |
2822 CREATE VIEW v_contrast_log2_fc_p_val | |
2823 AS | |
2824 SELECT contrast, | |
2825 a_level, | |
2826 a_samples, | |
2827 b_level, | |
2828 b_samples, | |
2829 a_quant, | |
2830 b_quant, | |
2831 fc_description, | |
2832 log2_fc, | |
2833 p_value_raw, | |
2834 p_value_adj, | |
2835 phosphopep | |
2836 FROM contrast_log2_fc_p_val | |
2837 ORDER BY contrast, phosphopep | |
2838 ; | |
2839 " | |
2840 ) | |
2841 ddl_exec(db, " | |
2842 DROP TABLE IF EXISTS kseaapp_metadata | |
2843 ; | |
2844 " | |
2845 ) | |
2846 dml_no_rows_exec(db, " | |
2847 CREATE TABLE kseaapp_metadata | |
2848 AS | |
2849 WITH extended(deppep, ppep, gene_name, uniprot_id, phosphoresidue) AS ( | |
2850 SELECT DISTINCT | |
2851 deppep.seq, | |
2852 ppep.seq, | |
2853 GeneName||';', | |
2854 UniProtID||';', | |
2855 PhosphoResidue||';' | |
2856 FROM | |
2857 ppep, deppep, mrgfltr_metadata | |
2858 WHERE | |
2859 mrgfltr_metadata.ppep_id = ppep.id | |
2860 AND | |
2861 ppep.deppep_id = deppep.id | |
2862 ) | |
2863 SELECT | |
2864 ppep AS `ppep`, | |
2865 SUBSTR(uniprot_id, 1, INSTR(uniprot_id,';') - 1 ) AS `Protein`, | |
2866 SUBSTR(gene_name, 1, INSTR(gene_name,';') - 1 ) AS `Gene`, | |
2867 deppep AS `Peptide`, | |
2868 REPLACE( | |
2869 REPLACE( | |
2870 SUBSTR(phosphoresidue, 1, INSTR(phosphoresidue,';') - 1 ), | |
2871 'p', | |
2872 '' | |
2873 ), | |
2874 ', ', | |
2875 ';' | |
2876 ) AS `Residue.Both` | |
2877 FROM extended | |
2878 ; | |
2879 " | |
2880 ) | |
2881 # Create indexes for join | |
2882 ddl_exec(db, " | |
2883 CREATE INDEX IF NOT EXISTS kseaapp_metadata__ppep__idx | |
2884 ON kseaapp_metadata(ppep); | |
2885 " | |
2886 ) | |
2887 ddl_exec(db, " | |
2888 DROP VIEW IF EXISTS v_kseaapp_contrast | |
2889 ; | |
2890 " | |
2891 ) | |
2892 dml_no_rows_exec(db, " | |
2893 CREATE VIEW v_kseaapp_contrast | |
2894 AS | |
2895 SELECT a.*, b.Protein, b.Gene, b.Peptide, b.`Residue.Both` | |
2896 FROM v_contrast_log2_fc_p_val a, kseaapp_metadata b | |
2897 WHERE b.ppep = a.phosphopep | |
2898 ; | |
2899 " | |
2900 ) | |
2901 ddl_exec(db, " | |
2902 DROP VIEW IF EXISTS v_kseaapp_input | |
2903 ; | |
2904 " | |
2905 ) | |
2906 dml_no_rows_exec(db, " | |
2907 CREATE VIEW v_kseaapp_input | |
2908 AS | |
2909 SELECT v.contrast, | |
2910 v.phosphopep, | |
2911 m.`Protein`, | |
2912 m.`Gene`, | |
2913 m.`Peptide`, | |
2914 m.`Residue.Both`, | |
2915 v.p_value_raw AS `p`, | |
2916 v.log2_fc AS `FC` | |
2917 FROM kseaapp_metadata AS m, | |
2918 v_contrast_log2_fc_p_val AS v | |
2919 WHERE m.ppep = v.phosphopep | |
2920 AND NOT m.`Gene` = 'No_Gene_Name' | |
2921 AND NOT v.log2_fc = 0 | |
2922 ; | |
2923 " | |
2924 ) | |
2925 } | |
2926 ``` | |
2927 | |
2928 ```{r echo = FALSE, results = 'asis'} | |
2929 cat("\\newpage\n") | |
2930 ``` | |
2931 | |
2932 # KSEA Analysis | |
2933 | |
2934 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: | |
2935 | |
2936 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp | |
2937 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415). | |
2938 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/). | |
2939 | |
2940 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: | |
2941 | |
2942 $$ | |
2943 \text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j} | |
2944 $$ | |
2945 | |
2946 and fold-enrichment is computed as: | |
2947 | |
2948 $$ | |
2949 \text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j} | |
2950 $$ | |
2951 | |
2952 where: | |
2953 | |
2954 - $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$, | |
2955 - $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and | |
2956 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, | |
2957 - $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset. | |
2958 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. | |
2959 | |
2960 $\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x) | |
2961 | |
2962 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). | |
2963 | |
2964 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. | |
2965 | |
2966 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. | |
2967 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). | |
2968 - Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). | |
2969 | |
2970 ```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
2971 | |
2972 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) | |
2973 | |
2974 # -- eliminate the table that's about to be defined | |
2975 ddl_exec(db, " | |
2976 DROP TABLE IF EXISTS site_metadata; | |
2977 ") | |
2978 | |
2979 # -- define the site_metadata table | |
2980 ddl_exec(db, " | |
2981 CREATE TABLE site_metadata( | |
2982 id INTEGER PRIMARY KEY | |
2983 , site_type_id INTEGER REFERENCES site_type(id) | |
2984 , full TEXT UNIQUE ON CONFLICT IGNORE | |
2985 , abbrev TEXT | |
2986 , pattern TEXT | |
2987 , motif TEXT | |
2988 ); | |
2989 ") | |
2990 | |
2991 # -- populate the table with initial values | |
2992 ddl_exec(db, " | |
2993 INSERT INTO site_metadata(full, abbrev, site_type_id) | |
2994 SELECT DISTINCT kinase_map, kinase_map, site_type_id | |
2995 FROM ppep_gene_site | |
2996 ORDER BY kinase_map; | |
2997 ") | |
2998 | |
2999 # -- drop bogus KSData view if exists | |
3000 ddl_exec(db, " | |
3001 DROP VIEW IF EXISTS ks_data_v; | |
3002 ") | |
3003 | |
3004 # -- create view to serve as an impostor for KSEAapp::KSData | |
3005 ddl_exec(db, " | |
3006 CREATE VIEW IF NOT EXISTS ks_data_v | |
3007 AS | |
3008 SELECT | |
3009 'NA' AS KINASE, | |
3010 'NA' AS KIN_ACC_ID, | |
3011 kinase_map AS GENE, | |
3012 'NA' AS KIN_ORGANISM, | |
3013 'NA' AS SUBSTRATE, | |
3014 0 AS SUB_GENE_ID, | |
3015 'NA' AS SUB_ACC_ID, | |
3016 gene_names AS SUB_GENE, | |
3017 'NA' AS SUB_ORGANISM, | |
3018 phospho_peptide AS SUB_MOD_RSD, | |
3019 0 AS SITE_GROUP_ID, | |
3020 'NA' AS 'SITE_7AA', | |
3021 2 AS networkin_score, | |
3022 type_name AS Source | |
3023 FROM ppep_gene_site_view; | |
3024 ") | |
3025 | |
3026 contrast_metadata_df <- | |
3027 sqldf::sqldf("select * from contrast_lvl_lvl_metadata", connection = db) | |
3028 rslt <- new_env() | |
3029 rslt$score_list <- list() | |
3030 rslt$name_list <- list() | |
3031 rslt$longname_list <- list() | |
3032 | |
3033 ddl_exec(db, " | |
3034 DROP TABLE IF EXISTS contrast_ksea_scores; | |
3035 " | |
3036 ) | |
3037 | |
3038 next_index <- 0 | |
3039 err_na_subscr_df_const <- | |
3040 "missing values are not allowed in subscripted assignments of data frames" | |
3041 | |
3042 for (i_cntrst in seq_len(nrow(contrast_metadata_df))) { | |
3043 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] | |
3044 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] | |
3045 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] | |
3046 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) | |
3047 contrast_longlabel <- ( | |
3048 sprintf( | |
3049 "Trt %s {%s} -> Trt %s {%s}", | |
3050 contrast_metadata_df[i_cntrst, "b_level"], | |
3051 gsub( | |
3052 pattern = ";", | |
3053 replacement = ", ", | |
3054 x = contrast_metadata_df[i_cntrst, "b_samples"], | |
3055 fixed = TRUE | |
3056 ), | |
3057 contrast_metadata_df[i_cntrst, "a_level"], | |
3058 gsub( | |
3059 pattern = ";", | |
3060 replacement = ", ", | |
3061 x = contrast_metadata_df[i_cntrst, "a_samples"], | |
3062 fixed = TRUE | |
3063 ) | |
3064 ) | |
3065 ) | |
3066 kseaapp_input <- | |
3067 sqldf::sqldf( | |
3068 x = sprintf(" | |
3069 SELECT `Protein`, `Gene`, `Peptide`, phosphopep AS `Residue.Both`, `p`, `FC` | |
3070 FROM v_kseaapp_input | |
3071 WHERE contrast = %d | |
3072 ", | |
3073 i_cntrst | |
3074 ), | |
3075 connection = db | |
3076 ) | |
3077 | |
3078 pseudo_ksdata <- dbReadTable(db, "ks_data_v") | |
3079 | |
3080 # This hack is because SQL table has the log2-transformed values | |
3081 kseaapp_input[, "FC"] <- 2 ** kseaapp_input[, "FC", drop = TRUE] | |
3082 main_title <- ( | |
3083 sprintf( | |
3084 "Change from treatment %s to treatment %s", | |
3085 contrast_metadata_df[i_cntrst, "b_level"], | |
3086 contrast_metadata_df[i_cntrst, "a_level"] | |
3087 ) | |
3088 ) | |
3089 sub_title <- contrast_longlabel | |
3090 tryCatch( | |
3091 expr = { | |
3092 ksea_scores_rslt <- | |
3093 ksea_scores( | |
3094 ksdata = pseudo_ksdata, # KSEAapp::KSData, | |
3095 px = kseaapp_input, | |
3096 networkin = TRUE, | |
3097 networkin_cutoff = 2 | |
3098 ) | |
3099 | |
3100 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | |
3101 next_index <- 1 + next_index | |
3102 rslt$score_list[[next_index]] <- ksea_scores_rslt | |
3103 rslt$name_list[[next_index]] <- contrast_label | |
3104 rslt$longname_list[[next_index]] <- contrast_longlabel | |
3105 low_fdr_print( | |
3106 rslt = rslt, | |
3107 i_cntrst = i_cntrst, | |
3108 i = next_index, | |
3109 a_level = cntrst_a_level, | |
3110 b_level = cntrst_b_level, | |
3111 fold_change = cntrst_fold_change, | |
3112 caption = contrast_longlabel | |
3113 ) | |
3114 } | |
3115 }, | |
3116 error = function(e) str(e) | |
3117 ) | |
3118 } | |
3119 | |
3120 plotted_kinases <- NULL | |
3121 if (length(rslt$score_list) > 1) { | |
3122 for (i in seq_len(length(ksea_heatmap_titles))) { | |
3123 hdr <- ksea_heatmap_titles[[i]] | |
3124 which_kinases <- i | |
3125 | |
3126 cat("\\clearpage\n\\begin{center}\n") | |
3127 if (i == const_ksea_astrsk_kinases) { | |
3128 subsection_header(hdr) | |
3129 } else { | |
3130 subsection_header(hdr) | |
3131 } | |
3132 cat("\\end{center}\n") | |
3133 | |
3134 plotted_kinases <- ksea_heatmap( | |
3135 # the data frame outputs from the KSEA.Scores() function, in list format | |
3136 score_list = rslt$score_list, | |
3137 # a character vector of all the sample names for heatmap annotation: | |
3138 # - the names must be in the same order as the data in score_list | |
3139 # - please avoid long names, as they may get cropped in the final image | |
3140 sample_labels = rslt$name_list, | |
3141 # character string of either "p.value" or "FDR" indicating the data column | |
3142 # to use for marking statistically significant scores | |
3143 stats = c("p.value", "FDR")[2], | |
3144 # a numeric value between 0 and infinity indicating the min. number of | |
3145 # substrates a kinase must have to be included in the heatmap | |
3146 m_cutoff = 1, | |
3147 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff | |
3148 # for indicating significant kinases in the heatmap | |
3149 p_cutoff = 0.05, | |
3150 # a binary input of TRUE or FALSE, indicating whether or not to perform | |
3151 # hierarchical clustering of the sample columns | |
3152 sample_cluster = TRUE, | |
3153 # a binary input of TRUE or FALSE, indicating whether or not to export | |
3154 # the heatmap as a .png image into the working directory | |
3155 export = FALSE, | |
3156 # additional arguments to gplots::heatmap.2, such as: | |
3157 # - main: main title of plot | |
3158 # - xlab: x-axis label | |
3159 # - ylab: y-axis label | |
3160 xlab = "Contrast", | |
3161 ylab = "Kinase", | |
3162 # print which kinases: | |
3163 # - 1 : all kinases | |
3164 # - 2 : significant kinases | |
3165 # - 3 : non-significant kinases | |
3166 which_kinases = which_kinases | |
3167 ) | |
3168 cat("\\begin{center}\n") | |
3169 cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n") | |
3170 cat("\\end{center}\n") | |
3171 } # end for (i in ... | |
3172 } # end if (length ... | |
3173 | |
3174 for (i_cntrst in seq_len(length(rslt$score_list))) { | |
3175 next_index <- i_cntrst | |
3176 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] | |
3177 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] | |
3178 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] | |
3179 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) | |
3180 contrast_longlabel <- ( | |
3181 sprintf( | |
3182 "Trt %s {%s} -> Trt %s {%s}", | |
3183 contrast_metadata_df[i_cntrst, "b_level"], | |
3184 gsub( | |
3185 pattern = ";", | |
3186 replacement = ", ", | |
3187 x = contrast_metadata_df[i_cntrst, "b_samples"], | |
3188 fixed = TRUE | |
3189 ), | |
3190 contrast_metadata_df[i_cntrst, "a_level"], | |
3191 gsub( | |
3192 pattern = ";", | |
3193 replacement = ", ", | |
3194 x = contrast_metadata_df[i_cntrst, "a_samples"], | |
3195 fixed = TRUE | |
3196 ) | |
3197 ) | |
3198 ) | |
3199 main_title <- ( | |
3200 sprintf( | |
3201 "Change from treatment %s to treatment %s", | |
3202 contrast_metadata_df[i_cntrst, "b_level"], | |
3203 contrast_metadata_df[i_cntrst, "a_level"] | |
3204 ) | |
3205 ) | |
3206 sub_title <- contrast_longlabel | |
3207 tryCatch( | |
3208 expr = { | |
3209 ksea_scores_rslt <- rslt$score_list[[next_index]] | |
3210 | |
3211 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | |
3212 low_fdr_barplot( | |
3213 rslt = rslt, | |
3214 i_cntrst = i_cntrst, | |
3215 i = next_index, | |
3216 a_level = cntrst_a_level, | |
3217 b_level = cntrst_b_level, | |
3218 fold_change = cntrst_fold_change, | |
3219 caption = contrast_longlabel | |
3220 ) | |
3221 } | |
3222 }, | |
3223 error = function(e) str(e) | |
3224 ) | |
3225 } | |
3226 ``` | |
3227 | |
3228 ```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
3229 | |
3230 # Use enriched kinases to find enriched kinase-substrate pairs | |
3231 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) | |
3232 all_enriched_substrates <- sqldf(" | |
3233 SELECT | |
3234 gene AS kinase, | |
3235 ppep, | |
3236 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label | |
3237 FROM ( | |
3238 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
3239 FROM pseudo_ksdata | |
3240 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
3241 ) | |
3242 GROUP BY ppep | |
3243 ") | |
3244 | |
3245 # helper used to label per-kinase substrate enrichment figure | |
3246 cat_enriched_heading <- function(m, cut_args) { | |
3247 cutoff <- cut_args$cutoff | |
3248 kinase <- cut_args$kinase | |
3249 statistic <- cut_args$statistic | |
3250 threshold <- cut_args$threshold | |
3251 cat("\\newpage\n") | |
3252 if (nrow(m) > intensity_hm_rows) { | |
3253 subsection_header( | |
3254 paste( | |
3255 sprintf( | |
3256 "Lowest p-valued %d (of %d) enriched %s-substrates,", | |
3257 intensity_hm_rows, | |
3258 nrow(m), | |
3259 kinase | |
3260 ), | |
3261 sprintf(" KSEA %s < %0.2f\n", statistic, threshold) | |
3262 ) | |
3263 ) | |
3264 } else { | |
3265 if (nrow(m) == 1) { | |
3266 return(FALSE) | |
3267 } else { | |
3268 subsection_header( | |
3269 paste( | |
3270 sprintf( | |
3271 "%d enriched %s-substrates,", | |
3272 nrow(m), | |
3273 kinase | |
3274 ), | |
3275 sprintf( | |
3276 " KSEA %s < %0.2f\n", | |
3277 statistic, | |
3278 threshold | |
3279 ) | |
3280 ) | |
3281 ) | |
3282 } | |
3283 } | |
3284 cat("\n\n\n") | |
3285 cat("\n\n\n") | |
3286 return(TRUE) | |
3287 } | |
3288 | |
3289 # Disabling heatmaps for substrates pending decision whether to eliminate them altogether | |
3290 if (FALSE) | |
3291 for (kinase_name in sort(enriched_kinases$kinase)) { | |
3292 enriched_substrates <- | |
3293 all_enriched_substrates[ | |
3294 all_enriched_substrates$kinase == kinase_name, | |
3295 , | |
3296 drop = FALSE | |
3297 ] | |
3298 # Get the intensity values for the heatmap | |
3299 enriched_intensities <- | |
3300 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) | |
3301 # Remove rows having too many NA values to be relevant | |
3302 na_counter <- is.na(enriched_intensities) | |
3303 na_counts <- apply(na_counter, 1, sum) | |
3304 enriched_intensities <- | |
3305 enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE] | |
3306 # Rename the rows with the display-name for the heatmap | |
3307 rownames(enriched_intensities) <- | |
3308 sapply( | |
3309 X = rownames(enriched_intensities), | |
3310 FUN = function(rn) { | |
3311 enriched_substrates[enriched_substrates$ppep == rn, "label"] | |
3312 } | |
3313 ) | |
3314 # Format as matrix for heatmap | |
3315 m <- as.matrix(enriched_intensities) | |
3316 # Draw the heading and heatmap | |
3317 if (nrow(m) > 0) { | |
3318 cut_args <- new_env() | |
3319 cut_args$cutoff <- cutoff | |
3320 cut_args$kinase <- kinase_name | |
3321 cut_args$statistic <- ksea_cutoff_statistic | |
3322 cut_args$threshold <- ksea_cutoff_threshold | |
3323 number_of_peptides_found <- | |
3324 draw_intensity_heatmap( | |
3325 m = m, | |
3326 cutoff = cut_args, | |
3327 hm_heading_function = cat_enriched_heading, | |
3328 hm_main_title | |
3329 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", | |
3330 suppress_row_dendrogram = FALSE | |
3331 ) | |
3332 } | |
3333 } | |
3334 | |
3335 # Write output tabular files | |
3336 | |
3337 # get kinase, ppep, concat(kinase) tuples for enriched kinases | |
3338 | |
3339 kinase_ppep_label <- sqldf(" | |
3340 WITH | |
3341 t(ppep, label) AS | |
3342 ( | |
3343 SELECT DISTINCT | |
3344 SUB_MOD_RSD AS ppep, | |
3345 group_concat(gene, '; ') AS label | |
3346 FROM pseudo_ksdata | |
3347 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
3348 GROUP BY ppep | |
3349 ), | |
3350 k(kinase, ppep_join) AS | |
3351 ( | |
3352 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join | |
3353 FROM pseudo_ksdata | |
3354 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
3355 ) | |
3356 SELECT k.kinase, t.ppep, t.label | |
3357 FROM t, k | |
3358 WHERE t.ppep = k.ppep_join | |
3359 ORDER BY k.kinase, t.ppep | |
3360 ") | |
3361 | |
3362 # extract what we need from full_data | |
3363 impish <- cbind(rownames(quant_data_imp), quant_data_imp) | |
3364 colnames(impish)[1] <- "Phosphopeptide" | |
3365 data_table_imputed_sql <- " | |
3366 SELECT | |
3367 f.*, | |
3368 k.label AS KSEA_enrichments, | |
3369 q.* | |
3370 FROM | |
3371 metadata_plus_p f | |
3372 LEFT JOIN kinase_ppep_label k | |
3373 ON f.Phosphopeptide = k.ppep, | |
3374 impish q | |
3375 WHERE | |
3376 f.Phosphopeptide = q.Phosphopeptide | |
3377 " | |
3378 data_table_imputed <- sqldf(data_table_imputed_sql) | |
3379 # Zap the duplicated 'Phosphopeptide' column named 'ppep' | |
3380 data_table_imputed <- | |
3381 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] | |
3382 | |
3383 # Output with imputed, un-normalized data | |
3384 | |
3385 write.table( | |
3386 data_table_imputed | |
3387 , file = imputed_data_filename | |
3388 , sep = "\t" | |
3389 , col.names = TRUE | |
3390 , row.names = FALSE | |
3391 , quote = FALSE | |
3392 ) | |
3393 | |
3394 | |
3395 #output quantile normalized data | |
3396 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) | |
3397 colnames(impish)[1] <- "Phosphopeptide" | |
3398 data_table_imputed <- sqldf(data_table_imputed_sql) | |
3399 # Zap the duplicated 'Phosphopeptide' column named 'ppep' | |
3400 data_table_imputed <- | |
3401 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] | |
3402 write.table( | |
3403 data_table_imputed, | |
3404 file = imp_qn_lt_data_filenm, | |
3405 sep = "\t", | |
3406 col.names = TRUE, | |
3407 row.names = FALSE, | |
3408 quote = FALSE | |
3409 ) | |
3410 | |
3411 ppep_kinase <- sqldf(" | |
3412 SELECT DISTINCT k.ppep, k.kinase | |
3413 FROM ( | |
3414 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep | |
3415 FROM pseudo_ksdata | |
3416 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
3417 ) k | |
3418 ORDER BY k.ppep, k.kinase | |
3419 ") | |
3420 | |
3421 RSQLite::dbWriteTable( | |
3422 conn = db, | |
3423 name = "ksea_enriched_ks", | |
3424 value = ppep_kinase, | |
3425 append = FALSE | |
3426 ) | |
3427 | |
3428 RSQLite::dbWriteTable( | |
3429 conn = db, | |
3430 name = "anova_signif", | |
3431 value = p_value_data, | |
3432 append = FALSE | |
3433 ) | |
3434 | |
3435 ddl_exec(db, " | |
3436 DROP VIEW IF EXISTS stats_metadata_v; | |
3437 " | |
3438 ) | |
3439 dml_no_rows_exec(db, " | |
3440 CREATE VIEW stats_metadata_v | |
3441 AS | |
3442 SELECT DISTINCT m.*, | |
3443 p.raw_anova_p, | |
3444 p.fdr_adjusted_anova_p, | |
3445 kek.kinase AS ksea_enrichments | |
3446 FROM | |
3447 mrgfltr_metadata_view m | |
3448 LEFT JOIN anova_signif p | |
3449 ON m.phospho_peptide = p.phosphopeptide | |
3450 LEFT JOIN ksea_enriched_ks kek | |
3451 ON m.phospho_peptide = kek.ppep | |
3452 ; | |
3453 " | |
3454 ) | |
3455 | |
3456 write.table( | |
3457 dbReadTable(db, "stats_metadata_v"), | |
3458 file = anova_ksea_mtdt_file, | |
3459 sep = "\t", | |
3460 col.names = TRUE, | |
3461 row.names = FALSE, | |
3462 quote = FALSE | |
3463 ) | |
3464 | |
3465 | |
3466 ``` | |
3467 | |
3468 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
3469 cat("\\leavevmode\n\n\n") | |
3470 | |
3471 # write parameters to report | |
3472 | |
3473 param_unlist <- unlist(as.list(params)) | |
3474 param_df <- data.frame( | |
3475 parameter = paste0("\\verb@", names(param_unlist), "@"), | |
3476 value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@") | |
3477 ) | |
3478 | |
3479 data_frame_latex( | |
3480 x = param_df, | |
3481 justification = "p{0.35\\linewidth} p{0.6\\linewidth}", | |
3482 centered = TRUE, | |
3483 caption = "Input parameters", | |
3484 anchor = const_table_anchor_bp, | |
3485 underscore_whack = FALSE | |
3486 ) | |
3487 | |
3488 # write parameters to SQLite output | |
3489 | |
3490 mqppep_anova_script_param_df <- data.frame( | |
3491 script = "mqppep_anova_script.Rmd", | |
3492 parameter = names(param_unlist), | |
3493 value = param_unlist | |
3494 ) | |
3495 ddl_exec(db, " | |
3496 DROP TABLE IF EXISTS script_parameter; | |
3497 " | |
3498 ) | |
3499 ddl_exec(db, " | |
3500 CREATE TABLE IF NOT EXISTS script_parameter( | |
3501 script TEXT, | |
3502 parameter TEXT, | |
3503 value ANY, | |
3504 UNIQUE (script, parameter) ON CONFLICT REPLACE | |
3505 ) | |
3506 ; | |
3507 " | |
3508 ) | |
3509 RSQLite::dbWriteTable( | |
3510 conn = db, | |
3511 name = "script_parameter", | |
3512 value = mqppep_anova_script_param_df, | |
3513 append = TRUE | |
3514 ) | |
3515 | |
3516 # We are done with output | |
3517 RSQLite::dbDisconnect(db) | |
3518 ``` | |
3519 <!-- | |
3520 There's gotta be a better way... | |
3521 | |
3522 loaded_packages_df <- sessioninfo::package_info("loaded") | |
3523 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) | |
3524 loaded_packages_df <- data.frame( | |
3525 package = loaded_packages_df$package, | |
3526 version = loaded_packages_df$loadedversion, | |
3527 date = loaded_packages_df$date | |
3528 ) | |
3529 data_frame_latex( | |
3530 x = loaded_packages_df, | |
3531 justification = "l | l l", | |
3532 centered = FALSE, | |
3533 caption = "Loaded R packages", | |
3534 anchor = const_table_anchor_bp | |
3535 ) | |
3536 --> |