comparison mqppep_anova_script.Rmd @ 22:61adb8801b73 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 0c7ca054e77e042c8a584c9903073da064df7d8b
author eschen42
date Thu, 30 Jun 2022 16:16:32 +0000
parents 2c5f1a2fe16a
children 8582a9797c18
comparison
equal deleted inserted replaced
21:f3deca1a3c84 22:61adb8801b73
1 --- 1 ---
2 title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA" 2 title: "MaxQuant Phosphoproteomic Enrichment Pipeline ANOVA/KSEA"
3 author: "Larry Cheng; Art Eschenlauer" 3 author:
4 date: "May 28, 2018; Mar 16, 2022" 4 - "Nick Graham^[ORCiD 0000-0002-6811-1941, University of Southern California: Los Angeles, CA, US]"
5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]"
6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]"
7 date:
8 - "May 28, 2018"
9 - "; revised June 23, 2022"
5 output: 10 output:
6 pdf_document: 11 pdf_document:
7 toc: true 12 toc: true
8 latex_document: 13 toc_depth: 3
9 toc: true 14 keep_tex: true
15 header-includes:
16 - \usepackage{longtable}
17 - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut
18 - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut
10 params: 19 params:
11 alphaFile: "test-data/alpha_levels.tabular" 20 alphaFile: "test-data/alpha_levels.tabular"
12 inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular" 21 inputFile: "test-data/test_input_for_anova.tabular"
13 firstDataColumn: "^Intensity[^_]" 22 preprocDb: "test-data/test_input_for_anova.sqlite"
14 imputationMethod: !r c("group-median", "median", "mean", "random")[4] 23 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
15 meanPercentile: 1 24 show_toc: true
16 sdPercentile: 1.0 25 firstDataColumn: "^Intensity[^_]"
17 regexSampleNames: "\\.\\d+[A-Z]$" 26 imputationMethod: !r c("group-median", "median", "mean", "random")[1]
18 regexSampleGrouping: "\\d+" 27 meanPercentile: 1
19 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" 28 sdPercentile: 1.0
20 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" 29 regexSampleNames: "\\.\\d+[A-Z]$"
21 show_toc: true 30 regexSampleGrouping: "\\d+"
31 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt"
32 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt"
33 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt"
34 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1]
35 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3]
36 kseaCutoffStatistic: !r c("p.value", "FDR")[2]
37 kseaCutoffThreshold: !r c( 0.1, 0.05)[2]
38 kseaMinKinaseCount: 1
39 intensityHeatmapRows: 75
22 --- 40 ---
23 <!-- 41 <!--
24 alphaFile: "test-data/alpha_levels.tabular" 42 kseaCutoffStatistic: !r c("p.value", "FDR")[2]
25 inputFile: "test-data/test_input_for_anova.tabular" 43 kseaCutoffThreshold: !r c(0.05, 0.1)[1]
26 inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular" 44
27 inputFile: "test-data/density_failure.preproc_tab.tabular" 45 alphaFile: "test-data/alpha_levels.tabular"
46 inputFile: "test-data/test_input_for_anova.tabular"
47 preprocDb: "test-data/test_input_for_anova.sqlite"
48 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
49
50 alphaFile: "test-data/alpha_levels.tabular"
51 inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular"
52 preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite"
53 kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2]
54
55 alphaFile: "test-data/alpha_levels.tabular"
56 inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular"
57 preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite"
58 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2]
59
60 alphaFile: "test-data/alpha_levels.tabular"
61 inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular"
62 preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite"
63 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2]
64
65 inputFile: "test-data/density_failure.preproc_tab.tabular"
66 kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2]
28 latex_document: default 67 latex_document: default
29 --> 68 -->
30 ```{r setup, include = FALSE} 69 ```{r setup, include = FALSE}
70 #ref for debugging: https://yihui.org/tinytex/r/#debugging
71 options(tinytex.verbose = TRUE)
72
31 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 73 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
74 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355
32 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) 75 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10))
33 76
34 # freeze the random number generator so the same results will be produced 77 # freeze the random number generator so the same results will be produced
35 # from run to run 78 # from run to run
36 set.seed(28571) 79 set.seed(28571)
80
81 ### LIBRARIES
82 library(gplots)
83 library(DBI)
84 library(RSQLite)
85 # Suppress "Warning: no DISPLAY variable so Tk is not available"
86 suppressWarnings(suppressMessages(library(sqldf)))
87
88 # required but not added to search list:
89 # - DBI
90 # - RSQLite
91 # - ggplot2
92 # - knitr
93 # - latex2exp
94 # - preprocessCore
95 # - reshape2
96 # - vioplot
37 97
38 ### CONSTANTS 98 ### CONSTANTS
39 99
40 const_parfin <- par("fin") 100 const_parfin <- par("fin")
41 const_boxplot_fill <- "grey94" 101 const_boxplot_fill <- "grey94"
42 const_stripchart_cex <- 0.5 102 const_stripchart_cex <- 0.5
43 const_stripsmall_cex <- 103 const_stripsmall_cex <-
44 sqrt(const_stripchart_cex * const_stripchart_cex / 2) 104 sqrt(const_stripchart_cex * const_stripchart_cex / 2)
45 const_stripchart_jitter <- 0.3 105 const_stripchart_jitter <- 0.3
46 const_write_debug_files <- FALSE 106 const_write_debug_files <- FALSE
47 const_table_anchor <- "tbp" 107 const_table_anchor_bp <- "bp"
108 const_table_anchor_ht <- "ht"
109 const_table_anchor_p <- "p"
110 const_table_anchor_tbp <- "tbp"
111
112
113 const_ksea_astrsk_kinases <- 1
114 const_ksea_nonastrsk_kinases <- 2
115 const_ksea_all_kinases <- 3
116
117 const_log10_e <- log10(exp(1))
48 118
49 ### FUNCTIONS 119 ### FUNCTIONS
50 120
51 #ANOVA filter function 121 # from `demo(error.catching)`
52 anova_func <- function(x, grouping_factor) { 122 ##' Catch *and* save both errors and warnings, and in the case of
53 x_aov <- aov(as.numeric(x) ~ grouping_factor) 123 ##' a warning, also keep the computed result.
54 pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1] 124 ##'
55 pvalue 125 ##' @title tryCatch both warnings (with value) and errors
56 } 126 ##' @param expr an \R expression to evaluate
127 ##' @return a list with 'value' and 'warning', where
128 ##' 'value' may be an error caught.
129 ##' @author Martin Maechler;
130 ##' Copyright (C) 2010-2012 The R Core Team
131 try_catch_w_e <- function(expr) {
132 wrn <- NULL
133 # warning handler
134 w_handler <- function(w) {
135 wrn <<- w
136 invokeRestart("muffleWarning")
137 }
138 list(
139 value = withCallingHandlers(
140 tryCatch(
141 expr,
142 error = function(e) e
143 ),
144 warning = w_handler
145 ),
146 warning = wrn
147 )
148 }
149
57 150
58 write_debug_file <- function(s) { 151 write_debug_file <- function(s) {
59 if (const_write_debug_files) { 152 if (const_write_debug_files) {
60 s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) 153 s_path <- sprintf("test-data/%s.txt", deparse(substitute(s)))
154 print(sprintf("DEBUG writing file %s", spath))
61 write.table( 155 write.table(
62 s, 156 s,
63 file = s_path, 157 file = s_path,
64 sep = "\t", 158 sep = "\t",
65 col.names = TRUE, 159 col.names = TRUE,
67 quote = FALSE 161 quote = FALSE
68 ) 162 )
69 } 163 }
70 } 164 }
71 165
72 latex_collapsed_vector <- function(collapse_string, v) { 166 # ref: http://adv-r.had.co.nz/Environments.html
167 # "When creating your own environment, note that you should set its parent
168 # environment to be the empty environment. This ensures you don't
169 # accidentally inherit objects from somewhere else."
170 # Caution: this prevents `with(my_env, expr)` from working when `expr`
171 # contains anything from the global environment, even operators!
172 # Hence, `x <- 1; get("x", new_env())` fails by design.
173 new_env <- function() {
174 new.env(parent = emptyenv())
175 }
176
177 ### numerical/statistical helper functions
178
179 any_nan <- function(x) {
180 !any(x == "NaN")
181 }
182
183 # determine standard deviation of quantile to impute
184 sd_finite <- function(x) {
185 ok <- is.finite(x)
186 sd(x[ok])
187 }
188
189 anova_func <- function(x, grouping_factor, one_way_f) {
190 subject <- data.frame(
191 intensity = x
192 )
193 x_aov <-
194 one_way_f(
195 formula = intensity ~ grouping_factor,
196 data = subject
197 )
198 pvalue <-
199 if (identical(one_way_f, aov))
200 summary(x_aov)[[1]][["Pr(>F)"]][1]
201 else
202 pvalue <- x_aov$p.value
203 pvalue
204 }
205
206
207 ### LaTeX functions
208
209 latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) {
210 v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v
73 cat( 211 cat(
74 paste0( 212 paste0(
75 gsub("_", "\\\\_", v), 213 v_sub,
76 collapse = collapse_string 214 collapse = collapse_string
77 ) 215 )
78 ) 216 )
79 } 217 }
80 218
81 latex_itemized_collapsed <- function(collapse_string, v) { 219 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
82 cat("\\begin{itemize}\n\\item ") 220 cat("\\begin{itemize}\n\\item ")
83 latex_collapsed_vector(collapse_string, v) 221 latex_collapsed_vector(collapse_string, v, underscore_whack)
84 cat("\n\\end{itemize}\n") 222 cat("\n\\end{itemize}\n")
85 } 223 }
86 224
87 latex_itemized_list <- function(v) { 225 latex_itemized_list <- function(v, underscore_whack = TRUE) {
88 latex_itemized_collapsed("\n\\item ", v) 226 latex_itemized_collapsed("\n\\item ", v, underscore_whack)
89 } 227 }
90 228
91 latex_enumerated_collapsed <- function(collapse_string, v) { 229 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
92 cat("\\begin{enumerate}\n\\item ") 230 cat("\\begin{enumerate}\n\\item ")
93 latex_collapsed_vector(collapse_string, v) 231 latex_collapsed_vector(collapse_string, v, underscore_whack)
94 cat("\n\\end{enumerate}\n") 232 cat("\n\\end{enumerate}\n")
95 } 233 }
96 234
97 latex_enumerated_list <- function(v) { 235 latex_enumerated_list <- function(v) {
98 latex_enumerated_collapsed("\n\\item ", v) 236 latex_enumerated_collapsed("\n\\item ", v)
99 } 237 }
100 238
101 latex_table_row <- function(v) { 239 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) {
102 latex_collapsed_vector(" & ", v) 240 latex_collapsed_vector(" & ", v, underscore_whack)
241 cat(extra)
103 cat(" \\\\\n") 242 cat(" \\\\\n")
104 } 243 }
105 244
106 # Use this like print.data.frame, from which it is adapted: 245 # Use this like print.data.frame, from which it is adapted:
107 data_frame_latex <- 246 data_frame_latex <-
116 max = NULL, 255 max = NULL,
117 # string with justification of each column 256 # string with justification of each column
118 justification = NULL, 257 justification = NULL,
119 # TRUE to center on page 258 # TRUE to center on page
120 centered = TRUE, 259 centered = TRUE,
121 # optional capttion 260 # optional caption
122 caption = NULL, 261 caption = NULL,
123 # h(inline); b(bottom); t (top) or p (separate page) 262 # h(inline); b(bottom); t (top) or p (separate page)
124 anchor = "h" 263 anchor = "h",
264 # set underscore_whack to TRUE to escape underscores
265 underscore_whack = TRUE
125 ) { 266 ) {
126 if (is.null(justification)) 267 if (is.null(justification))
127 justification <- 268 justification <-
128 Reduce( 269 Reduce(
129 f = paste, 270 f = paste,
142 n 283 n
143 ), 284 ),
144 "\n", 285 "\n",
145 sep = "" 286 sep = ""
146 ) 287 )
147 } 288 } else if (n == 0L) {
148 else if (n == 0L) {
149 cat("0 rows for:\n") 289 cat("0 rows for:\n")
150 latex_itemized_list(names(x)) 290 latex_itemized_list(
151 } 291 v = names(x),
152 else { 292 underscore_whack = underscore_whack
293 )
294 } else {
153 if (is.null(max)) 295 if (is.null(max))
154 max <- getOption("max.print", 99999L) 296 max <- getOption("max.print", 99999L)
155 if (!is.finite(max)) 297 if (!is.finite(max))
156 stop("invalid 'max' / getOption(\"max.print\"): ", 298 stop("invalid 'max' / getOption(\"max.print\"): ",
157 max) 299 max)
176 justification, 318 justification,
177 "}\n", 319 "}\n",
178 sep = "" 320 sep = ""
179 ) 321 )
180 ) 322 )
323 # ref: https://tex.stackexchange.com/a/50353
324 # Describes use of \rule{0pt}{3ex}
181 if (!is.null(caption)) 325 if (!is.null(caption))
182 cat(" \\hline\\hline\n") 326 cat("\\B \\\\ \\hline\\hline\n")
183 latex_table_row(colnames(m)) 327 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355
328 latex_table_row(
329 v = colnames(m),
330 extra = "\\T\\B",
331 underscore_whack = underscore_whack
332 )
184 cat("\\hline\n") 333 cat("\\hline\n")
185 for (i in seq_len(length(m[, 1]))) { 334 for (i in seq_len(length(m[, 1]))) {
186 latex_table_row(m[i, ]) 335 latex_table_row(
336 v = m[i, ],
337 underscore_whack = underscore_whack
338 )
187 } 339 }
188 cat( 340 cat(
189 paste( 341 paste(
190 " \\end{tabular}", 342 " \\end{tabular}",
191 "\\end{table}", 343 "\\end{table}",
197 n - n0, "rows ]\n") 349 n - n0, "rows ]\n")
198 } 350 }
199 invisible(x) 351 invisible(x)
200 } 352 }
201 353
202 ``` 354 hypersub <-
203 355 function(s) {
204 ## Purpose 356 hyper <- tolower(s)
205 357 hyper <- gsub("[^a-z0-9]+", "-", hyper)
206 Perform imputation of missing values, quantile normalization, and ANOVA. 358 hyper <- gsub("[-]+", "-", hyper)
359 hyper <- sub("^[-]", "", hyper)
360 hyper <- sub("[-]$", "", hyper)
361 return(hyper)
362 }
363
364 subsection_header <-
365 function(s) {
366 hyper <- hypersub(s)
367 cat(
368 sprintf(
369 "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n",
370 hyper, s, hyper
371 )
372 )
373 }
374
375 subsubsection_header <-
376 function(s) {
377 hyper <- hypersub(s)
378 cat(
379 sprintf(
380 "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n",
381 hyper, s, hyper
382 )
383 )
384 }
385
386 ### SQLite functions
387
388 ddl_exec <- function(db, sql) {
389 discard <- DBI::dbExecute(conn = db, statement = sql)
390 if (FALSE && discard != 0) {
391 need_newpage <- TRUE
392 if (need_newpage) {
393 need_newpage <<- FALSE
394 cat("\\newpage\n")
395 }
396 o_file <- stdout()
397 cat("\n\\begin{verbatim}\n")
398 cat(sql, file = o_file)
399 cat(sprintf("\n%d rows affected by DDL\n", discard), file = o_file)
400 cat("\n\\end{verbatim}\n")
401 }
402 }
403
404 dml_no_rows_exec <- function(db, sql) {
405 discard <- DBI::dbExecute(conn = db, statement = sql)
406 if (discard != 0) {
407 need_newpage <- TRUE
408 if (need_newpage) {
409 need_newpage <<- FALSE
410 cat("\\newpage\n")
411 }
412 cat("\n\\begin{verbatim}\n")
413 o_file <- stdout()
414 cat(sql, file = o_file)
415 cat(sprintf("\n%d rows affected by DML\n", discard), file = o_file)
416 cat("\n\\end{verbatim}\n")
417 }
418 }
419
420 ### KSEA functions and helpers
421
422 # Adapted from KSEAapp::KSEA.Scores to allow retrieval of:
423 # - maximum log2(FC)
424 ksea_scores <- function(
425
426 # For human data, typically, ksdata = KSEAapp::ksdata
427 ksdata,
428
429 # Input data file having columns:
430 # - Protein : abbreviated protein name
431 # - Gene : HUGO gene name
432 # - Peptide : peptide sequence without indications of phosphorylation
433 # - Reside.Both : position(s) of phosphorylation within Gene sequence
434 # - First letter designates AA that is modified
435 # - Numbers indicate position within Gene
436 # - Multiple values are separated by semicolons
437 # - p : p-value
438 # - FC : fold-change
439 px,
440
441 # A binary input of TRUE or FALSE, indicating whether or not to include
442 # NetworKIN predictions
443 networkin,
444
445 # A numeric value between 1 and infinity setting the minimum NetworKIN
446 # score (can be left out if networkin = FALSE)
447 networkin_cutoff
448
449 ) {
450 if (length(grep(";", px$Residue.Both)) == 0) {
451 # There are no Residue.Both entries having semicolons, so new is
452 # simply px except two columns are renamed and a column is added
453 # for log2(abs(fold-change))
454 new <- px
455 colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
456 new$log2_fc <- log2(abs(as.numeric(as.character(new$FC))))
457 new <- new[complete.cases(new$log2_fc), ]
458 } else {
459 # Split each row having semicolons in Residue.Both into rows that are
460 # duplicated in all respects except that each row has a single
461 # member of the set "split-on-semicolon-Residue.Both"
462 px_double <- px[grep(";", px$Residue.Both), ]
463 residues <- as.character(px_double$Residue.Both)
464 residues <- as.matrix(residues, ncol = 1)
465 split <- strsplit(residues, split = ";")
466 # x gets count of residues in each row,
467 # i.e., 1 + count of semicolons
468 x <- sapply(split, length)
469 # Here is the set of split rows
470 px_single <- data.frame(
471 Protein = rep(px_double$Protein, x),
472 Gene = rep(px_double$Gene, x),
473 Peptide = rep(px_double$Peptide, x),
474 Residue.Both = unlist(split),
475 p = rep(px_double$p, x),
476 FC = rep(px_double$FC, x)
477 )
478 # new first gets the split rows
479 new <- px[-grep(";", px$Residue.Both), ]
480 # to new, append the rows that didn't need splitting in the first place
481 new <- rbind(new, px_single)
482 # map Gene to SUB_GENE
483 # map Residue.Both to SUB_MOD_RSD
484 colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
485 # Eliminate any non-positive values to prevent introduction of
486 # infinite or NaN values
487 new[(0 <= new$log2_fc), "log2_fc"] <- NA
488 # Because of preceding step, there is no need for abs in the next line
489 new$log2_fc <- log2(as.numeric(as.character(new$FC)))
490 # Convert any illegal values from NaN to NA
491 new[is.nan(new$log2_fc), "log2_fc"] <- NA
492 # Eliminate rows having missing values (e.g., non-imputed data)
493 new <- new[complete.cases(new$log2_fc), ]
494 }
495 if (networkin == TRUE) {
496 # When NetworKIN is true, filter on NetworKIN.cutoff which includes
497 # PhosphoSitePlus data *because its networkin_score is set to Inf*
498 ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ]
499 ksdata_filtered <- ksdata_filtered[
500 (ksdata_filtered$networkin_score >= networkin_cutoff), ]
501 } else {
502 # Otherwise, simply use PhosphSitePlus rows
503 ksdata_filtered <- ksdata[
504 grep("PhosphoSitePlus", ksdata$Source), ]
505 }
506 # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD
507 # colnames of ksdata_filtered:
508 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID"
509 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID"
510 # "SITE_...7_AA" "networkin_score" "Source"
511 # colnames of new:
512 # "Protein" "SUB_GENE" "Peptide" "SUB_MOD_RSD" "p" "FC" "log2_fc"
513 # Equivalent to:
514 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc
515 # FROM ksdata_filtered a
516 # INNER JOIN new b
517 # ON a.SUB_GENE = b.SUB_GENE
518 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD
519 ksdata_dataset <- base::merge(ksdata_filtered, new)
520 # colnames of ksdata_dataset:
521 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE"
522 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD"
523 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein"
524 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform)
525 # Re-order dataset; prior to accounting for isoforms
526 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ]
527 # Extract non-isoform accession in UniProtKB
528 ksdata_dataset$uniprot_no_isoform <- sapply(
529 ksdata_dataset$KIN_ACC_ID,
530 function(x) unlist(strsplit(as.character(x), split = "-"))[1]
531 )
532 # Discard previous results while selecting interesting columns ...
533 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)]
534 # Column names are now:
535 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p"
536 # "FC" "log2_fc" "Source"
537 # Make column names human-readable
538 colnames(ksdata_dataset_abbrev) <- c(
539 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p",
540 "FC", "log2FC", "Source"
541 )
542 # SELECT * FROM ksdata_dataset_abbrev
543 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p
544 ksdata_dataset_abbrev <-
545 ksdata_dataset_abbrev[
546 order(
547 ksdata_dataset_abbrev$Kinase.Gene,
548 ksdata_dataset_abbrev$Substrate.Gene,
549 ksdata_dataset_abbrev$Substrate.Mod,
550 ksdata_dataset_abbrev$p),
551 ]
552 # First aggregation step to account for multiply phosphorylated peptides
553 # and differing peptide sequences; the goal here is to combine results
554 # for all measurements of the same substrate.
555 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
556 # `Source`, avg(log2FC) AS log2FC
557 # FROM ksdata_dataset_abbrev
558 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
559 # `Source`
560 # ORDER BY `Kinase.Gene`;
561 # in two steps:
562 # (1) compute average log_2(fold-change)
563 ksdata_dataset_abbrev <- aggregate(
564 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source,
565 data = ksdata_dataset_abbrev,
566 FUN = mean
567 )
568 # (2) order by Kinase.Gene
569 ksdata_dataset_abbrev <-
570 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ]
571 # SELECT `Kinase.Gene`, count(*)
572 # FROM ksdata_dataset_abbrev
573 # GROUP BY `Kinase.Gene`;
574 # in two steps:
575 # (1) Extract the list of Kinase.Gene names
576 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene)
577 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev,
578 # named by Kinase.Gene
579 kinase_list <- as.matrix(table(kinase_list))
580 # Second aggregation step to account for all substrates per kinase
581 # CREATE TABLE mean_fc
582 # AS
583 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC
584 # FROM ksdata_dataset_abbrev
585 # GROUP BY `Kinase.Gene`
586 mean_fc <- aggregate(
587 log2FC ~ Kinase.Gene,
588 data = ksdata_dataset_abbrev,
589 FUN = mean
590 )
591 # mean_fc columns: "Kinase.Gene", "log2FC"
592 if (FALSE) {
593 # I need to re-think this; I was trying to find the most-represented
594 # peptide, but that horse has already left the barn
595 # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC
596 # FROM ksdata_dataset_abbrev
597 # GROUP BY `Kinase.Gene`
598 max_fc <- aggregate(
599 log2FC ~ Kinase.Gene,
600 data = ksdata_dataset_abbrev,
601 FUN = function(r) max(abs(r))
602 )
603 }
604
605 # Create column 3: mS
606 mean_fc$m_s <- mean_fc[, 2]
607 # Create column 4: Enrichment
608 mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE))
609 # Create column 5: m, count of substrates
610 mean_fc$m <- kinase_list
611 # Create column 6: z-score
612 mean_fc$z_score <- (
613 (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) *
614 sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE)
615 # Create column 7: p-value, deduced from z-score
616 mean_fc$p_value <- pnorm(-abs(mean_fc$z_score))
617 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value
618 mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr")
619
620 # Remove log2FC column, which is duplicated as mS
621 mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2]
622 # Correct the column names which we had to hack because of the linter...
623 colnames(mean_fc) <- c(
624 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
625 )
626 return(mean_fc)
627 }
628
629 low_fdr_barplot <- function(
630 rslt,
631 i_cntrst,
632 i,
633 a_level,
634 b_level,
635 fold_change,
636 caption
637 ) {
638 rslt_score_list_i <- rslt$score_list[[i]]
639 if (!is.null(rslt_score_list_i)) {
640 rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
641 k <- data.frame(
642 contrast = as.integer(i_cntrst),
643 a_level = rep.int(a_level, rslt_score_list_i_nrow),
644 b_level = rep.int(b_level, rslt_score_list_i_nrow),
645 kinase_gene = rslt_score_list_i$Kinase.Gene,
646 mean_log2_fc = rslt_score_list_i$mS,
647 enrichment = rslt_score_list_i$Enrichment,
648 substrate_count = rslt_score_list_i$m,
649 z_score = rslt_score_list_i$z.score,
650 p_value = rslt_score_list_i$p.value,
651 fdr = rslt_score_list_i$FDR
652 )
653 selector <- switch(
654 ksea_cutoff_statistic,
655 "FDR" = {
656 k$fdr
657 },
658 "p.value" = {
659 k$p_value
660 },
661 stop(
662 sprintf(
663 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
664 ksea_cutoff_statistic
665 )
666 )
667 )
668
669 k <- k[selector < ksea_cutoff_threshold, ]
670
671 if (nrow(k) > 1) {
672 op <- par(mai = c(1, 1.5, 0.4, 0.4))
673 numeric_z_score <- as.numeric(k$z_score)
674 z_score_order <- order(numeric_z_score)
675 kinase_name <- k$kinase_gene
676 long_caption <-
677 sprintf(
678 "Kinase z-score, %s < %s, %s",
679 ksea_cutoff_statistic,
680 ksea_cutoff_threshold,
681 caption
682 )
683 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption))
684 cat("\n\\clearpage\n")
685 barplot(
686 height = numeric_z_score[z_score_order],
687 border = NA,
688 xpd = FALSE,
689 cex.names = 1.0,
690 cex.axis = 1.0,
691 main = long_caption,
692 cex.main = my_cex_caption,
693 names.arg = kinase_name[z_score_order],
694 horiz = TRUE,
695 srt = 45,
696 las = 1)
697 par(op)
698 }
699 }
700 }
701
702 # note that this adds elements to the global variable `ksea_asterisk_hash`
703
704 low_fdr_print <- function(
705 rslt,
706 i_cntrst,
707 i,
708 a_level,
709 b_level,
710 fold_change,
711 caption
712 ) {
713 rslt_score_list_i <- rslt$score_list[[i]]
714 if (!is.null(rslt_score_list_i)) {
715 rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
716 k <- contrast_ksea_scores <- data.frame(
717 contrast = as.integer(i_cntrst),
718 a_level = rep.int(a_level, rslt_score_list_i_nrow),
719 b_level = rep.int(b_level, rslt_score_list_i_nrow),
720 kinase_gene = rslt_score_list_i$Kinase.Gene,
721 mean_log2_fc = rslt_score_list_i$mS,
722 enrichment = rslt_score_list_i$Enrichment,
723 substrate_count = rslt_score_list_i$m,
724 z_score = rslt_score_list_i$z.score,
725 p_value = rslt_score_list_i$p.value,
726 fdr = rslt_score_list_i$FDR
727 )
728
729 selector <- switch(
730 ksea_cutoff_statistic,
731 "FDR" = {
732 k$fdr
733 },
734 "p.value" = {
735 k$p_value
736 },
737 stop(
738 sprintf(
739 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
740 ksea_cutoff_statistic
741 )
742 )
743 )
744
745 k <- k[selector < ksea_cutoff_threshold, ]
746 # save kinase names to ksea_asterisk_hash
747 for (kinase_name in k$kinase_gene) {
748 ksea_asterisk_hash[[kinase_name]] <- 1
749 }
750
751 db_write_table_overwrite <- (i_cntrst < 2)
752 db_write_table_append <- !db_write_table_overwrite
753 RSQLite::dbWriteTable(
754 conn = db,
755 name = "contrast_ksea_scores",
756 value = contrast_ksea_scores,
757 append = db_write_table_append
758 )
759 selector <- switch(
760 ksea_cutoff_statistic,
761 "FDR" = {
762 contrast_ksea_scores$fdr
763 },
764 "p.value" = {
765 contrast_ksea_scores$p_value
766 },
767 stop(
768 sprintf(
769 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
770 ksea_cutoff_statistic
771 )
772 )
773 )
774 output_df <- contrast_ksea_scores[
775 selector < ksea_cutoff_threshold,
776 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count",
777 "z_score", "p_value", "fdr")
778 ]
779 output_order <- with(output_df, order(mean_log2_fc, kinase_gene))
780 output_df <- output_df[output_order, ]
781 colnames(output_df) <-
782 c(
783 colnames(output_df)[1],
784 colnames(output_df)[2],
785 "enrichment",
786 "m_s",
787 "z_score",
788 "p_value",
789 "fdr"
790 )
791 output_df$fdr <- sprintf("%0.4f", output_df$fdr)
792 output_df$p_value <- sprintf("%0.2e", output_df$p_value)
793 output_df$z_score <- sprintf("%0.2f", output_df$z_score)
794 output_df$m_s <- sprintf("%d", output_df$m_s)
795 output_df$enrichment <- sprintf("%0.2f", output_df$enrichment)
796 output_ncol <- ncol(output_df)
797 colnames(output_df) <-
798 c(
799 "Kinase",
800 "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)",
801 "Enrichment",
802 "Substrates",
803 "z-score",
804 "p-value",
805 "FDR"
806 )
807 selector <- switch(
808 ksea_cutoff_statistic,
809 "FDR" = {
810 rslt$score_list[[i]]$FDR
811 },
812 "p.value" = {
813 rslt$score_list[[i]]$p.value
814 },
815 stop(
816 sprintf(
817 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
818 ksea_cutoff_statistic
819 )
820 )
821 )
822 if (sum(selector < ksea_cutoff_threshold) > 0) {
823 math_caption <- gsub("{", "\\{", caption, fixed = TRUE)
824 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE)
825 data_frame_latex(
826 x = output_df,
827 justification = "l c c c c c c",
828 centered = TRUE,
829 caption = sprintf(
830 "\\text{%s}, %s < %s",
831 math_caption,
832 ksea_cutoff_statistic,
833 ksea_cutoff_threshold
834 ),
835 anchor = const_table_anchor_p
836 )
837 } else {
838 cat(
839 sprintf(
840 "\\break
841 No kinases had
842 \\(\\text{%s}_\\text{enrichment} < %s\\)
843 for contrast %s\\hfill\\break\n",
844 ksea_cutoff_statistic,
845 ksea_cutoff_threshold,
846 caption
847 )
848 )
849 }
850 }
851 }
852
853 # create_breaks is a helper for ksea_heatmap
854 create_breaks <- function(merged_scores) {
855 if (min(merged_scores, na.rm = TRUE) < -1.6) {
856 breaks_neg <- seq(-1.6, 0, length.out = 30)
857 breaks_neg <-
858 append(
859 seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10),
860 breaks_neg
861 )
862 breaks_neg <- sort(unique(breaks_neg))
863 } else {
864 breaks_neg <- seq(-1.6, 0, length.out = 30)
865 }
866 if (max(merged_scores, na.rm = TRUE) > 1.6) {
867 breaks_pos <- seq(0, 1.6, length.out = 30)
868 breaks_pos <-
869 append(
870 breaks_pos,
871 seq(1.6, max(merged_scores, na.rm = TRUE),
872 length.out = 10)
873 )
874 breaks_pos <- sort(unique(breaks_pos))
875 } else {
876 breaks_pos <- seq(0, 1.6, length.out = 30)
877 }
878 breaks_all <- unique(append(breaks_neg, breaks_pos))
879 mycol_neg <-
880 gplots::colorpanel(n = length(breaks_neg),
881 low = "blue",
882 high = "white")
883 mycol_pos <-
884 gplots::colorpanel(n = length(breaks_pos) - 1,
885 low = "white",
886 high = "red")
887 mycol <- unique(append(mycol_neg, mycol_pos))
888 color_breaks <- list(breaks_all, mycol)
889 return(color_breaks)
890 }
891
892 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap
893 draw_kseaapp_summary_heatmap <- function(
894 x,
895 sample_cluster,
896 merged_asterisk,
897 my_cex_row,
898 color_breaks,
899 margins,
900 ...
901 ) {
902 merged_scores <- x
903 if (!is.matrix(x)) {
904 cat(
905 paste0(
906 "No plot because \\texttt{typeof(x)} is '",
907 typeof(x),
908 "' rather than 'matrix'.\n\n"
909 )
910 )
911 } else if (nrow(x) < 2) {
912 cat("No plot because matrix x has ", nrow(x), " rows.\n\n")
913 cat("\\begin{verbatim}\n")
914 str(x)
915 cat("\\end{verbatim}\n")
916 } else if (ncol(x) < 2) {
917 cat("No plot because matrix x has ", ncol(x), " columns.\n\n")
918 cat("\\begin{verbatim}\n")
919 str(x)
920 cat("\\end{verbatim}\n")
921 } else {
922 gplots::heatmap.2(
923 x = merged_scores,
924 Colv = sample_cluster,
925 scale = "none",
926 cellnote = merged_asterisk,
927 notecol = "white",
928 cexCol = 0.9,
929 # Heuristically assign size of row labels
930 cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25),
931 srtCol = 45,
932 srtRow = 45,
933 notecex = 3 * my_cex_row,
934 col = color_breaks[[2]],
935 density.info = "none",
936 trace = "none",
937 breaks = color_breaks[[1]],
938 lmat = rbind(c(0, 3), c(2, 1), c(0, 4)),
939 lhei = c(0.4, 8.0, 1.1),
940 lwid = c(0.5, 3),
941 key = FALSE,
942 margins = margins,
943 ...
944 )
945 }
946 }
947
948 # Adapted from KSEAapp::KSEA.Heatmap
949 ksea_heatmap <- function(
950 # the data frame outputs from the KSEA.Scores() function, in list format
951 score_list,
952 # a character vector of all the sample names for heatmap annotation:
953 # - the names must be in the same order as the data in score_list
954 # - please avoid long names, as they may get cropped in the final image
955 sample_labels,
956 # character string of either "p.value" or "FDR" indicating the data column
957 # to use for marking statistically significant scores
958 stats,
959 # a numeric value between 0 and infinity indicating the min. number of
960 # substrates a kinase must have to be included in the heatmap
961 m_cutoff,
962 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
963 # for indicating significant kinases in the heatmap
964 p_cutoff =
965 stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"),
966 # a binary input of TRUE or FALSE, indicating whether or not to perform
967 # hierarchical clustering of the sample columns
968 sample_cluster,
969 # a binary input of TRUE or FALSE, indicating whether or not to export
970 # the heatmap as a .png image into the working directory
971 export = FALSE,
972 # bottom and right margins; adjust as needed if contrast names are too long
973 margins = c(6, 20),
974 # print which kinases?
975 # - Mandatory argument, must be one of const_ksea_.*_kinases
976 which_kinases,
977 # additional arguments to gplots::heatmap.2, such as:
978 # - main: main title of plot
979 # - xlab: x-axis label
980 # - ylab: y-axis label
981 ...
982 ) {
983 filter_m <- function(dataset, m_cutoff) {
984 filtered <- dataset[(dataset$m >= m_cutoff), ]
985 return(filtered)
986 }
987 score_list_m <- lapply(score_list, function(...) filter_m(..., m_cutoff))
988 for (i in seq_len(length(score_list_m))) {
989 names <- colnames(score_list_m[[i]])[c(2:7)]
990 colnames(score_list_m[[i]])[c(2:7)] <-
991 paste(names, i, sep = ".")
992 }
993 master <-
994 Reduce(
995 f = function(...) {
996 base::merge(..., by = "Kinase.Gene", all = FALSE)
997 },
998 x = score_list_m
999 )
1000
1001 row.names(master) <- master$Kinase.Gene
1002 columns <- as.character(colnames(master))
1003 merged_scores <- as.matrix(master[, grep("z.score", columns), drop = FALSE])
1004 colnames(merged_scores) <- sample_labels
1005 merged_stats <- as.matrix(master[, grep(stats, columns)])
1006 asterisk <- function(mtrx, p_cutoff) {
1007 new <- data.frame()
1008 for (i in seq_len(nrow(mtrx))) {
1009 for (j in seq_len(ncol(mtrx))) {
1010 my_value <- mtrx[i, j]
1011 if (!is.na(my_value) && my_value < p_cutoff) {
1012 new[i, j] <- "*"
1013 } else {
1014 new[i, j] <- ""
1015 }
1016 }
1017 }
1018 return(new)
1019 }
1020 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff))
1021
1022 # begin hack to print only significant rows
1023 asterisk_rows <- rowSums(merged_asterisk == "*") > 0
1024 all_rows <- rownames(merged_stats)
1025 names(asterisk_rows) <- all_rows
1026 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE])
1027 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE])
1028 merged_scores_asterisk <- merged_scores[names(asterisk_rows), ]
1029 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ]
1030 # end hack to print only significant rows
1031
1032 row_list <- list()
1033 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows
1034 row_list[[const_ksea_all_kinases]] <- all_rows
1035 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows
1036
1037 i <- which_kinases
1038 my_row_names <- row_list[[i]]
1039 scrs <- merged_scores[my_row_names, ]
1040 stts <- merged_stats[my_row_names, ]
1041 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff))
1042
1043 color_breaks <- create_breaks(scrs)
1044 plot_height <- nrow(scrs) ^ 0.55
1045 plot_width <- ncol(scrs) ^ 0.7
1046 my_cex_row <- 0.25 * 16 / plot_height
1047 if (export == "TRUE") {
1048 png(
1049 "KSEA.Merged.Heatmap.png",
1050 width = plot_width * 300,
1051 height = 2 * plot_height * 300,
1052 res = 300,
1053 pointsize = 14
1054 )
1055 }
1056 draw_kseaapp_summary_heatmap(
1057 x = scrs,
1058 sample_cluster = sample_cluster,
1059 merged_asterisk = merged_asterisk,
1060 my_cex_row = my_cex_row,
1061 color_breaks = color_breaks,
1062 margins = margins
1063 )
1064 if (export == "TRUE") {
1065 dev.off()
1066 }
1067 return(my_row_names)
1068 }
1069
1070 # helper for heatmaps of phosphopeptide intensities
1071
1072 draw_intensity_heatmap <-
1073 function(
1074 m, # matrix with rownames already formatted
1075 cutoff, # cutoff used by hm_heading_function
1076 hm_heading_function, # construct and cat heading from m and cutoff
1077 hm_main_title, # main title for plot (drawn below heading)
1078 suppress_row_dendrogram = TRUE, # set to false to show dendrogram
1079 max_peptide_count # experimental:
1080 = intensity_hm_rows, # values of 50 and 75 worked well
1081 ... # passthru parameters for heatmap
1082 ) {
1083 peptide_count <- 0
1084 # emit the heading for the heatmap
1085 if (hm_heading_function(m, cutoff)) {
1086 peptide_count <- min(max_peptide_count, nrow(m))
1087 if (nrow(m) > 1) {
1088 m_margin <- m[peptide_count:1, ]
1089 # Margin setting was heuristically derived
1090 margins <-
1091 c(0.5, # col
1092 max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row
1093 )
1094 }
1095 if (nrow(m) > 1) {
1096 tryCatch(
1097 {
1098 old_oma <- par("oma")
1099 par(cex.main = 0.6)
1100 # Heuristically determined character size adjustment formula
1101 char_contractor <-
1102 250000 / (
1103 max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows
1104 )
1105 heatmap(
1106 m[peptide_count:1, ],
1107 Rowv = if (suppress_row_dendrogram) NA else NULL,
1108 Colv = NA,
1109 cexRow = char_contractor,
1110 cexCol = char_contractor * 50 / max_peptide_count,
1111 scale = "row",
1112 margins = margins,
1113 main =
1114 "Unimputed, unnormalized log(intensities)",
1115 xlab = "",
1116 las = 1,
1117 ...
1118 )
1119 },
1120 error = function(e) {
1121 cat(
1122 sprintf(
1123 "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n",
1124 e$message
1125 )
1126 )
1127 },
1128 finally = par(old_oma)
1129 )
1130 }
1131 }
1132 return(peptide_count)
1133 }
1134 ```
1135
1136 ```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
1137 cat("\\listoftables\n")
1138 ```
1139 # Purpose
1140
1141 To perform for phosphopeptides:
1142
1143 - imputation of missing values,
1144 - quantile normalization,
1145 - ANOVA (using the R stats::`r params$oneWayManyCategories` function), and
1146 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases:
1147 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org)
1148 - The Human Proteome Database [http://hprd.org](http://hprd.org)
1149 - NetworKIN [http://networkin.science/](http://networkin.science/)
1150 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx)
207 1151
208 ```{r include = FALSE} 1152 ```{r include = FALSE}
1153
1154 ### GLOBAL VARIABLES
1155
1156 # parameters for KSEA
1157
1158 ksea_cutoff_statistic <- params$kseaCutoffStatistic
1159 ksea_cutoff_threshold <- params$kseaCutoffThreshold
1160 ksea_min_kinase_count <- params$kseaMinKinaseCount
1161
1162 ksea_heatmap_titles <- list()
1163 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <-
1164 sprintf(
1165 "Summary for all kinases enriched in one or more contrasts at %s < %s",
1166 ksea_cutoff_statistic,
1167 ksea_cutoff_threshold
1168 )
1169 ksea_heatmap_titles[[const_ksea_all_kinases]] <-
1170 "Summary figure for all contrasts and all kinases"
1171 ksea_heatmap_titles[[const_ksea_nonastrsk_kinases]] <-
1172 sprintf(
1173 "Summary for all kinases not enriched at %s < %s in any contrast",
1174 ksea_cutoff_statistic,
1175 ksea_cutoff_threshold
1176 )
1177 # hash to hold names of significantly enriched kinases
1178 ksea_asterisk_hash <- new_env()
1179
1180 # READ PARAMETERS (mostly)
1181
1182 intensity_hm_rows <- params$intensityHeatmapRows
209 # Input Filename 1183 # Input Filename
210 input_file <- params$inputFile 1184 input_file <- params$inputFile
211 1185
212 # First data column - ideally, this could be detected via regexSampleNames, 1186 # First data column - ideally, this could be detected via regexSampleNames,
213 # but for now leave it as is. 1187 # but for now leave it as is.
218 } 1192 }
219 1193
220 # False discovery rate adjustment for ANOVA 1194 # False discovery rate adjustment for ANOVA
221 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 1195 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05
222 val_fdr <- 1196 val_fdr <-
223 read.table(file = params$alphaFile, sep = "\t", header = F, quote = "") 1197 read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "")
224 1198
225 if ( 1199 if (
226 ncol(val_fdr) != 1 || 1200 ncol(val_fdr) != 1 ||
227 sum(!is.numeric(val_fdr[, 1])) || 1201 sum(!is.numeric(val_fdr[, 1])) ||
228 sum(val_fdr[, 1] < 0) || 1202 sum(val_fdr[, 1] < 0) ||
233 val_fdr <- val_fdr[, 1] 1207 val_fdr <- val_fdr[, 1]
234 1208
235 #Imputed Data filename 1209 #Imputed Data filename
236 imputed_data_filename <- params$imputedDataFilename 1210 imputed_data_filename <- params$imputedDataFilename
237 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile 1211 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
238 1212 anova_ksea_mtdt_file <- params$anovaKseaMetadata
239 #ANOVA data filename 1213
240 ``` 1214 ```
241 1215
242 ```{r echo = FALSE} 1216 ```{r echo = FALSE}
243 # Imputation method, should be one of 1217 # Imputation method, should be one of
244 # "random", "group-median", "median", or "mean" 1218 # "random", "group-median", "median", or "mean"
253 1227
254 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" 1228 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
255 regex_sample_names <- params$regexSampleNames 1229 regex_sample_names <- params$regexSampleNames
256 1230
257 # Regular expression to extract Sample Grouping from Sample Name; 1231 # Regular expression to extract Sample Grouping from Sample Name;
258 # if error occurs, compare sample_factor_levels and sample_name_matches 1232 # if error occurs, compare sample_treatment_levels vs. sample_name_matches
259 # to see if groupings/pairs line up 1233 # to see if groupings/pairs line up
260 # e.g., "(\\d+)" 1234 # e.g., "(\\d+)"
261 regex_sample_grouping <- params$regexSampleGrouping 1235 regex_sample_grouping <- params$regexSampleGrouping
262 1236
1237 one_way_all_categories_fname <- params$oneWayManyCategories
1238 one_way_all_categories <- try_catch_w_e(
1239 match.fun(one_way_all_categories_fname))
1240 if (!is.function(one_way_all_categories$value)) {
1241 write("fatal error for parameter oneWayManyCategories:", stderr())
1242 write(one_way_all_categories$value$message, stderr())
1243 if (sys.nframe() > 0) quit(save = "no", status = 1)
1244 stop("Cannot continue. Goodbye.")
1245 }
1246 one_way_all_categories <- one_way_all_categories$value
1247
1248 one_way_two_categories_fname <- params$oneWayManyCategories
1249 one_way_two_categories <- try_catch_w_e(
1250 match.fun(one_way_two_categories_fname))
1251 if (!is.function(one_way_two_categories$value)) {
1252 cat("fatal error for parameter oneWayTwoCategories: \n")
1253 cat(one_way_two_categories$value$message, fill = TRUE)
1254 if (sys.nframe() > 0) quit(save = "no", status = 1)
1255 stop("Cannot continue. Goodbye.")
1256 }
1257 one_way_two_categories <- one_way_two_categories$value
1258
1259 preproc_db <- params$preprocDb
1260 ksea_app_prep_db <- params$kseaAppPrepDb
1261 result <- file.copy(
1262 from = preproc_db,
1263 to = ksea_app_prep_db,
1264 overwrite = TRUE
1265 )
1266 if (!result) {
1267 write(
1268 sprintf(
1269 "fatal error copying initial database '%s' to output '%s'",
1270 preproc_db,
1271 ksea_app_prep_db,
1272 ),
1273 stderr()
1274 )
1275 if (sys.nframe() > 0) quit(save = "no", status = 1)
1276 stop("Cannot continue. Goodbye.")
1277 }
263 ``` 1278 ```
264 1279
265 ```{r echo = FALSE} 1280 ```{r echo = FALSE}
266 ### READ DATA 1281 ### READ DATA
267
268 library(data.table)
269 1282
270 # read.table reads a file in table format and creates a data frame from it. 1283 # read.table reads a file in table format and creates a data frame from it.
271 # - note that `quote = ""` means that quotation marks are treated literally. 1284 # - note that `quote = ""` means that quotation marks are treated literally.
272 full_data <- read.table( 1285 full_data <- read.table(
273 file = input_file, 1286 file = input_file,
274 sep = "\t", 1287 sep = "\t",
275 header = T, 1288 header = TRUE,
276 quote = "", 1289 quote = "",
277 check.names = FALSE 1290 check.names = FALSE
278 ) 1291 )
279 ``` 1292 ```
280 1293
281 ## Extract Sample Names and Factor Levels 1294 # Extract Sample Names and Treatment Levels
282 1295
283 Column names parsed from input file are shown in Table 1; sample names and factor levels, in Table 2. 1296 Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2.
284 1297
285 ```{r echo = FALSE, results = 'asis'} 1298 ```{r echo = FALSE, results = 'asis'}
286 1299
287 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) 1300 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)
288 1301
295 } 1308 }
296 1309
297 cat( 1310 cat(
298 sprintf( 1311 sprintf(
299 paste( 1312 paste(
300 "\n\nPeptide-intensity data for each sample is", 1313 "\n\nThe input data file has peptide-intensity data for each sample",
301 "in one of columns %d through %d.\n\n" 1314 "in one of columns %d through %d.\n\n"
302 ), 1315 ),
303 min(data_column_indices), 1316 min(data_column_indices),
304 max(data_column_indices) 1317 max(data_column_indices)
305 ) 1318 )
306 ) 1319 )
307 1320
308 # Write column names as a LaTeX enumerated list. 1321 # Write column names as a LaTeX enumerated list.
309 column_name_df <- data.frame( 1322 column_name_df <- data.frame(
310 column = seq_len(length(colnames(full_data))), 1323 column = seq_len(length(colnames(full_data))),
311 name = colnames(full_data) 1324 name = paste0("\\verb@", colnames(full_data), "@")
312 ) 1325 )
313 data_frame_latex( 1326 data_frame_latex(
314 x = column_name_df, 1327 x = column_name_df,
315 justification = "l l", 1328 justification = "l l",
316 centered = TRUE, 1329 centered = TRUE,
317 caption = "Input data column name", 1330 caption = "Input data column names",
318 anchor = const_table_anchor 1331 anchor = const_table_anchor_bp,
1332 underscore_whack = FALSE
319 ) 1333 )
320 1334
321 ``` 1335 ```
322 1336
323 ```{r echo = FALSE, results = 'asis'} 1337 ```{r echo = FALSE, results = 'asis'}
324 #```{r echo = FALSE, results = 'asis'}
325 quant_data <- full_data[first_data_column:length(full_data)] 1338 quant_data <- full_data[first_data_column:length(full_data)]
326 quant_data[quant_data == 0] <- NA 1339 quant_data[quant_data == 0] <- NA
327 rownames(quant_data) <- full_data$Phosphopeptide 1340 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide
328 # Get factors -> group replicates (as indicated by terminal letter) 1341 # Extract factors and trt-replicates using regular expressions.
329 # by the preceding digits; 1342 # Typically:
330 # Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$" 1343 # regex_sample_names is "\\.\\d+[A-Z]$"
331 # get factors -> 1344 # regex_sample_grouping is "\\d+"
332 # group runs (samples) by ignoring terminal [A-Z] in sample names 1345 # This would distinguish trt-replicates by terminal letter [A-Z]
333 # e.g. 1346 # in sample names and group them into trts by the preceding digits.
1347 # e.g.:
334 # group .1A .1B .1C into group 1; 1348 # group .1A .1B .1C into group 1;
335 # group .2A .2B .2C, into group 2; 1349 # group .2A .2B .2C, into group 2;
336 # etc. 1350 # etc.
337 m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE) 1351 m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE)
338 sample_name_matches <- regmatches(names(quant_data), m) 1352 sample_name_matches <- regmatches(names(quant_data), m)
339 colnames(quant_data) <- sample_name_matches 1353 colnames(quant_data) <- sample_name_matches
340 1354
341 write_debug_file(quant_data) 1355 write_debug_file(quant_data)
342 1356
343 m2 <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) 1357 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
344 sample_factor_levels <- as.factor(regmatches(sample_name_matches, m2)) 1358 sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match))
345 number_of_samples <- length(sample_name_matches) 1359 number_of_samples <- length(sample_name_matches)
346 sample_factor_df <- data.frame( 1360 sample_treatment_df <- data.frame(
347 sample = sample_name_matches, 1361 level = sample_treatment_levels,
348 level = sample_factor_levels 1362 sample = sample_name_matches
349 ) 1363 )
350 data_frame_latex( 1364 data_frame_latex(
351 x = sample_factor_df, 1365 x = sample_treatment_df,
352 justification = "c c", 1366 justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}",
353 centered = TRUE, 1367 centered = TRUE,
354 caption = "Factor level", 1368 caption = "Treatment levels",
355 anchor = const_table_anchor 1369 anchor = const_table_anchor_tbp,
356 ) 1370 underscore_whack = FALSE
357 ``` 1371 )
1372 ```
1373
358 ```{r echo = FALSE, results = 'asis'} 1374 ```{r echo = FALSE, results = 'asis'}
359 cat("\\newpage\n") 1375 cat("\\newpage\n")
360 ``` 1376 ```
361 1377
362 ### Are the log-transformed sample distributions similar? 1378 ## Are the log-transformed sample distributions similar?
363 1379
364 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} 1380 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
365 1381
366 quant_data[quant_data == 0] <- NA #replace 0 with NA 1382 quant_data[quant_data == 0] <- NA #replace 0 with NA
367 quant_data_log <- log10(quant_data) 1383 quant_data_log <- log10(quant_data)
392 cat("\n\n\n") 1408 cat("\n\n\n")
393 1409
394 ``` 1410 ```
395 1411
396 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} 1412 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
397 library(ggplot2)
398 if (nrow(quant_data_log) > 1) { 1413 if (nrow(quant_data_log) > 1) {
399 quant_data_log_stack <- stack(quant_data_log) 1414 quant_data_log_stack <- stack(quant_data_log)
400 ggplot( 1415 ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) +
401 quant_data_log_stack, 1416 ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
402 aes(x = values) 1417 ggplot2::ylab("Probability density") +
403 ) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + 1418 ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE)
404 ylab("Probability density") +
405 geom_density(
406 aes(group = ind, colour = ind),
407 na.rm = TRUE
408 )
409 } else { 1419 } else {
410 cat("No density plot because there are too few peptides.\n\n") 1420 cat("No density plot because there are too few peptides.\n\n")
411 } 1421 }
412 ``` 1422 ```
413 1423
414 ### Globally, are peptide intensities are approximately unimodal? 1424 ## Globally, are peptide intensities are approximately unimodal?
415 1425
416 <!-- 1426 <!--
417 # bquote could be used as an alternative to latex2exp::TeX below particularly 1427 # bquote could be used as an alternative to latex2exp::TeX below particularly
418 # and when plotting math expressions generally, at the expense of mastering 1428 # and when plotting math expressions generally, at the expense of mastering
419 # another syntax, which hardly seems worthwhile when I need to use TeX 1429 # another syntax, which hardly seems worthwhile when I need to use TeX
442 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), 1452 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"),
443 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") 1453 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)")
444 ) 1454 )
445 ``` 1455 ```
446 1456
447 ### Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values: 1457 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values
448 1458
449 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} 1459 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
450 # determine quantile 1460 # determine quantile
451 q1 <- quantile(logvalues, probs = mean_percentile)[1] 1461 q1 <- quantile(logvalues, probs = mean_percentile)[1]
452 1462
453 # determine standard deviation of quantile to impute
454 sd_finite <- function(x) {
455 ok <- is.finite(x)
456 sd(x[ok])
457 }
458 # 1 = row of matrix (ie, phosphopeptide) 1463 # 1 = row of matrix (ie, phosphopeptide)
459 sds <- apply(quant_data_log, 1, sd_finite) 1464 sds <- apply(quant_data_log, 1, sd_finite)
460 if (sum(!is.na(sds)) > 2) { 1465 if (sum(!is.na(sds)) > 2) {
461 plot( 1466 plot(
462 density(sds, na.rm = T) 1467 density(sds, na.rm = TRUE)
463 , main = "Smoothed estimated probability density vs. std. deviation" 1468 , main = "Smoothed estimated probability density vs. std. deviation"
464 , sub = "(probability estimation made with Gaussian smoothing)" 1469 , sub = "(probability estimation made with Gaussian smoothing)"
465 , ylab = "Probability density" 1470 , ylab = "Probability density"
466 ) 1471 )
467 } else { 1472 } else {
488 ```{r echo = FALSE} 1493 ```{r echo = FALSE}
489 1494
490 # prep for trt-median based imputation 1495 # prep for trt-median based imputation
491 1496
492 ``` 1497 ```
493 ## Impute Missing Values 1498 # Impute Missing Values
494 1499
495 ```{r echo = FALSE} 1500 ```{r echo = FALSE}
496 1501
497 imp_smry_potential_before <- length(logvalues) 1502 imp_smry_pot_peptides_before <- nrow(quant_data_log)
498 imp_smry_missing_before <- number_to_impute 1503 imp_smry_missing_values_before <- number_to_impute
499 imp_smry_pct_missing <- pct_missing_values 1504 imp_smry_pct_missing <- pct_missing_values
500 1505
501 ``` 1506 ```
502 1507
503 ```{r echo = FALSE} 1508 ```{r echo = FALSE}
504 #Determine number of cells to impute 1509 #Determine number of cells to impute
505 1510
506 ``` 1511 ```
507 ```{r echo = FALSE} 1512 ```{r echo = FALSE}
508 1513
509 #Impute data
510 quant_data_imp <- quant_data
511
512 # Identify which values are missing and need to be imputed 1514 # Identify which values are missing and need to be imputed
513 ind <- which(is.na(quant_data_imp), arr.ind = TRUE) 1515 ind <- which(is.na(quant_data), arr.ind = TRUE)
514 1516
515 ``` 1517 ```
516 ```{r echo = FALSE, results = 'asis'} 1518 ```{r echo = FALSE, results = 'asis'}
517 1519
518 # Apply imputation 1520 # Apply imputation
519 switch( 1521 switch(
520 imputation_method 1522 imputation_method
521 , "group-median" = { 1523 , "group-median" = {
1524 quant_data_imp <- quant_data
522 imputation_method_description <- 1525 imputation_method_description <-
523 paste("Substitute missing value with", 1526 paste("Substitute missing value with",
524 "median peptide intensity for sample group.\n" 1527 "median peptide-intensity for sample group.\n"
525 ) 1528 )
526 sample_level_integers <- as.integer(sample_factor_levels) 1529 sample_level_integers <- as.integer(sample_treatment_levels)
527 for (i in seq_len(length(levels(sample_factor_levels)))) { 1530 # Take the accurate ln(x+1) because the data are log-normally distributed
1531 # and because median can involve an average of two measurements.
1532 quant_data_imp <- log1p(quant_data_imp)
1533 for (i in seq_len(length(levels(sample_treatment_levels)))) {
1534 # Determine the columns for this factor-level
528 level_cols <- i == sample_level_integers 1535 level_cols <- i == sample_level_integers
529 ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE) 1536 # Extract those columns
530 quant_data_imp[ind, level_cols] <- 1537 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE]
531 apply(quant_data_imp[, level_cols], 1, median, na.rm = T)[ind[, 1]] 1538 # assign to ind the row-column pairs corresponding to each NA
1539 ind <- which(is.na(lvlsbst), arr.ind = TRUE)
1540 # No group-median exists if there is only one sample
1541 # a given ppep has no measurement; otherwise, proceed.
1542 if (ncol(lvlsbst) > 1) {
1543 the_centers <-
1544 apply(lvlsbst, 1, median, na.rm = TRUE)
1545 for (j in seq_len(nrow(lvlsbst))) {
1546 for (k in seq_len(ncol(lvlsbst))) {
1547 if (is.na(lvlsbst[j, k])) {
1548 lvlsbst[j, k] <- the_centers[j]
1549 }
1550 }
1551 }
1552 quant_data_imp[, level_cols] <- lvlsbst
1553 }
532 } 1554 }
1555 # Take the accurate e^x - 1 to match scaling of original input.
1556 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
533 good_rows <- !is.na(rowMeans(quant_data_imp)) 1557 good_rows <- !is.na(rowMeans(quant_data_imp))
534 } 1558 }
535 , "median" = { 1559 , "median" = {
1560 quant_data_imp <- quant_data
536 imputation_method_description <- 1561 imputation_method_description <-
537 paste("Substitute missing value with", 1562 paste("Substitute missing value with",
538 "median peptide intensity across all sample classes.\n" 1563 "median peptide-intensity across all sample classes.\n"
539 ) 1564 )
540 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]] 1565 # Take the accurate ln(x+1) because the data are log-normally distributed
541 good_rows <- !is.na(rowMeans(quant_data_imp)) 1566 # and because median can involve an average of two measurements.
1567 quant_data_imp <- log1p(quant_data_imp)
1568 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]]
1569 # Take the accurate e^x - 1 to match scaling of original input.
1570 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
1571 good_rows <- !is.nan(rowMeans(quant_data_imp))
542 } 1572 }
543 , "mean" = { 1573 , "mean" = {
1574 quant_data_imp <- quant_data
544 imputation_method_description <- 1575 imputation_method_description <-
545 paste("Substitute missing value with", 1576 paste("Substitute missing value with",
546 "mean peptide intensity across all sample classes.\n" 1577 "geometric-mean peptide intensity across all sample classes.\n"
547 ) 1578 )
548 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]] 1579 # Take the accurate ln(x+1) because the data are log-normally distributed,
549 good_rows <- !is.na(rowMeans(quant_data_imp)) 1580 # so arguments to mean should be previously transformed.
1581 # this will have to be
1582 quant_data_imp <- log1p(quant_data_imp)
1583 # Assign to NA cells the mean for the row
1584 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]]
1585 # Take the accurate e^x - 1 to match scaling of original input.
1586 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp))
1587 good_rows <- !is.nan(rowMeans(quant_data_imp))
550 } 1588 }
551 , "random" = { 1589 , "random" = {
552 m1 <- median(sds, na.rm = T) * sd_percentile #sd to be used is the median sd 1590 quant_data_imp <- quant_data
1591 m1 <- median(sds, na.rm = TRUE) * sd_percentile #sd to be used is the median sd
553 # If you want results to be reproducible, you will want to call 1592 # If you want results to be reproducible, you will want to call
554 # base::set.seed before calling stats::rnorm 1593 # base::set.seed before calling stats::rnorm
555 imputation_method_description <- 1594 imputation_method_description <-
556 paste("Substitute each missing value with random intensity", 1595 paste("Substitute each missing value with random intensity",
557 sprintf( 1596 sprintf(
563 100 * mean_percentile)) 1602 100 * mean_percentile))
564 cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n", 1603 cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n",
565 sd_percentile)) 1604 sd_percentile))
566 quant_data_imp[ind] <- 1605 quant_data_imp[ind] <-
567 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) 1606 10 ^ rnorm(number_to_impute, mean = q1, sd = m1)
568 good_rows <- !is.na(rowMeans(quant_data_imp)) 1607 quant_data_imp_ln <- log1p(quant_data_imp)
1608 good_rows <- !is.nan(rowMeans(quant_data_imp))
569 } 1609 }
570 ) 1610 )
1611 quant_data_imp_log10 <- quant_data_imp_ln * const_log10_e
571 1612
572 if (length(good_rows) < 1) { 1613 if (length(good_rows) < 1) {
573 print("ERROR: Cannot impute data; there are no good rows!") 1614 print("ERROR: Cannot impute data; there are no good rows!")
574 return(-1) 1615 return(-1)
575 } 1616 }
579 cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description) 1620 cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description)
580 ``` 1621 ```
581 1622
582 ```{r echo = FALSE} 1623 ```{r echo = FALSE}
583 1624
584 imp_smry_potential_after <- sum(good_rows) 1625 imp_smry_pot_peptides_after <- sum(good_rows)
585 imp_smry_rejected_after <- sum(!good_rows) 1626 imp_smry_rejected_after <- sum(!good_rows)
586 imp_smry_missing_after <- sum(is.na(quant_data_imp[good_rows, ])) 1627 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ]))
587 ``` 1628 ```
588 ```{r echo = FALSE, results = 'asis'} 1629 ```{r echo = FALSE, results = 'asis'}
589 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf 1630 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf
590 tabular_lines <- paste( 1631 tabular_lines_fmt <- paste(
591 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) 1632 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page)
592 " \\caption{Imputation Results}", 1633 " \\caption{Imputation Results}",
593 " \\centering", # \centering centers the table on the page 1634 " \\centering", # \centering centers the table on the page
594 " \\begin{tabular}{l c c c}", 1635 " \\begin{tabular}{l c c c}",
595 " \\hline\\hline", 1636 " \\hline\\hline",
604 "\\end{table}", 1645 "\\end{table}",
605 sep = "\n" 1646 sep = "\n"
606 ) 1647 )
607 tabular_lines <- 1648 tabular_lines <-
608 sprintf( 1649 sprintf(
609 tabular_lines, 1650 tabular_lines_fmt,
610 imp_smry_potential_before, 1651 imp_smry_pot_peptides_before,
611 imp_smry_missing_before, 1652 imp_smry_missing_values_before,
612 imp_smry_pct_missing, 1653 imp_smry_pct_missing,
613 "%", 1654 "%",
614 imp_smry_potential_after, 1655 imp_smry_pot_peptides_after,
615 imp_smry_missing_after, 1656 imp_smry_missing_values_after,
616 imp_smry_rejected_after 1657 imp_smry_rejected_after
617 ) 1658 )
618 cat(tabular_lines) 1659 cat(tabular_lines)
619 ``` 1660 ```
620 ```{r echo = FALSE} 1661 ```{r echo = FALSE}
622 1663
623 # Zap rows where imputation was ineffective 1664 # Zap rows where imputation was ineffective
624 full_data <- full_data [good_rows, ] 1665 full_data <- full_data [good_rows, ]
625 quant_data <- quant_data [good_rows, ] 1666 quant_data <- quant_data [good_rows, ]
626 1667
1668 quant_data_imp <- quant_data_imp[good_rows, ]
627 write_debug_file(quant_data_imp) 1669 write_debug_file(quant_data_imp)
628
629 quant_data_imp <- quant_data_imp[good_rows, ]
630 quant_data_imp_good_rows <- quant_data_imp 1670 quant_data_imp_good_rows <- quant_data_imp
631 1671
632 write_debug_file(quant_data_imp_good_rows) 1672 write_debug_file(quant_data_imp_good_rows)
633 ``` 1673 ```
634 1674
646 as.matrix( 1686 as.matrix(
647 log10(quant_data_imp[!is.na(quant_data)]) 1687 log10(quant_data_imp[!is.na(quant_data)])
648 ) 1688 )
649 ) 1689 )
650 1690
651 if (sum(!is.na(d_combined)) > 2 && sum(!is.na(d_original)) > 2) { 1691 if (sum(!is.na(d_original)) > 2) {
1692 d_original <- density(d_original)
1693 } else {
1694 can_plot_before_after_imp <- FALSE
1695 }
1696 if (can_plot_before_after_imp && sum(is.na(d_combined)) < 1) {
652 d_combined <- density(d_combined) 1697 d_combined <- density(d_combined)
653 d_original <- density(d_original)
654 } else { 1698 } else {
655 can_plot_before_after_imp <- FALSE 1699 can_plot_before_after_imp <- FALSE
656 } 1700 }
657 1701
658 if (sum(is.na(quant_data)) > 0) { 1702 if (sum(is.na(quant_data)) > 0) {
661 as.numeric( 1705 as.numeric(
662 as.matrix( 1706 as.matrix(
663 log10(quant_data_imp[is.na(quant_data)]) 1707 log10(quant_data_imp[is.na(quant_data)])
664 ) 1708 )
665 ) 1709 )
666 if (sum(!is.na(d_combined)) > 2) { 1710 if (can_plot_before_after_imp && sum(is.na(d_imputed)) < 1) {
667 d_imputed <- (density(d_imputed)) 1711 d_imputed <- (density(d_imputed))
668 } else { 1712 } else {
669 can_plot_before_after_imp <- FALSE 1713 can_plot_before_after_imp <- FALSE
670 } 1714 }
671 } else { 1715 } else {
674 } 1718 }
675 1719
676 ``` 1720 ```
677 1721
678 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} 1722 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
1723 zero_sd_rownames <-
1724 rownames(quant_data_imp)[
1725 is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0)
1726 ]
1727
1728 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) {
1729 stop("All peptides have zero standard deviation. Cannot continue.")
1730 }
1731 if (length(zero_sd_rownames) > 0) {
1732 cat(
1733 sprintf("%d peptides with zero variance were removed from statistical consideration",
1734 length(zero_sd_rownames)
1735 )
1736 )
1737 zap_named_rows <- function(df, nms) {
1738 return(df[!(row.names(df) %in% nms), ])
1739 }
1740 quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames)
1741 quant_data <- zap_named_rows(quant_data, zero_sd_rownames)
1742 full_data <- zap_named_rows(full_data, zero_sd_rownames)
1743 }
1744
679 if (sum(is.na(quant_data)) > 0) { 1745 if (sum(is.na(quant_data)) > 0) {
680 cat("\\leavevmode\\newpage\n") 1746 cat("\\leavevmode\\newpage\n")
681 # data visualization 1747 # data visualization
682 old_par <- par( 1748 old_par <- par(
683 mai = par("mai") + c(0.5, 0, 0, 0) 1749 mai = par("mai") + c(0.5, 0, 0, 0)
684 ) 1750 )
1751 # Copy quant data to x
685 x <- quant_data 1752 x <- quant_data
686 x <- blue_dots <- x / x 1753 # x gets to have values of:
687 blue_dots <- log10(blue_dots * quant_data) 1754 # - NA for observed values
1755 # - 1 for missing values
688 x[is.na(x)] <- 0 1756 x[is.na(x)] <- 0
689 x[x == 1] <- NA 1757 x[x > 1] <- NA
690 x[x == 0] <- 1 1758 x[x == 0] <- 1
691 quant_data_imp_log10 <- 1759 # Log-transform imputed data
692 log10(quant_data_imp) 1760 # update variable because rows may have been eliminated from quant_data_imp
1761 quant_data_imp_log10 <- log10(quant_data_imp)
693 1762
694 write_debug_file(quant_data_imp_log10) 1763 write_debug_file(quant_data_imp_log10)
695 1764
1765 # Set blue_dots to log of quant data or NA for NA quant data
1766 blue_dots <- log10(quant_data)
1767 # Set red_dots to log of imputed data or NA for observed quant data
696 red_dots <- quant_data_imp_log10 * x 1768 red_dots <- quant_data_imp_log10 * x
1769
697 count_red <- sum(!is.na(red_dots)) 1770 count_red <- sum(!is.na(red_dots))
698 count_blue <- sum(!is.na(blue_dots)) 1771 count_blue <- sum(!is.na(blue_dots))
699 ylim_save <- ylim <- c( 1772 ylim_save <- ylim <- c(
700 min(red_dots, blue_dots, na.rm = TRUE), 1773 min(red_dots, blue_dots, na.rm = TRUE),
701 max(red_dots, blue_dots, na.rm = TRUE) 1774 max(red_dots, blue_dots, na.rm = TRUE)
713 boxplot( 1786 boxplot(
714 blue_dots 1787 blue_dots
715 , las = 1 # "always horizontal" 1788 , las = 1 # "always horizontal"
716 , col = const_boxplot_fill 1789 , col = const_boxplot_fill
717 , ylim = ylim 1790 , ylim = ylim
718 , main = "Peptide intensities before and after imputation" 1791 , main = "Peptide intensities after eliminating unusable peptides"
719 , sub = boxplot_sub 1792 , sub = boxplot_sub
720 , xlab = "Sample" 1793 , xlab = "Sample"
721 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") 1794 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
722 ) 1795 )
723 1796
744 col = "red", # Color of the symbol 1817 col = "red", # Color of the symbol
745 vertical = TRUE, # Vertical mode 1818 vertical = TRUE, # Vertical mode
746 add = TRUE # Add it over 1819 add = TRUE # Add it over
747 ) 1820 )
748 1821
749 } else { 1822 }
750 # violin plot 1823 if (TRUE) {
1824 # show measured values in blue on left half-violin plot
751 cat("\\leavevmode\n\\quad\n\n\\quad\n\n") 1825 cat("\\leavevmode\n\\quad\n\n\\quad\n\n")
752 vioplot::vioplot( 1826 vioplot::vioplot(
753 x = lapply(blue_dots, function(x) x[!is.na(x)]), 1827 x = lapply(blue_dots, function(x) x[!is.na(x)]),
754 col = "lightblue1", 1828 col = "lightblue1",
755 side = "left", 1829 side = "left",
758 main = "Distributions of observed and imputed data", 1832 main = "Distributions of observed and imputed data",
759 sub = "Light blue = observed data; Pink = imputed data", 1833 sub = "Light blue = observed data; Pink = imputed data",
760 xlab = "Sample", 1834 xlab = "Sample",
761 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") 1835 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
762 ) 1836 )
1837 red_violins <- lapply(red_dots, function(x) x[!is.na(x)])
1838 cols_to_delete <- c()
1839 for (ix in seq_len(length(red_violins))) {
1840 if (length(red_violins[[ix]]) < 1) {
1841 cols_to_delete <- c(cols_to_delete, ix)
1842 }
1843 }
1844 # destroy any unimputable columns
1845 if (!is.null(cols_to_delete)) {
1846 red_violins <- red_violins[-cols_to_delete]
1847 }
1848 # plot imputed values in red on right half-violin plot
763 vioplot::vioplot( 1849 vioplot::vioplot(
764 x = lapply(red_dots, function(x) x[!is.na(x)]), 1850 x = red_violins,
765 col = "lightpink1", 1851 col = "lightpink1",
766 side = "right", 1852 side = "right",
767 plotCentre = "line", 1853 plotCentre = "line",
768 add = T 1854 add = TRUE
769 ) 1855 )
770 } 1856 }
771 1857
772 par(old_par) 1858 par(old_par)
773 1859
774 # density plot 1860 # density plot
775 cat("\\leavevmode\n\n\n\n\n\n\n") 1861 cat("\\leavevmode\n\n\n\n\n\n\n")
776 if (can_plot_before_after_imp) { 1862 if (can_plot_before_after_imp) {
777 ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) 1863 ylim <- c(
1864 0,
1865 max(
1866 if (is.list(d_combined)) d_combined$y else d_combined,
1867 if (is.list(d_original)) d_original$y else d_original,
1868 if (is.list(d_imputed)) d_imputed$y else d_imputed,
1869 na.rm = TRUE
1870 )
1871 )
778 plot( 1872 plot(
779 d_combined, 1873 d_combined,
780 ylim = ylim, 1874 ylim = ylim,
781 sub = 1875 sub =
782 paste( 1876 paste(
797 } 1891 }
798 cat("\\leavevmode\\newpage\n") 1892 cat("\\leavevmode\\newpage\n")
799 } 1893 }
800 ``` 1894 ```
801 1895
802 ## Perform Quantile Normalization 1896 # Perform Quantile Normalization
1897
1898 The excellent `normalize.quantiles` function from
1899 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)*
1900 performs "quantile normalization" as described Bolstad *et al.* (2003),
1901 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)*
1902 and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*,
1903 i.e., it assumes that the goal is to detect
1904 subtle differences among grossly similar samples (having similar distributions)
1905 by equailzing intra-quantile quantitations.
1906 Unfortunately, one software library upon which it depends
1907 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)*
1908 that requires that a specific, non-concurrent version of the library be
1909 installed. The installation command equivalent to what was used to install the library to produce the results presented here is:
1910 ```
1911 conda install bioconductor-preprocesscore openblas=0.3.3
1912 ```
1913
803 1914
804 <!-- 1915 <!--
805 # Apply quantile normalization using preprocessCore::normalize.quantiles 1916 # Apply quantile normalization using preprocessCore::normalize.quantiles
806 # --- 1917 # ---
807 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html 1918 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html
812 # ``` 1923 # ```
813 # conda installation (necessary because of a bug in recent openblas): 1924 # conda installation (necessary because of a bug in recent openblas):
814 # conda install bioconductor-preprocesscore openblas=0.3.3 1925 # conda install bioconductor-preprocesscore openblas=0.3.3
815 # ... 1926 # ...
816 # --- 1927 # ---
817 # normalize.quantiles {preprocessCore} -- Quantile Normalization 1928 # normalize.quantiles {preprocessCore} -- Quantile Normalization
818 # 1929 #
819 # Description: 1930 # Description:
820 # Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities. 1931 # Using a normalization based upon quantiles, this function normalizes a
1932 # matrix of probe level intensities.
1933 #
1934 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the
1935 # assumption that the data is missing at random.
821 # 1936 #
822 # Usage: 1937 # Usage:
823 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) 1938 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE)
824 # 1939 #
825 # Arguments: 1940 # Arguments:
857 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 1972 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185
858 # http://bmbolstad.com/misc/normalize/normalize.html 1973 # http://bmbolstad.com/misc/normalize/normalize.html
859 # ... 1974 # ...
860 --> 1975 -->
861 ```{r echo = FALSE, results = 'asis'} 1976 ```{r echo = FALSE, results = 'asis'}
862 library(preprocessCore)
863 1977
864 if (nrow(quant_data_imp) > 0) { 1978 if (nrow(quant_data_imp) > 0) {
865 quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp)) 1979 quant_data_imp_qn <- preprocessCore::normalize.quantiles(
1980 as.matrix(quant_data_imp), keep.names = TRUE
1981 )
866 } else { 1982 } else {
867 quant_data_imp_qn <- as.matrix(quant_data_imp) 1983 quant_data_imp_qn <- as.matrix(quant_data_imp)
868 } 1984 }
869 1985
870 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) 1986 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn)
871 names(quant_data_imp_qn) <- names(quant_data_imp)
872 1987
873 write_debug_file(quant_data_imp_qn) 1988 write_debug_file(quant_data_imp_qn)
874 1989
875 quant_data_imp_qn_log <- log10(quant_data_imp_qn) 1990 quant_data_imp_qn_log <- log10(quant_data_imp_qn)
876 rownames(quant_data_imp_qn_log) <- rownames(quant_data_imp)
877 1991
878 write_debug_file(quant_data_imp_qn_log) 1992 write_debug_file(quant_data_imp_qn_log)
879 1993
880 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) 1994 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn))))
881 1995
882 any_nan <- function(x) {
883 !any(x == "NaN")
884 }
885 sel <- apply(quant_data_imp_qn_ls, 1, any_nan) 1996 sel <- apply(quant_data_imp_qn_ls, 1, any_nan)
886 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ] 1997 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls
1998
1999 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ]
887 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) 2000 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2)
888 rownames(quant_data_imp_qn_ls2) <- rownames(quant_data_imp)[sel]
889 2001
890 quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls) 2002 quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls)
891 rownames(quant_data_imp_qn_ls) <- rownames(quant_data_imp)
892 2003
893 write_debug_file(quant_data_imp_qn_ls) 2004 write_debug_file(quant_data_imp_qn_ls)
894 write_debug_file(quant_data_imp_qn_ls2) 2005 write_debug_file(quant_data_imp_qn_ls2)
895 2006
896 #output quantile normalized data 2007 # Create data.frame used by ANOVA analysis
897 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) 2008 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log)
898 write.table(
899 data_table_imp_qn_lt,
900 file = imp_qn_lt_data_filenm,
901 sep = "\t",
902 col.names = TRUE,
903 row.names = FALSE,
904 quote = FALSE
905 )
906
907 ``` 2009 ```
908 2010
909 <!-- ACE insertion begin --> 2011 <!-- ACE insertion begin -->
910 ### Checking that normalized, imputed, log-transformed sample distributions are similar: 2012 ## Are normalized, imputed, log-transformed sample distributions similar?
911 2013
912 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} 2014 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
913 2015
914 # Save unimputed quant_data_log for plotting below 2016 # Save unimputed quant_data_log for plotting below
915 unimputed_quant_data_log <- quant_data_log 2017 unimputed_quant_data_log <- quant_data_log
916 2018
917 # log10 transform (after preparing for zero values, 2019 # log10 transform (after preparing for zero values,
918 # which should never happen...) 2020 # which should never happen...)
919 quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 2021 quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001
920 quant_data_log <- log10(quant_data_imp_qn) 2022 quant_data_log <- log10(quant_data_imp_qn)
921
922 # Output with imputed, un-normalized data
923
924 data_table_imputed <- cbind(full_data[1:9], quant_data_imp)
925 write.table(
926 data_table_imputed
927 , file = imputed_data_filename
928 , sep = "\t"
929 , col.names = TRUE
930 , row.names = FALSE
931 , quote = FALSE
932 )
933 2023
934 how_many_peptides <- nrow(quant_data_log) 2024 how_many_peptides <- nrow(quant_data_log)
935 2025
936 if ((how_many_peptides) > 0) { 2026 if ((how_many_peptides) > 0) {
937 cat( 2027 cat(
968 ``` 2058 ```
969 2059
970 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} 2060 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
971 if (nrow(quant_data_log) > 1) { 2061 if (nrow(quant_data_log) > 1) {
972 quant_data_log_stack <- stack(quant_data_log) 2062 quant_data_log_stack <- stack(quant_data_log)
973 ggplot( 2063 ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) +
974 quant_data_log_stack, 2064 ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
975 aes(x = values) 2065 ggplot2::ylab("Probability density") +
976 ) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + 2066 ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE)
977 ylab("Probability density") +
978 geom_density(
979 aes(group = ind, colour = ind),
980 na.rm = TRUE
981 )
982 } else { 2067 } else {
983 cat("No density plot because there are fewer than two peptides to plot.\n\n") 2068 cat("No density plot because there are fewer than two peptides to plot.\n\n")
984 } 2069 }
985 ``` 2070 ```
986 ```{r echo = FALSE, results = 'asis'} 2071 ```{r echo = FALSE, results = 'asis'}
987 cat("\\leavevmode\\newpage\n") 2072 cat("\\leavevmode\\newpage\n")
988 ``` 2073 ```
989 2074
990 ## Perform ANOVA Filters 2075 # ANOVA Analysis
991 2076
992 ```{r, echo = FALSE} 2077 ```{r, echo = FALSE}
993 # Make new data frame containing only Phosphopeptides 2078 # Make new data frame containing only Phosphopeptides
994 # to connect preANOVA to ANOVA (connect_df) 2079 # to connect preANOVA to ANOVA (connect_df)
995 connect_df <- data.frame( 2080 connect_df <- data.frame(
998 ) 2083 )
999 colnames(connect_df) <- c("Phosphopeptide", "Intensity") 2084 colnames(connect_df) <- c("Phosphopeptide", "Intensity")
1000 ``` 2085 ```
1001 2086
1002 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} 2087 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
1003 count_of_factor_levels <- length(levels(sample_factor_levels)) 2088 count_of_treatment_levels <- length(levels(sample_treatment_levels))
1004 if (count_of_factor_levels < 2) { 2089 if (count_of_treatment_levels < 2) {
1005 nuke_control_sequences <- 2090 nuke_control_sequences <-
1006 function(s) { 2091 function(s) {
1007 s <- gsub("[\\]", "xyzzy_plugh", s) 2092 s <- gsub("[\\]", "xyzzy_plugh", s)
1008 s <- gsub("[$]", "\\\\$", s) 2093 s <- gsub("[$]", "\\\\$", s)
1009 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) 2094 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s)
1052 ) 2137 )
1053 2138
1054 cat("\n\n\n") 2139 cat("\n\n\n")
1055 cat("Sample group assignments are:\n", 2140 cat("Sample group assignments are:\n",
1056 "\\begin{quote}\n", 2141 "\\begin{quote}\n",
1057 paste(regmatches(sample_name_matches, m2), collapse = "\n\n\n"), 2142 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"),
1058 "\n\\end{quote}\n\n") 2143 "\n\\end{quote}\n\n")
1059 2144
1060 } else { 2145 } else {
2146
1061 p_value_data_anova_ps <- 2147 p_value_data_anova_ps <-
1062 apply( 2148 apply(
1063 quant_data_imp_qn_log, 2149 quant_data_imp_qn_log,
1064 1, 2150 1,
1065 anova_func, 2151 anova_func,
1066 grouping_factor = sample_factor_levels 2152 grouping_factor = sample_treatment_levels,
2153 one_way_f = one_way_all_categories
1067 ) 2154 )
1068 2155
1069 p_value_data_anova_ps_fdr <- 2156 p_value_data_anova_ps_fdr <-
1070 p.adjust(p_value_data_anova_ps, method = "fdr") 2157 p.adjust(p_value_data_anova_ps, method = "fdr")
1071 p_value_data <- data.frame( 2158 p_value_data <- data.frame(
1079 # output ANOVA file to constructed filename, 2166 # output ANOVA file to constructed filename,
1080 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" 2167 # e.g. "Outputfile_pST_ANOVA_STEP5.txt"
1081 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" 2168 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt"
1082 2169
1083 # Re-output datasets to include p-values 2170 # Re-output datasets to include p-values
2171 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3])
1084 write.table( 2172 write.table(
1085 cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp), 2173 cbind(metadata_plus_p, quant_data_imp),
1086 file = imputed_data_filename, 2174 file = imputed_data_filename,
1087 sep = "\t", 2175 sep = "\t",
1088 col.names = TRUE, 2176 col.names = TRUE,
1089 row.names = FALSE, 2177 row.names = FALSE,
1090 quote = FALSE 2178 quote = FALSE
1091 ) 2179 )
1092 2180
1093 write.table( 2181 write.table(
1094 cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp_qn_log), 2182 cbind(metadata_plus_p, quant_data_imp_qn_log),
1095 file = imp_qn_lt_data_filenm, 2183 file = imp_qn_lt_data_filenm,
1096 sep = "\t", 2184 sep = "\t",
1097 col.names = TRUE, 2185 col.names = TRUE,
1098 row.names = FALSE, 2186 row.names = FALSE,
1099 quote = FALSE 2187 quote = FALSE
1133 # <!-- ACE insertion start --> 2221 # <!-- ACE insertion start -->
1134 2222
1135 if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { 2223 if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) {
1136 if (first_page_suppress == 1) { 2224 if (first_page_suppress == 1) {
1137 first_page_suppress <- 0 2225 first_page_suppress <- 0
1138 } 2226 } else {
1139 else {
1140 cat("\\newpage\n") 2227 cat("\\newpage\n")
1141 } 2228 }
1142 cat(sprintf( 2229 if (nrow(filtered_data_filtered) > 1) {
1143 "Intensities for %d peptides whose adjusted p-value < %0.2f:\n", 2230 subsection_header(sprintf(
1144 nrow(filtered_data_filtered), 2231 "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n",
1145 cutoff 2232 nrow(filtered_data_filtered),
1146 )) 2233 cutoff
2234 ))
2235 } else {
2236 subsection_header(sprintf(
2237 "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n",
2238 rownames(filtered_data_filtered)[1],
2239 cutoff
2240 ))
2241 }
1147 cat("\n\n\n") 2242 cat("\n\n\n")
1148 cat("\n\n\n") 2243 cat("\n\n\n")
1149 2244
1150 old_oma <- par("oma") 2245 old_oma <- par("oma")
1151 old_par <- par( 2246 old_par <- par(
1156 fin = c(9, 7.25) 2251 fin = c(9, 7.25)
1157 ) 2252 )
1158 # ref: https://r-charts.com/distribution/add-points-boxplot/ 2253 # ref: https://r-charts.com/distribution/add-points-boxplot/
1159 # Vertical plot 2254 # Vertical plot
1160 colnames(filtered_data_filtered) <- sample_name_matches 2255 colnames(filtered_data_filtered) <- sample_name_matches
1161 boxplot( 2256 tryCatch(
1162 filtered_data_filtered, 2257 boxplot(
1163 main = "Imputed, normalized intensities", # no line plot 2258 filtered_data_filtered,
1164 las = 1, 2259 main = "Imputed, normalized intensities", # no line plot
1165 col = const_boxplot_fill, 2260 las = 1,
1166 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") 2261 col = const_boxplot_fill,
2262 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
2263 ),
2264 error = function(e) print(e)
1167 ) 2265 )
1168 par(old_par) 2266 par(old_par)
1169 } else { 2267 } else {
1170 cat(sprintf( 2268 cat(sprintf(
1171 "%s < %0.2f\n\n\n\n\n", 2269 "%s < %0.2f\n\n\n\n\n",
1173 cutoff 2271 cutoff
1174 )) 2272 ))
1175 } 2273 }
1176 2274
1177 if (nrow(filtered_data_filtered) > 0) { 2275 if (nrow(filtered_data_filtered) > 0) {
1178 #Add Phosphopeptide column to anova_filtered table 2276 # Add Phosphopeptide column to anova_filtered table
1179 anova_filtered_merge <- merge( 2277 # The assumption here is that the first intensity is unique;
2278 # this is a hokey assumption but almost definitely will
2279 # be true in the real world unless there is a computation
2280 # error upstream.
2281 anova_filtered_merge <- base::merge(
1180 x = connect_df, 2282 x = connect_df,
1181 y = filtered_data_filtered, 2283 y = filtered_data_filtered,
1182 by.x = "Intensity", 2284 by.x = "Intensity",
1183 by.y = 1 2285 by.y = 1
1184 ) 2286 )
1185 anova_filtered_merge_order <- rownames(filtered_p) 2287 anova_filtered_merge_order <- rownames(filtered_p)
2288
2289 anova_filtered <- data.frame(
2290 ppep = anova_filtered_merge$Phosphopeptide,
2291 intense = anova_filtered_merge$Intensity,
2292 data = anova_filtered_merge[, 2:number_of_samples + 1]
2293 )
2294 colnames(anova_filtered) <-
2295 c("Phosphopeptide", colnames(filtered_data_filtered))
2296
2297 # Merge qualitative columns into the ANOVA data
2298 output_table <- data.frame(anova_filtered$Phosphopeptide)
2299 output_table <- base::merge(
2300 x = output_table,
2301 y = data_table_imp_qn_lt,
2302 by.x = "anova_filtered.Phosphopeptide",
2303 by.y = "Phosphopeptide"
2304 )
2305
2306 # Produce heatmap to visualize significance and the effect of imputation
1186 2307
1187 anova_filtered_merge_format <- sapply( 2308 anova_filtered_merge_format <- sapply(
1188 X = filtered_p$fdr_adjusted_anova_p 2309 X = filtered_p$fdr_adjusted_anova_p
1189 , 2310 ,
1190 FUN = function(x) { 2311 FUN = function(x) {
1193 else 2314 else
1194 paste0("(%0.4e) %s") 2315 paste0("(%0.4e) %s")
1195 } 2316 }
1196 ) 2317 )
1197 2318
1198 anova_filtered <- data.table( 2319 cat_hm_heading <- function(m, cutoff) {
1199 anova_filtered_merge$Phosphopeptide 2320 cat("\\newpage\n")
1200 , 2321 if (nrow(m) > intensity_hm_rows) {
1201 anova_filtered_merge$Intensity 2322 subsection_header(
1202 , 2323 paste(
1203 anova_filtered_merge[, 2:number_of_samples + 1] 2324 sprintf("Heatmap for the %d most-significant peptides",
1204 ) 2325 intensity_hm_rows),
1205 colnames(anova_filtered) <- 2326 sprintf("whose adjusted p-value < %0.2f\n", cutoff)
1206 c("Phosphopeptide", colnames(filtered_data_filtered)) 2327 )
1207 2328 )
1208 # Merge qualitative columns into the ANOVA data 2329 } else {
1209 output_table <- data.frame(anova_filtered$Phosphopeptide) 2330 if (nrow(m) == 1) {
1210 output_table <- merge( 2331 return(FALSE)
1211 x = output_table, 2332 } else {
1212 y = data_table_imp_qn_lt, 2333 subsection_header(
1213 by.x = "anova_filtered.Phosphopeptide", 2334 paste(
1214 by.y = "Phosphopeptide" 2335 sprintf("Heatmap for %d usable peptides whose", nrow(m)),
1215 ) 2336 sprintf("adjusted p-value < %0.2f\n", cutoff)
1216 2337 )
1217 # Produce heatmap to visualize significance and the effect of imputation 2338 )
2339 }
2340 }
2341 cat("\n\n\n")
2342 cat("\n\n\n")
2343 return(TRUE)
2344 }
2345
2346 # construct matrix with appropriate rownames
1218 m <- 2347 m <-
1219 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) 2348 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ])
1220 m_nan_rows <- rowSums(
1221 matrix(
1222 as.integer(is.na(m)),
1223 nrow = nrow(m)
1224 )
1225 )
1226 m <- m[!m_nan_rows, , drop = FALSE]
1227 if (nrow(m) > 0) { 2349 if (nrow(m) > 0) {
1228 rownames_m <- rownames(m) 2350 rownames_m <- rownames(m)
1229 rownames(m) <- sapply( 2351 rownames(m) <- sapply(
1230 X = seq_len(nrow(m)) 2352 X = seq_len(nrow(m))
1231 , 2353 ,
1235 filtered_p$fdr_adjusted_anova_p[i], 2357 filtered_p$fdr_adjusted_anova_p[i],
1236 rownames_m[i] 2358 rownames_m[i]
1237 ) 2359 )
1238 } 2360 }
1239 ) 2361 )
1240 how_many_peptides <- min(50, nrow(m)) 2362 }
1241 number_of_peptides_found <- how_many_peptides 2363 # draw the heading and heatmap
1242 if (nrow(m) > 1) { 2364 if (nrow(m) > 0) {
1243 m_margin <- m[how_many_peptides:1, ] 2365 number_of_peptides_found <-
1244 margins <- 2366 draw_intensity_heatmap(
1245 c(max(nchar(colnames(m_margin))) * 10 / 16 # col 2367 m = m,
1246 , max(nchar(rownames(m_margin))) * 5 / 16 # row 2368 cutoff = cutoff,
1247 ) 2369 hm_heading_function = cat_hm_heading,
1248 } 2370 hm_main_title = "Unimputed, unnormalized log(intensities)",
1249 2371 suppress_row_dendrogram = FALSE
1250 cat("\\newpage\n")
1251 if (nrow(m) > 50) {
1252 cat("Heatmap for the 50 most-significant peptides",
1253 sprintf(
1254 "whose adjusted p-value < %0.2f\n",
1255 cutoff)
1256 )
1257 } else {
1258 if (nrow(m) == 1) {
1259 next
1260 } else {
1261 cat(
1262 sprintf("Heatmap for %d usable peptides whose", nrow(m)),
1263 sprintf("adjusted p-value < %0.2f\n", cutoff)
1264 ) 2372 )
1265 }
1266 }
1267 cat("\n\n\n")
1268 cat("\n\n\n")
1269 try(
1270 if (nrow(m) > 1) {
1271 old_oma <- par("oma")
1272 par(cex.main = 0.6)
1273 heatmap(
1274 m[how_many_peptides:1, ],
1275 Rowv = NA,
1276 Colv = NA,
1277 cexRow = 0.7,
1278 cexCol = 0.8,
1279 scale = "row",
1280 margins = margins,
1281 main =
1282 "Unimputed, unnormalized intensities",
1283 xlab = "",
1284 las = 1 #, fin = c(9, 5.5)
1285 )
1286 }
1287 )
1288 } 2373 }
1289 } 2374 }
1290 } 2375 }
1291 } 2376 }
1292 cat("\\leavevmode\n\n\n") 2377 cat("\\leavevmode\n\n\n")
1293 ``` 2378 ```
2379
2380 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
2381
2382 if (count_of_treatment_levels > 1) {
2383 # Prepare two-way contrasts with adjusted p-values
2384 # Strategy:
2385 # - use imputed, log-transformed data:
2386 # - remember this when computing log2(fold-change)
2387 # - each contrast is between a combination of trt levels
2388 # - for each contrast, compute samples that are members
2389 # - compute one-way test:
2390 # - use `oneway.test` (Welch test) if numbers of samples
2391 # are not equivalent between trt levels
2392 # - otherwise, aov is fine but offers no advantage
2393 # - adjust p-value, assuming that
2394 # (# of pppeps)*(# of contrasts) tests were performed
2395
2396 # Each contrast is between a combination of trt levels
2397 m2 <- combn(
2398 x = seq_len(length(levels(sample_treatment_levels))),
2399 m = 2,
2400 simplify = TRUE
2401 )
2402 contrast_count <- ncol(m2)
2403
2404 # For each contrast, compute samples that are members
2405 # - local function to construct a data.frame for each contrast
2406 # - the contrast in the first "column"
2407 f_m2 <-
2408 function(cntrst, lvl1, lvl2) {
2409 return(
2410 data.frame(
2411 contrast = cntrst,
2412 level = sample_treatment_levels[
2413 sample_treatment_levels %in%
2414 levels(sample_treatment_levels)[c(lvl1, lvl2)]
2415 ],
2416 label = sample_name_matches[
2417 sample_treatment_levels %in%
2418 levels(sample_treatment_levels)[c(lvl1, lvl2)]
2419 ]
2420 )
2421 )
2422 }
2423 # - compute a df for each contrast
2424 sample_level_dfs <- lapply(
2425 X = 1:contrast_count,
2426 FUN = function(i) f_m2(i, m2[1, i], m2[2, i])
2427 )
2428 # - compute a single df for all contrasts
2429 combined_contrast_df <- Reduce(f = rbind, x = sample_level_dfs)
2430
2431 # - dispose objects to free resources
2432 rm(sample_level_dfs)
2433
2434 # - write the df to a DB for later join-per-contrast
2435 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
2436
2437 RSQLite::dbWriteTable(
2438 conn = db,
2439 name = "contrast",
2440 value = combined_contrast_df,
2441 overwrite = TRUE
2442 )
2443
2444 # Create UK for insert
2445 ddl_exec(db, "
2446 CREATE UNIQUE INDEX IF NOT EXISTS contrast__uk__idx
2447 ON contrast(contrast, label);
2448 "
2449 )
2450 # Create indexes for join
2451 ddl_exec(db, "
2452 -- index for join in contrast_ppep_smpl_qnlt on a.label < b.label
2453 CREATE INDEX IF NOT EXISTS contrast__label__idx
2454 ON contrast(label);
2455 "
2456 )
2457 ddl_exec(db, "
2458 -- index for joining two contrast_lvl_ppep_avg_quant on contrast
2459 CREATE INDEX IF NOT EXISTS contrast__contrast__idx
2460 ON contrast(contrast);
2461 "
2462 )
2463 ddl_exec(db, "
2464 -- index for joining two contrast_lvl_ppep_avg_quant on phophospep
2465 CREATE INDEX IF NOT EXISTS contrast__level__idx
2466 ON contrast(level);
2467 "
2468 )
2469 # - dispose objects to free resources
2470 rm(combined_contrast_df)
2471
2472 # Use imputed, log-transformed data
2473 # - remember that this was donoe when computing log2(fold-change)
2474 # - melt data matrix for use in later join-per-contrast
2475 casted <- cbind(
2476 data.frame(vrbl = rownames(quant_data_imp_qn_log)),
2477 quant_data_imp_qn_log
2478 )
2479 quant_data_imp_qn_log_melted <- reshape2::melt(
2480 casted,
2481 id.vars = "vrbl"
2482 )
2483 colnames(quant_data_imp_qn_log_melted) <-
2484 c("phosphopep", "sample", "quant")
2485 # - dispose objects to free resources
2486 rm(casted)
2487
2488 # - write the df to a DB for use in later join-per-contrast
2489 RSQLite::dbWriteTable(
2490 conn = db,
2491 name = "ppep_smpl_qnlt",
2492 value = quant_data_imp_qn_log_melted,
2493 overwrite = TRUE
2494 )
2495 # Create UK for insert
2496 ddl_exec(db, "
2497 CREATE UNIQUE INDEX IF NOT EXISTS ppep_smpl_qnlt__uk__idx
2498 ON ppep_smpl_qnlt(phosphopep, sample);
2499 "
2500 )
2501 # Create index for join
2502 ddl_exec(db, "
2503 -- index for join in contrast_ppep_smpl_qnlt
2504 CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__sample__idx
2505 ON ppep_smpl_qnlt(sample);
2506 "
2507 )
2508 ddl_exec(db, "
2509 -- index for joining two contrast_lvl_ppep_avg_quant on phopho.pep
2510 CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__phosphopep__idx
2511 ON ppep_smpl_qnlt(phosphopep);
2512 "
2513 )
2514 # - dispose objects to free resources
2515 rm(quant_data_imp_qn_log_melted)
2516
2517 # - drop views if exist
2518 ddl_exec(db, "
2519 -- drop view dependent on contrast_lvl_ppep_avg_quant
2520 DROP VIEW IF EXISTS v_contrast_log2_fc;
2521 "
2522 )
2523 ddl_exec(db, "
2524 -- drop table dependent on contrast_ppep_smpl_qnlt
2525 DROP TABLE IF EXISTS contrast_lvl_ppep_avg_quant;
2526 "
2527 )
2528 ddl_exec(db, "
2529 DROP TABLE IF EXISTS contrast_lvl_lvl_metadata;
2530 "
2531 )
2532 ddl_exec(db, "
2533 DROP VIEW IF EXISTS v_contrast_lvl_metadata;
2534 "
2535 )
2536 ddl_exec(db, "
2537 -- drop view dependent on contrast_ppep_smpl_qnlt
2538 DROP VIEW IF EXISTS v_contrast_lvl_ppep_avg_quant;
2539 "
2540 )
2541 ddl_exec(db, "
2542 DROP VIEW IF EXISTS v_contrast_lvl_lvl;
2543 "
2544 )
2545 ddl_exec(db, "
2546 -- drop view upon which other views depend
2547 DROP VIEW IF EXISTS contrast_ppep_smpl_qnlt;
2548 "
2549 )
2550 # - create view
2551 dml_no_rows_exec(db, "
2552 -- view contrast_ppep_smpl_qnlt is used for each phopshopep to
2553 -- compute p-value for test of trt effect for two trt levels
2554 CREATE VIEW contrast_ppep_smpl_qnlt
2555 AS
2556 SELECT contrast,
2557 level,
2558 phosphopep,
2559 sample,
2560 quant
2561 FROM contrast AS c,
2562 ppep_smpl_qnlt AS q
2563 WHERE q.sample = c.label
2564 ORDER BY contrast, level, phosphopep
2565 ;
2566 "
2567 )
2568 # - create simplification views
2569 dml_no_rows_exec(db, "
2570 CREATE VIEW v_contrast_lvl_metadata
2571 AS
2572 SELECT contrast,
2573 level,
2574 group_concat(label, ';') AS samples
2575 FROM contrast
2576 GROUP BY contrast, level
2577 /* view v_contrast_lvl_metadata is used
2578 to simplify creation of table contrast_lvl_lvl_metadata */
2579 ;
2580 "
2581 )
2582 dml_no_rows_exec(db, "
2583 CREATE VIEW v_contrast_lvl_ppep_avg_quant
2584 AS
2585 SELECT contrast,
2586 level,
2587 phosphopep,
2588 avg(quant) AS avg_quant
2589 FROM contrast_ppep_smpl_qnlt
2590 GROUP BY contrast, level, phosphopep
2591 /* view v_contrast_lvl_ppep_avg_quant is used
2592 to simplify view v_contrast_log2_fc */
2593 ;
2594 "
2595 )
2596
2597 # - create contrast-metadata table
2598 dml_no_rows_exec(db, "
2599 CREATE TABLE contrast_lvl_lvl_metadata
2600 AS
2601 SELECT DISTINCT
2602 a.contrast AS ab_contrast,
2603 a.level AS a_level,
2604 b.level AS b_level,
2605 a.samples AS a_samples,
2606 b.samples AS b_samples,
2607 'log2(level_'||a.level||'/level_'||b.level||')'
2608 AS fc_description
2609 FROM v_contrast_lvl_metadata AS a,
2610 v_contrast_lvl_metadata AS b
2611 WHERE a.contrast = b.contrast
2612 AND a.level > b.level
2613 /* view v_contrast_lvl_lvl is used
2614 to simplify view v_contrast_log2_fc */
2615 ;
2616 "
2617 )
2618 # - create pseudo-materialized view table
2619 dml_no_rows_exec(db, "
2620 CREATE VIEW v_contrast_lvl_lvl
2621 AS
2622 SELECT DISTINCT
2623 a.contrast AS ab_contrast,
2624 a.level AS a_level,
2625 b.level AS b_level
2626 FROM contrast AS a,
2627 contrast AS b
2628 WHERE a.contrast = b.contrast
2629 AND a.level > b.level
2630 /* view v_contrast_lvl_lvl is used
2631 to simplify view v_contrast_log2_fc */
2632 ;
2633 "
2634 )
2635
2636 # - create view to compute log2(fold-change)
2637 dml_no_rows_exec(db, "
2638 CREATE VIEW v_contrast_log2_fc
2639 AS
2640 SELECT ab.ab_contrast AS contrast,
2641 m.a_level AS a_level,
2642 c.avg_quant AS a_quant,
2643 m.a_samples AS a_samples,
2644 ab.b_level AS b_level,
2645 d.avg_quant AS b_quant,
2646 m.b_samples AS b_samples,
2647 m.fc_description AS fc_description,
2648 3.32193 * ( d.avg_quant - c.avg_quant) AS log2_fc,
2649 d.phosphopep AS phosphopep
2650 FROM contrast_lvl_lvl_metadata AS m,
2651 v_contrast_lvl_ppep_avg_quant AS d,
2652 v_contrast_lvl_lvl AS ab
2653 INNER JOIN v_contrast_lvl_ppep_avg_quant AS c
2654 ON c.contrast = ab.ab_contrast
2655 AND c.level = ab.a_level
2656 WHERE d.contrast = ab.ab_contrast
2657 AND m.ab_contrast = ab.ab_contrast
2658 AND d.level = ab.b_level
2659 AND d.phosphopep = c.phosphopep
2660 /* view to compute log2(fold-change) */
2661 ;
2662 "
2663 )
2664
2665 # For each contrast, compute samples that are members
2666 # compute one-way test:
2667 # - use `oneway.test` (Welch test) if numbers of samples
2668 # are not equivalent between trt levels
2669 # - otherwise, aov is fine but offers no advantage
2670 for (contrast in contrast_count:2) {
2671 invisible(contrast)
2672 }
2673 for (contrast in 1:contrast_count) {
2674 contrast_df <- sqldf::sqldf(
2675 x = paste0("
2676 SELECT level, phosphopep, sample, quant
2677 FROM contrast_ppep_smpl_qnlt
2678 WHERE contrast = ", contrast, "
2679 ORDER BY phosphopep, level, sample
2680 "),
2681 connection = db
2682 )
2683 contrast_cast <- reshape2::dcast(
2684 data = contrast_df,
2685 formula = phosphopep ~ sample,
2686 value.var = "quant"
2687 )
2688 contrast_cast_ncol <- ncol(contrast_cast)
2689 contrast_cast_data <- contrast_cast[, 2:contrast_cast_ncol]
2690 contrast_cast_samples <- colnames(contrast_cast_data)
2691
2692 # - order grouping_factor by order of sample columns of contrast_cast_data
2693 grouping_factor <- sqldf::sqldf(
2694 x = paste0("
2695 SELECT sample, level
2696 FROM contrast_ppep_smpl_qnlt
2697 WHERE contrast = ", contrast, "
2698 ORDER BY phosphopep, level, sample
2699 LIMIT ", contrast_cast_ncol - 1
2700 ),
2701 connection = db
2702 )
2703 rownames(grouping_factor) <- grouping_factor$sample
2704 grouping_factor <- grouping_factor[, "level", drop = FALSE]
2705
2706 # - run the two-level (one-way) test
2707 p_value_data_contrast_ps <-
2708 apply(
2709 X = contrast_cast_data,
2710 MARGIN = 1, # apply to rows
2711 FUN = anova_func,
2712 grouping_factor =
2713 as.factor(as.numeric(grouping_factor$level)), # anova_func arg2
2714 one_way_f = one_way_two_categories, # anova_func arg3
2715 simplify = TRUE # TRUE is the default for simplify
2716 )
2717 contrast_data_adj_p_values <- p.adjust(
2718 p = p_value_data_contrast_ps,
2719 method = "fdr",
2720 n = length(p_value_data_contrast_ps) # this is the default, length(p)
2721 )
2722 # - compute the fold-change
2723 contrast_p_df <-
2724 data.frame(
2725 contrast = contrast,
2726 phosphopep = contrast_cast$phosphopep,
2727 p_value_raw = p_value_data_contrast_ps,
2728 p_value_adj = contrast_data_adj_p_values
2729 )
2730 db_write_table_overwrite <- (contrast < 2)
2731 db_write_table_append <- !db_write_table_overwrite
2732 RSQLite::dbWriteTable(
2733 conn = db,
2734 name = "contrast_ppep_p_val",
2735 value = contrast_p_df,
2736 append = db_write_table_append
2737 )
2738 # Create UK for insert
2739 ddl_exec(db, "
2740 CREATE UNIQUE INDEX IF NOT EXISTS contrast_ppep_p_val__uk__idx
2741 ON contrast_ppep_p_val(phosphopep, contrast);
2742 "
2743 )
2744 }
2745 # Perhaps this could be done more elegantly using unique keys
2746 # or creating the tables before saving data to them, but this
2747 # is fast and, if the database exists on disk rather than in
2748 # memory, it doesn't stress memory.
2749 dml_no_rows_exec(db, "
2750 CREATE TEMP table contrast_log2_fc
2751 AS
2752 SELECT *
2753 FROM v_contrast_log2_fc
2754 ORDER BY contrast, phosphopep
2755 ;
2756 "
2757 )
2758 dml_no_rows_exec(db, "
2759 CREATE TEMP table ppep_p_val
2760 AS
2761 SELECT p_value_raw,
2762 p_value_adj,
2763 contrast AS p_val_contrast,
2764 phosphopep AS p_val_ppep
2765 FROM contrast_ppep_p_val
2766 ORDER BY contrast, phosphopep
2767 ;
2768 "
2769 )
2770 dml_no_rows_exec(db, "
2771 DROP TABLE IF EXISTS contrast_log2_fc_p_val
2772 ;
2773 "
2774 )
2775 dml_no_rows_exec(db, "
2776 CREATE TABLE contrast_log2_fc_p_val
2777 AS
2778 SELECT a.*,
2779 b.p_value_raw,
2780 b.p_value_adj,
2781 b.p_val_contrast,
2782 b.p_val_ppep
2783 FROM contrast_log2_fc a, ppep_p_val b
2784 WHERE a.rowid = b.rowid
2785 AND a.phosphopep = b.p_val_ppep
2786 ;
2787 "
2788 )
2789 # Create UK
2790 ddl_exec(db, "
2791 CREATE UNIQUE INDEX IF NOT EXISTS contrast_log2_fc_p_val__uk__idx
2792 ON contrast_log2_fc_p_val(phosphopep, contrast);
2793 "
2794 )
2795 # Create indices for future queries
2796 ddl_exec(db, "
2797 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__contrast__idx
2798 ON contrast_log2_fc_p_val(contrast);
2799 "
2800 )
2801 ddl_exec(db, "
2802 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__phosphopep__idx
2803 ON contrast_log2_fc_p_val(phosphopep);
2804 "
2805 )
2806 ddl_exec(db, "
2807 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_raw__idx
2808 ON contrast_log2_fc_p_val(p_value_raw);
2809 "
2810 )
2811 ddl_exec(db, "
2812 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_adj__idx
2813 ON contrast_log2_fc_p_val(p_value_adj);
2814 "
2815 )
2816 dml_no_rows_exec(db, "
2817 DROP VIEW IF EXISTS v_contrast_log2_fc_p_val
2818 ;
2819 "
2820 )
2821 dml_no_rows_exec(db, "
2822 CREATE VIEW v_contrast_log2_fc_p_val
2823 AS
2824 SELECT contrast,
2825 a_level,
2826 a_samples,
2827 b_level,
2828 b_samples,
2829 a_quant,
2830 b_quant,
2831 fc_description,
2832 log2_fc,
2833 p_value_raw,
2834 p_value_adj,
2835 phosphopep
2836 FROM contrast_log2_fc_p_val
2837 ORDER BY contrast, phosphopep
2838 ;
2839 "
2840 )
2841 ddl_exec(db, "
2842 DROP TABLE IF EXISTS kseaapp_metadata
2843 ;
2844 "
2845 )
2846 dml_no_rows_exec(db, "
2847 CREATE TABLE kseaapp_metadata
2848 AS
2849 WITH extended(deppep, ppep, gene_name, uniprot_id, phosphoresidue) AS (
2850 SELECT DISTINCT
2851 deppep.seq,
2852 ppep.seq,
2853 GeneName||';',
2854 UniProtID||';',
2855 PhosphoResidue||';'
2856 FROM
2857 ppep, deppep, mrgfltr_metadata
2858 WHERE
2859 mrgfltr_metadata.ppep_id = ppep.id
2860 AND
2861 ppep.deppep_id = deppep.id
2862 )
2863 SELECT
2864 ppep AS `ppep`,
2865 SUBSTR(uniprot_id, 1, INSTR(uniprot_id,';') - 1 ) AS `Protein`,
2866 SUBSTR(gene_name, 1, INSTR(gene_name,';') - 1 ) AS `Gene`,
2867 deppep AS `Peptide`,
2868 REPLACE(
2869 REPLACE(
2870 SUBSTR(phosphoresidue, 1, INSTR(phosphoresidue,';') - 1 ),
2871 'p',
2872 ''
2873 ),
2874 ', ',
2875 ';'
2876 ) AS `Residue.Both`
2877 FROM extended
2878 ;
2879 "
2880 )
2881 # Create indexes for join
2882 ddl_exec(db, "
2883 CREATE INDEX IF NOT EXISTS kseaapp_metadata__ppep__idx
2884 ON kseaapp_metadata(ppep);
2885 "
2886 )
2887 ddl_exec(db, "
2888 DROP VIEW IF EXISTS v_kseaapp_contrast
2889 ;
2890 "
2891 )
2892 dml_no_rows_exec(db, "
2893 CREATE VIEW v_kseaapp_contrast
2894 AS
2895 SELECT a.*, b.Protein, b.Gene, b.Peptide, b.`Residue.Both`
2896 FROM v_contrast_log2_fc_p_val a, kseaapp_metadata b
2897 WHERE b.ppep = a.phosphopep
2898 ;
2899 "
2900 )
2901 ddl_exec(db, "
2902 DROP VIEW IF EXISTS v_kseaapp_input
2903 ;
2904 "
2905 )
2906 dml_no_rows_exec(db, "
2907 CREATE VIEW v_kseaapp_input
2908 AS
2909 SELECT v.contrast,
2910 v.phosphopep,
2911 m.`Protein`,
2912 m.`Gene`,
2913 m.`Peptide`,
2914 m.`Residue.Both`,
2915 v.p_value_raw AS `p`,
2916 v.log2_fc AS `FC`
2917 FROM kseaapp_metadata AS m,
2918 v_contrast_log2_fc_p_val AS v
2919 WHERE m.ppep = v.phosphopep
2920 AND NOT m.`Gene` = 'No_Gene_Name'
2921 AND NOT v.log2_fc = 0
2922 ;
2923 "
2924 )
2925 }
2926 ```
2927
2928 ```{r echo = FALSE, results = 'asis'}
2929 cat("\\newpage\n")
2930 ```
2931
2932 # KSEA Analysis
2933
2934 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package:
2935
2936 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp
2937 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415).
2938 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/).
2939
2940 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as:
2941
2942 $$
2943 \text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j}
2944 $$
2945
2946 and fold-enrichment is computed as:
2947
2948 $$
2949 \text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j}
2950 $$
2951
2952 where:
2953
2954 - $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$,
2955 - $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and
2956 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$,
2957 - $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset.
2958 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score.
2959
2960 $\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x)
2961
2962 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive).
2963
2964 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`.
2965
2966 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis.
2967 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)).
2968 - Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads).
2969
2970 ```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
2971
2972 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
2973
2974 # -- eliminate the table that's about to be defined
2975 ddl_exec(db, "
2976 DROP TABLE IF EXISTS site_metadata;
2977 ")
2978
2979 # -- define the site_metadata table
2980 ddl_exec(db, "
2981 CREATE TABLE site_metadata(
2982 id INTEGER PRIMARY KEY
2983 , site_type_id INTEGER REFERENCES site_type(id)
2984 , full TEXT UNIQUE ON CONFLICT IGNORE
2985 , abbrev TEXT
2986 , pattern TEXT
2987 , motif TEXT
2988 );
2989 ")
2990
2991 # -- populate the table with initial values
2992 ddl_exec(db, "
2993 INSERT INTO site_metadata(full, abbrev, site_type_id)
2994 SELECT DISTINCT kinase_map, kinase_map, site_type_id
2995 FROM ppep_gene_site
2996 ORDER BY kinase_map;
2997 ")
2998
2999 # -- drop bogus KSData view if exists
3000 ddl_exec(db, "
3001 DROP VIEW IF EXISTS ks_data_v;
3002 ")
3003
3004 # -- create view to serve as an impostor for KSEAapp::KSData
3005 ddl_exec(db, "
3006 CREATE VIEW IF NOT EXISTS ks_data_v
3007 AS
3008 SELECT
3009 'NA' AS KINASE,
3010 'NA' AS KIN_ACC_ID,
3011 kinase_map AS GENE,
3012 'NA' AS KIN_ORGANISM,
3013 'NA' AS SUBSTRATE,
3014 0 AS SUB_GENE_ID,
3015 'NA' AS SUB_ACC_ID,
3016 gene_names AS SUB_GENE,
3017 'NA' AS SUB_ORGANISM,
3018 phospho_peptide AS SUB_MOD_RSD,
3019 0 AS SITE_GROUP_ID,
3020 'NA' AS 'SITE_7AA',
3021 2 AS networkin_score,
3022 type_name AS Source
3023 FROM ppep_gene_site_view;
3024 ")
3025
3026 contrast_metadata_df <-
3027 sqldf::sqldf("select * from contrast_lvl_lvl_metadata", connection = db)
3028 rslt <- new_env()
3029 rslt$score_list <- list()
3030 rslt$name_list <- list()
3031 rslt$longname_list <- list()
3032
3033 ddl_exec(db, "
3034 DROP TABLE IF EXISTS contrast_ksea_scores;
3035 "
3036 )
3037
3038 next_index <- 0
3039 err_na_subscr_df_const <-
3040 "missing values are not allowed in subscripted assignments of data frames"
3041
3042 for (i_cntrst in seq_len(nrow(contrast_metadata_df))) {
3043 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
3044 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
3045 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
3046 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
3047 contrast_longlabel <- (
3048 sprintf(
3049 "Trt %s {%s} -> Trt %s {%s}",
3050 contrast_metadata_df[i_cntrst, "b_level"],
3051 gsub(
3052 pattern = ";",
3053 replacement = ", ",
3054 x = contrast_metadata_df[i_cntrst, "b_samples"],
3055 fixed = TRUE
3056 ),
3057 contrast_metadata_df[i_cntrst, "a_level"],
3058 gsub(
3059 pattern = ";",
3060 replacement = ", ",
3061 x = contrast_metadata_df[i_cntrst, "a_samples"],
3062 fixed = TRUE
3063 )
3064 )
3065 )
3066 kseaapp_input <-
3067 sqldf::sqldf(
3068 x = sprintf("
3069 SELECT `Protein`, `Gene`, `Peptide`, phosphopep AS `Residue.Both`, `p`, `FC`
3070 FROM v_kseaapp_input
3071 WHERE contrast = %d
3072 ",
3073 i_cntrst
3074 ),
3075 connection = db
3076 )
3077
3078 pseudo_ksdata <- dbReadTable(db, "ks_data_v")
3079
3080 # This hack is because SQL table has the log2-transformed values
3081 kseaapp_input[, "FC"] <- 2 ** kseaapp_input[, "FC", drop = TRUE]
3082 main_title <- (
3083 sprintf(
3084 "Change from treatment %s to treatment %s",
3085 contrast_metadata_df[i_cntrst, "b_level"],
3086 contrast_metadata_df[i_cntrst, "a_level"]
3087 )
3088 )
3089 sub_title <- contrast_longlabel
3090 tryCatch(
3091 expr = {
3092 ksea_scores_rslt <-
3093 ksea_scores(
3094 ksdata = pseudo_ksdata, # KSEAapp::KSData,
3095 px = kseaapp_input,
3096 networkin = TRUE,
3097 networkin_cutoff = 2
3098 )
3099
3100 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
3101 next_index <- 1 + next_index
3102 rslt$score_list[[next_index]] <- ksea_scores_rslt
3103 rslt$name_list[[next_index]] <- contrast_label
3104 rslt$longname_list[[next_index]] <- contrast_longlabel
3105 low_fdr_print(
3106 rslt = rslt,
3107 i_cntrst = i_cntrst,
3108 i = next_index,
3109 a_level = cntrst_a_level,
3110 b_level = cntrst_b_level,
3111 fold_change = cntrst_fold_change,
3112 caption = contrast_longlabel
3113 )
3114 }
3115 },
3116 error = function(e) str(e)
3117 )
3118 }
3119
3120 plotted_kinases <- NULL
3121 if (length(rslt$score_list) > 1) {
3122 for (i in seq_len(length(ksea_heatmap_titles))) {
3123 hdr <- ksea_heatmap_titles[[i]]
3124 which_kinases <- i
3125
3126 cat("\\clearpage\n\\begin{center}\n")
3127 if (i == const_ksea_astrsk_kinases) {
3128 subsection_header(hdr)
3129 } else {
3130 subsection_header(hdr)
3131 }
3132 cat("\\end{center}\n")
3133
3134 plotted_kinases <- ksea_heatmap(
3135 # the data frame outputs from the KSEA.Scores() function, in list format
3136 score_list = rslt$score_list,
3137 # a character vector of all the sample names for heatmap annotation:
3138 # - the names must be in the same order as the data in score_list
3139 # - please avoid long names, as they may get cropped in the final image
3140 sample_labels = rslt$name_list,
3141 # character string of either "p.value" or "FDR" indicating the data column
3142 # to use for marking statistically significant scores
3143 stats = c("p.value", "FDR")[2],
3144 # a numeric value between 0 and infinity indicating the min. number of
3145 # substrates a kinase must have to be included in the heatmap
3146 m_cutoff = 1,
3147 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
3148 # for indicating significant kinases in the heatmap
3149 p_cutoff = 0.05,
3150 # a binary input of TRUE or FALSE, indicating whether or not to perform
3151 # hierarchical clustering of the sample columns
3152 sample_cluster = TRUE,
3153 # a binary input of TRUE or FALSE, indicating whether or not to export
3154 # the heatmap as a .png image into the working directory
3155 export = FALSE,
3156 # additional arguments to gplots::heatmap.2, such as:
3157 # - main: main title of plot
3158 # - xlab: x-axis label
3159 # - ylab: y-axis label
3160 xlab = "Contrast",
3161 ylab = "Kinase",
3162 # print which kinases:
3163 # - 1 : all kinases
3164 # - 2 : significant kinases
3165 # - 3 : non-significant kinases
3166 which_kinases = which_kinases
3167 )
3168 cat("\\begin{center}\n")
3169 cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n")
3170 cat("\\end{center}\n")
3171 } # end for (i in ...
3172 } # end if (length ...
3173
3174 for (i_cntrst in seq_len(length(rslt$score_list))) {
3175 next_index <- i_cntrst
3176 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
3177 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
3178 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
3179 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
3180 contrast_longlabel <- (
3181 sprintf(
3182 "Trt %s {%s} -> Trt %s {%s}",
3183 contrast_metadata_df[i_cntrst, "b_level"],
3184 gsub(
3185 pattern = ";",
3186 replacement = ", ",
3187 x = contrast_metadata_df[i_cntrst, "b_samples"],
3188 fixed = TRUE
3189 ),
3190 contrast_metadata_df[i_cntrst, "a_level"],
3191 gsub(
3192 pattern = ";",
3193 replacement = ", ",
3194 x = contrast_metadata_df[i_cntrst, "a_samples"],
3195 fixed = TRUE
3196 )
3197 )
3198 )
3199 main_title <- (
3200 sprintf(
3201 "Change from treatment %s to treatment %s",
3202 contrast_metadata_df[i_cntrst, "b_level"],
3203 contrast_metadata_df[i_cntrst, "a_level"]
3204 )
3205 )
3206 sub_title <- contrast_longlabel
3207 tryCatch(
3208 expr = {
3209 ksea_scores_rslt <- rslt$score_list[[next_index]]
3210
3211 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
3212 low_fdr_barplot(
3213 rslt = rslt,
3214 i_cntrst = i_cntrst,
3215 i = next_index,
3216 a_level = cntrst_a_level,
3217 b_level = cntrst_b_level,
3218 fold_change = cntrst_fold_change,
3219 caption = contrast_longlabel
3220 )
3221 }
3222 },
3223 error = function(e) str(e)
3224 )
3225 }
3226 ```
3227
3228 ```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
3229
3230 # Use enriched kinases to find enriched kinase-substrate pairs
3231 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash))
3232 all_enriched_substrates <- sqldf("
3233 SELECT
3234 gene AS kinase,
3235 ppep,
3236 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label
3237 FROM (
3238 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
3239 FROM pseudo_ksdata
3240 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
3241 )
3242 GROUP BY ppep
3243 ")
3244
3245 # helper used to label per-kinase substrate enrichment figure
3246 cat_enriched_heading <- function(m, cut_args) {
3247 cutoff <- cut_args$cutoff
3248 kinase <- cut_args$kinase
3249 statistic <- cut_args$statistic
3250 threshold <- cut_args$threshold
3251 cat("\\newpage\n")
3252 if (nrow(m) > intensity_hm_rows) {
3253 subsection_header(
3254 paste(
3255 sprintf(
3256 "Lowest p-valued %d (of %d) enriched %s-substrates,",
3257 intensity_hm_rows,
3258 nrow(m),
3259 kinase
3260 ),
3261 sprintf(" KSEA %s < %0.2f\n", statistic, threshold)
3262 )
3263 )
3264 } else {
3265 if (nrow(m) == 1) {
3266 return(FALSE)
3267 } else {
3268 subsection_header(
3269 paste(
3270 sprintf(
3271 "%d enriched %s-substrates,",
3272 nrow(m),
3273 kinase
3274 ),
3275 sprintf(
3276 " KSEA %s < %0.2f\n",
3277 statistic,
3278 threshold
3279 )
3280 )
3281 )
3282 }
3283 }
3284 cat("\n\n\n")
3285 cat("\n\n\n")
3286 return(TRUE)
3287 }
3288
3289 # Disabling heatmaps for substrates pending decision whether to eliminate them altogether
3290 if (FALSE)
3291 for (kinase_name in sort(enriched_kinases$kinase)) {
3292 enriched_substrates <-
3293 all_enriched_substrates[
3294 all_enriched_substrates$kinase == kinase_name,
3295 ,
3296 drop = FALSE
3297 ]
3298 # Get the intensity values for the heatmap
3299 enriched_intensities <-
3300 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE])
3301 # Remove rows having too many NA values to be relevant
3302 na_counter <- is.na(enriched_intensities)
3303 na_counts <- apply(na_counter, 1, sum)
3304 enriched_intensities <-
3305 enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE]
3306 # Rename the rows with the display-name for the heatmap
3307 rownames(enriched_intensities) <-
3308 sapply(
3309 X = rownames(enriched_intensities),
3310 FUN = function(rn) {
3311 enriched_substrates[enriched_substrates$ppep == rn, "label"]
3312 }
3313 )
3314 # Format as matrix for heatmap
3315 m <- as.matrix(enriched_intensities)
3316 # Draw the heading and heatmap
3317 if (nrow(m) > 0) {
3318 cut_args <- new_env()
3319 cut_args$cutoff <- cutoff
3320 cut_args$kinase <- kinase_name
3321 cut_args$statistic <- ksea_cutoff_statistic
3322 cut_args$threshold <- ksea_cutoff_threshold
3323 number_of_peptides_found <-
3324 draw_intensity_heatmap(
3325 m = m,
3326 cutoff = cut_args,
3327 hm_heading_function = cat_enriched_heading,
3328 hm_main_title
3329 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates",
3330 suppress_row_dendrogram = FALSE
3331 )
3332 }
3333 }
3334
3335 # Write output tabular files
3336
3337 # get kinase, ppep, concat(kinase) tuples for enriched kinases
3338
3339 kinase_ppep_label <- sqldf("
3340 WITH
3341 t(ppep, label) AS
3342 (
3343 SELECT DISTINCT
3344 SUB_MOD_RSD AS ppep,
3345 group_concat(gene, '; ') AS label
3346 FROM pseudo_ksdata
3347 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
3348 GROUP BY ppep
3349 ),
3350 k(kinase, ppep_join) AS
3351 (
3352 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join
3353 FROM pseudo_ksdata
3354 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
3355 )
3356 SELECT k.kinase, t.ppep, t.label
3357 FROM t, k
3358 WHERE t.ppep = k.ppep_join
3359 ORDER BY k.kinase, t.ppep
3360 ")
3361
3362 # extract what we need from full_data
3363 impish <- cbind(rownames(quant_data_imp), quant_data_imp)
3364 colnames(impish)[1] <- "Phosphopeptide"
3365 data_table_imputed_sql <- "
3366 SELECT
3367 f.*,
3368 k.label AS KSEA_enrichments,
3369 q.*
3370 FROM
3371 metadata_plus_p f
3372 LEFT JOIN kinase_ppep_label k
3373 ON f.Phosphopeptide = k.ppep,
3374 impish q
3375 WHERE
3376 f.Phosphopeptide = q.Phosphopeptide
3377 "
3378 data_table_imputed <- sqldf(data_table_imputed_sql)
3379 # Zap the duplicated 'Phosphopeptide' column named 'ppep'
3380 data_table_imputed <-
3381 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
3382
3383 # Output with imputed, un-normalized data
3384
3385 write.table(
3386 data_table_imputed
3387 , file = imputed_data_filename
3388 , sep = "\t"
3389 , col.names = TRUE
3390 , row.names = FALSE
3391 , quote = FALSE
3392 )
3393
3394
3395 #output quantile normalized data
3396 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)
3397 colnames(impish)[1] <- "Phosphopeptide"
3398 data_table_imputed <- sqldf(data_table_imputed_sql)
3399 # Zap the duplicated 'Phosphopeptide' column named 'ppep'
3400 data_table_imputed <-
3401 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
3402 write.table(
3403 data_table_imputed,
3404 file = imp_qn_lt_data_filenm,
3405 sep = "\t",
3406 col.names = TRUE,
3407 row.names = FALSE,
3408 quote = FALSE
3409 )
3410
3411 ppep_kinase <- sqldf("
3412 SELECT DISTINCT k.ppep, k.kinase
3413 FROM (
3414 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep
3415 FROM pseudo_ksdata
3416 WHERE GENE IN (SELECT kinase FROM enriched_kinases)
3417 ) k
3418 ORDER BY k.ppep, k.kinase
3419 ")
3420
3421 RSQLite::dbWriteTable(
3422 conn = db,
3423 name = "ksea_enriched_ks",
3424 value = ppep_kinase,
3425 append = FALSE
3426 )
3427
3428 RSQLite::dbWriteTable(
3429 conn = db,
3430 name = "anova_signif",
3431 value = p_value_data,
3432 append = FALSE
3433 )
3434
3435 ddl_exec(db, "
3436 DROP VIEW IF EXISTS stats_metadata_v;
3437 "
3438 )
3439 dml_no_rows_exec(db, "
3440 CREATE VIEW stats_metadata_v
3441 AS
3442 SELECT DISTINCT m.*,
3443 p.raw_anova_p,
3444 p.fdr_adjusted_anova_p,
3445 kek.kinase AS ksea_enrichments
3446 FROM
3447 mrgfltr_metadata_view m
3448 LEFT JOIN anova_signif p
3449 ON m.phospho_peptide = p.phosphopeptide
3450 LEFT JOIN ksea_enriched_ks kek
3451 ON m.phospho_peptide = kek.ppep
3452 ;
3453 "
3454 )
3455
3456 write.table(
3457 dbReadTable(db, "stats_metadata_v"),
3458 file = anova_ksea_mtdt_file,
3459 sep = "\t",
3460 col.names = TRUE,
3461 row.names = FALSE,
3462 quote = FALSE
3463 )
3464
3465
3466 ```
3467
3468 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
3469 cat("\\leavevmode\n\n\n")
3470
3471 # write parameters to report
3472
3473 param_unlist <- unlist(as.list(params))
3474 param_df <- data.frame(
3475 parameter = paste0("\\verb@", names(param_unlist), "@"),
3476 value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@")
3477 )
3478
3479 data_frame_latex(
3480 x = param_df,
3481 justification = "p{0.35\\linewidth} p{0.6\\linewidth}",
3482 centered = TRUE,
3483 caption = "Input parameters",
3484 anchor = const_table_anchor_bp,
3485 underscore_whack = FALSE
3486 )
3487
3488 # write parameters to SQLite output
3489
3490 mqppep_anova_script_param_df <- data.frame(
3491 script = "mqppep_anova_script.Rmd",
3492 parameter = names(param_unlist),
3493 value = param_unlist
3494 )
3495 ddl_exec(db, "
3496 DROP TABLE IF EXISTS script_parameter;
3497 "
3498 )
3499 ddl_exec(db, "
3500 CREATE TABLE IF NOT EXISTS script_parameter(
3501 script TEXT,
3502 parameter TEXT,
3503 value ANY,
3504 UNIQUE (script, parameter) ON CONFLICT REPLACE
3505 )
3506 ;
3507 "
3508 )
3509 RSQLite::dbWriteTable(
3510 conn = db,
3511 name = "script_parameter",
3512 value = mqppep_anova_script_param_df,
3513 append = TRUE
3514 )
3515
3516 # We are done with output
3517 RSQLite::dbDisconnect(db)
3518 ```
3519 <!--
3520 There's gotta be a better way...
3521
3522 loaded_packages_df <- sessioninfo::package_info("loaded")
3523 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library)
3524 loaded_packages_df <- data.frame(
3525 package = loaded_packages_df$package,
3526 version = loaded_packages_df$loadedversion,
3527 date = loaded_packages_df$date
3528 )
3529 data_frame_latex(
3530 x = loaded_packages_df,
3531 justification = "l | l l",
3532 centered = FALSE,
3533 caption = "Loaded R packages",
3534 anchor = const_table_anchor_bp
3535 )
3536 -->