Mercurial > repos > galaxyp > mqppep_anova
comparison mqppep_anova_script.Rmd @ 0:d9b68bedbc91 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 3a7b3609d6e514c9e8f980ecb684960c6b2252fe
| author | galaxyp |
|---|---|
| date | Mon, 11 Jul 2022 19:20:41 +0000 |
| parents | |
| children | 2276e88d5a1f |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d9b68bedbc91 |
|---|---|
| 1 --- | |
| 2 title: "MaxQuant Phosphoproteomic Enrichment Pipeline ANOVA/KSEA" | |
| 3 author: | |
| 4 - "Nick Graham^[ORCiD 0000-0002-6811-1941, University of Southern California: Los Angeles, CA, US]" | |
| 5 - "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]" | |
| 6 - "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]" | |
| 7 date: | |
| 8 - "May 28, 2018" | |
| 9 - "; revised June 23, 2022" | |
| 10 output: | |
| 11 pdf_document: | |
| 12 toc: true | |
| 13 toc_depth: 3 | |
| 14 keep_tex: true | |
| 15 header-includes: | |
| 16 - \usepackage{longtable} | |
| 17 - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut | |
| 18 - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut | |
| 19 params: | |
| 20 alphaFile: "test-data/alpha_levels.tabular" | |
| 21 inputFile: "test-data/test_input_for_anova.tabular" | |
| 22 preprocDb: "test-data/test_input_for_anova.sqlite" | |
| 23 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
| 24 show_toc: true | |
| 25 firstDataColumn: "^Intensity[^_]" | |
| 26 imputationMethod: !r c("group-median", "median", "mean", "random")[1] | |
| 27 meanPercentile: 1 | |
| 28 sdPercentile: 1.0 | |
| 29 regexSampleNames: "\\.\\d+[A-Z]$" | |
| 30 regexSampleGrouping: "\\d+" | |
| 31 imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" | |
| 32 imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" | |
| 33 anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt" | |
| 34 oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1] | |
| 35 oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3] | |
| 36 kseaCutoffStatistic: !r c("p.value", "FDR")[2] | |
| 37 kseaCutoffThreshold: !r c( 0.1, 0.05)[2] | |
| 38 kseaMinKinaseCount: 1 | |
| 39 intensityHeatmapRows: 75 | |
| 40 --- | |
| 41 <!-- | |
| 42 kseaCutoffStatistic: !r c("p.value", "FDR")[2] | |
| 43 kseaCutoffThreshold: !r c(0.05, 0.1)[1] | |
| 44 | |
| 45 alphaFile: "test-data/alpha_levels.tabular" | |
| 46 inputFile: "test-data/test_input_for_anova.tabular" | |
| 47 preprocDb: "test-data/test_input_for_anova.sqlite" | |
| 48 kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2] | |
| 49 | |
| 50 alphaFile: "test-data/alpha_levels.tabular" | |
| 51 inputFile: "test-data/UT_phospho_ST_sites.preproc.tabular" | |
| 52 preprocDb: "test-data/UT_phospho_ST_sites.preproc.sqlite" | |
| 53 kseaAppPrepDb: !r c(":memory:", "test-data/UT_phospho_ST_sites.ksea.sqlite")[2] | |
| 54 | |
| 55 alphaFile: "test-data/alpha_levels.tabular" | |
| 56 inputFile: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.tabular" | |
| 57 preprocDb: "test-data/pY_Sites_NancyDu.txt.ppep_intensities.ppep_map.preproc.sqlite" | |
| 58 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] | |
| 59 | |
| 60 alphaFile: "test-data/alpha_levels.tabular" | |
| 61 inputFile: "test-data/pST_Sites_NancyDu.txt.preproc.tabular" | |
| 62 preprocDb: "test-data/pST_Sites_NancyDu.txt.preproc.sqlite" | |
| 63 kseaAppPrepDb: !r c(":memory:", "test-data/pST_Sites_NancyDu.ksea.sqlite")[2] | |
| 64 | |
| 65 inputFile: "test-data/density_failure.preproc_tab.tabular" | |
| 66 kseaAppPrepDb: !r c(":memory:", "mqppep.sqlite")[2] | |
| 67 latex_document: default | |
| 68 --> | |
| 69 ```{r setup, include = FALSE} | |
| 70 #ref for debugging: https://yihui.org/tinytex/r/#debugging | |
| 71 options(tinytex.verbose = TRUE) | |
| 72 | |
| 73 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 | |
| 74 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 | |
| 75 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) | |
| 76 | |
| 77 # freeze the random number generator so the same results will be produced | |
| 78 # from run to run | |
| 79 set.seed(28571) | |
| 80 | |
| 81 ### LIBRARIES | |
| 82 library(gplots) | |
| 83 library(DBI) | |
| 84 library(RSQLite) | |
| 85 # Suppress "Warning: no DISPLAY variable so Tk is not available" | |
| 86 suppressWarnings(suppressMessages(library(sqldf))) | |
| 87 | |
| 88 # required but not added to search list: | |
| 89 # - DBI | |
| 90 # - RSQLite | |
| 91 # - ggplot2 | |
| 92 # - knitr | |
| 93 # - latex2exp | |
| 94 # - preprocessCore | |
| 95 # - reshape2 | |
| 96 # - vioplot | |
| 97 | |
| 98 ### CONSTANTS | |
| 99 | |
| 100 const_parfin <- par("fin") | |
| 101 const_boxplot_fill <- "grey94" | |
| 102 const_stripchart_cex <- 0.5 | |
| 103 const_stripsmall_cex <- | |
| 104 sqrt(const_stripchart_cex * const_stripchart_cex / 2) | |
| 105 const_stripchart_jitter <- 0.3 | |
| 106 const_write_debug_files <- FALSE | |
| 107 const_table_anchor_bp <- "bp" | |
| 108 const_table_anchor_ht <- "ht" | |
| 109 const_table_anchor_p <- "p" | |
| 110 const_table_anchor_tbp <- "tbp" | |
| 111 | |
| 112 | |
| 113 const_ksea_astrsk_kinases <- 1 | |
| 114 const_ksea_nonastrsk_kinases <- 2 | |
| 115 const_ksea_all_kinases <- 3 | |
| 116 | |
| 117 const_log10_e <- log10(exp(1)) | |
| 118 | |
| 119 ### FUNCTIONS | |
| 120 | |
| 121 # from `demo(error.catching)` | |
| 122 ##' Catch *and* save both errors and warnings, and in the case of | |
| 123 ##' a warning, also keep the computed result. | |
| 124 ##' | |
| 125 ##' @title tryCatch both warnings (with value) and errors | |
| 126 ##' @param expr an \R expression to evaluate | |
| 127 ##' @return a list with 'value' and 'warning', where | |
| 128 ##' 'value' may be an error caught. | |
| 129 ##' @author Martin Maechler; | |
| 130 ##' Copyright (C) 2010-2012 The R Core Team | |
| 131 try_catch_w_e <- function(expr) { | |
| 132 wrn <- NULL | |
| 133 # warning handler | |
| 134 w_handler <- function(w) { | |
| 135 wrn <<- w | |
| 136 invokeRestart("muffleWarning") | |
| 137 } | |
| 138 list( | |
| 139 value = withCallingHandlers( | |
| 140 tryCatch( | |
| 141 expr, | |
| 142 error = function(e) e | |
| 143 ), | |
| 144 warning = w_handler | |
| 145 ), | |
| 146 warning = wrn | |
| 147 ) | |
| 148 } | |
| 149 | |
| 150 | |
| 151 write_debug_file <- function(s) { | |
| 152 if (const_write_debug_files) { | |
| 153 s_path <- sprintf("test-data/%s.txt", deparse(substitute(s))) | |
| 154 print(sprintf("DEBUG writing file %s", spath)) | |
| 155 write.table( | |
| 156 s, | |
| 157 file = s_path, | |
| 158 sep = "\t", | |
| 159 col.names = TRUE, | |
| 160 row.names = TRUE, | |
| 161 quote = FALSE | |
| 162 ) | |
| 163 } | |
| 164 } | |
| 165 | |
| 166 # ref: http://adv-r.had.co.nz/Environments.html | |
| 167 # "When creating your own environment, note that you should set its parent | |
| 168 # environment to be the empty environment. This ensures you don't | |
| 169 # accidentally inherit objects from somewhere else." | |
| 170 # Caution: this prevents `with(my_env, expr)` from working when `expr` | |
| 171 # contains anything from the global environment, even operators! | |
| 172 # Hence, `x <- 1; get("x", new_env())` fails by design. | |
| 173 new_env <- function() { | |
| 174 new.env(parent = emptyenv()) | |
| 175 } | |
| 176 | |
| 177 ### numerical/statistical helper functions | |
| 178 | |
| 179 any_nan <- function(x) { | |
| 180 !any(x == "NaN") | |
| 181 } | |
| 182 | |
| 183 # determine standard deviation of quantile to impute | |
| 184 sd_finite <- function(x) { | |
| 185 ok <- is.finite(x) | |
| 186 sd(x[ok]) | |
| 187 } | |
| 188 | |
| 189 anova_func <- function(x, grouping_factor, one_way_f) { | |
| 190 subject <- data.frame( | |
| 191 intensity = x | |
| 192 ) | |
| 193 x_aov <- | |
| 194 one_way_f( | |
| 195 formula = intensity ~ grouping_factor, | |
| 196 data = subject | |
| 197 ) | |
| 198 pvalue <- | |
| 199 if (identical(one_way_f, aov)) | |
| 200 summary(x_aov)[[1]][["Pr(>F)"]][1] | |
| 201 else | |
| 202 pvalue <- x_aov$p.value | |
| 203 pvalue | |
| 204 } | |
| 205 | |
| 206 | |
| 207 ### LaTeX functions | |
| 208 | |
| 209 latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) { | |
| 210 v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v | |
| 211 cat( | |
| 212 paste0( | |
| 213 v_sub, | |
| 214 collapse = collapse_string | |
| 215 ) | |
| 216 ) | |
| 217 } | |
| 218 | |
| 219 latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
| 220 cat("\\begin{itemize}\n\\item ") | |
| 221 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
| 222 cat("\n\\end{itemize}\n") | |
| 223 } | |
| 224 | |
| 225 latex_itemized_list <- function(v, underscore_whack = TRUE) { | |
| 226 latex_itemized_collapsed("\n\\item ", v, underscore_whack) | |
| 227 } | |
| 228 | |
| 229 latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) { | |
| 230 cat("\\begin{enumerate}\n\\item ") | |
| 231 latex_collapsed_vector(collapse_string, v, underscore_whack) | |
| 232 cat("\n\\end{enumerate}\n") | |
| 233 } | |
| 234 | |
| 235 latex_enumerated_list <- function(v) { | |
| 236 latex_enumerated_collapsed("\n\\item ", v) | |
| 237 } | |
| 238 | |
| 239 latex_table_row <- function(v, extra = "", underscore_whack = TRUE) { | |
| 240 latex_collapsed_vector(" & ", v, underscore_whack) | |
| 241 cat(extra) | |
| 242 cat(" \\\\\n") | |
| 243 } | |
| 244 | |
| 245 # Use this like print.data.frame, from which it is adapted: | |
| 246 data_frame_latex <- | |
| 247 function( | |
| 248 x, | |
| 249 ..., | |
| 250 # digits to pass to format.data.frame | |
| 251 digits = NULL, | |
| 252 # TRUE -> right-justify columns; FALSE -> left-justify | |
| 253 right = TRUE, | |
| 254 # maximumn number of rows to print | |
| 255 max = NULL, | |
| 256 # string with justification of each column | |
| 257 justification = NULL, | |
| 258 # TRUE to center on page | |
| 259 centered = TRUE, | |
| 260 # optional caption | |
| 261 caption = NULL, | |
| 262 # h(inline); b(bottom); t (top) or p (separate page) | |
| 263 anchor = "h", | |
| 264 # set underscore_whack to TRUE to escape underscores | |
| 265 underscore_whack = TRUE | |
| 266 ) { | |
| 267 if (is.null(justification)) | |
| 268 justification <- | |
| 269 Reduce( | |
| 270 f = paste, | |
| 271 x = rep_len(if (right) "r" else "l", length(colnames(x))) | |
| 272 ) | |
| 273 n <- length(rownames(x)) | |
| 274 if (length(x) == 0L) { | |
| 275 cat( | |
| 276 sprintf( | |
| 277 # if n is one, use singular 'row', else use plural 'rows' | |
| 278 ngettext( | |
| 279 n, | |
| 280 "data frame with 0 columns and %d row", | |
| 281 "data frame with 0 columns and %d rows" | |
| 282 ), | |
| 283 n | |
| 284 ), | |
| 285 "\n", | |
| 286 sep = "" | |
| 287 ) | |
| 288 } else if (n == 0L) { | |
| 289 cat("0 rows for:\n") | |
| 290 latex_itemized_list( | |
| 291 v = names(x), | |
| 292 underscore_whack = underscore_whack | |
| 293 ) | |
| 294 } else { | |
| 295 if (is.null(max)) | |
| 296 max <- getOption("max.print", 99999L) | |
| 297 if (!is.finite(max)) | |
| 298 stop("invalid 'max' / getOption(\"max.print\"): ", | |
| 299 max) | |
| 300 omit <- (n0 <- max %/% length(x)) < n | |
| 301 m <- as.matrix( | |
| 302 format.data.frame( | |
| 303 if (omit) x[seq_len(n0), , drop = FALSE] else x, | |
| 304 digits = digits, | |
| 305 na.encode = FALSE | |
| 306 ) | |
| 307 ) | |
| 308 cat( | |
| 309 # h(inline); b(bottom); t (top) or p (separate page) | |
| 310 paste0("\\begin{table}[", anchor, "]\n") | |
| 311 ) | |
| 312 if (!is.null(caption)) | |
| 313 cat(paste0(" \\caption{", caption, "}")) | |
| 314 if (centered) cat("\\centering\n") | |
| 315 cat( | |
| 316 paste( | |
| 317 " \\begin{tabular}{", | |
| 318 justification, | |
| 319 "}\n", | |
| 320 sep = "" | |
| 321 ) | |
| 322 ) | |
| 323 # ref: https://tex.stackexchange.com/a/50353 | |
| 324 # Describes use of \rule{0pt}{3ex} | |
| 325 if (!is.null(caption)) | |
| 326 cat("\\B \\\\ \\hline\\hline\n") | |
| 327 # ref for top and bottom struts: https://tex.stackexchange.com/a/50355 | |
| 328 latex_table_row( | |
| 329 v = colnames(m), | |
| 330 extra = "\\T\\B", | |
| 331 underscore_whack = underscore_whack | |
| 332 ) | |
| 333 cat("\\hline\n") | |
| 334 for (i in seq_len(length(m[, 1]))) { | |
| 335 latex_table_row( | |
| 336 v = m[i, ], | |
| 337 underscore_whack = underscore_whack | |
| 338 ) | |
| 339 } | |
| 340 cat( | |
| 341 paste( | |
| 342 " \\end{tabular}", | |
| 343 "\\end{table}", | |
| 344 sep = "\n" | |
| 345 ) | |
| 346 ) | |
| 347 if (omit) | |
| 348 cat(" [ reached 'max' / getOption(\"max.print\") -- omitted", | |
| 349 n - n0, "rows ]\n") | |
| 350 } | |
| 351 invisible(x) | |
| 352 } | |
| 353 | |
| 354 hypersub <- | |
| 355 function(s) { | |
| 356 hyper <- tolower(s) | |
| 357 hyper <- gsub("[^a-z0-9]+", "-", hyper) | |
| 358 hyper <- gsub("[-]+", "-", hyper) | |
| 359 hyper <- sub("^[-]", "", hyper) | |
| 360 hyper <- sub("[-]$", "", hyper) | |
| 361 return(hyper) | |
| 362 } | |
| 363 | |
| 364 subsection_header <- | |
| 365 function(s) { | |
| 366 hyper <- hypersub(s) | |
| 367 cat( | |
| 368 sprintf( | |
| 369 "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n", | |
| 370 hyper, s, hyper | |
| 371 ) | |
| 372 ) | |
| 373 } | |
| 374 | |
| 375 subsubsection_header <- | |
| 376 function(s) { | |
| 377 hyper <- hypersub(s) | |
| 378 cat( | |
| 379 sprintf( | |
| 380 "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n", | |
| 381 hyper, s, hyper | |
| 382 ) | |
| 383 ) | |
| 384 } | |
| 385 | |
| 386 ### SQLite functions | |
| 387 | |
| 388 ddl_exec <- function(db, sql) { | |
| 389 discard <- DBI::dbExecute(conn = db, statement = sql) | |
| 390 if (FALSE && discard != 0) { | |
| 391 need_newpage <- TRUE | |
| 392 if (need_newpage) { | |
| 393 need_newpage <<- FALSE | |
| 394 cat("\\newpage\n") | |
| 395 } | |
| 396 o_file <- stdout() | |
| 397 cat("\n\\begin{verbatim}\n") | |
| 398 cat(sql, file = o_file) | |
| 399 cat(sprintf("\n%d rows affected by DDL\n", discard), file = o_file) | |
| 400 cat("\n\\end{verbatim}\n") | |
| 401 } | |
| 402 } | |
| 403 | |
| 404 dml_no_rows_exec <- function(db, sql) { | |
| 405 discard <- DBI::dbExecute(conn = db, statement = sql) | |
| 406 if (discard != 0) { | |
| 407 need_newpage <- TRUE | |
| 408 if (need_newpage) { | |
| 409 need_newpage <<- FALSE | |
| 410 cat("\\newpage\n") | |
| 411 } | |
| 412 cat("\n\\begin{verbatim}\n") | |
| 413 o_file <- stdout() | |
| 414 cat(sql, file = o_file) | |
| 415 cat(sprintf("\n%d rows affected by DML\n", discard), file = o_file) | |
| 416 cat("\n\\end{verbatim}\n") | |
| 417 } | |
| 418 } | |
| 419 | |
| 420 ### KSEA functions and helpers | |
| 421 | |
| 422 # Adapted from KSEAapp::KSEA.Scores to allow retrieval of: | |
| 423 # - maximum log2(FC) | |
| 424 ksea_scores <- function( | |
| 425 | |
| 426 # For human data, typically, ksdata = KSEAapp::ksdata | |
| 427 ksdata, | |
| 428 | |
| 429 # Input data file having columns: | |
| 430 # - Protein : abbreviated protein name | |
| 431 # - Gene : HUGO gene name | |
| 432 # - Peptide : peptide sequence without indications of phosphorylation | |
| 433 # - Reside.Both : position(s) of phosphorylation within Gene sequence | |
| 434 # - First letter designates AA that is modified | |
| 435 # - Numbers indicate position within Gene | |
| 436 # - Multiple values are separated by semicolons | |
| 437 # - p : p-value | |
| 438 # - FC : fold-change | |
| 439 px, | |
| 440 | |
| 441 # A binary input of TRUE or FALSE, indicating whether or not to include | |
| 442 # NetworKIN predictions | |
| 443 networkin, | |
| 444 | |
| 445 # A numeric value between 1 and infinity setting the minimum NetworKIN | |
| 446 # score (can be left out if networkin = FALSE) | |
| 447 networkin_cutoff | |
| 448 | |
| 449 ) { | |
| 450 if (length(grep(";", px$Residue.Both)) == 0) { | |
| 451 # There are no Residue.Both entries having semicolons, so new is | |
| 452 # simply px except two columns are renamed and a column is added | |
| 453 # for log2(abs(fold-change)) | |
| 454 new <- px | |
| 455 colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") | |
| 456 new$log2_fc <- log2(abs(as.numeric(as.character(new$FC)))) | |
| 457 new <- new[complete.cases(new$log2_fc), ] | |
| 458 } else { | |
| 459 # Split each row having semicolons in Residue.Both into rows that are | |
| 460 # duplicated in all respects except that each row has a single | |
| 461 # member of the set "split-on-semicolon-Residue.Both" | |
| 462 px_double <- px[grep(";", px$Residue.Both), ] | |
| 463 residues <- as.character(px_double$Residue.Both) | |
| 464 residues <- as.matrix(residues, ncol = 1) | |
| 465 split <- strsplit(residues, split = ";") | |
| 466 # x gets count of residues in each row, | |
| 467 # i.e., 1 + count of semicolons | |
| 468 x <- sapply(split, length) | |
| 469 # Here is the set of split rows | |
| 470 px_single <- data.frame( | |
| 471 Protein = rep(px_double$Protein, x), | |
| 472 Gene = rep(px_double$Gene, x), | |
| 473 Peptide = rep(px_double$Peptide, x), | |
| 474 Residue.Both = unlist(split), | |
| 475 p = rep(px_double$p, x), | |
| 476 FC = rep(px_double$FC, x) | |
| 477 ) | |
| 478 # new first gets the split rows | |
| 479 new <- px[-grep(";", px$Residue.Both), ] | |
| 480 # to new, append the rows that didn't need splitting in the first place | |
| 481 new <- rbind(new, px_single) | |
| 482 # map Gene to SUB_GENE | |
| 483 # map Residue.Both to SUB_MOD_RSD | |
| 484 colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") | |
| 485 # Eliminate any non-positive values to prevent introduction of | |
| 486 # infinite or NaN values | |
| 487 new[(0 <= new$log2_fc), "log2_fc"] <- NA | |
| 488 # Because of preceding step, there is no need for abs in the next line | |
| 489 new$log2_fc <- log2(as.numeric(as.character(new$FC))) | |
| 490 # Convert any illegal values from NaN to NA | |
| 491 new[is.nan(new$log2_fc), "log2_fc"] <- NA | |
| 492 # Eliminate rows having missing values (e.g., non-imputed data) | |
| 493 new <- new[complete.cases(new$log2_fc), ] | |
| 494 } | |
| 495 if (networkin == TRUE) { | |
| 496 # When NetworKIN is true, filter on NetworKIN.cutoff which includes | |
| 497 # PhosphoSitePlus data *because its networkin_score is set to Inf* | |
| 498 ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ] | |
| 499 ksdata_filtered <- ksdata_filtered[ | |
| 500 (ksdata_filtered$networkin_score >= networkin_cutoff), ] | |
| 501 } else { | |
| 502 # Otherwise, simply use PhosphSitePlus rows | |
| 503 ksdata_filtered <- ksdata[ | |
| 504 grep("PhosphoSitePlus", ksdata$Source), ] | |
| 505 } | |
| 506 # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD | |
| 507 # colnames of ksdata_filtered: | |
| 508 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID" | |
| 509 # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID" | |
| 510 # "SITE_...7_AA" "networkin_score" "Source" | |
| 511 # colnames of new: | |
| 512 # "Protein" "SUB_GENE" "Peptide" "SUB_MOD_RSD" "p" "FC" "log2_fc" | |
| 513 # Equivalent to: | |
| 514 # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc | |
| 515 # FROM ksdata_filtered a | |
| 516 # INNER JOIN new b | |
| 517 # ON a.SUB_GENE = b.SUB_GENE | |
| 518 # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD | |
| 519 ksdata_dataset <- base::merge(ksdata_filtered, new) | |
| 520 # colnames of ksdata_dataset: | |
| 521 # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" | |
| 522 # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" | |
| 523 # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein" | |
| 524 # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform) | |
| 525 # Re-order dataset; prior to accounting for isoforms | |
| 526 ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] | |
| 527 # Extract non-isoform accession in UniProtKB | |
| 528 ksdata_dataset$uniprot_no_isoform <- sapply( | |
| 529 ksdata_dataset$KIN_ACC_ID, | |
| 530 function(x) unlist(strsplit(as.character(x), split = "-"))[1] | |
| 531 ) | |
| 532 # Discard previous results while selecting interesting columns ... | |
| 533 ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] | |
| 534 # Column names are now: | |
| 535 # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p" | |
| 536 # "FC" "log2_fc" "Source" | |
| 537 # Make column names human-readable | |
| 538 colnames(ksdata_dataset_abbrev) <- c( | |
| 539 "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", | |
| 540 "FC", "log2FC", "Source" | |
| 541 ) | |
| 542 # SELECT * FROM ksdata_dataset_abbrev | |
| 543 # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p | |
| 544 ksdata_dataset_abbrev <- | |
| 545 ksdata_dataset_abbrev[ | |
| 546 order( | |
| 547 ksdata_dataset_abbrev$Kinase.Gene, | |
| 548 ksdata_dataset_abbrev$Substrate.Gene, | |
| 549 ksdata_dataset_abbrev$Substrate.Mod, | |
| 550 ksdata_dataset_abbrev$p), | |
| 551 ] | |
| 552 # First aggregation step to account for multiply phosphorylated peptides | |
| 553 # and differing peptide sequences; the goal here is to combine results | |
| 554 # for all measurements of the same substrate. | |
| 555 # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, | |
| 556 # `Source`, avg(log2FC) AS log2FC | |
| 557 # FROM ksdata_dataset_abbrev | |
| 558 # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`, | |
| 559 # `Source` | |
| 560 # ORDER BY `Kinase.Gene`; | |
| 561 # in two steps: | |
| 562 # (1) compute average log_2(fold-change) | |
| 563 ksdata_dataset_abbrev <- aggregate( | |
| 564 log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, | |
| 565 data = ksdata_dataset_abbrev, | |
| 566 FUN = mean | |
| 567 ) | |
| 568 # (2) order by Kinase.Gene | |
| 569 ksdata_dataset_abbrev <- | |
| 570 ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] | |
| 571 # SELECT `Kinase.Gene`, count(*) | |
| 572 # FROM ksdata_dataset_abbrev | |
| 573 # GROUP BY `Kinase.Gene`; | |
| 574 # in two steps: | |
| 575 # (1) Extract the list of Kinase.Gene names | |
| 576 kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) | |
| 577 # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev, | |
| 578 # named by Kinase.Gene | |
| 579 kinase_list <- as.matrix(table(kinase_list)) | |
| 580 # Second aggregation step to account for all substrates per kinase | |
| 581 # CREATE TABLE mean_fc | |
| 582 # AS | |
| 583 # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC | |
| 584 # FROM ksdata_dataset_abbrev | |
| 585 # GROUP BY `Kinase.Gene` | |
| 586 mean_fc <- aggregate( | |
| 587 log2FC ~ Kinase.Gene, | |
| 588 data = ksdata_dataset_abbrev, | |
| 589 FUN = mean | |
| 590 ) | |
| 591 # mean_fc columns: "Kinase.Gene", "log2FC" | |
| 592 if (FALSE) { | |
| 593 # I need to re-think this; I was trying to find the most-represented | |
| 594 # peptide, but that horse has already left the barn | |
| 595 # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC | |
| 596 # FROM ksdata_dataset_abbrev | |
| 597 # GROUP BY `Kinase.Gene` | |
| 598 max_fc <- aggregate( | |
| 599 log2FC ~ Kinase.Gene, | |
| 600 data = ksdata_dataset_abbrev, | |
| 601 FUN = function(r) max(abs(r)) | |
| 602 ) | |
| 603 } | |
| 604 | |
| 605 # Create column 3: mS | |
| 606 mean_fc$m_s <- mean_fc[, 2] | |
| 607 # Create column 4: Enrichment | |
| 608 mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE)) | |
| 609 # Create column 5: m, count of substrates | |
| 610 mean_fc$m <- kinase_list | |
| 611 # Create column 6: z-score | |
| 612 mean_fc$z_score <- ( | |
| 613 (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) * | |
| 614 sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE) | |
| 615 # Create column 7: p-value, deduced from z-score | |
| 616 mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) | |
| 617 # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value | |
| 618 mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") | |
| 619 | |
| 620 # Remove log2FC column, which is duplicated as mS | |
| 621 mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2] | |
| 622 # Correct the column names which we had to hack because of the linter... | |
| 623 colnames(mean_fc) <- c( | |
| 624 "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR" | |
| 625 ) | |
| 626 return(mean_fc) | |
| 627 } | |
| 628 | |
| 629 low_fdr_barplot <- function( | |
| 630 rslt, | |
| 631 i_cntrst, | |
| 632 i, | |
| 633 a_level, | |
| 634 b_level, | |
| 635 fold_change, | |
| 636 caption | |
| 637 ) { | |
| 638 rslt_score_list_i <- rslt$score_list[[i]] | |
| 639 if (!is.null(rslt_score_list_i)) { | |
| 640 rslt_score_list_i_nrow <- nrow(rslt_score_list_i) | |
| 641 k <- data.frame( | |
| 642 contrast = as.integer(i_cntrst), | |
| 643 a_level = rep.int(a_level, rslt_score_list_i_nrow), | |
| 644 b_level = rep.int(b_level, rslt_score_list_i_nrow), | |
| 645 kinase_gene = rslt_score_list_i$Kinase.Gene, | |
| 646 mean_log2_fc = rslt_score_list_i$mS, | |
| 647 enrichment = rslt_score_list_i$Enrichment, | |
| 648 substrate_count = rslt_score_list_i$m, | |
| 649 z_score = rslt_score_list_i$z.score, | |
| 650 p_value = rslt_score_list_i$p.value, | |
| 651 fdr = rslt_score_list_i$FDR | |
| 652 ) | |
| 653 selector <- switch( | |
| 654 ksea_cutoff_statistic, | |
| 655 "FDR" = { | |
| 656 k$fdr | |
| 657 }, | |
| 658 "p.value" = { | |
| 659 k$p_value | |
| 660 }, | |
| 661 stop( | |
| 662 sprintf( | |
| 663 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
| 664 ksea_cutoff_statistic | |
| 665 ) | |
| 666 ) | |
| 667 ) | |
| 668 | |
| 669 k <- k[selector < ksea_cutoff_threshold, ] | |
| 670 | |
| 671 if (nrow(k) > 1) { | |
| 672 op <- par(mai = c(1, 1.5, 0.4, 0.4)) | |
| 673 numeric_z_score <- as.numeric(k$z_score) | |
| 674 z_score_order <- order(numeric_z_score) | |
| 675 kinase_name <- k$kinase_gene | |
| 676 long_caption <- | |
| 677 sprintf( | |
| 678 "Kinase z-score, %s < %s, %s", | |
| 679 ksea_cutoff_statistic, | |
| 680 ksea_cutoff_threshold, | |
| 681 caption | |
| 682 ) | |
| 683 my_cex_caption <- 65.0 / max(65.0, nchar(long_caption)) | |
| 684 cat("\n\\clearpage\n") | |
| 685 barplot( | |
| 686 height = numeric_z_score[z_score_order], | |
| 687 border = NA, | |
| 688 xpd = FALSE, | |
| 689 cex.names = 1.0, | |
| 690 cex.axis = 1.0, | |
| 691 main = long_caption, | |
| 692 cex.main = my_cex_caption, | |
| 693 names.arg = kinase_name[z_score_order], | |
| 694 horiz = TRUE, | |
| 695 srt = 45, | |
| 696 las = 1) | |
| 697 par(op) | |
| 698 } | |
| 699 } | |
| 700 } | |
| 701 | |
| 702 # note that this adds elements to the global variable `ksea_asterisk_hash` | |
| 703 | |
| 704 low_fdr_print <- function( | |
| 705 rslt, | |
| 706 i_cntrst, | |
| 707 i, | |
| 708 a_level, | |
| 709 b_level, | |
| 710 fold_change, | |
| 711 caption | |
| 712 ) { | |
| 713 rslt_score_list_i <- rslt$score_list[[i]] | |
| 714 if (!is.null(rslt_score_list_i)) { | |
| 715 rslt_score_list_i_nrow <- nrow(rslt_score_list_i) | |
| 716 k <- contrast_ksea_scores <- data.frame( | |
| 717 contrast = as.integer(i_cntrst), | |
| 718 a_level = rep.int(a_level, rslt_score_list_i_nrow), | |
| 719 b_level = rep.int(b_level, rslt_score_list_i_nrow), | |
| 720 kinase_gene = rslt_score_list_i$Kinase.Gene, | |
| 721 mean_log2_fc = rslt_score_list_i$mS, | |
| 722 enrichment = rslt_score_list_i$Enrichment, | |
| 723 substrate_count = rslt_score_list_i$m, | |
| 724 z_score = rslt_score_list_i$z.score, | |
| 725 p_value = rslt_score_list_i$p.value, | |
| 726 fdr = rslt_score_list_i$FDR | |
| 727 ) | |
| 728 | |
| 729 selector <- switch( | |
| 730 ksea_cutoff_statistic, | |
| 731 "FDR" = { | |
| 732 k$fdr | |
| 733 }, | |
| 734 "p.value" = { | |
| 735 k$p_value | |
| 736 }, | |
| 737 stop( | |
| 738 sprintf( | |
| 739 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
| 740 ksea_cutoff_statistic | |
| 741 ) | |
| 742 ) | |
| 743 ) | |
| 744 | |
| 745 k <- k[selector < ksea_cutoff_threshold, ] | |
| 746 # save kinase names to ksea_asterisk_hash | |
| 747 for (kinase_name in k$kinase_gene) { | |
| 748 ksea_asterisk_hash[[kinase_name]] <- 1 | |
| 749 } | |
| 750 | |
| 751 db_write_table_overwrite <- (i_cntrst < 2) | |
| 752 db_write_table_append <- !db_write_table_overwrite | |
| 753 RSQLite::dbWriteTable( | |
| 754 conn = db, | |
| 755 name = "contrast_ksea_scores", | |
| 756 value = contrast_ksea_scores, | |
| 757 append = db_write_table_append | |
| 758 ) | |
| 759 selector <- switch( | |
| 760 ksea_cutoff_statistic, | |
| 761 "FDR" = { | |
| 762 contrast_ksea_scores$fdr | |
| 763 }, | |
| 764 "p.value" = { | |
| 765 contrast_ksea_scores$p_value | |
| 766 }, | |
| 767 stop( | |
| 768 sprintf( | |
| 769 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
| 770 ksea_cutoff_statistic | |
| 771 ) | |
| 772 ) | |
| 773 ) | |
| 774 output_df <- contrast_ksea_scores[ | |
| 775 selector < ksea_cutoff_threshold, | |
| 776 c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count", | |
| 777 "z_score", "p_value", "fdr") | |
| 778 ] | |
| 779 output_order <- with(output_df, order(mean_log2_fc, kinase_gene)) | |
| 780 output_df <- output_df[output_order, ] | |
| 781 colnames(output_df) <- | |
| 782 c( | |
| 783 colnames(output_df)[1], | |
| 784 colnames(output_df)[2], | |
| 785 "enrichment", | |
| 786 "m_s", | |
| 787 "z_score", | |
| 788 "p_value", | |
| 789 "fdr" | |
| 790 ) | |
| 791 output_df$fdr <- sprintf("%0.4f", output_df$fdr) | |
| 792 output_df$p_value <- sprintf("%0.2e", output_df$p_value) | |
| 793 output_df$z_score <- sprintf("%0.2f", output_df$z_score) | |
| 794 output_df$m_s <- sprintf("%d", output_df$m_s) | |
| 795 output_df$enrichment <- sprintf("%0.2f", output_df$enrichment) | |
| 796 output_ncol <- ncol(output_df) | |
| 797 colnames(output_df) <- | |
| 798 c( | |
| 799 "Kinase", | |
| 800 "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)", | |
| 801 "Enrichment", | |
| 802 "Substrates", | |
| 803 "z-score", | |
| 804 "p-value", | |
| 805 "FDR" | |
| 806 ) | |
| 807 selector <- switch( | |
| 808 ksea_cutoff_statistic, | |
| 809 "FDR" = { | |
| 810 rslt$score_list[[i]]$FDR | |
| 811 }, | |
| 812 "p.value" = { | |
| 813 rslt$score_list[[i]]$p.value | |
| 814 }, | |
| 815 stop( | |
| 816 sprintf( | |
| 817 "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'", | |
| 818 ksea_cutoff_statistic | |
| 819 ) | |
| 820 ) | |
| 821 ) | |
| 822 if (sum(selector < ksea_cutoff_threshold) > 0) { | |
| 823 math_caption <- gsub("{", "\\{", caption, fixed = TRUE) | |
| 824 math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE) | |
| 825 data_frame_latex( | |
| 826 x = output_df, | |
| 827 justification = "l c c c c c c", | |
| 828 centered = TRUE, | |
| 829 caption = sprintf( | |
| 830 "\\text{%s}, %s < %s", | |
| 831 math_caption, | |
| 832 ksea_cutoff_statistic, | |
| 833 ksea_cutoff_threshold | |
| 834 ), | |
| 835 anchor = const_table_anchor_p | |
| 836 ) | |
| 837 } else { | |
| 838 cat( | |
| 839 sprintf( | |
| 840 "\\break | |
| 841 No kinases had | |
| 842 \\(\\text{%s}_\\text{enrichment} < %s\\) | |
| 843 for contrast %s\\hfill\\break\n", | |
| 844 ksea_cutoff_statistic, | |
| 845 ksea_cutoff_threshold, | |
| 846 caption | |
| 847 ) | |
| 848 ) | |
| 849 } | |
| 850 } | |
| 851 } | |
| 852 | |
| 853 # create_breaks is a helper for ksea_heatmap | |
| 854 create_breaks <- function(merged_scores) { | |
| 855 if (min(merged_scores, na.rm = TRUE) < -1.6) { | |
| 856 breaks_neg <- seq(-1.6, 0, length.out = 30) | |
| 857 breaks_neg <- | |
| 858 append( | |
| 859 seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10), | |
| 860 breaks_neg | |
| 861 ) | |
| 862 breaks_neg <- sort(unique(breaks_neg)) | |
| 863 } else { | |
| 864 breaks_neg <- seq(-1.6, 0, length.out = 30) | |
| 865 } | |
| 866 if (max(merged_scores, na.rm = TRUE) > 1.6) { | |
| 867 breaks_pos <- seq(0, 1.6, length.out = 30) | |
| 868 breaks_pos <- | |
| 869 append( | |
| 870 breaks_pos, | |
| 871 seq(1.6, max(merged_scores, na.rm = TRUE), | |
| 872 length.out = 10) | |
| 873 ) | |
| 874 breaks_pos <- sort(unique(breaks_pos)) | |
| 875 } else { | |
| 876 breaks_pos <- seq(0, 1.6, length.out = 30) | |
| 877 } | |
| 878 breaks_all <- unique(append(breaks_neg, breaks_pos)) | |
| 879 mycol_neg <- | |
| 880 gplots::colorpanel(n = length(breaks_neg), | |
| 881 low = "blue", | |
| 882 high = "white") | |
| 883 mycol_pos <- | |
| 884 gplots::colorpanel(n = length(breaks_pos) - 1, | |
| 885 low = "white", | |
| 886 high = "red") | |
| 887 mycol <- unique(append(mycol_neg, mycol_pos)) | |
| 888 color_breaks <- list(breaks_all, mycol) | |
| 889 return(color_breaks) | |
| 890 } | |
| 891 | |
| 892 # draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap | |
| 893 draw_kseaapp_summary_heatmap <- function( | |
| 894 x, | |
| 895 sample_cluster, | |
| 896 merged_asterisk, | |
| 897 my_cex_row, | |
| 898 color_breaks, | |
| 899 margins, | |
| 900 ... | |
| 901 ) { | |
| 902 merged_scores <- x | |
| 903 if (!is.matrix(x)) { | |
| 904 cat( | |
| 905 paste0( | |
| 906 "No plot because \\texttt{typeof(x)} is '", | |
| 907 typeof(x), | |
| 908 "' rather than 'matrix'.\n\n" | |
| 909 ) | |
| 910 ) | |
| 911 } else if (nrow(x) < 2) { | |
| 912 cat("No plot because matrix x has ", nrow(x), " rows.\n\n") | |
| 913 cat("\\begin{verbatim}\n") | |
| 914 str(x) | |
| 915 cat("\\end{verbatim}\n") | |
| 916 } else if (ncol(x) < 2) { | |
| 917 cat("No plot because matrix x has ", ncol(x), " columns.\n\n") | |
| 918 cat("\\begin{verbatim}\n") | |
| 919 str(x) | |
| 920 cat("\\end{verbatim}\n") | |
| 921 } else { | |
| 922 gplots::heatmap.2( | |
| 923 x = merged_scores, | |
| 924 Colv = sample_cluster, | |
| 925 scale = "none", | |
| 926 cellnote = merged_asterisk, | |
| 927 notecol = "white", | |
| 928 cexCol = 0.9, | |
| 929 # Heuristically assign size of row labels | |
| 930 cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25), | |
| 931 srtCol = 45, | |
| 932 srtRow = 45, | |
| 933 notecex = 3 * my_cex_row, | |
| 934 col = color_breaks[[2]], | |
| 935 density.info = "none", | |
| 936 trace = "none", | |
| 937 breaks = color_breaks[[1]], | |
| 938 lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), | |
| 939 lhei = c(0.4, 8.0, 1.1), | |
| 940 lwid = c(0.5, 3), | |
| 941 key = FALSE, | |
| 942 margins = margins, | |
| 943 ... | |
| 944 ) | |
| 945 } | |
| 946 } | |
| 947 | |
| 948 # Adapted from KSEAapp::KSEA.Heatmap | |
| 949 ksea_heatmap <- function( | |
| 950 # the data frame outputs from the KSEA.Scores() function, in list format | |
| 951 score_list, | |
| 952 # a character vector of all the sample names for heatmap annotation: | |
| 953 # - the names must be in the same order as the data in score_list | |
| 954 # - please avoid long names, as they may get cropped in the final image | |
| 955 sample_labels, | |
| 956 # character string of either "p.value" or "FDR" indicating the data column | |
| 957 # to use for marking statistically significant scores | |
| 958 stats, | |
| 959 # a numeric value between 0 and infinity indicating the min. number of | |
| 960 # substrates a kinase must have to be included in the heatmap | |
| 961 m_cutoff, | |
| 962 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff | |
| 963 # for indicating significant kinases in the heatmap | |
| 964 p_cutoff = | |
| 965 stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"), | |
| 966 # a binary input of TRUE or FALSE, indicating whether or not to perform | |
| 967 # hierarchical clustering of the sample columns | |
| 968 sample_cluster, | |
| 969 # a binary input of TRUE or FALSE, indicating whether or not to export | |
| 970 # the heatmap as a .png image into the working directory | |
| 971 export = FALSE, | |
| 972 # bottom and right margins; adjust as needed if contrast names are too long | |
| 973 margins = c(6, 20), | |
| 974 # print which kinases? | |
| 975 # - Mandatory argument, must be one of const_ksea_.*_kinases | |
| 976 which_kinases, | |
| 977 # additional arguments to gplots::heatmap.2, such as: | |
| 978 # - main: main title of plot | |
| 979 # - xlab: x-axis label | |
| 980 # - ylab: y-axis label | |
| 981 ... | |
| 982 ) { | |
| 983 filter_m <- function(dataset, m_cutoff) { | |
| 984 filtered <- dataset[(dataset$m >= m_cutoff), ] | |
| 985 return(filtered) | |
| 986 } | |
| 987 score_list_m <- lapply(score_list, function(...) filter_m(..., m_cutoff)) | |
| 988 for (i in seq_len(length(score_list_m))) { | |
| 989 names <- colnames(score_list_m[[i]])[c(2:7)] | |
| 990 colnames(score_list_m[[i]])[c(2:7)] <- | |
| 991 paste(names, i, sep = ".") | |
| 992 } | |
| 993 master <- | |
| 994 Reduce( | |
| 995 f = function(...) { | |
| 996 base::merge(..., by = "Kinase.Gene", all = FALSE) | |
| 997 }, | |
| 998 x = score_list_m | |
| 999 ) | |
| 1000 | |
| 1001 row.names(master) <- master$Kinase.Gene | |
| 1002 columns <- as.character(colnames(master)) | |
| 1003 merged_scores <- as.matrix(master[, grep("z.score", columns), drop = FALSE]) | |
| 1004 colnames(merged_scores) <- sample_labels | |
| 1005 merged_stats <- as.matrix(master[, grep(stats, columns)]) | |
| 1006 asterisk <- function(mtrx, p_cutoff) { | |
| 1007 new <- data.frame() | |
| 1008 for (i in seq_len(nrow(mtrx))) { | |
| 1009 for (j in seq_len(ncol(mtrx))) { | |
| 1010 my_value <- mtrx[i, j] | |
| 1011 if (!is.na(my_value) && my_value < p_cutoff) { | |
| 1012 new[i, j] <- "*" | |
| 1013 } else { | |
| 1014 new[i, j] <- "" | |
| 1015 } | |
| 1016 } | |
| 1017 } | |
| 1018 return(new) | |
| 1019 } | |
| 1020 merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff)) | |
| 1021 | |
| 1022 # begin hack to print only significant rows | |
| 1023 asterisk_rows <- rowSums(merged_asterisk == "*") > 0 | |
| 1024 all_rows <- rownames(merged_stats) | |
| 1025 names(asterisk_rows) <- all_rows | |
| 1026 non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE]) | |
| 1027 asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE]) | |
| 1028 merged_scores_asterisk <- merged_scores[names(asterisk_rows), ] | |
| 1029 merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ] | |
| 1030 # end hack to print only significant rows | |
| 1031 | |
| 1032 row_list <- list() | |
| 1033 row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows | |
| 1034 row_list[[const_ksea_all_kinases]] <- all_rows | |
| 1035 row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows | |
| 1036 | |
| 1037 i <- which_kinases | |
| 1038 my_row_names <- row_list[[i]] | |
| 1039 scrs <- merged_scores[my_row_names, ] | |
| 1040 stts <- merged_stats[my_row_names, ] | |
| 1041 merged_asterisk <- as.matrix(asterisk(stts, p_cutoff)) | |
| 1042 | |
| 1043 color_breaks <- create_breaks(scrs) | |
| 1044 plot_height <- nrow(scrs) ^ 0.55 | |
| 1045 plot_width <- ncol(scrs) ^ 0.7 | |
| 1046 my_cex_row <- 0.25 * 16 / plot_height | |
| 1047 if (export == "TRUE") { | |
| 1048 png( | |
| 1049 "KSEA.Merged.Heatmap.png", | |
| 1050 width = plot_width * 300, | |
| 1051 height = 2 * plot_height * 300, | |
| 1052 res = 300, | |
| 1053 pointsize = 14 | |
| 1054 ) | |
| 1055 } | |
| 1056 draw_kseaapp_summary_heatmap( | |
| 1057 x = scrs, | |
| 1058 sample_cluster = sample_cluster, | |
| 1059 merged_asterisk = merged_asterisk, | |
| 1060 my_cex_row = my_cex_row, | |
| 1061 color_breaks = color_breaks, | |
| 1062 margins = margins | |
| 1063 ) | |
| 1064 if (export == "TRUE") { | |
| 1065 dev.off() | |
| 1066 } | |
| 1067 return(my_row_names) | |
| 1068 } | |
| 1069 | |
| 1070 # helper for heatmaps of phosphopeptide intensities | |
| 1071 | |
| 1072 draw_intensity_heatmap <- | |
| 1073 function( | |
| 1074 m, # matrix with rownames already formatted | |
| 1075 cutoff, # cutoff used by hm_heading_function | |
| 1076 hm_heading_function, # construct and cat heading from m and cutoff | |
| 1077 hm_main_title, # main title for plot (drawn below heading) | |
| 1078 suppress_row_dendrogram = TRUE, # set to false to show dendrogram | |
| 1079 max_peptide_count # experimental: | |
| 1080 = intensity_hm_rows, # values of 50 and 75 worked well | |
| 1081 ... # passthru parameters for heatmap | |
| 1082 ) { | |
| 1083 peptide_count <- 0 | |
| 1084 # emit the heading for the heatmap | |
| 1085 if (hm_heading_function(m, cutoff)) { | |
| 1086 peptide_count <- min(max_peptide_count, nrow(m)) | |
| 1087 if (nrow(m) > 1) { | |
| 1088 m_margin <- m[peptide_count:1, ] | |
| 1089 # Margin setting was heuristically derived | |
| 1090 margins <- | |
| 1091 c(0.5, # col | |
| 1092 max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row | |
| 1093 ) | |
| 1094 } | |
| 1095 if (nrow(m) > 1) { | |
| 1096 tryCatch( | |
| 1097 { | |
| 1098 old_oma <- par("oma") | |
| 1099 par(cex.main = 0.6) | |
| 1100 # Heuristically determined character size adjustment formula | |
| 1101 char_contractor <- | |
| 1102 250000 / ( | |
| 1103 max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows | |
| 1104 ) | |
| 1105 heatmap( | |
| 1106 m[peptide_count:1, ], | |
| 1107 Rowv = if (suppress_row_dendrogram) NA else NULL, | |
| 1108 Colv = NA, | |
| 1109 cexRow = char_contractor, | |
| 1110 cexCol = char_contractor * 50 / max_peptide_count, | |
| 1111 scale = "row", | |
| 1112 margins = margins, | |
| 1113 main = | |
| 1114 "Unimputed, unnormalized log(intensities)", | |
| 1115 xlab = "", | |
| 1116 las = 1, | |
| 1117 ... | |
| 1118 ) | |
| 1119 }, | |
| 1120 error = function(e) { | |
| 1121 cat( | |
| 1122 sprintf( | |
| 1123 "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n", | |
| 1124 e$message | |
| 1125 ) | |
| 1126 ) | |
| 1127 }, | |
| 1128 finally = par(old_oma) | |
| 1129 ) | |
| 1130 } | |
| 1131 } | |
| 1132 return(peptide_count) | |
| 1133 } | |
| 1134 ``` | |
| 1135 | |
| 1136 ```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
| 1137 cat("\\listoftables\n") | |
| 1138 ``` | |
| 1139 # Purpose | |
| 1140 | |
| 1141 To perform for phosphopeptides: | |
| 1142 | |
| 1143 - imputation of missing values, | |
| 1144 - quantile normalization, | |
| 1145 - ANOVA (using the R stats::`r params$oneWayManyCategories` function), and | |
| 1146 - KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases: | |
| 1147 - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org) | |
| 1148 - The Human Proteome Database [http://hprd.org](http://hprd.org) | |
| 1149 - NetworKIN [http://networkin.science/](http://networkin.science/) | |
| 1150 - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx) | |
| 1151 | |
| 1152 ```{r include = FALSE} | |
| 1153 | |
| 1154 ### GLOBAL VARIABLES | |
| 1155 | |
| 1156 # parameters for KSEA | |
| 1157 | |
| 1158 ksea_cutoff_statistic <- params$kseaCutoffStatistic | |
| 1159 ksea_cutoff_threshold <- params$kseaCutoffThreshold | |
| 1160 ksea_min_kinase_count <- params$kseaMinKinaseCount | |
| 1161 | |
| 1162 ksea_heatmap_titles <- list() | |
| 1163 ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <- | |
| 1164 sprintf( | |
| 1165 "Summary for all kinases enriched in one or more contrasts at %s < %s", | |
| 1166 ksea_cutoff_statistic, | |
| 1167 ksea_cutoff_threshold | |
| 1168 ) | |
| 1169 ksea_heatmap_titles[[const_ksea_all_kinases]] <- | |
| 1170 "Summary figure for all contrasts and all kinases" | |
| 1171 ksea_heatmap_titles[[const_ksea_nonastrsk_kinases]] <- | |
| 1172 sprintf( | |
| 1173 "Summary for all kinases not enriched at %s < %s in any contrast", | |
| 1174 ksea_cutoff_statistic, | |
| 1175 ksea_cutoff_threshold | |
| 1176 ) | |
| 1177 # hash to hold names of significantly enriched kinases | |
| 1178 ksea_asterisk_hash <- new_env() | |
| 1179 | |
| 1180 # READ PARAMETERS (mostly) | |
| 1181 | |
| 1182 intensity_hm_rows <- params$intensityHeatmapRows | |
| 1183 # Input Filename | |
| 1184 input_file <- params$inputFile | |
| 1185 | |
| 1186 # First data column - ideally, this could be detected via regexSampleNames, | |
| 1187 # but for now leave it as is. | |
| 1188 first_data_column <- params$firstDataColumn | |
| 1189 fdc_is_integer <- is.integer(first_data_column) | |
| 1190 if (fdc_is_integer) { | |
| 1191 first_data_column <- as.integer(params$firstDataColumn) | |
| 1192 } | |
| 1193 | |
| 1194 # False discovery rate adjustment for ANOVA | |
| 1195 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 | |
| 1196 val_fdr <- | |
| 1197 read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "") | |
| 1198 | |
| 1199 if ( | |
| 1200 ncol(val_fdr) != 1 || | |
| 1201 sum(!is.numeric(val_fdr[, 1])) || | |
| 1202 sum(val_fdr[, 1] < 0) || | |
| 1203 sum(val_fdr[, 1] > 1) | |
| 1204 ) { | |
| 1205 stop("alphaFile should be one column of numbers within the range [0.0,1.0]") | |
| 1206 } | |
| 1207 val_fdr <- val_fdr[, 1] | |
| 1208 | |
| 1209 #Imputed Data filename | |
| 1210 imputed_data_filename <- params$imputedDataFilename | |
| 1211 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile | |
| 1212 anova_ksea_mtdt_file <- params$anovaKseaMetadata | |
| 1213 | |
| 1214 ``` | |
| 1215 | |
| 1216 ```{r echo = FALSE} | |
| 1217 # Imputation method, should be one of | |
| 1218 # "random", "group-median", "median", or "mean" | |
| 1219 imputation_method <- params$imputationMethod | |
| 1220 | |
| 1221 # Selection of percentile of logvalue data to set the mean for random number | |
| 1222 # generation when using random imputation | |
| 1223 mean_percentile <- params$meanPercentile / 100.0 | |
| 1224 | |
| 1225 # deviation adjustment-factor for random values; real number. | |
| 1226 sd_percentile <- params$sdPercentile | |
| 1227 | |
| 1228 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" | |
| 1229 regex_sample_names <- params$regexSampleNames | |
| 1230 | |
| 1231 # Regular expression to extract Sample Grouping from Sample Name; | |
| 1232 # if error occurs, compare sample_treatment_levels vs. sample_name_matches | |
| 1233 # to see if groupings/pairs line up | |
| 1234 # e.g., "(\\d+)" | |
| 1235 regex_sample_grouping <- params$regexSampleGrouping | |
| 1236 | |
| 1237 one_way_all_categories_fname <- params$oneWayManyCategories | |
| 1238 one_way_all_categories <- try_catch_w_e( | |
| 1239 match.fun(one_way_all_categories_fname)) | |
| 1240 if (!is.function(one_way_all_categories$value)) { | |
| 1241 write("fatal error for parameter oneWayManyCategories:", stderr()) | |
| 1242 write(one_way_all_categories$value$message, stderr()) | |
| 1243 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
| 1244 stop("Cannot continue. Goodbye.") | |
| 1245 } | |
| 1246 one_way_all_categories <- one_way_all_categories$value | |
| 1247 | |
| 1248 one_way_two_categories_fname <- params$oneWayManyCategories | |
| 1249 one_way_two_categories <- try_catch_w_e( | |
| 1250 match.fun(one_way_two_categories_fname)) | |
| 1251 if (!is.function(one_way_two_categories$value)) { | |
| 1252 cat("fatal error for parameter oneWayTwoCategories: \n") | |
| 1253 cat(one_way_two_categories$value$message, fill = TRUE) | |
| 1254 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
| 1255 stop("Cannot continue. Goodbye.") | |
| 1256 } | |
| 1257 one_way_two_categories <- one_way_two_categories$value | |
| 1258 | |
| 1259 preproc_db <- params$preprocDb | |
| 1260 ksea_app_prep_db <- params$kseaAppPrepDb | |
| 1261 result <- file.copy( | |
| 1262 from = preproc_db, | |
| 1263 to = ksea_app_prep_db, | |
| 1264 overwrite = TRUE | |
| 1265 ) | |
| 1266 if (!result) { | |
| 1267 write( | |
| 1268 sprintf( | |
| 1269 "fatal error copying initial database '%s' to output '%s'", | |
| 1270 preproc_db, | |
| 1271 ksea_app_prep_db, | |
| 1272 ), | |
| 1273 stderr() | |
| 1274 ) | |
| 1275 if (sys.nframe() > 0) quit(save = "no", status = 1) | |
| 1276 stop("Cannot continue. Goodbye.") | |
| 1277 } | |
| 1278 ``` | |
| 1279 | |
| 1280 ```{r echo = FALSE} | |
| 1281 ### READ DATA | |
| 1282 | |
| 1283 # read.table reads a file in table format and creates a data frame from it. | |
| 1284 # - note that `quote = ""` means that quotation marks are treated literally. | |
| 1285 full_data <- read.table( | |
| 1286 file = input_file, | |
| 1287 sep = "\t", | |
| 1288 header = TRUE, | |
| 1289 quote = "", | |
| 1290 check.names = FALSE | |
| 1291 ) | |
| 1292 ``` | |
| 1293 | |
| 1294 # Extract Sample Names and Treatment Levels | |
| 1295 | |
| 1296 Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2. | |
| 1297 | |
| 1298 ```{r echo = FALSE, results = 'asis'} | |
| 1299 | |
| 1300 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) | |
| 1301 | |
| 1302 if (!fdc_is_integer) { | |
| 1303 if (length(data_column_indices) > 0) { | |
| 1304 first_data_column <- data_column_indices[1] | |
| 1305 } else { | |
| 1306 stop(paste("failed to convert firstDataColumn:", first_data_column)) | |
| 1307 } | |
| 1308 } | |
| 1309 | |
| 1310 cat( | |
| 1311 sprintf( | |
| 1312 paste( | |
| 1313 "\n\nThe input data file has peptide-intensity data for each sample", | |
| 1314 "in one of columns %d through %d.\n\n" | |
| 1315 ), | |
| 1316 min(data_column_indices), | |
| 1317 max(data_column_indices) | |
| 1318 ) | |
| 1319 ) | |
| 1320 | |
| 1321 # Write column names as a LaTeX enumerated list. | |
| 1322 column_name_df <- data.frame( | |
| 1323 column = seq_len(length(colnames(full_data))), | |
| 1324 name = paste0("\\verb@", colnames(full_data), "@") | |
| 1325 ) | |
| 1326 data_frame_latex( | |
| 1327 x = column_name_df, | |
| 1328 justification = "l l", | |
| 1329 centered = TRUE, | |
| 1330 caption = "Input data column names", | |
| 1331 anchor = const_table_anchor_bp, | |
| 1332 underscore_whack = FALSE | |
| 1333 ) | |
| 1334 | |
| 1335 ``` | |
| 1336 | |
| 1337 ```{r echo = FALSE, results = 'asis'} | |
| 1338 quant_data <- full_data[first_data_column:length(full_data)] | |
| 1339 quant_data[quant_data == 0] <- NA | |
| 1340 rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide | |
| 1341 # Extract factors and trt-replicates using regular expressions. | |
| 1342 # Typically: | |
| 1343 # regex_sample_names is "\\.\\d+[A-Z]$" | |
| 1344 # regex_sample_grouping is "\\d+" | |
| 1345 # This would distinguish trt-replicates by terminal letter [A-Z] | |
| 1346 # in sample names and group them into trts by the preceding digits. | |
| 1347 # e.g.: | |
| 1348 # group .1A .1B .1C into group 1; | |
| 1349 # group .2A .2B .2C, into group 2; | |
| 1350 # etc. | |
| 1351 m <- regexpr(regex_sample_names, colnames(quant_data), perl = TRUE) | |
| 1352 sample_name_matches <- regmatches(names(quant_data), m) | |
| 1353 colnames(quant_data) <- sample_name_matches | |
| 1354 | |
| 1355 write_debug_file(quant_data) | |
| 1356 | |
| 1357 rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE) | |
| 1358 sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match)) | |
| 1359 number_of_samples <- length(sample_name_matches) | |
| 1360 sample_treatment_df <- data.frame( | |
| 1361 level = sample_treatment_levels, | |
| 1362 sample = sample_name_matches | |
| 1363 ) | |
| 1364 data_frame_latex( | |
| 1365 x = sample_treatment_df, | |
| 1366 justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}", | |
| 1367 centered = TRUE, | |
| 1368 caption = "Treatment levels", | |
| 1369 anchor = const_table_anchor_tbp, | |
| 1370 underscore_whack = FALSE | |
| 1371 ) | |
| 1372 ``` | |
| 1373 | |
| 1374 ```{r echo = FALSE, results = 'asis'} | |
| 1375 cat("\\newpage\n") | |
| 1376 ``` | |
| 1377 | |
| 1378 ## Are the log-transformed sample distributions similar? | |
| 1379 | |
| 1380 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | |
| 1381 | |
| 1382 quant_data[quant_data == 0] <- NA #replace 0 with NA | |
| 1383 quant_data_log <- log10(quant_data) | |
| 1384 | |
| 1385 rownames(quant_data_log) <- rownames(quant_data) | |
| 1386 colnames(quant_data_log) <- sample_name_matches | |
| 1387 | |
| 1388 write_debug_file(quant_data_log) | |
| 1389 | |
| 1390 # data visualization | |
| 1391 old_par <- par( | |
| 1392 mai = par("mai") + c(0.5, 0, 0, 0) | |
| 1393 ) | |
| 1394 # ref: https://r-charts.com/distribution/add-points-boxplot/ | |
| 1395 # Vertical plot | |
| 1396 boxplot( | |
| 1397 quant_data_log | |
| 1398 , las = 1 | |
| 1399 , col = const_boxplot_fill | |
| 1400 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
| 1401 , xlab = "Sample" | |
| 1402 ) | |
| 1403 par(old_par) | |
| 1404 | |
| 1405 | |
| 1406 | |
| 1407 cat("\n\n\n") | |
| 1408 cat("\n\n\n") | |
| 1409 | |
| 1410 ``` | |
| 1411 | |
| 1412 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} | |
| 1413 if (nrow(quant_data_log) > 1) { | |
| 1414 quant_data_log_stack <- stack(quant_data_log) | |
| 1415 ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) + | |
| 1416 ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + | |
| 1417 ggplot2::ylab("Probability density") + | |
| 1418 ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE) | |
| 1419 } else { | |
| 1420 cat("No density plot because there are too few peptides.\n\n") | |
| 1421 } | |
| 1422 ``` | |
| 1423 | |
| 1424 ## Globally, are peptide intensities are approximately unimodal? | |
| 1425 | |
| 1426 <!-- | |
| 1427 # bquote could be used as an alternative to latex2exp::TeX below particularly | |
| 1428 # and when plotting math expressions generally, at the expense of mastering | |
| 1429 # another syntax, which hardly seems worthwhile when I need to use TeX | |
| 1430 # elsewhere; here's an introduction to bquote: | |
| 1431 # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ | |
| 1432 --> | |
| 1433 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} | |
| 1434 | |
| 1435 # identify the location of missing values | |
| 1436 fin <- is.finite(as.numeric(as.matrix(quant_data_log))) | |
| 1437 | |
| 1438 logvalues <- as.numeric(as.matrix(quant_data_log))[fin] | |
| 1439 logvalues_density <- density(logvalues) | |
| 1440 plot( | |
| 1441 x = logvalues_density, | |
| 1442 main = latex2exp::TeX( | |
| 1443 "Smoothed estimated probability density vs. $log_{10}$(peptide intensity)" | |
| 1444 ), | |
| 1445 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), | |
| 1446 ylab = "Probability density" | |
| 1447 ) | |
| 1448 hist( | |
| 1449 x = as.numeric(as.matrix(quant_data_log)), | |
| 1450 xlim = c(min(logvalues_density$x), max(logvalues_density$x)), | |
| 1451 breaks = 100, | |
| 1452 main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), | |
| 1453 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
| 1454 ) | |
| 1455 ``` | |
| 1456 | |
| 1457 ## Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values | |
| 1458 | |
| 1459 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} | |
| 1460 # determine quantile | |
| 1461 q1 <- quantile(logvalues, probs = mean_percentile)[1] | |
| 1462 | |
| 1463 # 1 = row of matrix (ie, phosphopeptide) | |
| 1464 sds <- apply(quant_data_log, 1, sd_finite) | |
| 1465 if (sum(!is.na(sds)) > 2) { | |
| 1466 plot( | |
| 1467 density(sds, na.rm = TRUE) | |
| 1468 , main = "Smoothed estimated probability density vs. std. deviation" | |
| 1469 , sub = "(probability estimation made with Gaussian smoothing)" | |
| 1470 , ylab = "Probability density" | |
| 1471 ) | |
| 1472 } else { | |
| 1473 cat( | |
| 1474 "At least two non-missing values are required to plot", | |
| 1475 "probability density.\n\n" | |
| 1476 ) | |
| 1477 } | |
| 1478 | |
| 1479 ``` | |
| 1480 | |
| 1481 ```{r echo = FALSE} | |
| 1482 # Determine number of cells to impute | |
| 1483 temp <- quant_data[is.na(quant_data)] | |
| 1484 | |
| 1485 # Determine number of values to impute | |
| 1486 number_to_impute <- length(temp) | |
| 1487 | |
| 1488 # Determine percent of missing values | |
| 1489 pct_missing_values <- | |
| 1490 round(length(temp) / (length(logvalues) + length(temp)) * 100) | |
| 1491 ``` | |
| 1492 | |
| 1493 ```{r echo = FALSE} | |
| 1494 | |
| 1495 # prep for trt-median based imputation | |
| 1496 | |
| 1497 ``` | |
| 1498 # Impute Missing Values | |
| 1499 | |
| 1500 ```{r echo = FALSE} | |
| 1501 | |
| 1502 imp_smry_pot_peptides_before <- nrow(quant_data_log) | |
| 1503 imp_smry_missing_values_before <- number_to_impute | |
| 1504 imp_smry_pct_missing <- pct_missing_values | |
| 1505 | |
| 1506 ``` | |
| 1507 | |
| 1508 ```{r echo = FALSE} | |
| 1509 #Determine number of cells to impute | |
| 1510 | |
| 1511 ``` | |
| 1512 ```{r echo = FALSE} | |
| 1513 | |
| 1514 # Identify which values are missing and need to be imputed | |
| 1515 ind <- which(is.na(quant_data), arr.ind = TRUE) | |
| 1516 | |
| 1517 ``` | |
| 1518 ```{r echo = FALSE, results = 'asis'} | |
| 1519 | |
| 1520 # Apply imputation | |
| 1521 switch( | |
| 1522 imputation_method | |
| 1523 , "group-median" = { | |
| 1524 quant_data_imp <- quant_data | |
| 1525 imputation_method_description <- | |
| 1526 paste("Substitute missing value with", | |
| 1527 "median peptide-intensity for sample group.\n" | |
| 1528 ) | |
| 1529 sample_level_integers <- as.integer(sample_treatment_levels) | |
| 1530 # Take the accurate ln(x+1) because the data are log-normally distributed | |
| 1531 # and because median can involve an average of two measurements. | |
| 1532 quant_data_imp <- log1p(quant_data_imp) | |
| 1533 for (i in seq_len(length(levels(sample_treatment_levels)))) { | |
| 1534 # Determine the columns for this factor-level | |
| 1535 level_cols <- i == sample_level_integers | |
| 1536 # Extract those columns | |
| 1537 lvlsbst <- quant_data_imp[, level_cols, drop = FALSE] | |
| 1538 # assign to ind the row-column pairs corresponding to each NA | |
| 1539 ind <- which(is.na(lvlsbst), arr.ind = TRUE) | |
| 1540 # No group-median exists if there is only one sample | |
| 1541 # a given ppep has no measurement; otherwise, proceed. | |
| 1542 if (ncol(lvlsbst) > 1) { | |
| 1543 the_centers <- | |
| 1544 apply(lvlsbst, 1, median, na.rm = TRUE) | |
| 1545 for (j in seq_len(nrow(lvlsbst))) { | |
| 1546 for (k in seq_len(ncol(lvlsbst))) { | |
| 1547 if (is.na(lvlsbst[j, k])) { | |
| 1548 lvlsbst[j, k] <- the_centers[j] | |
| 1549 } | |
| 1550 } | |
| 1551 } | |
| 1552 quant_data_imp[, level_cols] <- lvlsbst | |
| 1553 } | |
| 1554 } | |
| 1555 # Take the accurate e^x - 1 to match scaling of original input. | |
| 1556 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | |
| 1557 good_rows <- !is.na(rowMeans(quant_data_imp)) | |
| 1558 } | |
| 1559 , "median" = { | |
| 1560 quant_data_imp <- quant_data | |
| 1561 imputation_method_description <- | |
| 1562 paste("Substitute missing value with", | |
| 1563 "median peptide-intensity across all sample classes.\n" | |
| 1564 ) | |
| 1565 # Take the accurate ln(x+1) because the data are log-normally distributed | |
| 1566 # and because median can involve an average of two measurements. | |
| 1567 quant_data_imp <- log1p(quant_data_imp) | |
| 1568 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = TRUE)[ind[, 1]] | |
| 1569 # Take the accurate e^x - 1 to match scaling of original input. | |
| 1570 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | |
| 1571 good_rows <- !is.nan(rowMeans(quant_data_imp)) | |
| 1572 } | |
| 1573 , "mean" = { | |
| 1574 quant_data_imp <- quant_data | |
| 1575 imputation_method_description <- | |
| 1576 paste("Substitute missing value with", | |
| 1577 "geometric-mean peptide intensity across all sample classes.\n" | |
| 1578 ) | |
| 1579 # Take the accurate ln(x+1) because the data are log-normally distributed, | |
| 1580 # so arguments to mean should be previously transformed. | |
| 1581 # this will have to be | |
| 1582 quant_data_imp <- log1p(quant_data_imp) | |
| 1583 # Assign to NA cells the mean for the row | |
| 1584 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = TRUE)[ind[, 1]] | |
| 1585 # Take the accurate e^x - 1 to match scaling of original input. | |
| 1586 quant_data_imp <- round(expm1(quant_data_imp_ln <- quant_data_imp)) | |
| 1587 good_rows <- !is.nan(rowMeans(quant_data_imp)) | |
| 1588 } | |
| 1589 , "random" = { | |
| 1590 quant_data_imp <- quant_data | |
| 1591 m1 <- median(sds, na.rm = TRUE) * sd_percentile #sd to be used is the median sd | |
| 1592 # If you want results to be reproducible, you will want to call | |
| 1593 # base::set.seed before calling stats::rnorm | |
| 1594 imputation_method_description <- | |
| 1595 paste("Substitute each missing value with random intensity", | |
| 1596 sprintf( | |
| 1597 "random intensity $N \\sim (%0.2f, %0.2f)$.\n", | |
| 1598 q1, m1 | |
| 1599 ) | |
| 1600 ) | |
| 1601 cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n", | |
| 1602 100 * mean_percentile)) | |
| 1603 cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n", | |
| 1604 sd_percentile)) | |
| 1605 quant_data_imp[ind] <- | |
| 1606 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) | |
| 1607 quant_data_imp_ln <- log1p(quant_data_imp) | |
| 1608 good_rows <- !is.nan(rowMeans(quant_data_imp)) | |
| 1609 } | |
| 1610 ) | |
| 1611 quant_data_imp_log10 <- quant_data_imp_ln * const_log10_e | |
| 1612 | |
| 1613 if (length(good_rows) < 1) { | |
| 1614 print("ERROR: Cannot impute data; there are no good rows!") | |
| 1615 return(-1) | |
| 1616 } | |
| 1617 ``` | |
| 1618 | |
| 1619 ```{r echo = FALSE, results = 'asis'} | |
| 1620 cat("\\quad\n\nImputation method:\n\n\n", imputation_method_description) | |
| 1621 ``` | |
| 1622 | |
| 1623 ```{r echo = FALSE} | |
| 1624 | |
| 1625 imp_smry_pot_peptides_after <- sum(good_rows) | |
| 1626 imp_smry_rejected_after <- sum(!good_rows) | |
| 1627 imp_smry_missing_values_after <- sum(is.na(quant_data_imp[good_rows, ])) | |
| 1628 ``` | |
| 1629 ```{r echo = FALSE, results = 'asis'} | |
| 1630 # ref: http://www1.maths.leeds.ac.uk/latex/TableHelp1.pdf | |
| 1631 tabular_lines_fmt <- paste( | |
| 1632 "\\begin{table}[hb]", # h(inline); b(bottom); t (top) or p (separate page) | |
| 1633 " \\caption{Imputation Results}", | |
| 1634 " \\centering", # \centering centers the table on the page | |
| 1635 " \\begin{tabular}{l c c c}", | |
| 1636 " \\hline\\hline", | |
| 1637 " \\ & potential peptides & missing values & rejected", | |
| 1638 " peptides \\\\ [0.5ex]", | |
| 1639 " \\hline", | |
| 1640 " before imputation & %d & %d (%d\\%s) & \\\\", | |
| 1641 " after imputation & %d & %d & %d \\\\ [1ex]", | |
| 1642 " \\hline", | |
| 1643 " \\end{tabular}", | |
| 1644 #" \\label{table:nonlin}", # may be used to refer this table in the text | |
| 1645 "\\end{table}", | |
| 1646 sep = "\n" | |
| 1647 ) | |
| 1648 tabular_lines <- | |
| 1649 sprintf( | |
| 1650 tabular_lines_fmt, | |
| 1651 imp_smry_pot_peptides_before, | |
| 1652 imp_smry_missing_values_before, | |
| 1653 imp_smry_pct_missing, | |
| 1654 "%", | |
| 1655 imp_smry_pot_peptides_after, | |
| 1656 imp_smry_missing_values_after, | |
| 1657 imp_smry_rejected_after | |
| 1658 ) | |
| 1659 cat(tabular_lines) | |
| 1660 ``` | |
| 1661 ```{r echo = FALSE} | |
| 1662 | |
| 1663 | |
| 1664 # Zap rows where imputation was ineffective | |
| 1665 full_data <- full_data [good_rows, ] | |
| 1666 quant_data <- quant_data [good_rows, ] | |
| 1667 | |
| 1668 quant_data_imp <- quant_data_imp[good_rows, ] | |
| 1669 write_debug_file(quant_data_imp) | |
| 1670 quant_data_imp_good_rows <- quant_data_imp | |
| 1671 | |
| 1672 write_debug_file(quant_data_imp_good_rows) | |
| 1673 ``` | |
| 1674 | |
| 1675 ```{r echo = FALSE, results = 'asis'} | |
| 1676 | |
| 1677 can_plot_before_after_imp <- TRUE | |
| 1678 d_combined <- | |
| 1679 as.numeric( | |
| 1680 as.matrix( | |
| 1681 log10(quant_data_imp) | |
| 1682 ) | |
| 1683 ) | |
| 1684 d_original <- | |
| 1685 as.numeric( | |
| 1686 as.matrix( | |
| 1687 log10(quant_data_imp[!is.na(quant_data)]) | |
| 1688 ) | |
| 1689 ) | |
| 1690 | |
| 1691 if (sum(!is.na(d_original)) > 2) { | |
| 1692 d_original <- density(d_original) | |
| 1693 } else { | |
| 1694 can_plot_before_after_imp <- FALSE | |
| 1695 } | |
| 1696 if (can_plot_before_after_imp && sum(is.na(d_combined)) < 1) { | |
| 1697 d_combined <- density(d_combined) | |
| 1698 } else { | |
| 1699 can_plot_before_after_imp <- FALSE | |
| 1700 } | |
| 1701 | |
| 1702 if (sum(is.na(quant_data)) > 0) { | |
| 1703 # There ARE missing values | |
| 1704 d_imputed <- | |
| 1705 as.numeric( | |
| 1706 as.matrix( | |
| 1707 log10(quant_data_imp[is.na(quant_data)]) | |
| 1708 ) | |
| 1709 ) | |
| 1710 if (can_plot_before_after_imp && sum(is.na(d_imputed)) < 1) { | |
| 1711 d_imputed <- (density(d_imputed)) | |
| 1712 } else { | |
| 1713 can_plot_before_after_imp <- FALSE | |
| 1714 } | |
| 1715 } else { | |
| 1716 # There are NO missing values | |
| 1717 d_imputed <- d_combined | |
| 1718 } | |
| 1719 | |
| 1720 ``` | |
| 1721 | |
| 1722 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | |
| 1723 zero_sd_rownames <- | |
| 1724 rownames(quant_data_imp)[ | |
| 1725 is.na((apply(quant_data_imp, 1, sd, na.rm = TRUE)) == 0) | |
| 1726 ] | |
| 1727 | |
| 1728 if (length(zero_sd_rownames) >= nrow(quant_data_imp)) { | |
| 1729 stop("All peptides have zero standard deviation. Cannot continue.") | |
| 1730 } | |
| 1731 if (length(zero_sd_rownames) > 0) { | |
| 1732 cat( | |
| 1733 sprintf("%d peptides with zero variance were removed from statistical consideration", | |
| 1734 length(zero_sd_rownames) | |
| 1735 ) | |
| 1736 ) | |
| 1737 zap_named_rows <- function(df, nms) { | |
| 1738 return(df[!(row.names(df) %in% nms), ]) | |
| 1739 } | |
| 1740 quant_data_imp <- zap_named_rows(quant_data_imp, zero_sd_rownames) | |
| 1741 quant_data <- zap_named_rows(quant_data, zero_sd_rownames) | |
| 1742 full_data <- zap_named_rows(full_data, zero_sd_rownames) | |
| 1743 } | |
| 1744 | |
| 1745 if (sum(is.na(quant_data)) > 0) { | |
| 1746 cat("\\leavevmode\\newpage\n") | |
| 1747 # data visualization | |
| 1748 old_par <- par( | |
| 1749 mai = par("mai") + c(0.5, 0, 0, 0) | |
| 1750 ) | |
| 1751 # Copy quant data to x | |
| 1752 x <- quant_data | |
| 1753 # x gets to have values of: | |
| 1754 # - NA for observed values | |
| 1755 # - 1 for missing values | |
| 1756 x[is.na(x)] <- 0 | |
| 1757 x[x > 1] <- NA | |
| 1758 x[x == 0] <- 1 | |
| 1759 # Log-transform imputed data | |
| 1760 # update variable because rows may have been eliminated from quant_data_imp | |
| 1761 quant_data_imp_log10 <- log10(quant_data_imp) | |
| 1762 | |
| 1763 write_debug_file(quant_data_imp_log10) | |
| 1764 | |
| 1765 # Set blue_dots to log of quant data or NA for NA quant data | |
| 1766 blue_dots <- log10(quant_data) | |
| 1767 # Set red_dots to log of imputed data or NA for observed quant data | |
| 1768 red_dots <- quant_data_imp_log10 * x | |
| 1769 | |
| 1770 count_red <- sum(!is.na(red_dots)) | |
| 1771 count_blue <- sum(!is.na(blue_dots)) | |
| 1772 ylim_save <- ylim <- c( | |
| 1773 min(red_dots, blue_dots, na.rm = TRUE), | |
| 1774 max(red_dots, blue_dots, na.rm = TRUE) | |
| 1775 ) | |
| 1776 show_stripchart <- | |
| 1777 50 > (count_red + count_blue) / length(sample_name_matches) | |
| 1778 if (show_stripchart) { | |
| 1779 boxplot_sub <- "Light blue = data before imputation; Red = imputed data" | |
| 1780 } else { | |
| 1781 boxplot_sub <- "" | |
| 1782 } | |
| 1783 | |
| 1784 # Vertical plot | |
| 1785 colnames(blue_dots) <- sample_name_matches | |
| 1786 boxplot( | |
| 1787 blue_dots | |
| 1788 , las = 1 # "always horizontal" | |
| 1789 , col = const_boxplot_fill | |
| 1790 , ylim = ylim | |
| 1791 , main = "Peptide intensities after eliminating unusable peptides" | |
| 1792 , sub = boxplot_sub | |
| 1793 , xlab = "Sample" | |
| 1794 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
| 1795 ) | |
| 1796 | |
| 1797 if (show_stripchart) { | |
| 1798 # Points | |
| 1799 # ref: https://r-charts.com/distribution/add-points-boxplot/ | |
| 1800 # NA values are not plotted | |
| 1801 stripchart( | |
| 1802 blue_dots, # Data | |
| 1803 method = "jitter", # Random noise | |
| 1804 jitter = const_stripchart_jitter, | |
| 1805 pch = 19, # Pch symbols | |
| 1806 cex = const_stripsmall_cex, # Size of symbols reduced | |
| 1807 col = "lightblue", # Color of the symbol | |
| 1808 vertical = TRUE, # Vertical mode | |
| 1809 add = TRUE # Add it over | |
| 1810 ) | |
| 1811 stripchart( | |
| 1812 red_dots, # Data | |
| 1813 method = "jitter", # Random noise | |
| 1814 jitter = const_stripchart_jitter, | |
| 1815 pch = 19, # Pch symbols | |
| 1816 cex = const_stripsmall_cex, # Size of symbols reduced | |
| 1817 col = "red", # Color of the symbol | |
| 1818 vertical = TRUE, # Vertical mode | |
| 1819 add = TRUE # Add it over | |
| 1820 ) | |
| 1821 | |
| 1822 } | |
| 1823 if (TRUE) { | |
| 1824 # show measured values in blue on left half-violin plot | |
| 1825 cat("\\leavevmode\n\\quad\n\n\\quad\n\n") | |
| 1826 vioplot::vioplot( | |
| 1827 x = lapply(blue_dots, function(x) x[!is.na(x)]), | |
| 1828 col = "lightblue1", | |
| 1829 side = "left", | |
| 1830 plotCentre = "line", | |
| 1831 ylim = ylim_save, | |
| 1832 main = "Distributions of observed and imputed data", | |
| 1833 sub = "Light blue = observed data; Pink = imputed data", | |
| 1834 xlab = "Sample", | |
| 1835 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
| 1836 ) | |
| 1837 red_violins <- lapply(red_dots, function(x) x[!is.na(x)]) | |
| 1838 cols_to_delete <- c() | |
| 1839 for (ix in seq_len(length(red_violins))) { | |
| 1840 if (length(red_violins[[ix]]) < 1) { | |
| 1841 cols_to_delete <- c(cols_to_delete, ix) | |
| 1842 } | |
| 1843 } | |
| 1844 # destroy any unimputable columns | |
| 1845 if (!is.null(cols_to_delete)) { | |
| 1846 red_violins <- red_violins[-cols_to_delete] | |
| 1847 } | |
| 1848 # plot imputed values in red on right half-violin plot | |
| 1849 vioplot::vioplot( | |
| 1850 x = red_violins, | |
| 1851 col = "lightpink1", | |
| 1852 side = "right", | |
| 1853 plotCentre = "line", | |
| 1854 add = TRUE | |
| 1855 ) | |
| 1856 } | |
| 1857 | |
| 1858 par(old_par) | |
| 1859 | |
| 1860 # density plot | |
| 1861 cat("\\leavevmode\n\n\n\n\n\n\n") | |
| 1862 if (can_plot_before_after_imp) { | |
| 1863 ylim <- c( | |
| 1864 0, | |
| 1865 max( | |
| 1866 if (is.list(d_combined)) d_combined$y else d_combined, | |
| 1867 if (is.list(d_original)) d_original$y else d_original, | |
| 1868 if (is.list(d_imputed)) d_imputed$y else d_imputed, | |
| 1869 na.rm = TRUE | |
| 1870 ) | |
| 1871 ) | |
| 1872 plot( | |
| 1873 d_combined, | |
| 1874 ylim = ylim, | |
| 1875 sub = | |
| 1876 paste( | |
| 1877 "Blue = data before imputation; Red = imputed data;", | |
| 1878 "Black = combined" | |
| 1879 ), | |
| 1880 main = "Density of peptide intensity before and after imputation", | |
| 1881 xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), | |
| 1882 ylab = "Probability density" | |
| 1883 ) | |
| 1884 lines(d_original, col = "blue") | |
| 1885 lines(d_imputed, col = "red") | |
| 1886 } else { | |
| 1887 cat( | |
| 1888 "There are too few points to plot the density of peptide intensity", | |
| 1889 "before and after imputation." | |
| 1890 ) | |
| 1891 } | |
| 1892 cat("\\leavevmode\\newpage\n") | |
| 1893 } | |
| 1894 ``` | |
| 1895 | |
| 1896 # Perform Quantile Normalization | |
| 1897 | |
| 1898 The excellent `normalize.quantiles` function from | |
| 1899 *[the `preprocessCore` Bioconductor package](http://bioconductor.org/packages/release/bioc/html/preprocessCore.html)* | |
| 1900 performs "quantile normalization" as described Bolstad *et al.* (2003), | |
| 1901 DOI *[10.1093/bioinformatics/19.2.185](https://doi.org/10.1093%2Fbioinformatics%2F19.2.185)* | |
| 1902 and *its supplementary material [http://bmbolstad.com/misc/normalize/normalize.html](http://bmbolstad.com/misc/normalize/normalize.html)*, | |
| 1903 i.e., it assumes that the goal is to detect | |
| 1904 subtle differences among grossly similar samples (having similar distributions) | |
| 1905 by equailzing intra-quantile quantitations. | |
| 1906 Unfortunately, one software library upon which it depends | |
| 1907 *[suffers from a concurrency defect](https://support.bioconductor.org/p/122925/#9135989)* | |
| 1908 that requires that a specific, non-concurrent version of the library be | |
| 1909 installed. The installation command equivalent to what was used to install the library to produce the results presented here is: | |
| 1910 ``` | |
| 1911 conda install bioconductor-preprocesscore openblas=0.3.3 | |
| 1912 ``` | |
| 1913 | |
| 1914 | |
| 1915 <!-- | |
| 1916 # Apply quantile normalization using preprocessCore::normalize.quantiles | |
| 1917 # --- | |
| 1918 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html | |
| 1919 # except this: https://support.bioconductor.org/p/122925/#9135989 | |
| 1920 # says to install it like this: | |
| 1921 # ``` | |
| 1922 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) | |
| 1923 # ``` | |
| 1924 # conda installation (necessary because of a bug in recent openblas): | |
| 1925 # conda install bioconductor-preprocesscore openblas=0.3.3 | |
| 1926 # ... | |
| 1927 # --- | |
| 1928 # normalize.quantiles {preprocessCore} -- Quantile Normalization | |
| 1929 # | |
| 1930 # Description: | |
| 1931 # Using a normalization based upon quantiles, this function normalizes a | |
| 1932 # matrix of probe level intensities. | |
| 1933 # | |
| 1934 # THIS FUNCTIONS WILL HANDLE MISSING DATA (ie NA values), based on the | |
| 1935 # assumption that the data is missing at random. | |
| 1936 # | |
| 1937 # Usage: | |
| 1938 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) | |
| 1939 # | |
| 1940 # Arguments: | |
| 1941 # | |
| 1942 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. | |
| 1943 # | |
| 1944 # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, | |
| 1945 # but in certain situations not making a copy of the matrix, but instead normalizing | |
| 1946 # it in place will be more memory friendly. | |
| 1947 # | |
| 1948 # - keep.names: Boolean option to preserve matrix row and column names in output. | |
| 1949 # | |
| 1950 # Details: | |
| 1951 # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. | |
| 1952 # No special allowances are made for outliers. If you make use of quantile normalization | |
| 1953 # please cite Bolstad et al, Bioinformatics (2003). | |
| 1954 # | |
| 1955 # This functions will handle missing data (ie NA values), based on | |
| 1956 # the assumption that the data is missing at random. | |
| 1957 # | |
| 1958 # Note that the current implementation optimizes for better memory usage | |
| 1959 # at the cost of some additional run-time. | |
| 1960 # | |
| 1961 # Value: A normalized matrix. | |
| 1962 # | |
| 1963 # Author: Ben Bolstad, bmbolstad.com | |
| 1964 # | |
| 1965 # References | |
| 1966 # | |
| 1967 # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide | |
| 1968 # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf | |
| 1969 # | |
| 1970 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of | |
| 1971 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias | |
| 1972 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 | |
| 1973 # http://bmbolstad.com/misc/normalize/normalize.html | |
| 1974 # ... | |
| 1975 --> | |
| 1976 ```{r echo = FALSE, results = 'asis'} | |
| 1977 | |
| 1978 if (nrow(quant_data_imp) > 0) { | |
| 1979 quant_data_imp_qn <- preprocessCore::normalize.quantiles( | |
| 1980 as.matrix(quant_data_imp), keep.names = TRUE | |
| 1981 ) | |
| 1982 } else { | |
| 1983 quant_data_imp_qn <- as.matrix(quant_data_imp) | |
| 1984 } | |
| 1985 | |
| 1986 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) | |
| 1987 | |
| 1988 write_debug_file(quant_data_imp_qn) | |
| 1989 | |
| 1990 quant_data_imp_qn_log <- log10(quant_data_imp_qn) | |
| 1991 | |
| 1992 write_debug_file(quant_data_imp_qn_log) | |
| 1993 | |
| 1994 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) | |
| 1995 | |
| 1996 sel <- apply(quant_data_imp_qn_ls, 1, any_nan) | |
| 1997 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls | |
| 1998 | |
| 1999 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ] | |
| 2000 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) | |
| 2001 | |
| 2002 quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls) | |
| 2003 | |
| 2004 write_debug_file(quant_data_imp_qn_ls) | |
| 2005 write_debug_file(quant_data_imp_qn_ls2) | |
| 2006 | |
| 2007 # Create data.frame used by ANOVA analysis | |
| 2008 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) | |
| 2009 ``` | |
| 2010 | |
| 2011 <!-- ACE insertion begin --> | |
| 2012 ## Are normalized, imputed, log-transformed sample distributions similar? | |
| 2013 | |
| 2014 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} | |
| 2015 | |
| 2016 # Save unimputed quant_data_log for plotting below | |
| 2017 unimputed_quant_data_log <- quant_data_log | |
| 2018 | |
| 2019 # log10 transform (after preparing for zero values, | |
| 2020 # which should never happen...) | |
| 2021 quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 | |
| 2022 quant_data_log <- log10(quant_data_imp_qn) | |
| 2023 | |
| 2024 how_many_peptides <- nrow(quant_data_log) | |
| 2025 | |
| 2026 if ((how_many_peptides) > 0) { | |
| 2027 cat( | |
| 2028 sprintf( | |
| 2029 "Intensities for %d peptides:\n\n\n", | |
| 2030 how_many_peptides | |
| 2031 ) | |
| 2032 ) | |
| 2033 cat("\n\n\n") | |
| 2034 | |
| 2035 | |
| 2036 # data visualization | |
| 2037 old_par <- par( | |
| 2038 mai = par("mai") + c(0.5, 0, 0, 0) | |
| 2039 , oma = par("oma") + c(0.5, 0, 0, 0) | |
| 2040 ) | |
| 2041 # ref: https://r-charts.com/distribution/add-points-boxplot/ | |
| 2042 # Vertical plot | |
| 2043 colnames(quant_data_log) <- sample_name_matches | |
| 2044 boxplot( | |
| 2045 quant_data_log | |
| 2046 , las = 1 | |
| 2047 , col = const_boxplot_fill | |
| 2048 , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
| 2049 , xlab = "Sample" | |
| 2050 ) | |
| 2051 par(old_par) | |
| 2052 } else { | |
| 2053 cat("There are no peptides to plot\n") | |
| 2054 } | |
| 2055 | |
| 2056 cat("\n\n\n") | |
| 2057 | |
| 2058 ``` | |
| 2059 | |
| 2060 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'} | |
| 2061 if (nrow(quant_data_log) > 1) { | |
| 2062 quant_data_log_stack <- stack(quant_data_log) | |
| 2063 ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) + | |
| 2064 ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + | |
| 2065 ggplot2::ylab("Probability density") + | |
| 2066 ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE) | |
| 2067 } else { | |
| 2068 cat("No density plot because there are fewer than two peptides to plot.\n\n") | |
| 2069 } | |
| 2070 ``` | |
| 2071 ```{r echo = FALSE, results = 'asis'} | |
| 2072 cat("\\leavevmode\\newpage\n") | |
| 2073 ``` | |
| 2074 | |
| 2075 # ANOVA Analysis | |
| 2076 | |
| 2077 ```{r, echo = FALSE} | |
| 2078 # Make new data frame containing only Phosphopeptides | |
| 2079 # to connect preANOVA to ANOVA (connect_df) | |
| 2080 connect_df <- data.frame( | |
| 2081 data_table_imp_qn_lt$Phosphopeptide | |
| 2082 , data_table_imp_qn_lt[, first_data_column] | |
| 2083 ) | |
| 2084 colnames(connect_df) <- c("Phosphopeptide", "Intensity") | |
| 2085 ``` | |
| 2086 | |
| 2087 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
| 2088 count_of_treatment_levels <- length(levels(sample_treatment_levels)) | |
| 2089 if (count_of_treatment_levels < 2) { | |
| 2090 nuke_control_sequences <- | |
| 2091 function(s) { | |
| 2092 s <- gsub("[\\]", "xyzzy_plugh", s) | |
| 2093 s <- gsub("[$]", "\\\\$", s) | |
| 2094 s <- gsub("xyzzy_plugh", "$\\\\backslash$", s) | |
| 2095 return(s) | |
| 2096 } | |
| 2097 cat( | |
| 2098 "ERROR!!!! Cannot perform ANOVA analysis", | |
| 2099 "(see next page)\\newpage\n" | |
| 2100 ) | |
| 2101 cat( | |
| 2102 "ERROR: ANOVA analysis", | |
| 2103 "requires two or more factor levels!\n\n\n" | |
| 2104 ) | |
| 2105 | |
| 2106 cat("\n\n\n\n\n") | |
| 2107 cat("Unparsed sample names are:\n\n\n", | |
| 2108 "\\begin{quote}\n", | |
| 2109 paste(names(quant_data_imp_qn_log), collapse = "\n\n\n"), | |
| 2110 "\n\\end{quote}\n\n") | |
| 2111 | |
| 2112 regex_sample_names <- nuke_control_sequences(regex_sample_names) | |
| 2113 | |
| 2114 cat("\\leavevmode\n\n\n") | |
| 2115 cat("Parsing rule for SampleNames is", | |
| 2116 "\n\n\n", | |
| 2117 "\\text{'", | |
| 2118 regex_sample_names, | |
| 2119 "'}\n\n\n", | |
| 2120 sep = "" | |
| 2121 ) | |
| 2122 | |
| 2123 cat("\nParsed sample names are:\n", | |
| 2124 "\\begin{quote}\n", | |
| 2125 paste(sample_name_matches, collapse = "\n\n\n"), | |
| 2126 "\n\\end{quote}\n\n") | |
| 2127 | |
| 2128 regex_sample_grouping <- nuke_control_sequences(regex_sample_grouping) | |
| 2129 | |
| 2130 cat("\\leavevmode\n\n\n") | |
| 2131 cat("Parsing rule for SampleGrouping is", | |
| 2132 "\n\n\n", | |
| 2133 "\\text{'", | |
| 2134 regex_sample_grouping, | |
| 2135 "'}\n\n\n", | |
| 2136 sep = "" | |
| 2137 ) | |
| 2138 | |
| 2139 cat("\n\n\n") | |
| 2140 cat("Sample group assignments are:\n", | |
| 2141 "\\begin{quote}\n", | |
| 2142 paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"), | |
| 2143 "\n\\end{quote}\n\n") | |
| 2144 | |
| 2145 } else { | |
| 2146 | |
| 2147 p_value_data_anova_ps <- | |
| 2148 apply( | |
| 2149 quant_data_imp_qn_log, | |
| 2150 1, | |
| 2151 anova_func, | |
| 2152 grouping_factor = sample_treatment_levels, | |
| 2153 one_way_f = one_way_all_categories | |
| 2154 ) | |
| 2155 | |
| 2156 p_value_data_anova_ps_fdr <- | |
| 2157 p.adjust(p_value_data_anova_ps, method = "fdr") | |
| 2158 p_value_data <- data.frame( | |
| 2159 phosphopeptide = full_data[, 1] | |
| 2160 , | |
| 2161 raw_anova_p = p_value_data_anova_ps | |
| 2162 , | |
| 2163 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr | |
| 2164 ) | |
| 2165 | |
| 2166 # output ANOVA file to constructed filename, | |
| 2167 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" | |
| 2168 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" | |
| 2169 | |
| 2170 # Re-output datasets to include p-values | |
| 2171 metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3]) | |
| 2172 write.table( | |
| 2173 cbind(metadata_plus_p, quant_data_imp), | |
| 2174 file = imputed_data_filename, | |
| 2175 sep = "\t", | |
| 2176 col.names = TRUE, | |
| 2177 row.names = FALSE, | |
| 2178 quote = FALSE | |
| 2179 ) | |
| 2180 | |
| 2181 write.table( | |
| 2182 cbind(metadata_plus_p, quant_data_imp_qn_log), | |
| 2183 file = imp_qn_lt_data_filenm, | |
| 2184 sep = "\t", | |
| 2185 col.names = TRUE, | |
| 2186 row.names = FALSE, | |
| 2187 quote = FALSE | |
| 2188 ) | |
| 2189 | |
| 2190 | |
| 2191 p_value_data <- | |
| 2192 p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] | |
| 2193 | |
| 2194 first_page_suppress <- 1 | |
| 2195 number_of_peptides_found <- 0 | |
| 2196 cutoff <- val_fdr[1] | |
| 2197 for (cutoff in val_fdr) { | |
| 2198 if (number_of_peptides_found > 49) { | |
| 2199 cat("\\leavevmode\n\n\n") | |
| 2200 break | |
| 2201 } | |
| 2202 | |
| 2203 #loop through FDR cutoffs | |
| 2204 | |
| 2205 filtered_p <- | |
| 2206 p_value_data[ | |
| 2207 which(p_value_data$fdr_adjusted_anova_p < cutoff), | |
| 2208 , drop = FALSE | |
| 2209 ] | |
| 2210 filtered_data_filtered <- | |
| 2211 quant_data_imp_qn_log[ | |
| 2212 rownames(filtered_p), | |
| 2213 , drop = FALSE | |
| 2214 ] | |
| 2215 filtered_data_filtered <- | |
| 2216 filtered_data_filtered[ | |
| 2217 order(filtered_p$fdr_adjusted_anova_p), | |
| 2218 , drop = FALSE | |
| 2219 ] | |
| 2220 | |
| 2221 # <!-- ACE insertion start --> | |
| 2222 | |
| 2223 if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) { | |
| 2224 if (first_page_suppress == 1) { | |
| 2225 first_page_suppress <- 0 | |
| 2226 } else { | |
| 2227 cat("\\newpage\n") | |
| 2228 } | |
| 2229 if (nrow(filtered_data_filtered) > 1) { | |
| 2230 subsection_header(sprintf( | |
| 2231 "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n", | |
| 2232 nrow(filtered_data_filtered), | |
| 2233 cutoff | |
| 2234 )) | |
| 2235 } else { | |
| 2236 subsection_header(sprintf( | |
| 2237 "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n", | |
| 2238 rownames(filtered_data_filtered)[1], | |
| 2239 cutoff | |
| 2240 )) | |
| 2241 } | |
| 2242 cat("\n\n\n") | |
| 2243 cat("\n\n\n") | |
| 2244 | |
| 2245 old_oma <- par("oma") | |
| 2246 old_par <- par( | |
| 2247 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), | |
| 2248 oma = old_oma * c(1, 1, 0.3, 1), | |
| 2249 cex.main = 0.9, | |
| 2250 cex.axis = 0.7, | |
| 2251 fin = c(9, 7.25) | |
| 2252 ) | |
| 2253 # ref: https://r-charts.com/distribution/add-points-boxplot/ | |
| 2254 # Vertical plot | |
| 2255 colnames(filtered_data_filtered) <- sample_name_matches | |
| 2256 tryCatch( | |
| 2257 boxplot( | |
| 2258 filtered_data_filtered, | |
| 2259 main = "Imputed, normalized intensities", # no line plot | |
| 2260 las = 1, | |
| 2261 col = const_boxplot_fill, | |
| 2262 ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") | |
| 2263 ), | |
| 2264 error = function(e) print(e) | |
| 2265 ) | |
| 2266 par(old_par) | |
| 2267 } else { | |
| 2268 cat(sprintf( | |
| 2269 "%s < %0.2f\n\n\n\n\n", | |
| 2270 "No peptides were found to have cutoff adjusted p-value", | |
| 2271 cutoff | |
| 2272 )) | |
| 2273 } | |
| 2274 | |
| 2275 if (nrow(filtered_data_filtered) > 0) { | |
| 2276 # Add Phosphopeptide column to anova_filtered table | |
| 2277 # The assumption here is that the first intensity is unique; | |
| 2278 # this is a hokey assumption but almost definitely will | |
| 2279 # be true in the real world unless there is a computation | |
| 2280 # error upstream. | |
| 2281 anova_filtered_merge <- base::merge( | |
| 2282 x = connect_df, | |
| 2283 y = filtered_data_filtered, | |
| 2284 by.x = "Intensity", | |
| 2285 by.y = 1 | |
| 2286 ) | |
| 2287 anova_filtered_merge_order <- rownames(filtered_p) | |
| 2288 | |
| 2289 anova_filtered <- data.frame( | |
| 2290 ppep = anova_filtered_merge$Phosphopeptide, | |
| 2291 intense = anova_filtered_merge$Intensity, | |
| 2292 data = anova_filtered_merge[, 2:number_of_samples + 1] | |
| 2293 ) | |
| 2294 colnames(anova_filtered) <- | |
| 2295 c("Phosphopeptide", colnames(filtered_data_filtered)) | |
| 2296 | |
| 2297 # Merge qualitative columns into the ANOVA data | |
| 2298 output_table <- data.frame(anova_filtered$Phosphopeptide) | |
| 2299 output_table <- base::merge( | |
| 2300 x = output_table, | |
| 2301 y = data_table_imp_qn_lt, | |
| 2302 by.x = "anova_filtered.Phosphopeptide", | |
| 2303 by.y = "Phosphopeptide" | |
| 2304 ) | |
| 2305 | |
| 2306 # Produce heatmap to visualize significance and the effect of imputation | |
| 2307 | |
| 2308 anova_filtered_merge_format <- sapply( | |
| 2309 X = filtered_p$fdr_adjusted_anova_p | |
| 2310 , | |
| 2311 FUN = function(x) { | |
| 2312 if (x > 0.0001) | |
| 2313 paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s") | |
| 2314 else | |
| 2315 paste0("(%0.4e) %s") | |
| 2316 } | |
| 2317 ) | |
| 2318 | |
| 2319 cat_hm_heading <- function(m, cutoff) { | |
| 2320 cat("\\newpage\n") | |
| 2321 if (nrow(m) > intensity_hm_rows) { | |
| 2322 subsection_header( | |
| 2323 paste( | |
| 2324 sprintf("Heatmap for the %d most-significant peptides", | |
| 2325 intensity_hm_rows), | |
| 2326 sprintf("whose adjusted p-value < %0.2f\n", cutoff) | |
| 2327 ) | |
| 2328 ) | |
| 2329 } else { | |
| 2330 if (nrow(m) == 1) { | |
| 2331 return(FALSE) | |
| 2332 } else { | |
| 2333 subsection_header( | |
| 2334 paste( | |
| 2335 sprintf("Heatmap for %d usable peptides whose", nrow(m)), | |
| 2336 sprintf("adjusted p-value < %0.2f\n", cutoff) | |
| 2337 ) | |
| 2338 ) | |
| 2339 } | |
| 2340 } | |
| 2341 cat("\n\n\n") | |
| 2342 cat("\n\n\n") | |
| 2343 return(TRUE) | |
| 2344 } | |
| 2345 | |
| 2346 # construct matrix with appropriate rownames | |
| 2347 m <- | |
| 2348 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) | |
| 2349 if (nrow(m) > 0) { | |
| 2350 rownames_m <- rownames(m) | |
| 2351 rownames(m) <- sapply( | |
| 2352 X = seq_len(nrow(m)) | |
| 2353 , | |
| 2354 FUN = function(i) { | |
| 2355 sprintf( | |
| 2356 anova_filtered_merge_format[i], | |
| 2357 filtered_p$fdr_adjusted_anova_p[i], | |
| 2358 rownames_m[i] | |
| 2359 ) | |
| 2360 } | |
| 2361 ) | |
| 2362 } | |
| 2363 # draw the heading and heatmap | |
| 2364 if (nrow(m) > 0) { | |
| 2365 number_of_peptides_found <- | |
| 2366 draw_intensity_heatmap( | |
| 2367 m = m, | |
| 2368 cutoff = cutoff, | |
| 2369 hm_heading_function = cat_hm_heading, | |
| 2370 hm_main_title = "Unimputed, unnormalized log(intensities)", | |
| 2371 suppress_row_dendrogram = FALSE | |
| 2372 ) | |
| 2373 } | |
| 2374 } | |
| 2375 } | |
| 2376 } | |
| 2377 cat("\\leavevmode\n\n\n") | |
| 2378 ``` | |
| 2379 | |
| 2380 ```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
| 2381 | |
| 2382 if (count_of_treatment_levels > 1) { | |
| 2383 # Prepare two-way contrasts with adjusted p-values | |
| 2384 # Strategy: | |
| 2385 # - use imputed, log-transformed data: | |
| 2386 # - remember this when computing log2(fold-change) | |
| 2387 # - each contrast is between a combination of trt levels | |
| 2388 # - for each contrast, compute samples that are members | |
| 2389 # - compute one-way test: | |
| 2390 # - use `oneway.test` (Welch test) if numbers of samples | |
| 2391 # are not equivalent between trt levels | |
| 2392 # - otherwise, aov is fine but offers no advantage | |
| 2393 # - adjust p-value, assuming that | |
| 2394 # (# of pppeps)*(# of contrasts) tests were performed | |
| 2395 | |
| 2396 # Each contrast is between a combination of trt levels | |
| 2397 m2 <- combn( | |
| 2398 x = seq_len(length(levels(sample_treatment_levels))), | |
| 2399 m = 2, | |
| 2400 simplify = TRUE | |
| 2401 ) | |
| 2402 contrast_count <- ncol(m2) | |
| 2403 | |
| 2404 # For each contrast, compute samples that are members | |
| 2405 # - local function to construct a data.frame for each contrast | |
| 2406 # - the contrast in the first "column" | |
| 2407 f_m2 <- | |
| 2408 function(cntrst, lvl1, lvl2) { | |
| 2409 return( | |
| 2410 data.frame( | |
| 2411 contrast = cntrst, | |
| 2412 level = sample_treatment_levels[ | |
| 2413 sample_treatment_levels %in% | |
| 2414 levels(sample_treatment_levels)[c(lvl1, lvl2)] | |
| 2415 ], | |
| 2416 label = sample_name_matches[ | |
| 2417 sample_treatment_levels %in% | |
| 2418 levels(sample_treatment_levels)[c(lvl1, lvl2)] | |
| 2419 ] | |
| 2420 ) | |
| 2421 ) | |
| 2422 } | |
| 2423 # - compute a df for each contrast | |
| 2424 sample_level_dfs <- lapply( | |
| 2425 X = 1:contrast_count, | |
| 2426 FUN = function(i) f_m2(i, m2[1, i], m2[2, i]) | |
| 2427 ) | |
| 2428 # - compute a single df for all contrasts | |
| 2429 combined_contrast_df <- Reduce(f = rbind, x = sample_level_dfs) | |
| 2430 | |
| 2431 # - dispose objects to free resources | |
| 2432 rm(sample_level_dfs) | |
| 2433 | |
| 2434 # - write the df to a DB for later join-per-contrast | |
| 2435 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) | |
| 2436 | |
| 2437 RSQLite::dbWriteTable( | |
| 2438 conn = db, | |
| 2439 name = "contrast", | |
| 2440 value = combined_contrast_df, | |
| 2441 overwrite = TRUE | |
| 2442 ) | |
| 2443 | |
| 2444 # Create UK for insert | |
| 2445 ddl_exec(db, " | |
| 2446 CREATE UNIQUE INDEX IF NOT EXISTS contrast__uk__idx | |
| 2447 ON contrast(contrast, label); | |
| 2448 " | |
| 2449 ) | |
| 2450 # Create indexes for join | |
| 2451 ddl_exec(db, " | |
| 2452 -- index for join in contrast_ppep_smpl_qnlt on a.label < b.label | |
| 2453 CREATE INDEX IF NOT EXISTS contrast__label__idx | |
| 2454 ON contrast(label); | |
| 2455 " | |
| 2456 ) | |
| 2457 ddl_exec(db, " | |
| 2458 -- index for joining two contrast_lvl_ppep_avg_quant on contrast | |
| 2459 CREATE INDEX IF NOT EXISTS contrast__contrast__idx | |
| 2460 ON contrast(contrast); | |
| 2461 " | |
| 2462 ) | |
| 2463 ddl_exec(db, " | |
| 2464 -- index for joining two contrast_lvl_ppep_avg_quant on phophospep | |
| 2465 CREATE INDEX IF NOT EXISTS contrast__level__idx | |
| 2466 ON contrast(level); | |
| 2467 " | |
| 2468 ) | |
| 2469 # - dispose objects to free resources | |
| 2470 rm(combined_contrast_df) | |
| 2471 | |
| 2472 # Use imputed, log-transformed data | |
| 2473 # - remember that this was donoe when computing log2(fold-change) | |
| 2474 # - melt data matrix for use in later join-per-contrast | |
| 2475 casted <- cbind( | |
| 2476 data.frame(vrbl = rownames(quant_data_imp_qn_log)), | |
| 2477 quant_data_imp_qn_log | |
| 2478 ) | |
| 2479 quant_data_imp_qn_log_melted <- reshape2::melt( | |
| 2480 casted, | |
| 2481 id.vars = "vrbl" | |
| 2482 ) | |
| 2483 colnames(quant_data_imp_qn_log_melted) <- | |
| 2484 c("phosphopep", "sample", "quant") | |
| 2485 # - dispose objects to free resources | |
| 2486 rm(casted) | |
| 2487 | |
| 2488 # - write the df to a DB for use in later join-per-contrast | |
| 2489 RSQLite::dbWriteTable( | |
| 2490 conn = db, | |
| 2491 name = "ppep_smpl_qnlt", | |
| 2492 value = quant_data_imp_qn_log_melted, | |
| 2493 overwrite = TRUE | |
| 2494 ) | |
| 2495 # Create UK for insert | |
| 2496 ddl_exec(db, " | |
| 2497 CREATE UNIQUE INDEX IF NOT EXISTS ppep_smpl_qnlt__uk__idx | |
| 2498 ON ppep_smpl_qnlt(phosphopep, sample); | |
| 2499 " | |
| 2500 ) | |
| 2501 # Create index for join | |
| 2502 ddl_exec(db, " | |
| 2503 -- index for join in contrast_ppep_smpl_qnlt | |
| 2504 CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__sample__idx | |
| 2505 ON ppep_smpl_qnlt(sample); | |
| 2506 " | |
| 2507 ) | |
| 2508 ddl_exec(db, " | |
| 2509 -- index for joining two contrast_lvl_ppep_avg_quant on phopho.pep | |
| 2510 CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__phosphopep__idx | |
| 2511 ON ppep_smpl_qnlt(phosphopep); | |
| 2512 " | |
| 2513 ) | |
| 2514 # - dispose objects to free resources | |
| 2515 rm(quant_data_imp_qn_log_melted) | |
| 2516 | |
| 2517 # - drop views if exist | |
| 2518 ddl_exec(db, " | |
| 2519 -- drop view dependent on contrast_lvl_ppep_avg_quant | |
| 2520 DROP VIEW IF EXISTS v_contrast_log2_fc; | |
| 2521 " | |
| 2522 ) | |
| 2523 ddl_exec(db, " | |
| 2524 -- drop table dependent on contrast_ppep_smpl_qnlt | |
| 2525 DROP TABLE IF EXISTS contrast_lvl_ppep_avg_quant; | |
| 2526 " | |
| 2527 ) | |
| 2528 ddl_exec(db, " | |
| 2529 DROP TABLE IF EXISTS contrast_lvl_lvl_metadata; | |
| 2530 " | |
| 2531 ) | |
| 2532 ddl_exec(db, " | |
| 2533 DROP VIEW IF EXISTS v_contrast_lvl_metadata; | |
| 2534 " | |
| 2535 ) | |
| 2536 ddl_exec(db, " | |
| 2537 -- drop view dependent on contrast_ppep_smpl_qnlt | |
| 2538 DROP VIEW IF EXISTS v_contrast_lvl_ppep_avg_quant; | |
| 2539 " | |
| 2540 ) | |
| 2541 ddl_exec(db, " | |
| 2542 DROP VIEW IF EXISTS v_contrast_lvl_lvl; | |
| 2543 " | |
| 2544 ) | |
| 2545 ddl_exec(db, " | |
| 2546 -- drop view upon which other views depend | |
| 2547 DROP VIEW IF EXISTS contrast_ppep_smpl_qnlt; | |
| 2548 " | |
| 2549 ) | |
| 2550 # - create view | |
| 2551 dml_no_rows_exec(db, " | |
| 2552 -- view contrast_ppep_smpl_qnlt is used for each phopshopep to | |
| 2553 -- compute p-value for test of trt effect for two trt levels | |
| 2554 CREATE VIEW contrast_ppep_smpl_qnlt | |
| 2555 AS | |
| 2556 SELECT contrast, | |
| 2557 level, | |
| 2558 phosphopep, | |
| 2559 sample, | |
| 2560 quant | |
| 2561 FROM contrast AS c, | |
| 2562 ppep_smpl_qnlt AS q | |
| 2563 WHERE q.sample = c.label | |
| 2564 ORDER BY contrast, level, phosphopep | |
| 2565 ; | |
| 2566 " | |
| 2567 ) | |
| 2568 # - create simplification views | |
| 2569 dml_no_rows_exec(db, " | |
| 2570 CREATE VIEW v_contrast_lvl_metadata | |
| 2571 AS | |
| 2572 SELECT contrast, | |
| 2573 level, | |
| 2574 group_concat(label, ';') AS samples | |
| 2575 FROM contrast | |
| 2576 GROUP BY contrast, level | |
| 2577 /* view v_contrast_lvl_metadata is used | |
| 2578 to simplify creation of table contrast_lvl_lvl_metadata */ | |
| 2579 ; | |
| 2580 " | |
| 2581 ) | |
| 2582 dml_no_rows_exec(db, " | |
| 2583 CREATE VIEW v_contrast_lvl_ppep_avg_quant | |
| 2584 AS | |
| 2585 SELECT contrast, | |
| 2586 level, | |
| 2587 phosphopep, | |
| 2588 avg(quant) AS avg_quant | |
| 2589 FROM contrast_ppep_smpl_qnlt | |
| 2590 GROUP BY contrast, level, phosphopep | |
| 2591 /* view v_contrast_lvl_ppep_avg_quant is used | |
| 2592 to simplify view v_contrast_log2_fc */ | |
| 2593 ; | |
| 2594 " | |
| 2595 ) | |
| 2596 | |
| 2597 # - create contrast-metadata table | |
| 2598 dml_no_rows_exec(db, " | |
| 2599 CREATE TABLE contrast_lvl_lvl_metadata | |
| 2600 AS | |
| 2601 SELECT DISTINCT | |
| 2602 a.contrast AS ab_contrast, | |
| 2603 a.level AS a_level, | |
| 2604 b.level AS b_level, | |
| 2605 a.samples AS a_samples, | |
| 2606 b.samples AS b_samples, | |
| 2607 'log2(level_'||a.level||'/level_'||b.level||')' | |
| 2608 AS fc_description | |
| 2609 FROM v_contrast_lvl_metadata AS a, | |
| 2610 v_contrast_lvl_metadata AS b | |
| 2611 WHERE a.contrast = b.contrast | |
| 2612 AND a.level > b.level | |
| 2613 /* view v_contrast_lvl_lvl is used | |
| 2614 to simplify view v_contrast_log2_fc */ | |
| 2615 ; | |
| 2616 " | |
| 2617 ) | |
| 2618 # - create pseudo-materialized view table | |
| 2619 dml_no_rows_exec(db, " | |
| 2620 CREATE VIEW v_contrast_lvl_lvl | |
| 2621 AS | |
| 2622 SELECT DISTINCT | |
| 2623 a.contrast AS ab_contrast, | |
| 2624 a.level AS a_level, | |
| 2625 b.level AS b_level | |
| 2626 FROM contrast AS a, | |
| 2627 contrast AS b | |
| 2628 WHERE a.contrast = b.contrast | |
| 2629 AND a.level > b.level | |
| 2630 /* view v_contrast_lvl_lvl is used | |
| 2631 to simplify view v_contrast_log2_fc */ | |
| 2632 ; | |
| 2633 " | |
| 2634 ) | |
| 2635 | |
| 2636 # - create view to compute log2(fold-change) | |
| 2637 dml_no_rows_exec(db, " | |
| 2638 CREATE VIEW v_contrast_log2_fc | |
| 2639 AS | |
| 2640 SELECT ab.ab_contrast AS contrast, | |
| 2641 m.a_level AS a_level, | |
| 2642 c.avg_quant AS a_quant, | |
| 2643 m.a_samples AS a_samples, | |
| 2644 ab.b_level AS b_level, | |
| 2645 d.avg_quant AS b_quant, | |
| 2646 m.b_samples AS b_samples, | |
| 2647 m.fc_description AS fc_description, | |
| 2648 3.32193 * ( d.avg_quant - c.avg_quant) AS log2_fc, | |
| 2649 d.phosphopep AS phosphopep | |
| 2650 FROM contrast_lvl_lvl_metadata AS m, | |
| 2651 v_contrast_lvl_ppep_avg_quant AS d, | |
| 2652 v_contrast_lvl_lvl AS ab | |
| 2653 INNER JOIN v_contrast_lvl_ppep_avg_quant AS c | |
| 2654 ON c.contrast = ab.ab_contrast | |
| 2655 AND c.level = ab.a_level | |
| 2656 WHERE d.contrast = ab.ab_contrast | |
| 2657 AND m.ab_contrast = ab.ab_contrast | |
| 2658 AND d.level = ab.b_level | |
| 2659 AND d.phosphopep = c.phosphopep | |
| 2660 /* view to compute log2(fold-change) */ | |
| 2661 ; | |
| 2662 " | |
| 2663 ) | |
| 2664 | |
| 2665 # For each contrast, compute samples that are members | |
| 2666 # compute one-way test: | |
| 2667 # - use `oneway.test` (Welch test) if numbers of samples | |
| 2668 # are not equivalent between trt levels | |
| 2669 # - otherwise, aov is fine but offers no advantage | |
| 2670 for (contrast in contrast_count:2) { | |
| 2671 invisible(contrast) | |
| 2672 } | |
| 2673 for (contrast in 1:contrast_count) { | |
| 2674 contrast_df <- sqldf::sqldf( | |
| 2675 x = paste0(" | |
| 2676 SELECT level, phosphopep, sample, quant | |
| 2677 FROM contrast_ppep_smpl_qnlt | |
| 2678 WHERE contrast = ", contrast, " | |
| 2679 ORDER BY phosphopep, level, sample | |
| 2680 "), | |
| 2681 connection = db | |
| 2682 ) | |
| 2683 contrast_cast <- reshape2::dcast( | |
| 2684 data = contrast_df, | |
| 2685 formula = phosphopep ~ sample, | |
| 2686 value.var = "quant" | |
| 2687 ) | |
| 2688 contrast_cast_ncol <- ncol(contrast_cast) | |
| 2689 contrast_cast_data <- contrast_cast[, 2:contrast_cast_ncol] | |
| 2690 contrast_cast_samples <- colnames(contrast_cast_data) | |
| 2691 | |
| 2692 # - order grouping_factor by order of sample columns of contrast_cast_data | |
| 2693 grouping_factor <- sqldf::sqldf( | |
| 2694 x = paste0(" | |
| 2695 SELECT sample, level | |
| 2696 FROM contrast_ppep_smpl_qnlt | |
| 2697 WHERE contrast = ", contrast, " | |
| 2698 ORDER BY phosphopep, level, sample | |
| 2699 LIMIT ", contrast_cast_ncol - 1 | |
| 2700 ), | |
| 2701 connection = db | |
| 2702 ) | |
| 2703 rownames(grouping_factor) <- grouping_factor$sample | |
| 2704 grouping_factor <- grouping_factor[, "level", drop = FALSE] | |
| 2705 | |
| 2706 # - run the two-level (one-way) test | |
| 2707 p_value_data_contrast_ps <- | |
| 2708 apply( | |
| 2709 X = contrast_cast_data, | |
| 2710 MARGIN = 1, # apply to rows | |
| 2711 FUN = anova_func, | |
| 2712 grouping_factor = | |
| 2713 as.factor(as.numeric(grouping_factor$level)), # anova_func arg2 | |
| 2714 one_way_f = one_way_two_categories, # anova_func arg3 | |
| 2715 simplify = TRUE # TRUE is the default for simplify | |
| 2716 ) | |
| 2717 contrast_data_adj_p_values <- p.adjust( | |
| 2718 p = p_value_data_contrast_ps, | |
| 2719 method = "fdr", | |
| 2720 n = length(p_value_data_contrast_ps) # this is the default, length(p) | |
| 2721 ) | |
| 2722 # - compute the fold-change | |
| 2723 contrast_p_df <- | |
| 2724 data.frame( | |
| 2725 contrast = contrast, | |
| 2726 phosphopep = contrast_cast$phosphopep, | |
| 2727 p_value_raw = p_value_data_contrast_ps, | |
| 2728 p_value_adj = contrast_data_adj_p_values | |
| 2729 ) | |
| 2730 db_write_table_overwrite <- (contrast < 2) | |
| 2731 db_write_table_append <- !db_write_table_overwrite | |
| 2732 RSQLite::dbWriteTable( | |
| 2733 conn = db, | |
| 2734 name = "contrast_ppep_p_val", | |
| 2735 value = contrast_p_df, | |
| 2736 append = db_write_table_append | |
| 2737 ) | |
| 2738 # Create UK for insert | |
| 2739 ddl_exec(db, " | |
| 2740 CREATE UNIQUE INDEX IF NOT EXISTS contrast_ppep_p_val__uk__idx | |
| 2741 ON contrast_ppep_p_val(phosphopep, contrast); | |
| 2742 " | |
| 2743 ) | |
| 2744 } | |
| 2745 # Perhaps this could be done more elegantly using unique keys | |
| 2746 # or creating the tables before saving data to them, but this | |
| 2747 # is fast and, if the database exists on disk rather than in | |
| 2748 # memory, it doesn't stress memory. | |
| 2749 dml_no_rows_exec(db, " | |
| 2750 CREATE TEMP table contrast_log2_fc | |
| 2751 AS | |
| 2752 SELECT * | |
| 2753 FROM v_contrast_log2_fc | |
| 2754 ORDER BY contrast, phosphopep | |
| 2755 ; | |
| 2756 " | |
| 2757 ) | |
| 2758 dml_no_rows_exec(db, " | |
| 2759 CREATE TEMP table ppep_p_val | |
| 2760 AS | |
| 2761 SELECT p_value_raw, | |
| 2762 p_value_adj, | |
| 2763 contrast AS p_val_contrast, | |
| 2764 phosphopep AS p_val_ppep | |
| 2765 FROM contrast_ppep_p_val | |
| 2766 ORDER BY contrast, phosphopep | |
| 2767 ; | |
| 2768 " | |
| 2769 ) | |
| 2770 dml_no_rows_exec(db, " | |
| 2771 DROP TABLE IF EXISTS contrast_log2_fc_p_val | |
| 2772 ; | |
| 2773 " | |
| 2774 ) | |
| 2775 dml_no_rows_exec(db, " | |
| 2776 CREATE TABLE contrast_log2_fc_p_val | |
| 2777 AS | |
| 2778 SELECT a.*, | |
| 2779 b.p_value_raw, | |
| 2780 b.p_value_adj, | |
| 2781 b.p_val_contrast, | |
| 2782 b.p_val_ppep | |
| 2783 FROM contrast_log2_fc a, ppep_p_val b | |
| 2784 WHERE a.rowid = b.rowid | |
| 2785 AND a.phosphopep = b.p_val_ppep | |
| 2786 ; | |
| 2787 " | |
| 2788 ) | |
| 2789 # Create UK | |
| 2790 ddl_exec(db, " | |
| 2791 CREATE UNIQUE INDEX IF NOT EXISTS contrast_log2_fc_p_val__uk__idx | |
| 2792 ON contrast_log2_fc_p_val(phosphopep, contrast); | |
| 2793 " | |
| 2794 ) | |
| 2795 # Create indices for future queries | |
| 2796 ddl_exec(db, " | |
| 2797 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__contrast__idx | |
| 2798 ON contrast_log2_fc_p_val(contrast); | |
| 2799 " | |
| 2800 ) | |
| 2801 ddl_exec(db, " | |
| 2802 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__phosphopep__idx | |
| 2803 ON contrast_log2_fc_p_val(phosphopep); | |
| 2804 " | |
| 2805 ) | |
| 2806 ddl_exec(db, " | |
| 2807 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_raw__idx | |
| 2808 ON contrast_log2_fc_p_val(p_value_raw); | |
| 2809 " | |
| 2810 ) | |
| 2811 ddl_exec(db, " | |
| 2812 CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_adj__idx | |
| 2813 ON contrast_log2_fc_p_val(p_value_adj); | |
| 2814 " | |
| 2815 ) | |
| 2816 dml_no_rows_exec(db, " | |
| 2817 DROP VIEW IF EXISTS v_contrast_log2_fc_p_val | |
| 2818 ; | |
| 2819 " | |
| 2820 ) | |
| 2821 dml_no_rows_exec(db, " | |
| 2822 CREATE VIEW v_contrast_log2_fc_p_val | |
| 2823 AS | |
| 2824 SELECT contrast, | |
| 2825 a_level, | |
| 2826 a_samples, | |
| 2827 b_level, | |
| 2828 b_samples, | |
| 2829 a_quant, | |
| 2830 b_quant, | |
| 2831 fc_description, | |
| 2832 log2_fc, | |
| 2833 p_value_raw, | |
| 2834 p_value_adj, | |
| 2835 phosphopep | |
| 2836 FROM contrast_log2_fc_p_val | |
| 2837 ORDER BY contrast, phosphopep | |
| 2838 ; | |
| 2839 " | |
| 2840 ) | |
| 2841 ddl_exec(db, " | |
| 2842 DROP TABLE IF EXISTS kseaapp_metadata | |
| 2843 ; | |
| 2844 " | |
| 2845 ) | |
| 2846 dml_no_rows_exec(db, " | |
| 2847 CREATE TABLE kseaapp_metadata | |
| 2848 AS | |
| 2849 WITH extended(deppep, ppep, gene_name, uniprot_id, phosphoresidue) AS ( | |
| 2850 SELECT DISTINCT | |
| 2851 deppep.seq, | |
| 2852 ppep.seq, | |
| 2853 GeneName||';', | |
| 2854 UniProtID||';', | |
| 2855 PhosphoResidue||';' | |
| 2856 FROM | |
| 2857 ppep, deppep, mrgfltr_metadata | |
| 2858 WHERE | |
| 2859 mrgfltr_metadata.ppep_id = ppep.id | |
| 2860 AND | |
| 2861 ppep.deppep_id = deppep.id | |
| 2862 ) | |
| 2863 SELECT | |
| 2864 ppep AS `ppep`, | |
| 2865 SUBSTR(uniprot_id, 1, INSTR(uniprot_id,';') - 1 ) AS `Protein`, | |
| 2866 SUBSTR(gene_name, 1, INSTR(gene_name,';') - 1 ) AS `Gene`, | |
| 2867 deppep AS `Peptide`, | |
| 2868 REPLACE( | |
| 2869 REPLACE( | |
| 2870 SUBSTR(phosphoresidue, 1, INSTR(phosphoresidue,';') - 1 ), | |
| 2871 'p', | |
| 2872 '' | |
| 2873 ), | |
| 2874 ', ', | |
| 2875 ';' | |
| 2876 ) AS `Residue.Both` | |
| 2877 FROM extended | |
| 2878 ; | |
| 2879 " | |
| 2880 ) | |
| 2881 # Create indexes for join | |
| 2882 ddl_exec(db, " | |
| 2883 CREATE INDEX IF NOT EXISTS kseaapp_metadata__ppep__idx | |
| 2884 ON kseaapp_metadata(ppep); | |
| 2885 " | |
| 2886 ) | |
| 2887 ddl_exec(db, " | |
| 2888 DROP VIEW IF EXISTS v_kseaapp_contrast | |
| 2889 ; | |
| 2890 " | |
| 2891 ) | |
| 2892 dml_no_rows_exec(db, " | |
| 2893 CREATE VIEW v_kseaapp_contrast | |
| 2894 AS | |
| 2895 SELECT a.*, b.Protein, b.Gene, b.Peptide, b.`Residue.Both` | |
| 2896 FROM v_contrast_log2_fc_p_val a, kseaapp_metadata b | |
| 2897 WHERE b.ppep = a.phosphopep | |
| 2898 ; | |
| 2899 " | |
| 2900 ) | |
| 2901 ddl_exec(db, " | |
| 2902 DROP VIEW IF EXISTS v_kseaapp_input | |
| 2903 ; | |
| 2904 " | |
| 2905 ) | |
| 2906 dml_no_rows_exec(db, " | |
| 2907 CREATE VIEW v_kseaapp_input | |
| 2908 AS | |
| 2909 SELECT v.contrast, | |
| 2910 v.phosphopep, | |
| 2911 m.`Protein`, | |
| 2912 m.`Gene`, | |
| 2913 m.`Peptide`, | |
| 2914 m.`Residue.Both`, | |
| 2915 v.p_value_raw AS `p`, | |
| 2916 v.log2_fc AS `FC` | |
| 2917 FROM kseaapp_metadata AS m, | |
| 2918 v_contrast_log2_fc_p_val AS v | |
| 2919 WHERE m.ppep = v.phosphopep | |
| 2920 AND NOT m.`Gene` = 'No_Gene_Name' | |
| 2921 AND NOT v.log2_fc = 0 | |
| 2922 ; | |
| 2923 " | |
| 2924 ) | |
| 2925 } | |
| 2926 ``` | |
| 2927 | |
| 2928 ```{r echo = FALSE, results = 'asis'} | |
| 2929 cat("\\newpage\n") | |
| 2930 ``` | |
| 2931 | |
| 2932 # KSEA Analysis | |
| 2933 | |
| 2934 Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package: | |
| 2935 | |
| 2936 - The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp | |
| 2937 - The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415). | |
| 2938 - An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/). | |
| 2939 | |
| 2940 For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as: | |
| 2941 | |
| 2942 $$ | |
| 2943 \text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j} | |
| 2944 $$ | |
| 2945 | |
| 2946 and fold-enrichment is computed as: | |
| 2947 | |
| 2948 $$ | |
| 2949 \text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j} | |
| 2950 $$ | |
| 2951 | |
| 2952 where: | |
| 2953 | |
| 2954 - $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$, | |
| 2955 - $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and | |
| 2956 - $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$, | |
| 2957 - $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset. | |
| 2958 - Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score. | |
| 2959 | |
| 2960 $\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x) | |
| 2961 | |
| 2962 Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive). | |
| 2963 | |
| 2964 Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`. | |
| 2965 | |
| 2966 - Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis. | |
| 2967 - Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)). | |
| 2968 - Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads). | |
| 2969 | |
| 2970 ```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
| 2971 | |
| 2972 db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db) | |
| 2973 | |
| 2974 # -- eliminate the table that's about to be defined | |
| 2975 ddl_exec(db, " | |
| 2976 DROP TABLE IF EXISTS site_metadata; | |
| 2977 ") | |
| 2978 | |
| 2979 # -- define the site_metadata table | |
| 2980 ddl_exec(db, " | |
| 2981 CREATE TABLE site_metadata( | |
| 2982 id INTEGER PRIMARY KEY | |
| 2983 , site_type_id INTEGER REFERENCES site_type(id) | |
| 2984 , full TEXT UNIQUE ON CONFLICT IGNORE | |
| 2985 , abbrev TEXT | |
| 2986 , pattern TEXT | |
| 2987 , motif TEXT | |
| 2988 ); | |
| 2989 ") | |
| 2990 | |
| 2991 # -- populate the table with initial values | |
| 2992 ddl_exec(db, " | |
| 2993 INSERT INTO site_metadata(full, abbrev, site_type_id) | |
| 2994 SELECT DISTINCT kinase_map, kinase_map, site_type_id | |
| 2995 FROM ppep_gene_site | |
| 2996 ORDER BY kinase_map; | |
| 2997 ") | |
| 2998 | |
| 2999 # -- drop bogus KSData view if exists | |
| 3000 ddl_exec(db, " | |
| 3001 DROP VIEW IF EXISTS ks_data_v; | |
| 3002 ") | |
| 3003 | |
| 3004 # -- create view to serve as an impostor for KSEAapp::KSData | |
| 3005 ddl_exec(db, " | |
| 3006 CREATE VIEW IF NOT EXISTS ks_data_v | |
| 3007 AS | |
| 3008 SELECT | |
| 3009 'NA' AS KINASE, | |
| 3010 'NA' AS KIN_ACC_ID, | |
| 3011 kinase_map AS GENE, | |
| 3012 'NA' AS KIN_ORGANISM, | |
| 3013 'NA' AS SUBSTRATE, | |
| 3014 0 AS SUB_GENE_ID, | |
| 3015 'NA' AS SUB_ACC_ID, | |
| 3016 gene_names AS SUB_GENE, | |
| 3017 'NA' AS SUB_ORGANISM, | |
| 3018 phospho_peptide AS SUB_MOD_RSD, | |
| 3019 0 AS SITE_GROUP_ID, | |
| 3020 'NA' AS 'SITE_7AA', | |
| 3021 2 AS networkin_score, | |
| 3022 type_name AS Source | |
| 3023 FROM ppep_gene_site_view; | |
| 3024 ") | |
| 3025 | |
| 3026 contrast_metadata_df <- | |
| 3027 sqldf::sqldf("select * from contrast_lvl_lvl_metadata", connection = db) | |
| 3028 rslt <- new_env() | |
| 3029 rslt$score_list <- list() | |
| 3030 rslt$name_list <- list() | |
| 3031 rslt$longname_list <- list() | |
| 3032 | |
| 3033 ddl_exec(db, " | |
| 3034 DROP TABLE IF EXISTS contrast_ksea_scores; | |
| 3035 " | |
| 3036 ) | |
| 3037 | |
| 3038 next_index <- 0 | |
| 3039 err_na_subscr_df_const <- | |
| 3040 "missing values are not allowed in subscripted assignments of data frames" | |
| 3041 | |
| 3042 for (i_cntrst in seq_len(nrow(contrast_metadata_df))) { | |
| 3043 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] | |
| 3044 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] | |
| 3045 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] | |
| 3046 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) | |
| 3047 contrast_longlabel <- ( | |
| 3048 sprintf( | |
| 3049 "Trt %s {%s} -> Trt %s {%s}", | |
| 3050 contrast_metadata_df[i_cntrst, "b_level"], | |
| 3051 gsub( | |
| 3052 pattern = ";", | |
| 3053 replacement = ", ", | |
| 3054 x = contrast_metadata_df[i_cntrst, "b_samples"], | |
| 3055 fixed = TRUE | |
| 3056 ), | |
| 3057 contrast_metadata_df[i_cntrst, "a_level"], | |
| 3058 gsub( | |
| 3059 pattern = ";", | |
| 3060 replacement = ", ", | |
| 3061 x = contrast_metadata_df[i_cntrst, "a_samples"], | |
| 3062 fixed = TRUE | |
| 3063 ) | |
| 3064 ) | |
| 3065 ) | |
| 3066 kseaapp_input <- | |
| 3067 sqldf::sqldf( | |
| 3068 x = sprintf(" | |
| 3069 SELECT `Protein`, `Gene`, `Peptide`, phosphopep AS `Residue.Both`, `p`, `FC` | |
| 3070 FROM v_kseaapp_input | |
| 3071 WHERE contrast = %d | |
| 3072 ", | |
| 3073 i_cntrst | |
| 3074 ), | |
| 3075 connection = db | |
| 3076 ) | |
| 3077 | |
| 3078 pseudo_ksdata <- dbReadTable(db, "ks_data_v") | |
| 3079 | |
| 3080 # This hack is because SQL table has the log2-transformed values | |
| 3081 kseaapp_input[, "FC"] <- 2 ** kseaapp_input[, "FC", drop = TRUE] | |
| 3082 main_title <- ( | |
| 3083 sprintf( | |
| 3084 "Change from treatment %s to treatment %s", | |
| 3085 contrast_metadata_df[i_cntrst, "b_level"], | |
| 3086 contrast_metadata_df[i_cntrst, "a_level"] | |
| 3087 ) | |
| 3088 ) | |
| 3089 sub_title <- contrast_longlabel | |
| 3090 tryCatch( | |
| 3091 expr = { | |
| 3092 ksea_scores_rslt <- | |
| 3093 ksea_scores( | |
| 3094 ksdata = pseudo_ksdata, # KSEAapp::KSData, | |
| 3095 px = kseaapp_input, | |
| 3096 networkin = TRUE, | |
| 3097 networkin_cutoff = 2 | |
| 3098 ) | |
| 3099 | |
| 3100 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | |
| 3101 next_index <- 1 + next_index | |
| 3102 rslt$score_list[[next_index]] <- ksea_scores_rslt | |
| 3103 rslt$name_list[[next_index]] <- contrast_label | |
| 3104 rslt$longname_list[[next_index]] <- contrast_longlabel | |
| 3105 low_fdr_print( | |
| 3106 rslt = rslt, | |
| 3107 i_cntrst = i_cntrst, | |
| 3108 i = next_index, | |
| 3109 a_level = cntrst_a_level, | |
| 3110 b_level = cntrst_b_level, | |
| 3111 fold_change = cntrst_fold_change, | |
| 3112 caption = contrast_longlabel | |
| 3113 ) | |
| 3114 } | |
| 3115 }, | |
| 3116 error = function(e) str(e) | |
| 3117 ) | |
| 3118 } | |
| 3119 | |
| 3120 plotted_kinases <- NULL | |
| 3121 if (length(rslt$score_list) > 1) { | |
| 3122 for (i in seq_len(length(ksea_heatmap_titles))) { | |
| 3123 hdr <- ksea_heatmap_titles[[i]] | |
| 3124 which_kinases <- i | |
| 3125 | |
| 3126 cat("\\clearpage\n\\begin{center}\n") | |
| 3127 if (i == const_ksea_astrsk_kinases) { | |
| 3128 subsection_header(hdr) | |
| 3129 } else { | |
| 3130 subsection_header(hdr) | |
| 3131 } | |
| 3132 cat("\\end{center}\n") | |
| 3133 | |
| 3134 plotted_kinases <- ksea_heatmap( | |
| 3135 # the data frame outputs from the KSEA.Scores() function, in list format | |
| 3136 score_list = rslt$score_list, | |
| 3137 # a character vector of all the sample names for heatmap annotation: | |
| 3138 # - the names must be in the same order as the data in score_list | |
| 3139 # - please avoid long names, as they may get cropped in the final image | |
| 3140 sample_labels = rslt$name_list, | |
| 3141 # character string of either "p.value" or "FDR" indicating the data column | |
| 3142 # to use for marking statistically significant scores | |
| 3143 stats = c("p.value", "FDR")[2], | |
| 3144 # a numeric value between 0 and infinity indicating the min. number of | |
| 3145 # substrates a kinase must have to be included in the heatmap | |
| 3146 m_cutoff = 1, | |
| 3147 # a numeric value between 0 and 1 indicating the p-value/FDR cutoff | |
| 3148 # for indicating significant kinases in the heatmap | |
| 3149 p_cutoff = 0.05, | |
| 3150 # a binary input of TRUE or FALSE, indicating whether or not to perform | |
| 3151 # hierarchical clustering of the sample columns | |
| 3152 sample_cluster = TRUE, | |
| 3153 # a binary input of TRUE or FALSE, indicating whether or not to export | |
| 3154 # the heatmap as a .png image into the working directory | |
| 3155 export = FALSE, | |
| 3156 # additional arguments to gplots::heatmap.2, such as: | |
| 3157 # - main: main title of plot | |
| 3158 # - xlab: x-axis label | |
| 3159 # - ylab: y-axis label | |
| 3160 xlab = "Contrast", | |
| 3161 ylab = "Kinase", | |
| 3162 # print which kinases: | |
| 3163 # - 1 : all kinases | |
| 3164 # - 2 : significant kinases | |
| 3165 # - 3 : non-significant kinases | |
| 3166 which_kinases = which_kinases | |
| 3167 ) | |
| 3168 cat("\\begin{center}\n") | |
| 3169 cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n") | |
| 3170 cat("\\end{center}\n") | |
| 3171 } # end for (i in ... | |
| 3172 } # end if (length ... | |
| 3173 | |
| 3174 for (i_cntrst in seq_len(length(rslt$score_list))) { | |
| 3175 next_index <- i_cntrst | |
| 3176 cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"] | |
| 3177 cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"] | |
| 3178 cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6] | |
| 3179 contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level) | |
| 3180 contrast_longlabel <- ( | |
| 3181 sprintf( | |
| 3182 "Trt %s {%s} -> Trt %s {%s}", | |
| 3183 contrast_metadata_df[i_cntrst, "b_level"], | |
| 3184 gsub( | |
| 3185 pattern = ";", | |
| 3186 replacement = ", ", | |
| 3187 x = contrast_metadata_df[i_cntrst, "b_samples"], | |
| 3188 fixed = TRUE | |
| 3189 ), | |
| 3190 contrast_metadata_df[i_cntrst, "a_level"], | |
| 3191 gsub( | |
| 3192 pattern = ";", | |
| 3193 replacement = ", ", | |
| 3194 x = contrast_metadata_df[i_cntrst, "a_samples"], | |
| 3195 fixed = TRUE | |
| 3196 ) | |
| 3197 ) | |
| 3198 ) | |
| 3199 main_title <- ( | |
| 3200 sprintf( | |
| 3201 "Change from treatment %s to treatment %s", | |
| 3202 contrast_metadata_df[i_cntrst, "b_level"], | |
| 3203 contrast_metadata_df[i_cntrst, "a_level"] | |
| 3204 ) | |
| 3205 ) | |
| 3206 sub_title <- contrast_longlabel | |
| 3207 tryCatch( | |
| 3208 expr = { | |
| 3209 ksea_scores_rslt <- rslt$score_list[[next_index]] | |
| 3210 | |
| 3211 if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) { | |
| 3212 low_fdr_barplot( | |
| 3213 rslt = rslt, | |
| 3214 i_cntrst = i_cntrst, | |
| 3215 i = next_index, | |
| 3216 a_level = cntrst_a_level, | |
| 3217 b_level = cntrst_b_level, | |
| 3218 fold_change = cntrst_fold_change, | |
| 3219 caption = contrast_longlabel | |
| 3220 ) | |
| 3221 } | |
| 3222 }, | |
| 3223 error = function(e) str(e) | |
| 3224 ) | |
| 3225 } | |
| 3226 ``` | |
| 3227 | |
| 3228 ```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
| 3229 | |
| 3230 # Use enriched kinases to find enriched kinase-substrate pairs | |
| 3231 enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash)) | |
| 3232 all_enriched_substrates <- sqldf(" | |
| 3233 SELECT | |
| 3234 gene AS kinase, | |
| 3235 ppep, | |
| 3236 '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label | |
| 3237 FROM ( | |
| 3238 SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep | |
| 3239 FROM pseudo_ksdata | |
| 3240 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
| 3241 ) | |
| 3242 GROUP BY ppep | |
| 3243 ") | |
| 3244 | |
| 3245 # helper used to label per-kinase substrate enrichment figure | |
| 3246 cat_enriched_heading <- function(m, cut_args) { | |
| 3247 cutoff <- cut_args$cutoff | |
| 3248 kinase <- cut_args$kinase | |
| 3249 statistic <- cut_args$statistic | |
| 3250 threshold <- cut_args$threshold | |
| 3251 cat("\\newpage\n") | |
| 3252 if (nrow(m) > intensity_hm_rows) { | |
| 3253 subsection_header( | |
| 3254 paste( | |
| 3255 sprintf( | |
| 3256 "Lowest p-valued %d (of %d) enriched %s-substrates,", | |
| 3257 intensity_hm_rows, | |
| 3258 nrow(m), | |
| 3259 kinase | |
| 3260 ), | |
| 3261 sprintf(" KSEA %s < %0.2f\n", statistic, threshold) | |
| 3262 ) | |
| 3263 ) | |
| 3264 } else { | |
| 3265 if (nrow(m) == 1) { | |
| 3266 return(FALSE) | |
| 3267 } else { | |
| 3268 subsection_header( | |
| 3269 paste( | |
| 3270 sprintf( | |
| 3271 "%d enriched %s-substrates,", | |
| 3272 nrow(m), | |
| 3273 kinase | |
| 3274 ), | |
| 3275 sprintf( | |
| 3276 " KSEA %s < %0.2f\n", | |
| 3277 statistic, | |
| 3278 threshold | |
| 3279 ) | |
| 3280 ) | |
| 3281 ) | |
| 3282 } | |
| 3283 } | |
| 3284 cat("\n\n\n") | |
| 3285 cat("\n\n\n") | |
| 3286 return(TRUE) | |
| 3287 } | |
| 3288 | |
| 3289 # Disabling heatmaps for substrates pending decision whether to eliminate them altogether | |
| 3290 if (FALSE) | |
| 3291 for (kinase_name in sort(enriched_kinases$kinase)) { | |
| 3292 enriched_substrates <- | |
| 3293 all_enriched_substrates[ | |
| 3294 all_enriched_substrates$kinase == kinase_name, | |
| 3295 , | |
| 3296 drop = FALSE | |
| 3297 ] | |
| 3298 # Get the intensity values for the heatmap | |
| 3299 enriched_intensities <- | |
| 3300 as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE]) | |
| 3301 # Remove rows having too many NA values to be relevant | |
| 3302 na_counter <- is.na(enriched_intensities) | |
| 3303 na_counts <- apply(na_counter, 1, sum) | |
| 3304 enriched_intensities <- | |
| 3305 enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE] | |
| 3306 # Rename the rows with the display-name for the heatmap | |
| 3307 rownames(enriched_intensities) <- | |
| 3308 sapply( | |
| 3309 X = rownames(enriched_intensities), | |
| 3310 FUN = function(rn) { | |
| 3311 enriched_substrates[enriched_substrates$ppep == rn, "label"] | |
| 3312 } | |
| 3313 ) | |
| 3314 # Format as matrix for heatmap | |
| 3315 m <- as.matrix(enriched_intensities) | |
| 3316 # Draw the heading and heatmap | |
| 3317 if (nrow(m) > 0) { | |
| 3318 cut_args <- new_env() | |
| 3319 cut_args$cutoff <- cutoff | |
| 3320 cut_args$kinase <- kinase_name | |
| 3321 cut_args$statistic <- ksea_cutoff_statistic | |
| 3322 cut_args$threshold <- ksea_cutoff_threshold | |
| 3323 number_of_peptides_found <- | |
| 3324 draw_intensity_heatmap( | |
| 3325 m = m, | |
| 3326 cutoff = cut_args, | |
| 3327 hm_heading_function = cat_enriched_heading, | |
| 3328 hm_main_title | |
| 3329 = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates", | |
| 3330 suppress_row_dendrogram = FALSE | |
| 3331 ) | |
| 3332 } | |
| 3333 } | |
| 3334 | |
| 3335 # Write output tabular files | |
| 3336 | |
| 3337 # get kinase, ppep, concat(kinase) tuples for enriched kinases | |
| 3338 | |
| 3339 kinase_ppep_label <- sqldf(" | |
| 3340 WITH | |
| 3341 t(ppep, label) AS | |
| 3342 ( | |
| 3343 SELECT DISTINCT | |
| 3344 SUB_MOD_RSD AS ppep, | |
| 3345 group_concat(gene, '; ') AS label | |
| 3346 FROM pseudo_ksdata | |
| 3347 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
| 3348 GROUP BY ppep | |
| 3349 ), | |
| 3350 k(kinase, ppep_join) AS | |
| 3351 ( | |
| 3352 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join | |
| 3353 FROM pseudo_ksdata | |
| 3354 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
| 3355 ) | |
| 3356 SELECT k.kinase, t.ppep, t.label | |
| 3357 FROM t, k | |
| 3358 WHERE t.ppep = k.ppep_join | |
| 3359 ORDER BY k.kinase, t.ppep | |
| 3360 ") | |
| 3361 | |
| 3362 # extract what we need from full_data | |
| 3363 impish <- cbind(rownames(quant_data_imp), quant_data_imp) | |
| 3364 colnames(impish)[1] <- "Phosphopeptide" | |
| 3365 data_table_imputed_sql <- " | |
| 3366 SELECT | |
| 3367 f.*, | |
| 3368 k.label AS KSEA_enrichments, | |
| 3369 q.* | |
| 3370 FROM | |
| 3371 metadata_plus_p f | |
| 3372 LEFT JOIN kinase_ppep_label k | |
| 3373 ON f.Phosphopeptide = k.ppep, | |
| 3374 impish q | |
| 3375 WHERE | |
| 3376 f.Phosphopeptide = q.Phosphopeptide | |
| 3377 " | |
| 3378 data_table_imputed <- sqldf(data_table_imputed_sql) | |
| 3379 # Zap the duplicated 'Phosphopeptide' column named 'ppep' | |
| 3380 data_table_imputed <- | |
| 3381 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] | |
| 3382 | |
| 3383 # Output with imputed, un-normalized data | |
| 3384 | |
| 3385 write.table( | |
| 3386 data_table_imputed | |
| 3387 , file = imputed_data_filename | |
| 3388 , sep = "\t" | |
| 3389 , col.names = TRUE | |
| 3390 , row.names = FALSE | |
| 3391 , quote = FALSE | |
| 3392 ) | |
| 3393 | |
| 3394 | |
| 3395 #output quantile normalized data | |
| 3396 impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log) | |
| 3397 colnames(impish)[1] <- "Phosphopeptide" | |
| 3398 data_table_imputed <- sqldf(data_table_imputed_sql) | |
| 3399 # Zap the duplicated 'Phosphopeptide' column named 'ppep' | |
| 3400 data_table_imputed <- | |
| 3401 data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))] | |
| 3402 write.table( | |
| 3403 data_table_imputed, | |
| 3404 file = imp_qn_lt_data_filenm, | |
| 3405 sep = "\t", | |
| 3406 col.names = TRUE, | |
| 3407 row.names = FALSE, | |
| 3408 quote = FALSE | |
| 3409 ) | |
| 3410 | |
| 3411 ppep_kinase <- sqldf(" | |
| 3412 SELECT DISTINCT k.ppep, k.kinase | |
| 3413 FROM ( | |
| 3414 SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep | |
| 3415 FROM pseudo_ksdata | |
| 3416 WHERE GENE IN (SELECT kinase FROM enriched_kinases) | |
| 3417 ) k | |
| 3418 ORDER BY k.ppep, k.kinase | |
| 3419 ") | |
| 3420 | |
| 3421 RSQLite::dbWriteTable( | |
| 3422 conn = db, | |
| 3423 name = "ksea_enriched_ks", | |
| 3424 value = ppep_kinase, | |
| 3425 append = FALSE | |
| 3426 ) | |
| 3427 | |
| 3428 RSQLite::dbWriteTable( | |
| 3429 conn = db, | |
| 3430 name = "anova_signif", | |
| 3431 value = p_value_data, | |
| 3432 append = FALSE | |
| 3433 ) | |
| 3434 | |
| 3435 ddl_exec(db, " | |
| 3436 DROP VIEW IF EXISTS stats_metadata_v; | |
| 3437 " | |
| 3438 ) | |
| 3439 dml_no_rows_exec(db, " | |
| 3440 CREATE VIEW stats_metadata_v | |
| 3441 AS | |
| 3442 SELECT DISTINCT m.*, | |
| 3443 p.raw_anova_p, | |
| 3444 p.fdr_adjusted_anova_p, | |
| 3445 kek.kinase AS ksea_enrichments | |
| 3446 FROM | |
| 3447 mrgfltr_metadata_view m | |
| 3448 LEFT JOIN anova_signif p | |
| 3449 ON m.phospho_peptide = p.phosphopeptide | |
| 3450 LEFT JOIN ksea_enriched_ks kek | |
| 3451 ON m.phospho_peptide = kek.ppep | |
| 3452 ; | |
| 3453 " | |
| 3454 ) | |
| 3455 | |
| 3456 write.table( | |
| 3457 dbReadTable(db, "stats_metadata_v"), | |
| 3458 file = anova_ksea_mtdt_file, | |
| 3459 sep = "\t", | |
| 3460 col.names = TRUE, | |
| 3461 row.names = FALSE, | |
| 3462 quote = FALSE | |
| 3463 ) | |
| 3464 | |
| 3465 | |
| 3466 ``` | |
| 3467 | |
| 3468 ```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'} | |
| 3469 cat("\\leavevmode\n\n\n") | |
| 3470 | |
| 3471 # write parameters to report | |
| 3472 | |
| 3473 param_unlist <- unlist(as.list(params)) | |
| 3474 param_df <- data.frame( | |
| 3475 parameter = paste0("\\verb@", names(param_unlist), "@"), | |
| 3476 value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@") | |
| 3477 ) | |
| 3478 | |
| 3479 data_frame_latex( | |
| 3480 x = param_df, | |
| 3481 justification = "p{0.35\\linewidth} p{0.6\\linewidth}", | |
| 3482 centered = TRUE, | |
| 3483 caption = "Input parameters", | |
| 3484 anchor = const_table_anchor_bp, | |
| 3485 underscore_whack = FALSE | |
| 3486 ) | |
| 3487 | |
| 3488 # write parameters to SQLite output | |
| 3489 | |
| 3490 mqppep_anova_script_param_df <- data.frame( | |
| 3491 script = "mqppep_anova_script.Rmd", | |
| 3492 parameter = names(param_unlist), | |
| 3493 value = param_unlist | |
| 3494 ) | |
| 3495 ddl_exec(db, " | |
| 3496 DROP TABLE IF EXISTS script_parameter; | |
| 3497 " | |
| 3498 ) | |
| 3499 ddl_exec(db, " | |
| 3500 CREATE TABLE IF NOT EXISTS script_parameter( | |
| 3501 script TEXT, | |
| 3502 parameter TEXT, | |
| 3503 value ANY, | |
| 3504 UNIQUE (script, parameter) ON CONFLICT REPLACE | |
| 3505 ) | |
| 3506 ; | |
| 3507 " | |
| 3508 ) | |
| 3509 RSQLite::dbWriteTable( | |
| 3510 conn = db, | |
| 3511 name = "script_parameter", | |
| 3512 value = mqppep_anova_script_param_df, | |
| 3513 append = TRUE | |
| 3514 ) | |
| 3515 | |
| 3516 # We are done with output | |
| 3517 RSQLite::dbDisconnect(db) | |
| 3518 ``` | |
| 3519 <!-- | |
| 3520 There's gotta be a better way... | |
| 3521 | |
| 3522 loaded_packages_df <- sessioninfo::package_info("loaded") | |
| 3523 loaded_packages_df[, "library"] <- as.character(loaded_packages_df$library) | |
| 3524 loaded_packages_df <- data.frame( | |
| 3525 package = loaded_packages_df$package, | |
| 3526 version = loaded_packages_df$loadedversion, | |
| 3527 date = loaded_packages_df$date | |
| 3528 ) | |
| 3529 data_frame_latex( | |
| 3530 x = loaded_packages_df, | |
| 3531 justification = "l | l l", | |
| 3532 centered = FALSE, | |
| 3533 caption = "Loaded R packages", | |
| 3534 anchor = const_table_anchor_bp | |
| 3535 ) | |
| 3536 --> |
