# HG changeset patch
# User eschen42
# Date 1656605792 0
# Node ID 61adb8801b73bb72c0d592db7326190dd2b0df28
# Parent f3deca1a3c84ebee61b14a271ea59d24a97d6890
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 0c7ca054e77e042c8a584c9903073da064df7d8b
diff -r f3deca1a3c84 -r 61adb8801b73 macros.xml
--- a/macros.xml Wed Apr 13 19:48:32 2022 +0000
+++ b/macros.xml Thu Jun 30 16:16:32 2022 +0000
@@ -1,17 +1,30 @@
- 0.1.12
+ 0.1.13
0
- r-sass
- r-data.table
bioconductor-preprocesscore
- perl
+ numpy
openblas
- numpy
+ pandas
+ perl-dbd-sqlite
+ perl
+ pyahocorasick
+ python
+ r-data.table
+ r-dbi
r-ggplot2
+ r-gplots
+ r-latex2exp
+ r-optparse
+ r-reshape2
+ r-rmarkdown
+ r-rsqlite
+ r-sass
+ r-sqldf
+ r-stringr
+ r-tinytex
r-vioplot
- r-tinytex
- python
- r-latex2exp
- r-stringr
- perl-dbd-sqlite
- pyahocorasick
- r-rmarkdown
- pandas
- r-optparse
+
diff -r f3deca1a3c84 -r 61adb8801b73 mqppep_anova.R
--- a/mqppep_anova.R Wed Apr 13 19:48:32 2022 +0000
+++ b/mqppep_anova.R Thu Jun 30 16:16:32 2022 +0000
@@ -3,12 +3,6 @@
library(optparse)
library(data.table)
library(stringr)
-# bioconductor-preprocesscore
-# - libopenblas
-# - r-data.table
-# - r-rmarkdown
-# - r-ggplot2
-# - texlive-core
# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
@@ -30,6 +24,20 @@
" path to text file having one column and no header")
),
make_option(
+ c("-S", "--preproc_sqlite"),
+ action = "store",
+ default = NA,
+ type = "character",
+ help = "Path to 'preproc_sqlite' produced by `mqppep_mrgfltr.py`"
+ ),
+ make_option(
+ c("-K", "--ksea_sqlite"),
+ action = "store",
+ default = NA,
+ type = "character",
+ help = "Path to 'ksea_sqlite' output produced by this tool"
+ ),
+ make_option(
c("-f", "--firstDataColumn"),
action = "store",
default = "^Intensity[^_]",
@@ -99,6 +107,28 @@
default = "QuantDataProcessingScript.html",
type = "character",
help = "HTML report file path"
+ ),
+ make_option(
+ c("-k", "--ksea_cutoff_statistic"),
+ action = "store",
+ default = "FDR",
+ type = "character",
+ help = paste0("Method for missing-value imputation,",
+ " one of c('FDR','p.value'), but don't expect 'p.value' to work well.")
+ ),
+ make_option(
+ c("-t", "--ksea_cutoff_threshold"),
+ action = "store",
+ default = 0.05,
+ type = "double",
+ help = paste0("Maximum score to be used to score a kinase enrichment as significant")
+ ),
+ make_option(
+ c("-M", "--anova_ksea_metadata"),
+ action = "store",
+ default = "anova_ksea_metadata.tsv",
+ type = "character",
+ help = "Phosphopeptide metadata, ANOVA FDR, and KSEA enribhments"
)
)
args <- parse_args(OptionParser(option_list = option_list))
@@ -112,18 +142,27 @@
}
input_file <- args$inputFile
alpha_file <- args$alphaFile
+preproc_sqlite <- args$preproc_sqlite
imputed_data_file_name <- args$imputedDataFile
imp_qn_lt_data_filenm <- args$imputedQNLTDataFile
+anova_ksea_metadata <- args$anova_ksea_metadata
report_file_name <- args$reportFile
+ksea_sqlite <- args$ksea_sqlite
+ksea_cutoff_statistic <- args$ksea_cutoff_statistic
+ksea_cutoff_threshold <- args$ksea_cutoff_threshold
+if (
+ sum(
+ grepl(
+ pattern = ksea_cutoff_statistic,
+ x = c("FDR", "p.value")
+ )
+ ) < 1
+ ) {
+ print(sprintf("bad ksea_cutoff_statistic argument: %s", ksea_cutoff_statistic))
+ return(-1)
+ }
imputation_method <- args$imputationMethod
-print(
- grepl(
- pattern = imputation_method,
- x = c("group-median", "median", "mean", "random")
- )
- )
-
if (
sum(
grepl(
@@ -219,6 +258,7 @@
rmarkdown_params <- list(
inputFile = input_file
, alphaFile = alpha_file
+ , preprocDb = preproc_sqlite
, firstDataColumn = first_data_column
, imputationMethod = imputation_method
, meanPercentile = mean_percentile
@@ -227,6 +267,10 @@
, regexSampleGrouping = regex_sample_grouping
, imputedDataFilename = imputed_data_file_name
, imputedQNLTDataFile = imp_qn_lt_data_filenm
+ , anovaKseaMetadata = anova_ksea_metadata
+ , kseaAppPrepDb = ksea_sqlite
+ , kseaCutoffThreshold = ksea_cutoff_threshold
+ , kseaCutoffStatistic = ksea_cutoff_statistic
)
print("rmarkdown_params")
diff -r f3deca1a3c84 -r 61adb8801b73 mqppep_anova.xml
--- a/mqppep_anova.xml Wed Apr 13 19:48:32 2022 +0000
+++ b/mqppep_anova.xml Thu Jun 30 16:16:32 2022 +0000
@@ -2,13 +2,25 @@
id="mqppep_anova"
name="MaxQuant Phosphopeptide ANOVA"
version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@"
- python_template_version="3.5"
profile="21.05"
>
- Perform ANOVA on merged and filtered data from phospho-peptide enrichment/MaxQuant pipeline
+ Runs ANOVA and KSEA for phosphopeptides.
macros.xml
+
+ topic_0121
+ topic_3520
+
+
+ operation_0276
+ operation_0531
+ operation_2938
+ operation_2938
+ operation_3435
+ operation_3501
+ operation_3658
+
@@ -48,19 +63,22 @@
-
+
@@ -73,16 +91,16 @@
@@ -91,7 +109,7 @@
@@ -99,34 +117,106 @@
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
@@ -134,20 +224,29 @@
+
10.3791/57996
+
+ 10.1093/bioinformatics/btx415
diff -r f3deca1a3c84 -r 61adb8801b73 mqppep_anova_script.Rmd
--- a/mqppep_anova_script.Rmd Wed Apr 13 19:48:32 2022 +0000
+++ b/mqppep_anova_script.Rmd Thu Jun 30 16:16:32 2022 +0000
@@ -1,40 +1,100 @@
---
-title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA"
-author: "Larry Cheng; Art Eschenlauer"
-date: "May 28, 2018; Mar 16, 2022"
+title: "MaxQuant Phosphoproteomic Enrichment Pipeline ANOVA/KSEA"
+author:
+- "Nick Graham^[ORCiD 0000-0002-6811-1941, University of Southern California: Los Angeles, CA, US]"
+- "Larry Cheng^[ORCiD 0000-0002-6922-6433, Rutgers School of Graduate Studies: New Brunswick, NJ, US]"
+- "Art Eschenlauer^[ORCiD 0000-0002-2882-0508, University of Minnesota: Minneapolis, Minnesota, US]"
+date:
+- "May 28, 2018"
+- "; revised June 23, 2022"
output:
pdf_document:
toc: true
- latex_document:
- toc: true
+ toc_depth: 3
+ keep_tex: true
+header-includes:
+ - \usepackage{longtable}
+ - \newcommand\T{\rule{0pt}{2.6ex}} % Top strut
+ - \newcommand\B{\rule[-1.2ex]{0pt}{0pt}} % Bottom strut
params:
- alphaFile: "test-data/alpha_levels.tabular"
- inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular"
- firstDataColumn: "^Intensity[^_]"
- imputationMethod: !r c("group-median", "median", "mean", "random")[4]
- meanPercentile: 1
- sdPercentile: 1.0
- regexSampleNames: "\\.\\d+[A-Z]$"
- regexSampleGrouping: "\\d+"
- imputedDataFilename: "test-data/limbo/imputedDataFilename.txt"
- imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt"
- show_toc: true
+ alphaFile: "test-data/alpha_levels.tabular"
+ inputFile: "test-data/test_input_for_anova.tabular"
+ preprocDb: "test-data/test_input_for_anova.sqlite"
+ kseaAppPrepDb: !r c(":memory:", "test-data/mqppep.sqlite")[2]
+ show_toc: true
+ firstDataColumn: "^Intensity[^_]"
+ imputationMethod: !r c("group-median", "median", "mean", "random")[1]
+ meanPercentile: 1
+ sdPercentile: 1.0
+ regexSampleNames: "\\.\\d+[A-Z]$"
+ regexSampleGrouping: "\\d+"
+ imputedDataFilename: "test-data/limbo/imputedDataFilename.txt"
+ imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt"
+ anovaKseaMetadata: "test-data/limbo/anovaKseaMetadata.txt"
+ oneWayManyCategories: !r c("aov", "kruskal.test", "oneway.test")[1]
+ oneWayTwoCategories: !r c("aov", "kruskal.test", "oneway.test")[3]
+ kseaCutoffStatistic: !r c("p.value", "FDR")[2]
+ kseaCutoffThreshold: !r c( 0.1, 0.05)[2]
+ kseaMinKinaseCount: 1
+ intensityHeatmapRows: 75
---
```{r setup, include = FALSE}
+#ref for debugging: https://yihui.org/tinytex/r/#debugging
+options(tinytex.verbose = TRUE)
+
# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
+# ref for top and bottom struts: https://tex.stackexchange.com/a/50355
knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10))
# freeze the random number generator so the same results will be produced
# from run to run
set.seed(28571)
+### LIBRARIES
+library(gplots)
+library(DBI)
+library(RSQLite)
+# Suppress "Warning: no DISPLAY variable so Tk is not available"
+suppressWarnings(suppressMessages(library(sqldf)))
+
+# required but not added to search list:
+# - DBI
+# - RSQLite
+# - ggplot2
+# - knitr
+# - latex2exp
+# - preprocessCore
+# - reshape2
+# - vioplot
+
### CONSTANTS
const_parfin <- par("fin")
@@ -44,20 +104,54 @@
sqrt(const_stripchart_cex * const_stripchart_cex / 2)
const_stripchart_jitter <- 0.3
const_write_debug_files <- FALSE
-const_table_anchor <- "tbp"
+const_table_anchor_bp <- "bp"
+const_table_anchor_ht <- "ht"
+const_table_anchor_p <- "p"
+const_table_anchor_tbp <- "tbp"
+
+
+const_ksea_astrsk_kinases <- 1
+const_ksea_nonastrsk_kinases <- 2
+const_ksea_all_kinases <- 3
+
+const_log10_e <- log10(exp(1))
### FUNCTIONS
-#ANOVA filter function
-anova_func <- function(x, grouping_factor) {
- x_aov <- aov(as.numeric(x) ~ grouping_factor)
- pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1]
- pvalue
+# from `demo(error.catching)`
+##' Catch *and* save both errors and warnings, and in the case of
+##' a warning, also keep the computed result.
+##'
+##' @title tryCatch both warnings (with value) and errors
+##' @param expr an \R expression to evaluate
+##' @return a list with 'value' and 'warning', where
+##' 'value' may be an error caught.
+##' @author Martin Maechler;
+##' Copyright (C) 2010-2012 The R Core Team
+try_catch_w_e <- function(expr) {
+ wrn <- NULL
+ # warning handler
+ w_handler <- function(w) {
+ wrn <<- w
+ invokeRestart("muffleWarning")
+ }
+ list(
+ value = withCallingHandlers(
+ tryCatch(
+ expr,
+ error = function(e) e
+ ),
+ warning = w_handler
+ ),
+ warning = wrn
+ )
}
+
write_debug_file <- function(s) {
if (const_write_debug_files) {
s_path <- sprintf("test-data/%s.txt", deparse(substitute(s)))
+ print(sprintf("DEBUG writing file %s", spath))
write.table(
s,
file = s_path,
@@ -69,28 +163,72 @@
}
}
-latex_collapsed_vector <- function(collapse_string, v) {
+# ref: http://adv-r.had.co.nz/Environments.html
+# "When creating your own environment, note that you should set its parent
+# environment to be the empty environment. This ensures you don't
+# accidentally inherit objects from somewhere else."
+# Caution: this prevents `with(my_env, expr)` from working when `expr`
+# contains anything from the global environment, even operators!
+# Hence, `x <- 1; get("x", new_env())` fails by design.
+new_env <- function() {
+ new.env(parent = emptyenv())
+}
+
+### numerical/statistical helper functions
+
+any_nan <- function(x) {
+ !any(x == "NaN")
+}
+
+# determine standard deviation of quantile to impute
+sd_finite <- function(x) {
+ ok <- is.finite(x)
+ sd(x[ok])
+}
+
+anova_func <- function(x, grouping_factor, one_way_f) {
+ subject <- data.frame(
+ intensity = x
+ )
+ x_aov <-
+ one_way_f(
+ formula = intensity ~ grouping_factor,
+ data = subject
+ )
+ pvalue <-
+ if (identical(one_way_f, aov))
+ summary(x_aov)[[1]][["Pr(>F)"]][1]
+ else
+ pvalue <- x_aov$p.value
+ pvalue
+}
+
+
+### LaTeX functions
+
+latex_collapsed_vector <- function(collapse_string, v, underscore_whack = TRUE) {
+ v_sub <- if (underscore_whack) gsub("_", "\\\\_", v) else v
cat(
paste0(
- gsub("_", "\\\\_", v),
+ v_sub,
collapse = collapse_string
)
)
}
-latex_itemized_collapsed <- function(collapse_string, v) {
+latex_itemized_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
cat("\\begin{itemize}\n\\item ")
- latex_collapsed_vector(collapse_string, v)
+ latex_collapsed_vector(collapse_string, v, underscore_whack)
cat("\n\\end{itemize}\n")
}
-latex_itemized_list <- function(v) {
- latex_itemized_collapsed("\n\\item ", v)
+latex_itemized_list <- function(v, underscore_whack = TRUE) {
+ latex_itemized_collapsed("\n\\item ", v, underscore_whack)
}
-latex_enumerated_collapsed <- function(collapse_string, v) {
+latex_enumerated_collapsed <- function(collapse_string, v, underscore_whack = TRUE) {
cat("\\begin{enumerate}\n\\item ")
- latex_collapsed_vector(collapse_string, v)
+ latex_collapsed_vector(collapse_string, v, underscore_whack)
cat("\n\\end{enumerate}\n")
}
@@ -98,8 +236,9 @@
latex_enumerated_collapsed("\n\\item ", v)
}
-latex_table_row <- function(v) {
- latex_collapsed_vector(" & ", v)
+latex_table_row <- function(v, extra = "", underscore_whack = TRUE) {
+ latex_collapsed_vector(" & ", v, underscore_whack)
+ cat(extra)
cat(" \\\\\n")
}
@@ -118,10 +257,12 @@
justification = NULL,
# TRUE to center on page
centered = TRUE,
- # optional capttion
+ # optional caption
caption = NULL,
# h(inline); b(bottom); t (top) or p (separate page)
- anchor = "h"
+ anchor = "h",
+ # set underscore_whack to TRUE to escape underscores
+ underscore_whack = TRUE
) {
if (is.null(justification))
justification <-
@@ -144,12 +285,13 @@
"\n",
sep = ""
)
- }
- else if (n == 0L) {
+ } else if (n == 0L) {
cat("0 rows for:\n")
- latex_itemized_list(names(x))
- }
- else {
+ latex_itemized_list(
+ v = names(x),
+ underscore_whack = underscore_whack
+ )
+ } else {
if (is.null(max))
max <- getOption("max.print", 99999L)
if (!is.finite(max))
@@ -178,12 +320,22 @@
sep = ""
)
)
+ # ref: https://tex.stackexchange.com/a/50353
+ # Describes use of \rule{0pt}{3ex}
if (!is.null(caption))
- cat(" \\hline\\hline\n")
- latex_table_row(colnames(m))
+ cat("\\B \\\\ \\hline\\hline\n")
+ # ref for top and bottom struts: https://tex.stackexchange.com/a/50355
+ latex_table_row(
+ v = colnames(m),
+ extra = "\\T\\B",
+ underscore_whack = underscore_whack
+ )
cat("\\hline\n")
for (i in seq_len(length(m[, 1]))) {
- latex_table_row(m[i, ])
+ latex_table_row(
+ v = m[i, ],
+ underscore_whack = underscore_whack
+ )
}
cat(
paste(
@@ -199,13 +351,835 @@
invisible(x)
}
+hypersub <-
+ function(s) {
+ hyper <- tolower(s)
+ hyper <- gsub("[^a-z0-9]+", "-", hyper)
+ hyper <- gsub("[-]+", "-", hyper)
+ hyper <- sub("^[-]", "", hyper)
+ hyper <- sub("[-]$", "", hyper)
+ return(hyper)
+ }
+
+subsection_header <-
+ function(s) {
+ hyper <- hypersub(s)
+ cat(
+ sprintf(
+ "\\hypertarget{%s}\n{\\subsection{%s}\\label{%s}}\n",
+ hyper, s, hyper
+ )
+ )
+ }
+
+subsubsection_header <-
+ function(s) {
+ hyper <- hypersub(s)
+ cat(
+ sprintf(
+ "\\hypertarget{%s}\n{\\subsubsection{%s}\\label{%s}}\n",
+ hyper, s, hyper
+ )
+ )
+ }
+
+### SQLite functions
+
+ddl_exec <- function(db, sql) {
+ discard <- DBI::dbExecute(conn = db, statement = sql)
+ if (FALSE && discard != 0) {
+ need_newpage <- TRUE
+ if (need_newpage) {
+ need_newpage <<- FALSE
+ cat("\\newpage\n")
+ }
+ o_file <- stdout()
+ cat("\n\\begin{verbatim}\n")
+ cat(sql, file = o_file)
+ cat(sprintf("\n%d rows affected by DDL\n", discard), file = o_file)
+ cat("\n\\end{verbatim}\n")
+ }
+}
+
+dml_no_rows_exec <- function(db, sql) {
+ discard <- DBI::dbExecute(conn = db, statement = sql)
+ if (discard != 0) {
+ need_newpage <- TRUE
+ if (need_newpage) {
+ need_newpage <<- FALSE
+ cat("\\newpage\n")
+ }
+ cat("\n\\begin{verbatim}\n")
+ o_file <- stdout()
+ cat(sql, file = o_file)
+ cat(sprintf("\n%d rows affected by DML\n", discard), file = o_file)
+ cat("\n\\end{verbatim}\n")
+ }
+}
+
+### KSEA functions and helpers
+
+# Adapted from KSEAapp::KSEA.Scores to allow retrieval of:
+# - maximum log2(FC)
+ksea_scores <- function(
+
+ # For human data, typically, ksdata = KSEAapp::ksdata
+ ksdata,
+
+ # Input data file having columns:
+ # - Protein : abbreviated protein name
+ # - Gene : HUGO gene name
+ # - Peptide : peptide sequence without indications of phosphorylation
+ # - Reside.Both : position(s) of phosphorylation within Gene sequence
+ # - First letter designates AA that is modified
+ # - Numbers indicate position within Gene
+ # - Multiple values are separated by semicolons
+ # - p : p-value
+ # - FC : fold-change
+ px,
+
+ # A binary input of TRUE or FALSE, indicating whether or not to include
+ # NetworKIN predictions
+ networkin,
+
+ # A numeric value between 1 and infinity setting the minimum NetworKIN
+ # score (can be left out if networkin = FALSE)
+ networkin_cutoff
+
+) {
+ if (length(grep(";", px$Residue.Both)) == 0) {
+ # There are no Residue.Both entries having semicolons, so new is
+ # simply px except two columns are renamed and a column is added
+ # for log2(abs(fold-change))
+ new <- px
+ colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
+ new$log2_fc <- log2(abs(as.numeric(as.character(new$FC))))
+ new <- new[complete.cases(new$log2_fc), ]
+ } else {
+ # Split each row having semicolons in Residue.Both into rows that are
+ # duplicated in all respects except that each row has a single
+ # member of the set "split-on-semicolon-Residue.Both"
+ px_double <- px[grep(";", px$Residue.Both), ]
+ residues <- as.character(px_double$Residue.Both)
+ residues <- as.matrix(residues, ncol = 1)
+ split <- strsplit(residues, split = ";")
+ # x gets count of residues in each row,
+ # i.e., 1 + count of semicolons
+ x <- sapply(split, length)
+ # Here is the set of split rows
+ px_single <- data.frame(
+ Protein = rep(px_double$Protein, x),
+ Gene = rep(px_double$Gene, x),
+ Peptide = rep(px_double$Peptide, x),
+ Residue.Both = unlist(split),
+ p = rep(px_double$p, x),
+ FC = rep(px_double$FC, x)
+ )
+ # new first gets the split rows
+ new <- px[-grep(";", px$Residue.Both), ]
+ # to new, append the rows that didn't need splitting in the first place
+ new <- rbind(new, px_single)
+ # map Gene to SUB_GENE
+ # map Residue.Both to SUB_MOD_RSD
+ colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
+ # Eliminate any non-positive values to prevent introduction of
+ # infinite or NaN values
+ new[(0 <= new$log2_fc), "log2_fc"] <- NA
+ # Because of preceding step, there is no need for abs in the next line
+ new$log2_fc <- log2(as.numeric(as.character(new$FC)))
+ # Convert any illegal values from NaN to NA
+ new[is.nan(new$log2_fc), "log2_fc"] <- NA
+ # Eliminate rows having missing values (e.g., non-imputed data)
+ new <- new[complete.cases(new$log2_fc), ]
+ }
+ if (networkin == TRUE) {
+ # When NetworKIN is true, filter on NetworKIN.cutoff which includes
+ # PhosphoSitePlus data *because its networkin_score is set to Inf*
+ ksdata_filtered <- ksdata[grep("[a-z]", ksdata$Source), ]
+ ksdata_filtered <- ksdata_filtered[
+ (ksdata_filtered$networkin_score >= networkin_cutoff), ]
+ } else {
+ # Otherwise, simply use PhosphSitePlus rows
+ ksdata_filtered <- ksdata[
+ grep("PhosphoSitePlus", ksdata$Source), ]
+ }
+ # Join the two data.frames on common columns SUB_GENE and SUB_MOD_RSD
+ # colnames of ksdata_filtered:
+ # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE" "SUB_GENE_ID"
+ # "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD" "SITE_GRP_ID"
+ # "SITE_...7_AA" "networkin_score" "Source"
+ # colnames of new:
+ # "Protein" "SUB_GENE" "Peptide" "SUB_MOD_RSD" "p" "FC" "log2_fc"
+ # Equivalent to:
+ # SELECT a.*. b.Protein, b.Peptide, b.p, b.FC, b.log2_fc
+ # FROM ksdata_filtered a
+ # INNER JOIN new b
+ # ON a.SUB_GENE = b.SUB_GENE
+ # AND a.SUB_MOD_RSD = b.SUB_MOD_RSD
+ ksdata_dataset <- base::merge(ksdata_filtered, new)
+ # colnames of ksdata_dataset:
+ # "KINASE" "KIN_ACC_ID" "GENE" "KIN_ORGANISM" "SUBSTRATE"
+ # "SUB_GENE_ID" "SUB_ACC_ID" "SUB_GENE" "SUB_ORGANISM" "SUB_MOD_RSD"
+ # "SITE_GRP_ID" "SITE_...7_AA" "networkin_score" "Source" "Protein"
+ # "Peptide" "p" "FC" "log2_fc" (uniprot_no_isoform)
+ # Re-order dataset; prior to accounting for isoforms
+ ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ]
+ # Extract non-isoform accession in UniProtKB
+ ksdata_dataset$uniprot_no_isoform <- sapply(
+ ksdata_dataset$KIN_ACC_ID,
+ function(x) unlist(strsplit(as.character(x), split = "-"))[1]
+ )
+ # Discard previous results while selecting interesting columns ...
+ ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)]
+ # Column names are now:
+ # "GENE" "SUB_GENE" "SUB_MOD_RSD" "Peptide" "p"
+ # "FC" "log2_fc" "Source"
+ # Make column names human-readable
+ colnames(ksdata_dataset_abbrev) <- c(
+ "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p",
+ "FC", "log2FC", "Source"
+ )
+ # SELECT * FROM ksdata_dataset_abbrev
+ # ORDER BY Kinase.Gene, Substrate.Gene, Substrate.Mod, p
+ ksdata_dataset_abbrev <-
+ ksdata_dataset_abbrev[
+ order(
+ ksdata_dataset_abbrev$Kinase.Gene,
+ ksdata_dataset_abbrev$Substrate.Gene,
+ ksdata_dataset_abbrev$Substrate.Mod,
+ ksdata_dataset_abbrev$p),
+ ]
+ # First aggregation step to account for multiply phosphorylated peptides
+ # and differing peptide sequences; the goal here is to combine results
+ # for all measurements of the same substrate.
+ # SELECT `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
+ # `Source`, avg(log2FC) AS log2FC
+ # FROM ksdata_dataset_abbrev
+ # GROUP BY `Kinase.Gene`, `Substrate.Gene`, `Substrate.Mod`,
+ # `Source`
+ # ORDER BY `Kinase.Gene`;
+ # in two steps:
+ # (1) compute average log_2(fold-change)
+ ksdata_dataset_abbrev <- aggregate(
+ log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source,
+ data = ksdata_dataset_abbrev,
+ FUN = mean
+ )
+ # (2) order by Kinase.Gene
+ ksdata_dataset_abbrev <-
+ ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ]
+ # SELECT `Kinase.Gene`, count(*)
+ # FROM ksdata_dataset_abbrev
+ # GROUP BY `Kinase.Gene`;
+ # in two steps:
+ # (1) Extract the list of Kinase.Gene names
+ kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene)
+ # (2) Convert to a named list of counts of kinases in ksdata_dataset_abrev,
+ # named by Kinase.Gene
+ kinase_list <- as.matrix(table(kinase_list))
+ # Second aggregation step to account for all substrates per kinase
+ # CREATE TABLE mean_fc
+ # AS
+ # SELECT `Kinase.Gene`, avg(log2FC) AS log2FC
+ # FROM ksdata_dataset_abbrev
+ # GROUP BY `Kinase.Gene`
+ mean_fc <- aggregate(
+ log2FC ~ Kinase.Gene,
+ data = ksdata_dataset_abbrev,
+ FUN = mean
+ )
+ # mean_fc columns: "Kinase.Gene", "log2FC"
+ if (FALSE) {
+ # I need to re-think this; I was trying to find the most-represented
+ # peptide, but that horse has already left the barn
+ # SELECT `Kinase.Gene`, max(abs(log2FC)) AS log2FC
+ # FROM ksdata_dataset_abbrev
+ # GROUP BY `Kinase.Gene`
+ max_fc <- aggregate(
+ log2FC ~ Kinase.Gene,
+ data = ksdata_dataset_abbrev,
+ FUN = function(r) max(abs(r))
+ )
+ }
+
+ # Create column 3: mS
+ mean_fc$m_s <- mean_fc[, 2]
+ # Create column 4: Enrichment
+ mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = TRUE))
+ # Create column 5: m, count of substrates
+ mean_fc$m <- kinase_list
+ # Create column 6: z-score
+ mean_fc$z_score <- (
+ (mean_fc$m_s - mean(new$log2_fc, na.rm = TRUE)) *
+ sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = TRUE)
+ # Create column 7: p-value, deduced from z-score
+ mean_fc$p_value <- pnorm(-abs(mean_fc$z_score))
+ # Create column 8: FDR, deduced by Benjamini-Hochberg adustment from p-value
+ mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr")
+
+ # Remove log2FC column, which is duplicated as mS
+ mean_fc <- mean_fc[order(mean_fc$Kinase.Gene), -2]
+ # Correct the column names which we had to hack because of the linter...
+ colnames(mean_fc) <- c(
+ "Kinase.Gene", "mS", "Enrichment", "m", "z.score", "p.value", "FDR"
+ )
+ return(mean_fc)
+}
+
+low_fdr_barplot <- function(
+ rslt,
+ i_cntrst,
+ i,
+ a_level,
+ b_level,
+ fold_change,
+ caption
+) {
+ rslt_score_list_i <- rslt$score_list[[i]]
+ if (!is.null(rslt_score_list_i)) {
+ rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
+ k <- data.frame(
+ contrast = as.integer(i_cntrst),
+ a_level = rep.int(a_level, rslt_score_list_i_nrow),
+ b_level = rep.int(b_level, rslt_score_list_i_nrow),
+ kinase_gene = rslt_score_list_i$Kinase.Gene,
+ mean_log2_fc = rslt_score_list_i$mS,
+ enrichment = rslt_score_list_i$Enrichment,
+ substrate_count = rslt_score_list_i$m,
+ z_score = rslt_score_list_i$z.score,
+ p_value = rslt_score_list_i$p.value,
+ fdr = rslt_score_list_i$FDR
+ )
+ selector <- switch(
+ ksea_cutoff_statistic,
+ "FDR" = {
+ k$fdr
+ },
+ "p.value" = {
+ k$p_value
+ },
+ stop(
+ sprintf(
+ "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+ ksea_cutoff_statistic
+ )
+ )
+ )
+
+ k <- k[selector < ksea_cutoff_threshold, ]
+
+ if (nrow(k) > 1) {
+ op <- par(mai = c(1, 1.5, 0.4, 0.4))
+ numeric_z_score <- as.numeric(k$z_score)
+ z_score_order <- order(numeric_z_score)
+ kinase_name <- k$kinase_gene
+ long_caption <-
+ sprintf(
+ "Kinase z-score, %s < %s, %s",
+ ksea_cutoff_statistic,
+ ksea_cutoff_threshold,
+ caption
+ )
+ my_cex_caption <- 65.0 / max(65.0, nchar(long_caption))
+ cat("\n\\clearpage\n")
+ barplot(
+ height = numeric_z_score[z_score_order],
+ border = NA,
+ xpd = FALSE,
+ cex.names = 1.0,
+ cex.axis = 1.0,
+ main = long_caption,
+ cex.main = my_cex_caption,
+ names.arg = kinase_name[z_score_order],
+ horiz = TRUE,
+ srt = 45,
+ las = 1)
+ par(op)
+ }
+ }
+}
+
+# note that this adds elements to the global variable `ksea_asterisk_hash`
+
+low_fdr_print <- function(
+ rslt,
+ i_cntrst,
+ i,
+ a_level,
+ b_level,
+ fold_change,
+ caption
+) {
+ rslt_score_list_i <- rslt$score_list[[i]]
+ if (!is.null(rslt_score_list_i)) {
+ rslt_score_list_i_nrow <- nrow(rslt_score_list_i)
+ k <- contrast_ksea_scores <- data.frame(
+ contrast = as.integer(i_cntrst),
+ a_level = rep.int(a_level, rslt_score_list_i_nrow),
+ b_level = rep.int(b_level, rslt_score_list_i_nrow),
+ kinase_gene = rslt_score_list_i$Kinase.Gene,
+ mean_log2_fc = rslt_score_list_i$mS,
+ enrichment = rslt_score_list_i$Enrichment,
+ substrate_count = rslt_score_list_i$m,
+ z_score = rslt_score_list_i$z.score,
+ p_value = rslt_score_list_i$p.value,
+ fdr = rslt_score_list_i$FDR
+ )
+
+ selector <- switch(
+ ksea_cutoff_statistic,
+ "FDR" = {
+ k$fdr
+ },
+ "p.value" = {
+ k$p_value
+ },
+ stop(
+ sprintf(
+ "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+ ksea_cutoff_statistic
+ )
+ )
+ )
+
+ k <- k[selector < ksea_cutoff_threshold, ]
+ # save kinase names to ksea_asterisk_hash
+ for (kinase_name in k$kinase_gene) {
+ ksea_asterisk_hash[[kinase_name]] <- 1
+ }
+
+ db_write_table_overwrite <- (i_cntrst < 2)
+ db_write_table_append <- !db_write_table_overwrite
+ RSQLite::dbWriteTable(
+ conn = db,
+ name = "contrast_ksea_scores",
+ value = contrast_ksea_scores,
+ append = db_write_table_append
+ )
+ selector <- switch(
+ ksea_cutoff_statistic,
+ "FDR" = {
+ contrast_ksea_scores$fdr
+ },
+ "p.value" = {
+ contrast_ksea_scores$p_value
+ },
+ stop(
+ sprintf(
+ "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+ ksea_cutoff_statistic
+ )
+ )
+ )
+ output_df <- contrast_ksea_scores[
+ selector < ksea_cutoff_threshold,
+ c("kinase_gene", "mean_log2_fc", "enrichment", "substrate_count",
+ "z_score", "p_value", "fdr")
+ ]
+ output_order <- with(output_df, order(mean_log2_fc, kinase_gene))
+ output_df <- output_df[output_order, ]
+ colnames(output_df) <-
+ c(
+ colnames(output_df)[1],
+ colnames(output_df)[2],
+ "enrichment",
+ "m_s",
+ "z_score",
+ "p_value",
+ "fdr"
+ )
+ output_df$fdr <- sprintf("%0.4f", output_df$fdr)
+ output_df$p_value <- sprintf("%0.2e", output_df$p_value)
+ output_df$z_score <- sprintf("%0.2f", output_df$z_score)
+ output_df$m_s <- sprintf("%d", output_df$m_s)
+ output_df$enrichment <- sprintf("%0.2f", output_df$enrichment)
+ output_ncol <- ncol(output_df)
+ colnames(output_df) <-
+ c(
+ "Kinase",
+ "\\(\\overline{\\log_2 (|\\text{fold-change}|)}\\)",
+ "Enrichment",
+ "Substrates",
+ "z-score",
+ "p-value",
+ "FDR"
+ )
+ selector <- switch(
+ ksea_cutoff_statistic,
+ "FDR" = {
+ rslt$score_list[[i]]$FDR
+ },
+ "p.value" = {
+ rslt$score_list[[i]]$p.value
+ },
+ stop(
+ sprintf(
+ "Unexpected cutoff statistic %s rather than 'FDR' or 'p.value'",
+ ksea_cutoff_statistic
+ )
+ )
+ )
+ if (sum(selector < ksea_cutoff_threshold) > 0) {
+ math_caption <- gsub("{", "\\{", caption, fixed = TRUE)
+ math_caption <- gsub("}", "\\}", math_caption, fixed = TRUE)
+ data_frame_latex(
+ x = output_df,
+ justification = "l c c c c c c",
+ centered = TRUE,
+ caption = sprintf(
+ "\\text{%s}, %s < %s",
+ math_caption,
+ ksea_cutoff_statistic,
+ ksea_cutoff_threshold
+ ),
+ anchor = const_table_anchor_p
+ )
+ } else {
+ cat(
+ sprintf(
+ "\\break
+ No kinases had
+ \\(\\text{%s}_\\text{enrichment} < %s\\)
+ for contrast %s\\hfill\\break\n",
+ ksea_cutoff_statistic,
+ ksea_cutoff_threshold,
+ caption
+ )
+ )
+ }
+ }
+}
+
+# create_breaks is a helper for ksea_heatmap
+create_breaks <- function(merged_scores) {
+ if (min(merged_scores, na.rm = TRUE) < -1.6) {
+ breaks_neg <- seq(-1.6, 0, length.out = 30)
+ breaks_neg <-
+ append(
+ seq(min(merged_scores, na.rm = TRUE), -1.6, length.out = 10),
+ breaks_neg
+ )
+ breaks_neg <- sort(unique(breaks_neg))
+ } else {
+ breaks_neg <- seq(-1.6, 0, length.out = 30)
+ }
+ if (max(merged_scores, na.rm = TRUE) > 1.6) {
+ breaks_pos <- seq(0, 1.6, length.out = 30)
+ breaks_pos <-
+ append(
+ breaks_pos,
+ seq(1.6, max(merged_scores, na.rm = TRUE),
+ length.out = 10)
+ )
+ breaks_pos <- sort(unique(breaks_pos))
+ } else {
+ breaks_pos <- seq(0, 1.6, length.out = 30)
+ }
+ breaks_all <- unique(append(breaks_neg, breaks_pos))
+ mycol_neg <-
+ gplots::colorpanel(n = length(breaks_neg),
+ low = "blue",
+ high = "white")
+ mycol_pos <-
+ gplots::colorpanel(n = length(breaks_pos) - 1,
+ low = "white",
+ high = "red")
+ mycol <- unique(append(mycol_neg, mycol_pos))
+ color_breaks <- list(breaks_all, mycol)
+ return(color_breaks)
+}
+
+# draw_kseaapp_summary_heatmap is a helper function for ksea_heatmap
+draw_kseaapp_summary_heatmap <- function(
+ x,
+ sample_cluster,
+ merged_asterisk,
+ my_cex_row,
+ color_breaks,
+ margins,
+ ...
+) {
+ merged_scores <- x
+ if (!is.matrix(x)) {
+ cat(
+ paste0(
+ "No plot because \\texttt{typeof(x)} is '",
+ typeof(x),
+ "' rather than 'matrix'.\n\n"
+ )
+ )
+ } else if (nrow(x) < 2) {
+ cat("No plot because matrix x has ", nrow(x), " rows.\n\n")
+ cat("\\begin{verbatim}\n")
+ str(x)
+ cat("\\end{verbatim}\n")
+ } else if (ncol(x) < 2) {
+ cat("No plot because matrix x has ", ncol(x), " columns.\n\n")
+ cat("\\begin{verbatim}\n")
+ str(x)
+ cat("\\end{verbatim}\n")
+ } else {
+ gplots::heatmap.2(
+ x = merged_scores,
+ Colv = sample_cluster,
+ scale = "none",
+ cellnote = merged_asterisk,
+ notecol = "white",
+ cexCol = 0.9,
+ # Heuristically assign size of row labels
+ cexRow = min(1.0, ((3 * my_cex_row) ^ 1.7) / 2.25),
+ srtCol = 45,
+ srtRow = 45,
+ notecex = 3 * my_cex_row,
+ col = color_breaks[[2]],
+ density.info = "none",
+ trace = "none",
+ breaks = color_breaks[[1]],
+ lmat = rbind(c(0, 3), c(2, 1), c(0, 4)),
+ lhei = c(0.4, 8.0, 1.1),
+ lwid = c(0.5, 3),
+ key = FALSE,
+ margins = margins,
+ ...
+ )
+ }
+}
+
+# Adapted from KSEAapp::KSEA.Heatmap
+ksea_heatmap <- function(
+ # the data frame outputs from the KSEA.Scores() function, in list format
+ score_list,
+ # a character vector of all the sample names for heatmap annotation:
+ # - the names must be in the same order as the data in score_list
+ # - please avoid long names, as they may get cropped in the final image
+ sample_labels,
+ # character string of either "p.value" or "FDR" indicating the data column
+ # to use for marking statistically significant scores
+ stats,
+ # a numeric value between 0 and infinity indicating the min. number of
+ # substrates a kinase must have to be included in the heatmap
+ m_cutoff,
+ # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
+ # for indicating significant kinases in the heatmap
+ p_cutoff =
+ stop("argument 'p_cutoff' is required for function 'ksea_heatmap'"),
+ # a binary input of TRUE or FALSE, indicating whether or not to perform
+ # hierarchical clustering of the sample columns
+ sample_cluster,
+ # a binary input of TRUE or FALSE, indicating whether or not to export
+ # the heatmap as a .png image into the working directory
+ export = FALSE,
+ # bottom and right margins; adjust as needed if contrast names are too long
+ margins = c(6, 20),
+ # print which kinases?
+ # - Mandatory argument, must be one of const_ksea_.*_kinases
+ which_kinases,
+ # additional arguments to gplots::heatmap.2, such as:
+ # - main: main title of plot
+ # - xlab: x-axis label
+ # - ylab: y-axis label
+ ...
+) {
+ filter_m <- function(dataset, m_cutoff) {
+ filtered <- dataset[(dataset$m >= m_cutoff), ]
+ return(filtered)
+ }
+ score_list_m <- lapply(score_list, function(...) filter_m(..., m_cutoff))
+ for (i in seq_len(length(score_list_m))) {
+ names <- colnames(score_list_m[[i]])[c(2:7)]
+ colnames(score_list_m[[i]])[c(2:7)] <-
+ paste(names, i, sep = ".")
+ }
+ master <-
+ Reduce(
+ f = function(...) {
+ base::merge(..., by = "Kinase.Gene", all = FALSE)
+ },
+ x = score_list_m
+ )
+
+ row.names(master) <- master$Kinase.Gene
+ columns <- as.character(colnames(master))
+ merged_scores <- as.matrix(master[, grep("z.score", columns), drop = FALSE])
+ colnames(merged_scores) <- sample_labels
+ merged_stats <- as.matrix(master[, grep(stats, columns)])
+ asterisk <- function(mtrx, p_cutoff) {
+ new <- data.frame()
+ for (i in seq_len(nrow(mtrx))) {
+ for (j in seq_len(ncol(mtrx))) {
+ my_value <- mtrx[i, j]
+ if (!is.na(my_value) && my_value < p_cutoff) {
+ new[i, j] <- "*"
+ } else {
+ new[i, j] <- ""
+ }
+ }
+ }
+ return(new)
+ }
+ merged_asterisk <- as.matrix(asterisk(merged_stats, p_cutoff))
+
+ # begin hack to print only significant rows
+ asterisk_rows <- rowSums(merged_asterisk == "*") > 0
+ all_rows <- rownames(merged_stats)
+ names(asterisk_rows) <- all_rows
+ non_asterisk_rows <- names(asterisk_rows[asterisk_rows == FALSE])
+ asterisk_rows <- names(asterisk_rows[asterisk_rows == TRUE])
+ merged_scores_asterisk <- merged_scores[names(asterisk_rows), ]
+ merged_scores_non_asterisk <- merged_scores[names(non_asterisk_rows), ]
+ # end hack to print only significant rows
+
+ row_list <- list()
+ row_list[[const_ksea_astrsk_kinases]] <- asterisk_rows
+ row_list[[const_ksea_all_kinases]] <- all_rows
+ row_list[[const_ksea_nonastrsk_kinases]] <- non_asterisk_rows
+
+ i <- which_kinases
+ my_row_names <- row_list[[i]]
+ scrs <- merged_scores[my_row_names, ]
+ stts <- merged_stats[my_row_names, ]
+ merged_asterisk <- as.matrix(asterisk(stts, p_cutoff))
+
+ color_breaks <- create_breaks(scrs)
+ plot_height <- nrow(scrs) ^ 0.55
+ plot_width <- ncol(scrs) ^ 0.7
+ my_cex_row <- 0.25 * 16 / plot_height
+ if (export == "TRUE") {
+ png(
+ "KSEA.Merged.Heatmap.png",
+ width = plot_width * 300,
+ height = 2 * plot_height * 300,
+ res = 300,
+ pointsize = 14
+ )
+ }
+ draw_kseaapp_summary_heatmap(
+ x = scrs,
+ sample_cluster = sample_cluster,
+ merged_asterisk = merged_asterisk,
+ my_cex_row = my_cex_row,
+ color_breaks = color_breaks,
+ margins = margins
+ )
+ if (export == "TRUE") {
+ dev.off()
+ }
+ return(my_row_names)
+}
+
+# helper for heatmaps of phosphopeptide intensities
+
+draw_intensity_heatmap <-
+ function(
+ m, # matrix with rownames already formatted
+ cutoff, # cutoff used by hm_heading_function
+ hm_heading_function, # construct and cat heading from m and cutoff
+ hm_main_title, # main title for plot (drawn below heading)
+ suppress_row_dendrogram = TRUE, # set to false to show dendrogram
+ max_peptide_count # experimental:
+ = intensity_hm_rows, # values of 50 and 75 worked well
+ ... # passthru parameters for heatmap
+ ) {
+ peptide_count <- 0
+ # emit the heading for the heatmap
+ if (hm_heading_function(m, cutoff)) {
+ peptide_count <- min(max_peptide_count, nrow(m))
+ if (nrow(m) > 1) {
+ m_margin <- m[peptide_count:1, ]
+ # Margin setting was heuristically derived
+ margins <-
+ c(0.5, # col
+ max(80, sqrt(nchar(rownames(m_margin)))) * 5 / 16 # row
+ )
+ }
+ if (nrow(m) > 1) {
+ tryCatch(
+ {
+ old_oma <- par("oma")
+ par(cex.main = 0.6)
+ # Heuristically determined character size adjustment formula
+ char_contractor <-
+ 250000 / (
+ max(4500, (nchar(rownames(m_margin)))^2) * intensity_hm_rows
+ )
+ heatmap(
+ m[peptide_count:1, ],
+ Rowv = if (suppress_row_dendrogram) NA else NULL,
+ Colv = NA,
+ cexRow = char_contractor,
+ cexCol = char_contractor * 50 / max_peptide_count,
+ scale = "row",
+ margins = margins,
+ main =
+ "Unimputed, unnormalized log(intensities)",
+ xlab = "",
+ las = 1,
+ ...
+ )
+ },
+ error = function(e) {
+ cat(
+ sprintf(
+ "\nCould not draw heatmap, possibly because of too many missing values. Internal message: %s\n",
+ e$message
+ )
+ )
+ },
+ finally = par(old_oma)
+ )
+ }
+ }
+ return(peptide_count)
+ }
```
-## Purpose
-
-Perform imputation of missing values, quantile normalization, and ANOVA.
+```{r, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+cat("\\listoftables\n")
+```
+# Purpose
+
+To perform for phosphopeptides:
+
+- imputation of missing values,
+- quantile normalization,
+- ANOVA (using the R stats::`r params$oneWayManyCategories` function), and
+- KSEA (Kinase-Substrate Enrichment Analysis) using code adapted from the CRAN `KSEAapp` package to search for kinase substrates from the following databases:
+ - PhosphoSitesPlus [https://www.phosphosite.org](https://www.phosphosite.org)
+ - The Human Proteome Database [http://hprd.org](http://hprd.org)
+ - NetworKIN [http://networkin.science/](http://networkin.science/)
+ - Phosida [http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx](http://pegasus.biochem.mpg.de/phosida/help/motifs.aspx)
```{r include = FALSE}
+
+### GLOBAL VARIABLES
+
+# parameters for KSEA
+
+ksea_cutoff_statistic <- params$kseaCutoffStatistic
+ksea_cutoff_threshold <- params$kseaCutoffThreshold
+ksea_min_kinase_count <- params$kseaMinKinaseCount
+
+ksea_heatmap_titles <- list()
+ksea_heatmap_titles[[const_ksea_astrsk_kinases]] <-
+ sprintf(
+ "Summary for all kinases enriched in one or more contrasts at %s < %s",
+ ksea_cutoff_statistic,
+ ksea_cutoff_threshold
+ )
+ksea_heatmap_titles[[const_ksea_all_kinases]] <-
+ "Summary figure for all contrasts and all kinases"
+ksea_heatmap_titles[[const_ksea_nonastrsk_kinases]] <-
+ sprintf(
+ "Summary for all kinases not enriched at %s < %s in any contrast",
+ ksea_cutoff_statistic,
+ ksea_cutoff_threshold
+ )
+# hash to hold names of significantly enriched kinases
+ksea_asterisk_hash <- new_env()
+
+# READ PARAMETERS (mostly)
+
+intensity_hm_rows <- params$intensityHeatmapRows
# Input Filename
input_file <- params$inputFile
@@ -220,7 +1194,7 @@
# False discovery rate adjustment for ANOVA
# Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05
val_fdr <-
- read.table(file = params$alphaFile, sep = "\t", header = F, quote = "")
+ read.table(file = params$alphaFile, sep = "\t", header = FALSE, quote = "")
if (
ncol(val_fdr) != 1 ||
@@ -235,8 +1209,8 @@
#Imputed Data filename
imputed_data_filename <- params$imputedDataFilename
imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
-
-#ANOVA data filename
+anova_ksea_mtdt_file <- params$anovaKseaMetadata
+
```
```{r echo = FALSE}
@@ -255,32 +1229,71 @@
regex_sample_names <- params$regexSampleNames
# Regular expression to extract Sample Grouping from Sample Name;
-# if error occurs, compare sample_factor_levels and sample_name_matches
+# if error occurs, compare sample_treatment_levels vs. sample_name_matches
# to see if groupings/pairs line up
# e.g., "(\\d+)"
regex_sample_grouping <- params$regexSampleGrouping
+one_way_all_categories_fname <- params$oneWayManyCategories
+one_way_all_categories <- try_catch_w_e(
+ match.fun(one_way_all_categories_fname))
+if (!is.function(one_way_all_categories$value)) {
+ write("fatal error for parameter oneWayManyCategories:", stderr())
+ write(one_way_all_categories$value$message, stderr())
+ if (sys.nframe() > 0) quit(save = "no", status = 1)
+ stop("Cannot continue. Goodbye.")
+}
+one_way_all_categories <- one_way_all_categories$value
+
+one_way_two_categories_fname <- params$oneWayManyCategories
+one_way_two_categories <- try_catch_w_e(
+ match.fun(one_way_two_categories_fname))
+if (!is.function(one_way_two_categories$value)) {
+ cat("fatal error for parameter oneWayTwoCategories: \n")
+ cat(one_way_two_categories$value$message, fill = TRUE)
+ if (sys.nframe() > 0) quit(save = "no", status = 1)
+ stop("Cannot continue. Goodbye.")
+}
+one_way_two_categories <- one_way_two_categories$value
+
+preproc_db <- params$preprocDb
+ksea_app_prep_db <- params$kseaAppPrepDb
+result <- file.copy(
+ from = preproc_db,
+ to = ksea_app_prep_db,
+ overwrite = TRUE
+ )
+if (!result) {
+ write(
+ sprintf(
+ "fatal error copying initial database '%s' to output '%s'",
+ preproc_db,
+ ksea_app_prep_db,
+ ),
+ stderr()
+ )
+ if (sys.nframe() > 0) quit(save = "no", status = 1)
+ stop("Cannot continue. Goodbye.")
+}
```
```{r echo = FALSE}
### READ DATA
-library(data.table)
-
# read.table reads a file in table format and creates a data frame from it.
# - note that `quote = ""` means that quotation marks are treated literally.
full_data <- read.table(
file = input_file,
sep = "\t",
- header = T,
+ header = TRUE,
quote = "",
check.names = FALSE
)
```
-## Extract Sample Names and Factor Levels
-
-Column names parsed from input file are shown in Table 1; sample names and factor levels, in Table 2.
+# Extract Sample Names and Treatment Levels
+
+Column names parsed from input file are shown in Table 1; sample names and treatment levels, in Table 2.
```{r echo = FALSE, results = 'asis'}
@@ -297,7 +1310,7 @@
cat(
sprintf(
paste(
- "\n\nPeptide-intensity data for each sample is",
+ "\n\nThe input data file has peptide-intensity data for each sample",
"in one of columns %d through %d.\n\n"
),
min(data_column_indices),
@@ -308,29 +1321,30 @@
# Write column names as a LaTeX enumerated list.
column_name_df <- data.frame(
column = seq_len(length(colnames(full_data))),
- name = colnames(full_data)
+ name = paste0("\\verb@", colnames(full_data), "@")
)
data_frame_latex(
x = column_name_df,
justification = "l l",
centered = TRUE,
- caption = "Input data column name",
- anchor = const_table_anchor
+ caption = "Input data column names",
+ anchor = const_table_anchor_bp,
+ underscore_whack = FALSE
)
```
```{r echo = FALSE, results = 'asis'}
-#```{r echo = FALSE, results = 'asis'}
quant_data <- full_data[first_data_column:length(full_data)]
quant_data[quant_data == 0] <- NA
-rownames(quant_data) <- full_data$Phosphopeptide
-# Get factors -> group replicates (as indicated by terminal letter)
-# by the preceding digits;
-# Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$"
-# get factors ->
-# group runs (samples) by ignoring terminal [A-Z] in sample names
-# e.g.
+rownames(quant_data) <- rownames(full_data) <- full_data$Phosphopeptide
+# Extract factors and trt-replicates using regular expressions.
+# Typically:
+# regex_sample_names is "\\.\\d+[A-Z]$"
+# regex_sample_grouping is "\\d+"
+# This would distinguish trt-replicates by terminal letter [A-Z]
+# in sample names and group them into trts by the preceding digits.
+# e.g.:
# group .1A .1B .1C into group 1;
# group .2A .2B .2C, into group 2;
# etc.
@@ -340,26 +1354,28 @@
write_debug_file(quant_data)
-m2 <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
-sample_factor_levels <- as.factor(regmatches(sample_name_matches, m2))
+rx_match <- regexpr(regex_sample_grouping, sample_name_matches, perl = TRUE)
+sample_treatment_levels <- as.factor(regmatches(sample_name_matches, rx_match))
number_of_samples <- length(sample_name_matches)
-sample_factor_df <- data.frame(
- sample = sample_name_matches,
- level = sample_factor_levels
+sample_treatment_df <- data.frame(
+ level = sample_treatment_levels,
+ sample = sample_name_matches
)
data_frame_latex(
- x = sample_factor_df,
- justification = "c c",
+ x = sample_treatment_df,
+ justification = "rp{0.2\\linewidth} lp{0.3\\linewidth}",
centered = TRUE,
- caption = "Factor level",
- anchor = const_table_anchor
+ caption = "Treatment levels",
+ anchor = const_table_anchor_tbp,
+ underscore_whack = FALSE
)
```
+
```{r echo = FALSE, results = 'asis'}
cat("\\newpage\n")
```
-### Are the log-transformed sample distributions similar?
+## Are the log-transformed sample distributions similar?
```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
@@ -394,24 +1410,18 @@
```
```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
-library(ggplot2)
if (nrow(quant_data_log) > 1) {
quant_data_log_stack <- stack(quant_data_log)
- ggplot(
- quant_data_log_stack,
- aes(x = values)
- ) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
- ylab("Probability density") +
- geom_density(
- aes(group = ind, colour = ind),
- na.rm = TRUE
- )
+ ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) +
+ ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
+ ggplot2::ylab("Probability density") +
+ ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE)
} else {
cat("No density plot because there are too few peptides.\n\n")
}
```
-### Globally, are peptide intensities are approximately unimodal?
+## Globally, are peptide intensities are approximately unimodal?
```{r echo = FALSE, results = 'asis'}
-library(preprocessCore)
if (nrow(quant_data_imp) > 0) {
- quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp))
+ quant_data_imp_qn <- preprocessCore::normalize.quantiles(
+ as.matrix(quant_data_imp), keep.names = TRUE
+ )
} else {
quant_data_imp_qn <- as.matrix(quant_data_imp)
}
quant_data_imp_qn <- as.data.frame(quant_data_imp_qn)
-names(quant_data_imp_qn) <- names(quant_data_imp)
write_debug_file(quant_data_imp_qn)
quant_data_imp_qn_log <- log10(quant_data_imp_qn)
-rownames(quant_data_imp_qn_log) <- rownames(quant_data_imp)
write_debug_file(quant_data_imp_qn_log)
quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn))))
-any_nan <- function(x) {
- !any(x == "NaN")
-}
sel <- apply(quant_data_imp_qn_ls, 1, any_nan)
-quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ]
+quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls
+
+quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls2[which(sel), ]
quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2)
-rownames(quant_data_imp_qn_ls2) <- rownames(quant_data_imp)[sel]
quant_data_imp_qn_ls <- as.data.frame(quant_data_imp_qn_ls)
-rownames(quant_data_imp_qn_ls) <- rownames(quant_data_imp)
write_debug_file(quant_data_imp_qn_ls)
write_debug_file(quant_data_imp_qn_ls2)
-#output quantile normalized data
+# Create data.frame used by ANOVA analysis
data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log)
-write.table(
- data_table_imp_qn_lt,
- file = imp_qn_lt_data_filenm,
- sep = "\t",
- col.names = TRUE,
- row.names = FALSE,
- quote = FALSE
-)
-
```
-### Checking that normalized, imputed, log-transformed sample distributions are similar:
+## Are normalized, imputed, log-transformed sample distributions similar?
```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'}
@@ -919,18 +2021,6 @@
quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001
quant_data_log <- log10(quant_data_imp_qn)
-# Output with imputed, un-normalized data
-
-data_table_imputed <- cbind(full_data[1:9], quant_data_imp)
-write.table(
- data_table_imputed
- , file = imputed_data_filename
- , sep = "\t"
- , col.names = TRUE
- , row.names = FALSE
- , quote = FALSE
- )
-
how_many_peptides <- nrow(quant_data_log)
if ((how_many_peptides) > 0) {
@@ -970,15 +2060,10 @@
```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), results = 'asis'}
if (nrow(quant_data_log) > 1) {
quant_data_log_stack <- stack(quant_data_log)
- ggplot(
- quant_data_log_stack,
- aes(x = values)
- ) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
- ylab("Probability density") +
- geom_density(
- aes(group = ind, colour = ind),
- na.rm = TRUE
- )
+ ggplot2::ggplot(quant_data_log_stack, ggplot2::aes(x = values)) +
+ ggplot2::xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
+ ggplot2::ylab("Probability density") +
+ ggplot2::geom_density(ggplot2::aes(group = ind, colour = ind), na.rm = TRUE)
} else {
cat("No density plot because there are fewer than two peptides to plot.\n\n")
}
@@ -987,7 +2072,7 @@
cat("\\leavevmode\\newpage\n")
```
-## Perform ANOVA Filters
+# ANOVA Analysis
```{r, echo = FALSE}
# Make new data frame containing only Phosphopeptides
@@ -1000,8 +2085,8 @@
```
```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
-count_of_factor_levels <- length(levels(sample_factor_levels))
-if (count_of_factor_levels < 2) {
+count_of_treatment_levels <- length(levels(sample_treatment_levels))
+if (count_of_treatment_levels < 2) {
nuke_control_sequences <-
function(s) {
s <- gsub("[\\]", "xyzzy_plugh", s)
@@ -1054,16 +2139,18 @@
cat("\n\n\n")
cat("Sample group assignments are:\n",
"\\begin{quote}\n",
- paste(regmatches(sample_name_matches, m2), collapse = "\n\n\n"),
+ paste(regmatches(sample_name_matches, rx_match), collapse = "\n\n\n"),
"\n\\end{quote}\n\n")
} else {
+
p_value_data_anova_ps <-
apply(
quant_data_imp_qn_log,
1,
anova_func,
- grouping_factor = sample_factor_levels
+ grouping_factor = sample_treatment_levels,
+ one_way_f = one_way_all_categories
)
p_value_data_anova_ps_fdr <-
@@ -1081,8 +2168,9 @@
# becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt"
# Re-output datasets to include p-values
+ metadata_plus_p <- cbind(full_data[1:9], p_value_data[, 2:3])
write.table(
- cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp),
+ cbind(metadata_plus_p, quant_data_imp),
file = imputed_data_filename,
sep = "\t",
col.names = TRUE,
@@ -1091,7 +2179,7 @@
)
write.table(
- cbind(full_data[1:9], p_value_data[, 2:3], quant_data_imp_qn_log),
+ cbind(metadata_plus_p, quant_data_imp_qn_log),
file = imp_qn_lt_data_filenm,
sep = "\t",
col.names = TRUE,
@@ -1135,15 +2223,22 @@
if (nrow(filtered_p) && nrow(filtered_data_filtered) > 0) {
if (first_page_suppress == 1) {
first_page_suppress <- 0
- }
- else {
+ } else {
cat("\\newpage\n")
}
- cat(sprintf(
- "Intensities for %d peptides whose adjusted p-value < %0.2f:\n",
- nrow(filtered_data_filtered),
- cutoff
- ))
+ if (nrow(filtered_data_filtered) > 1) {
+ subsection_header(sprintf(
+ "Intensity distributions for %d phosphopeptides whose adjusted p-value < %0.2f\n",
+ nrow(filtered_data_filtered),
+ cutoff
+ ))
+ } else {
+ subsection_header(sprintf(
+ "Intensity distribution for one phosphopeptide (%s) whose adjusted p-value < %0.2f\n",
+ rownames(filtered_data_filtered)[1],
+ cutoff
+ ))
+ }
cat("\n\n\n")
cat("\n\n\n")
@@ -1158,12 +2253,15 @@
# ref: https://r-charts.com/distribution/add-points-boxplot/
# Vertical plot
colnames(filtered_data_filtered) <- sample_name_matches
- boxplot(
- filtered_data_filtered,
- main = "Imputed, normalized intensities", # no line plot
- las = 1,
- col = const_boxplot_fill,
- ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+ tryCatch(
+ boxplot(
+ filtered_data_filtered,
+ main = "Imputed, normalized intensities", # no line plot
+ las = 1,
+ col = const_boxplot_fill,
+ ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+ ),
+ error = function(e) print(e)
)
par(old_par)
} else {
@@ -1175,8 +2273,12 @@
}
if (nrow(filtered_data_filtered) > 0) {
- #Add Phosphopeptide column to anova_filtered table
- anova_filtered_merge <- merge(
+ # Add Phosphopeptide column to anova_filtered table
+ # The assumption here is that the first intensity is unique;
+ # this is a hokey assumption but almost definitely will
+ # be true in the real world unless there is a computation
+ # error upstream.
+ anova_filtered_merge <- base::merge(
x = connect_df,
y = filtered_data_filtered,
by.x = "Intensity",
@@ -1184,6 +2286,25 @@
)
anova_filtered_merge_order <- rownames(filtered_p)
+ anova_filtered <- data.frame(
+ ppep = anova_filtered_merge$Phosphopeptide,
+ intense = anova_filtered_merge$Intensity,
+ data = anova_filtered_merge[, 2:number_of_samples + 1]
+ )
+ colnames(anova_filtered) <-
+ c("Phosphopeptide", colnames(filtered_data_filtered))
+
+ # Merge qualitative columns into the ANOVA data
+ output_table <- data.frame(anova_filtered$Phosphopeptide)
+ output_table <- base::merge(
+ x = output_table,
+ y = data_table_imp_qn_lt,
+ by.x = "anova_filtered.Phosphopeptide",
+ by.y = "Phosphopeptide"
+ )
+
+ # Produce heatmap to visualize significance and the effect of imputation
+
anova_filtered_merge_format <- sapply(
X = filtered_p$fdr_adjusted_anova_p
,
@@ -1195,35 +2316,36 @@
}
)
- anova_filtered <- data.table(
- anova_filtered_merge$Phosphopeptide
- ,
- anova_filtered_merge$Intensity
- ,
- anova_filtered_merge[, 2:number_of_samples + 1]
- )
- colnames(anova_filtered) <-
- c("Phosphopeptide", colnames(filtered_data_filtered))
-
- # Merge qualitative columns into the ANOVA data
- output_table <- data.frame(anova_filtered$Phosphopeptide)
- output_table <- merge(
- x = output_table,
- y = data_table_imp_qn_lt,
- by.x = "anova_filtered.Phosphopeptide",
- by.y = "Phosphopeptide"
- )
-
- # Produce heatmap to visualize significance and the effect of imputation
+ cat_hm_heading <- function(m, cutoff) {
+ cat("\\newpage\n")
+ if (nrow(m) > intensity_hm_rows) {
+ subsection_header(
+ paste(
+ sprintf("Heatmap for the %d most-significant peptides",
+ intensity_hm_rows),
+ sprintf("whose adjusted p-value < %0.2f\n", cutoff)
+ )
+ )
+ } else {
+ if (nrow(m) == 1) {
+ return(FALSE)
+ } else {
+ subsection_header(
+ paste(
+ sprintf("Heatmap for %d usable peptides whose", nrow(m)),
+ sprintf("adjusted p-value < %0.2f\n", cutoff)
+ )
+ )
+ }
+ }
+ cat("\n\n\n")
+ cat("\n\n\n")
+ return(TRUE)
+ }
+
+ # construct matrix with appropriate rownames
m <-
as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ])
- m_nan_rows <- rowSums(
- matrix(
- as.integer(is.na(m)),
- nrow = nrow(m)
- )
- )
- m <- m[!m_nan_rows, , drop = FALSE]
if (nrow(m) > 0) {
rownames_m <- rownames(m)
rownames(m) <- sapply(
@@ -1237,57 +2359,1178 @@
)
}
)
- how_many_peptides <- min(50, nrow(m))
- number_of_peptides_found <- how_many_peptides
- if (nrow(m) > 1) {
- m_margin <- m[how_many_peptides:1, ]
- margins <-
- c(max(nchar(colnames(m_margin))) * 10 / 16 # col
- , max(nchar(rownames(m_margin))) * 5 / 16 # row
- )
- }
-
- cat("\\newpage\n")
- if (nrow(m) > 50) {
- cat("Heatmap for the 50 most-significant peptides",
- sprintf(
- "whose adjusted p-value < %0.2f\n",
- cutoff)
- )
- } else {
- if (nrow(m) == 1) {
- next
- } else {
- cat(
- sprintf("Heatmap for %d usable peptides whose", nrow(m)),
- sprintf("adjusted p-value < %0.2f\n", cutoff)
+ }
+ # draw the heading and heatmap
+ if (nrow(m) > 0) {
+ number_of_peptides_found <-
+ draw_intensity_heatmap(
+ m = m,
+ cutoff = cutoff,
+ hm_heading_function = cat_hm_heading,
+ hm_main_title = "Unimputed, unnormalized log(intensities)",
+ suppress_row_dendrogram = FALSE
)
- }
- }
- cat("\n\n\n")
- cat("\n\n\n")
- try(
- if (nrow(m) > 1) {
- old_oma <- par("oma")
- par(cex.main = 0.6)
- heatmap(
- m[how_many_peptides:1, ],
- Rowv = NA,
- Colv = NA,
- cexRow = 0.7,
- cexCol = 0.8,
- scale = "row",
- margins = margins,
- main =
- "Unimputed, unnormalized intensities",
- xlab = "",
- las = 1 #, fin = c(9, 5.5)
- )
- }
- )
}
}
}
}
cat("\\leavevmode\n\n\n")
```
+
+```{r sqlite, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+
+if (count_of_treatment_levels > 1) {
+ # Prepare two-way contrasts with adjusted p-values
+ # Strategy:
+ # - use imputed, log-transformed data:
+ # - remember this when computing log2(fold-change)
+ # - each contrast is between a combination of trt levels
+ # - for each contrast, compute samples that are members
+ # - compute one-way test:
+ # - use `oneway.test` (Welch test) if numbers of samples
+ # are not equivalent between trt levels
+ # - otherwise, aov is fine but offers no advantage
+ # - adjust p-value, assuming that
+ # (# of pppeps)*(# of contrasts) tests were performed
+
+ # Each contrast is between a combination of trt levels
+ m2 <- combn(
+ x = seq_len(length(levels(sample_treatment_levels))),
+ m = 2,
+ simplify = TRUE
+ )
+ contrast_count <- ncol(m2)
+
+ # For each contrast, compute samples that are members
+ # - local function to construct a data.frame for each contrast
+ # - the contrast in the first "column"
+ f_m2 <-
+ function(cntrst, lvl1, lvl2) {
+ return(
+ data.frame(
+ contrast = cntrst,
+ level = sample_treatment_levels[
+ sample_treatment_levels %in%
+ levels(sample_treatment_levels)[c(lvl1, lvl2)]
+ ],
+ label = sample_name_matches[
+ sample_treatment_levels %in%
+ levels(sample_treatment_levels)[c(lvl1, lvl2)]
+ ]
+ )
+ )
+ }
+ # - compute a df for each contrast
+ sample_level_dfs <- lapply(
+ X = 1:contrast_count,
+ FUN = function(i) f_m2(i, m2[1, i], m2[2, i])
+ )
+ # - compute a single df for all contrasts
+ combined_contrast_df <- Reduce(f = rbind, x = sample_level_dfs)
+
+ # - dispose objects to free resources
+ rm(sample_level_dfs)
+
+ # - write the df to a DB for later join-per-contrast
+ db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
+
+ RSQLite::dbWriteTable(
+ conn = db,
+ name = "contrast",
+ value = combined_contrast_df,
+ overwrite = TRUE
+ )
+
+ # Create UK for insert
+ ddl_exec(db, "
+ CREATE UNIQUE INDEX IF NOT EXISTS contrast__uk__idx
+ ON contrast(contrast, label);
+ "
+ )
+ # Create indexes for join
+ ddl_exec(db, "
+ -- index for join in contrast_ppep_smpl_qnlt on a.label < b.label
+ CREATE INDEX IF NOT EXISTS contrast__label__idx
+ ON contrast(label);
+ "
+ )
+ ddl_exec(db, "
+ -- index for joining two contrast_lvl_ppep_avg_quant on contrast
+ CREATE INDEX IF NOT EXISTS contrast__contrast__idx
+ ON contrast(contrast);
+ "
+ )
+ ddl_exec(db, "
+ -- index for joining two contrast_lvl_ppep_avg_quant on phophospep
+ CREATE INDEX IF NOT EXISTS contrast__level__idx
+ ON contrast(level);
+ "
+ )
+ # - dispose objects to free resources
+ rm(combined_contrast_df)
+
+ # Use imputed, log-transformed data
+ # - remember that this was donoe when computing log2(fold-change)
+ # - melt data matrix for use in later join-per-contrast
+ casted <- cbind(
+ data.frame(vrbl = rownames(quant_data_imp_qn_log)),
+ quant_data_imp_qn_log
+ )
+ quant_data_imp_qn_log_melted <- reshape2::melt(
+ casted,
+ id.vars = "vrbl"
+ )
+ colnames(quant_data_imp_qn_log_melted) <-
+ c("phosphopep", "sample", "quant")
+ # - dispose objects to free resources
+ rm(casted)
+
+ # - write the df to a DB for use in later join-per-contrast
+ RSQLite::dbWriteTable(
+ conn = db,
+ name = "ppep_smpl_qnlt",
+ value = quant_data_imp_qn_log_melted,
+ overwrite = TRUE
+ )
+ # Create UK for insert
+ ddl_exec(db, "
+ CREATE UNIQUE INDEX IF NOT EXISTS ppep_smpl_qnlt__uk__idx
+ ON ppep_smpl_qnlt(phosphopep, sample);
+ "
+ )
+ # Create index for join
+ ddl_exec(db, "
+ -- index for join in contrast_ppep_smpl_qnlt
+ CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__sample__idx
+ ON ppep_smpl_qnlt(sample);
+ "
+ )
+ ddl_exec(db, "
+ -- index for joining two contrast_lvl_ppep_avg_quant on phopho.pep
+ CREATE INDEX IF NOT EXISTS ppep_smpl_qnlt__phosphopep__idx
+ ON ppep_smpl_qnlt(phosphopep);
+ "
+ )
+ # - dispose objects to free resources
+ rm(quant_data_imp_qn_log_melted)
+
+ # - drop views if exist
+ ddl_exec(db, "
+ -- drop view dependent on contrast_lvl_ppep_avg_quant
+ DROP VIEW IF EXISTS v_contrast_log2_fc;
+ "
+ )
+ ddl_exec(db, "
+ -- drop table dependent on contrast_ppep_smpl_qnlt
+ DROP TABLE IF EXISTS contrast_lvl_ppep_avg_quant;
+ "
+ )
+ ddl_exec(db, "
+ DROP TABLE IF EXISTS contrast_lvl_lvl_metadata;
+ "
+ )
+ ddl_exec(db, "
+ DROP VIEW IF EXISTS v_contrast_lvl_metadata;
+ "
+ )
+ ddl_exec(db, "
+ -- drop view dependent on contrast_ppep_smpl_qnlt
+ DROP VIEW IF EXISTS v_contrast_lvl_ppep_avg_quant;
+ "
+ )
+ ddl_exec(db, "
+ DROP VIEW IF EXISTS v_contrast_lvl_lvl;
+ "
+ )
+ ddl_exec(db, "
+ -- drop view upon which other views depend
+ DROP VIEW IF EXISTS contrast_ppep_smpl_qnlt;
+ "
+ )
+ # - create view
+ dml_no_rows_exec(db, "
+ -- view contrast_ppep_smpl_qnlt is used for each phopshopep to
+ -- compute p-value for test of trt effect for two trt levels
+ CREATE VIEW contrast_ppep_smpl_qnlt
+ AS
+ SELECT contrast,
+ level,
+ phosphopep,
+ sample,
+ quant
+ FROM contrast AS c,
+ ppep_smpl_qnlt AS q
+ WHERE q.sample = c.label
+ ORDER BY contrast, level, phosphopep
+ ;
+ "
+ )
+ # - create simplification views
+ dml_no_rows_exec(db, "
+ CREATE VIEW v_contrast_lvl_metadata
+ AS
+ SELECT contrast,
+ level,
+ group_concat(label, ';') AS samples
+ FROM contrast
+ GROUP BY contrast, level
+ /* view v_contrast_lvl_metadata is used
+ to simplify creation of table contrast_lvl_lvl_metadata */
+ ;
+ "
+ )
+ dml_no_rows_exec(db, "
+ CREATE VIEW v_contrast_lvl_ppep_avg_quant
+ AS
+ SELECT contrast,
+ level,
+ phosphopep,
+ avg(quant) AS avg_quant
+ FROM contrast_ppep_smpl_qnlt
+ GROUP BY contrast, level, phosphopep
+ /* view v_contrast_lvl_ppep_avg_quant is used
+ to simplify view v_contrast_log2_fc */
+ ;
+ "
+ )
+
+ # - create contrast-metadata table
+ dml_no_rows_exec(db, "
+ CREATE TABLE contrast_lvl_lvl_metadata
+ AS
+ SELECT DISTINCT
+ a.contrast AS ab_contrast,
+ a.level AS a_level,
+ b.level AS b_level,
+ a.samples AS a_samples,
+ b.samples AS b_samples,
+ 'log2(level_'||a.level||'/level_'||b.level||')'
+ AS fc_description
+ FROM v_contrast_lvl_metadata AS a,
+ v_contrast_lvl_metadata AS b
+ WHERE a.contrast = b.contrast
+ AND a.level > b.level
+ /* view v_contrast_lvl_lvl is used
+ to simplify view v_contrast_log2_fc */
+ ;
+ "
+ )
+ # - create pseudo-materialized view table
+ dml_no_rows_exec(db, "
+ CREATE VIEW v_contrast_lvl_lvl
+ AS
+ SELECT DISTINCT
+ a.contrast AS ab_contrast,
+ a.level AS a_level,
+ b.level AS b_level
+ FROM contrast AS a,
+ contrast AS b
+ WHERE a.contrast = b.contrast
+ AND a.level > b.level
+ /* view v_contrast_lvl_lvl is used
+ to simplify view v_contrast_log2_fc */
+ ;
+ "
+ )
+
+ # - create view to compute log2(fold-change)
+ dml_no_rows_exec(db, "
+ CREATE VIEW v_contrast_log2_fc
+ AS
+ SELECT ab.ab_contrast AS contrast,
+ m.a_level AS a_level,
+ c.avg_quant AS a_quant,
+ m.a_samples AS a_samples,
+ ab.b_level AS b_level,
+ d.avg_quant AS b_quant,
+ m.b_samples AS b_samples,
+ m.fc_description AS fc_description,
+ 3.32193 * ( d.avg_quant - c.avg_quant) AS log2_fc,
+ d.phosphopep AS phosphopep
+ FROM contrast_lvl_lvl_metadata AS m,
+ v_contrast_lvl_ppep_avg_quant AS d,
+ v_contrast_lvl_lvl AS ab
+ INNER JOIN v_contrast_lvl_ppep_avg_quant AS c
+ ON c.contrast = ab.ab_contrast
+ AND c.level = ab.a_level
+ WHERE d.contrast = ab.ab_contrast
+ AND m.ab_contrast = ab.ab_contrast
+ AND d.level = ab.b_level
+ AND d.phosphopep = c.phosphopep
+ /* view to compute log2(fold-change) */
+ ;
+ "
+ )
+
+ # For each contrast, compute samples that are members
+ # compute one-way test:
+ # - use `oneway.test` (Welch test) if numbers of samples
+ # are not equivalent between trt levels
+ # - otherwise, aov is fine but offers no advantage
+ for (contrast in contrast_count:2) {
+ invisible(contrast)
+ }
+ for (contrast in 1:contrast_count) {
+ contrast_df <- sqldf::sqldf(
+ x = paste0("
+ SELECT level, phosphopep, sample, quant
+ FROM contrast_ppep_smpl_qnlt
+ WHERE contrast = ", contrast, "
+ ORDER BY phosphopep, level, sample
+ "),
+ connection = db
+ )
+ contrast_cast <- reshape2::dcast(
+ data = contrast_df,
+ formula = phosphopep ~ sample,
+ value.var = "quant"
+ )
+ contrast_cast_ncol <- ncol(contrast_cast)
+ contrast_cast_data <- contrast_cast[, 2:contrast_cast_ncol]
+ contrast_cast_samples <- colnames(contrast_cast_data)
+
+ # - order grouping_factor by order of sample columns of contrast_cast_data
+ grouping_factor <- sqldf::sqldf(
+ x = paste0("
+ SELECT sample, level
+ FROM contrast_ppep_smpl_qnlt
+ WHERE contrast = ", contrast, "
+ ORDER BY phosphopep, level, sample
+ LIMIT ", contrast_cast_ncol - 1
+ ),
+ connection = db
+ )
+ rownames(grouping_factor) <- grouping_factor$sample
+ grouping_factor <- grouping_factor[, "level", drop = FALSE]
+
+ # - run the two-level (one-way) test
+ p_value_data_contrast_ps <-
+ apply(
+ X = contrast_cast_data,
+ MARGIN = 1, # apply to rows
+ FUN = anova_func,
+ grouping_factor =
+ as.factor(as.numeric(grouping_factor$level)), # anova_func arg2
+ one_way_f = one_way_two_categories, # anova_func arg3
+ simplify = TRUE # TRUE is the default for simplify
+ )
+ contrast_data_adj_p_values <- p.adjust(
+ p = p_value_data_contrast_ps,
+ method = "fdr",
+ n = length(p_value_data_contrast_ps) # this is the default, length(p)
+ )
+ # - compute the fold-change
+ contrast_p_df <-
+ data.frame(
+ contrast = contrast,
+ phosphopep = contrast_cast$phosphopep,
+ p_value_raw = p_value_data_contrast_ps,
+ p_value_adj = contrast_data_adj_p_values
+ )
+ db_write_table_overwrite <- (contrast < 2)
+ db_write_table_append <- !db_write_table_overwrite
+ RSQLite::dbWriteTable(
+ conn = db,
+ name = "contrast_ppep_p_val",
+ value = contrast_p_df,
+ append = db_write_table_append
+ )
+ # Create UK for insert
+ ddl_exec(db, "
+ CREATE UNIQUE INDEX IF NOT EXISTS contrast_ppep_p_val__uk__idx
+ ON contrast_ppep_p_val(phosphopep, contrast);
+ "
+ )
+ }
+ # Perhaps this could be done more elegantly using unique keys
+ # or creating the tables before saving data to them, but this
+ # is fast and, if the database exists on disk rather than in
+ # memory, it doesn't stress memory.
+ dml_no_rows_exec(db, "
+ CREATE TEMP table contrast_log2_fc
+ AS
+ SELECT *
+ FROM v_contrast_log2_fc
+ ORDER BY contrast, phosphopep
+ ;
+ "
+ )
+ dml_no_rows_exec(db, "
+ CREATE TEMP table ppep_p_val
+ AS
+ SELECT p_value_raw,
+ p_value_adj,
+ contrast AS p_val_contrast,
+ phosphopep AS p_val_ppep
+ FROM contrast_ppep_p_val
+ ORDER BY contrast, phosphopep
+ ;
+ "
+ )
+ dml_no_rows_exec(db, "
+ DROP TABLE IF EXISTS contrast_log2_fc_p_val
+ ;
+ "
+ )
+ dml_no_rows_exec(db, "
+ CREATE TABLE contrast_log2_fc_p_val
+ AS
+ SELECT a.*,
+ b.p_value_raw,
+ b.p_value_adj,
+ b.p_val_contrast,
+ b.p_val_ppep
+ FROM contrast_log2_fc a, ppep_p_val b
+ WHERE a.rowid = b.rowid
+ AND a.phosphopep = b.p_val_ppep
+ ;
+ "
+ )
+ # Create UK
+ ddl_exec(db, "
+ CREATE UNIQUE INDEX IF NOT EXISTS contrast_log2_fc_p_val__uk__idx
+ ON contrast_log2_fc_p_val(phosphopep, contrast);
+ "
+ )
+ # Create indices for future queries
+ ddl_exec(db, "
+ CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__contrast__idx
+ ON contrast_log2_fc_p_val(contrast);
+ "
+ )
+ ddl_exec(db, "
+ CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__phosphopep__idx
+ ON contrast_log2_fc_p_val(phosphopep);
+ "
+ )
+ ddl_exec(db, "
+ CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_raw__idx
+ ON contrast_log2_fc_p_val(p_value_raw);
+ "
+ )
+ ddl_exec(db, "
+ CREATE INDEX IF NOT EXISTS contrast_log2_fc_p_val__p_value_adj__idx
+ ON contrast_log2_fc_p_val(p_value_adj);
+ "
+ )
+ dml_no_rows_exec(db, "
+ DROP VIEW IF EXISTS v_contrast_log2_fc_p_val
+ ;
+ "
+ )
+ dml_no_rows_exec(db, "
+ CREATE VIEW v_contrast_log2_fc_p_val
+ AS
+ SELECT contrast,
+ a_level,
+ a_samples,
+ b_level,
+ b_samples,
+ a_quant,
+ b_quant,
+ fc_description,
+ log2_fc,
+ p_value_raw,
+ p_value_adj,
+ phosphopep
+ FROM contrast_log2_fc_p_val
+ ORDER BY contrast, phosphopep
+ ;
+ "
+ )
+ ddl_exec(db, "
+ DROP TABLE IF EXISTS kseaapp_metadata
+ ;
+ "
+ )
+ dml_no_rows_exec(db, "
+ CREATE TABLE kseaapp_metadata
+ AS
+ WITH extended(deppep, ppep, gene_name, uniprot_id, phosphoresidue) AS (
+ SELECT DISTINCT
+ deppep.seq,
+ ppep.seq,
+ GeneName||';',
+ UniProtID||';',
+ PhosphoResidue||';'
+ FROM
+ ppep, deppep, mrgfltr_metadata
+ WHERE
+ mrgfltr_metadata.ppep_id = ppep.id
+ AND
+ ppep.deppep_id = deppep.id
+ )
+ SELECT
+ ppep AS `ppep`,
+ SUBSTR(uniprot_id, 1, INSTR(uniprot_id,';') - 1 ) AS `Protein`,
+ SUBSTR(gene_name, 1, INSTR(gene_name,';') - 1 ) AS `Gene`,
+ deppep AS `Peptide`,
+ REPLACE(
+ REPLACE(
+ SUBSTR(phosphoresidue, 1, INSTR(phosphoresidue,';') - 1 ),
+ 'p',
+ ''
+ ),
+ ', ',
+ ';'
+ ) AS `Residue.Both`
+ FROM extended
+ ;
+ "
+ )
+ # Create indexes for join
+ ddl_exec(db, "
+ CREATE INDEX IF NOT EXISTS kseaapp_metadata__ppep__idx
+ ON kseaapp_metadata(ppep);
+ "
+ )
+ ddl_exec(db, "
+ DROP VIEW IF EXISTS v_kseaapp_contrast
+ ;
+ "
+ )
+ dml_no_rows_exec(db, "
+ CREATE VIEW v_kseaapp_contrast
+ AS
+ SELECT a.*, b.Protein, b.Gene, b.Peptide, b.`Residue.Both`
+ FROM v_contrast_log2_fc_p_val a, kseaapp_metadata b
+ WHERE b.ppep = a.phosphopep
+ ;
+ "
+ )
+ ddl_exec(db, "
+ DROP VIEW IF EXISTS v_kseaapp_input
+ ;
+ "
+ )
+ dml_no_rows_exec(db, "
+ CREATE VIEW v_kseaapp_input
+ AS
+ SELECT v.contrast,
+ v.phosphopep,
+ m.`Protein`,
+ m.`Gene`,
+ m.`Peptide`,
+ m.`Residue.Both`,
+ v.p_value_raw AS `p`,
+ v.log2_fc AS `FC`
+ FROM kseaapp_metadata AS m,
+ v_contrast_log2_fc_p_val AS v
+ WHERE m.ppep = v.phosphopep
+ AND NOT m.`Gene` = 'No_Gene_Name'
+ AND NOT v.log2_fc = 0
+ ;
+ "
+ )
+}
+```
+
+```{r echo = FALSE, results = 'asis'}
+cat("\\newpage\n")
+```
+
+# KSEA Analysis
+
+Results of Kinase-Substrate Enrichment Analysis are presented here, if the substrates for any kinases are relatively enriched. Enrichments are found by the CRAN `KSEAapp` package:
+
+- The package is available on CRAN, at https:/cran.r-project.org/package=KSEAapp
+- The method used is described in Casado et al. (2013) [doi:10.1126/scisignal.2003573](https:/doi.org/10.1126/scisignal.2003573) and Wiredja et al (2017) [doi:10.1093/bioinformatics/btx415](https:/doi.org/10.1093/bioinformatics/btx415).
+- An online alternative (supporting only analysis of human data) is available at [https:/casecpb.shinyapps.io/ksea/](https:/casecpb.shinyapps.io/ksea/).
+
+For each kinase, $i$, and each two-way contrast of treatments, $j$, an enrichment $z$-score is computed as:
+
+$$
+\text{kinase enrichment score}_{j,i} = \frac{(\overline{s}_{j,i} - \overline{p}_j)\sqrt{m_{j,i}}}{\delta_j}
+$$
+
+and fold-enrichment is computed as:
+
+$$
+\text{Enrichment}_{j,i} = \frac{\overline{s}_{j,i}}{\overline{p}_j}
+$$
+
+where:
+
+- $\overline{s}_{j,i}$ is the mean $\log_2 (|\text{fold-change|})$ in intensities (for contrast $j$) of known substrates of the kinase $i$,
+- $\overline{p}_j$ is the mean $\log_2 (|\text{fold-change}|)$ of all phosphosites identified in contrast $j$, and
+- $m_{j,i}$ is the total number of phosphosite substrates of kinase $i$ identified in contrast $j$,
+- $\delta_j$ is the standard deviation of the $\log_2 (|\text{fold-change}|)$ for contrast $j$ across all phosphosites in the dataset.
+- Note that the absolute value of fold-change is used so that both increased and decreased substrates of a kinase will contribute to its enrichment score.
+
+$\text{FDR}_{j,i}$ is computed from the $p$-value for the z-score using the R `stats::p.adjust` function, applying the False Discovery Rate correction from Benjamini and Hochberg (1995) [doi:10.1111/j.2517-6161.1995.tb02031.x](https:/doi.org/10.1111/j.2517-6161.1995.tb02031.x)
+
+Color intensity in heatmaps reflects magnitude of $z$-score for enrichment of respective kinase in respective contrast; hue reflects the sign of the $z$-score (blue, negative; red, positive).
+
+Asterisks in heatmaps reflect enrichments that are significant at `r ksea_cutoff_statistic` < `r ksea_cutoff_threshold`.
+
+- Kinase names are generally as presented at Phospho.ELM [http://phospho.elm.eu.org/kinases.html](http://phospho.elm.eu.org/kinases.html) (when available), although Phospho.ELM data are not yet incorporated into this analysis.
+- Kinase names having the suffix '(HPRD)' are as presented at [http://hprd.org/serine_motifs](http://hprd.org/serine_motifs) and [http://hprd.org/tyrosine_motifs](http://hprd.org/tyrosine_motifs) and are as originally reported in the Amanchy et al., 2007 (doi: [10.1038/nbt0307-285](https://doi.org/10.1038/nbt0307-285)).
+- Kinase-strate deata were also taken from [http://networkin.science/download.shtml](http://networkin.science/download.shtml) and from PhosphoSitePlus [https://www.phosphosite.org/staticDownloads](https://www.phosphosite.org/staticDownloads).
+
+```{r ksea, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+
+db <- RSQLite::dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
+
+# -- eliminate the table that's about to be defined
+ddl_exec(db, "
+DROP TABLE IF EXISTS site_metadata;
+")
+
+# -- define the site_metadata table
+ddl_exec(db, "
+CREATE TABLE site_metadata(
+ id INTEGER PRIMARY KEY
+, site_type_id INTEGER REFERENCES site_type(id)
+, full TEXT UNIQUE ON CONFLICT IGNORE
+, abbrev TEXT
+, pattern TEXT
+, motif TEXT
+);
+")
+
+# -- populate the table with initial values
+ddl_exec(db, "
+INSERT INTO site_metadata(full, abbrev, site_type_id)
+ SELECT DISTINCT kinase_map, kinase_map, site_type_id
+ FROM ppep_gene_site
+ ORDER BY kinase_map;
+")
+
+# -- drop bogus KSData view if exists
+ddl_exec(db, "
+DROP VIEW IF EXISTS ks_data_v;
+")
+
+# -- create view to serve as an impostor for KSEAapp::KSData
+ddl_exec(db, "
+CREATE VIEW IF NOT EXISTS ks_data_v
+AS
+SELECT
+ 'NA' AS KINASE,
+ 'NA' AS KIN_ACC_ID,
+ kinase_map AS GENE,
+ 'NA' AS KIN_ORGANISM,
+ 'NA' AS SUBSTRATE,
+ 0 AS SUB_GENE_ID,
+ 'NA' AS SUB_ACC_ID,
+ gene_names AS SUB_GENE,
+ 'NA' AS SUB_ORGANISM,
+ phospho_peptide AS SUB_MOD_RSD,
+ 0 AS SITE_GROUP_ID,
+ 'NA' AS 'SITE_7AA',
+ 2 AS networkin_score,
+ type_name AS Source
+FROM ppep_gene_site_view;
+")
+
+contrast_metadata_df <-
+ sqldf::sqldf("select * from contrast_lvl_lvl_metadata", connection = db)
+rslt <- new_env()
+rslt$score_list <- list()
+rslt$name_list <- list()
+rslt$longname_list <- list()
+
+ddl_exec(db, "
+ DROP TABLE IF EXISTS contrast_ksea_scores;
+ "
+)
+
+next_index <- 0
+err_na_subscr_df_const <-
+ "missing values are not allowed in subscripted assignments of data frames"
+
+for (i_cntrst in seq_len(nrow(contrast_metadata_df))) {
+ cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
+ cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
+ cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
+ contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
+ contrast_longlabel <- (
+ sprintf(
+ "Trt %s {%s} -> Trt %s {%s}",
+ contrast_metadata_df[i_cntrst, "b_level"],
+ gsub(
+ pattern = ";",
+ replacement = ", ",
+ x = contrast_metadata_df[i_cntrst, "b_samples"],
+ fixed = TRUE
+ ),
+ contrast_metadata_df[i_cntrst, "a_level"],
+ gsub(
+ pattern = ";",
+ replacement = ", ",
+ x = contrast_metadata_df[i_cntrst, "a_samples"],
+ fixed = TRUE
+ )
+ )
+ )
+ kseaapp_input <-
+ sqldf::sqldf(
+ x = sprintf("
+ SELECT `Protein`, `Gene`, `Peptide`, phosphopep AS `Residue.Both`, `p`, `FC`
+ FROM v_kseaapp_input
+ WHERE contrast = %d
+ ",
+ i_cntrst
+ ),
+ connection = db
+ )
+
+ pseudo_ksdata <- dbReadTable(db, "ks_data_v")
+
+ # This hack is because SQL table has the log2-transformed values
+ kseaapp_input[, "FC"] <- 2 ** kseaapp_input[, "FC", drop = TRUE]
+ main_title <- (
+ sprintf(
+ "Change from treatment %s to treatment %s",
+ contrast_metadata_df[i_cntrst, "b_level"],
+ contrast_metadata_df[i_cntrst, "a_level"]
+ )
+ )
+ sub_title <- contrast_longlabel
+ tryCatch(
+ expr = {
+ ksea_scores_rslt <-
+ ksea_scores(
+ ksdata = pseudo_ksdata, # KSEAapp::KSData,
+ px = kseaapp_input,
+ networkin = TRUE,
+ networkin_cutoff = 2
+ )
+
+ if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
+ next_index <- 1 + next_index
+ rslt$score_list[[next_index]] <- ksea_scores_rslt
+ rslt$name_list[[next_index]] <- contrast_label
+ rslt$longname_list[[next_index]] <- contrast_longlabel
+ low_fdr_print(
+ rslt = rslt,
+ i_cntrst = i_cntrst,
+ i = next_index,
+ a_level = cntrst_a_level,
+ b_level = cntrst_b_level,
+ fold_change = cntrst_fold_change,
+ caption = contrast_longlabel
+ )
+ }
+ },
+ error = function(e) str(e)
+ )
+}
+
+plotted_kinases <- NULL
+if (length(rslt$score_list) > 1) {
+ for (i in seq_len(length(ksea_heatmap_titles))) {
+ hdr <- ksea_heatmap_titles[[i]]
+ which_kinases <- i
+
+ cat("\\clearpage\n\\begin{center}\n")
+ if (i == const_ksea_astrsk_kinases) {
+ subsection_header(hdr)
+ } else {
+ subsection_header(hdr)
+ }
+ cat("\\end{center}\n")
+
+ plotted_kinases <- ksea_heatmap(
+ # the data frame outputs from the KSEA.Scores() function, in list format
+ score_list = rslt$score_list,
+ # a character vector of all the sample names for heatmap annotation:
+ # - the names must be in the same order as the data in score_list
+ # - please avoid long names, as they may get cropped in the final image
+ sample_labels = rslt$name_list,
+ # character string of either "p.value" or "FDR" indicating the data column
+ # to use for marking statistically significant scores
+ stats = c("p.value", "FDR")[2],
+ # a numeric value between 0 and infinity indicating the min. number of
+ # substrates a kinase must have to be included in the heatmap
+ m_cutoff = 1,
+ # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
+ # for indicating significant kinases in the heatmap
+ p_cutoff = 0.05,
+ # a binary input of TRUE or FALSE, indicating whether or not to perform
+ # hierarchical clustering of the sample columns
+ sample_cluster = TRUE,
+ # a binary input of TRUE or FALSE, indicating whether or not to export
+ # the heatmap as a .png image into the working directory
+ export = FALSE,
+ # additional arguments to gplots::heatmap.2, such as:
+ # - main: main title of plot
+ # - xlab: x-axis label
+ # - ylab: y-axis label
+ xlab = "Contrast",
+ ylab = "Kinase",
+ # print which kinases:
+ # - 1 : all kinases
+ # - 2 : significant kinases
+ # - 3 : non-significant kinases
+ which_kinases = which_kinases
+ )
+ cat("\\begin{center}\n")
+ cat("Color intensities reflects $z$-score magnitudes; hue reflects $z$-score sign. Asterisks reflect significance.\n")
+ cat("\\end{center}\n")
+ } # end for (i in ...
+} # end if (length ...
+
+for (i_cntrst in seq_len(length(rslt$score_list))) {
+ next_index <- i_cntrst
+ cntrst_a_level <- contrast_metadata_df[i_cntrst, "a_level"]
+ cntrst_b_level <- contrast_metadata_df[i_cntrst, "b_level"]
+ cntrst_fold_change <- contrast_metadata_df[i_cntrst, 6]
+ contrast_label <- sprintf("%s -> %s", cntrst_b_level, cntrst_a_level)
+ contrast_longlabel <- (
+ sprintf(
+ "Trt %s {%s} -> Trt %s {%s}",
+ contrast_metadata_df[i_cntrst, "b_level"],
+ gsub(
+ pattern = ";",
+ replacement = ", ",
+ x = contrast_metadata_df[i_cntrst, "b_samples"],
+ fixed = TRUE
+ ),
+ contrast_metadata_df[i_cntrst, "a_level"],
+ gsub(
+ pattern = ";",
+ replacement = ", ",
+ x = contrast_metadata_df[i_cntrst, "a_samples"],
+ fixed = TRUE
+ )
+ )
+ )
+ main_title <- (
+ sprintf(
+ "Change from treatment %s to treatment %s",
+ contrast_metadata_df[i_cntrst, "b_level"],
+ contrast_metadata_df[i_cntrst, "a_level"]
+ )
+ )
+ sub_title <- contrast_longlabel
+ tryCatch(
+ expr = {
+ ksea_scores_rslt <- rslt$score_list[[next_index]]
+
+ if (0 < sum(!is.nan(ksea_scores_rslt$FDR))) {
+ low_fdr_barplot(
+ rslt = rslt,
+ i_cntrst = i_cntrst,
+ i = next_index,
+ a_level = cntrst_a_level,
+ b_level = cntrst_b_level,
+ fold_change = cntrst_fold_change,
+ caption = contrast_longlabel
+ )
+ }
+ },
+ error = function(e) str(e)
+ )
+}
+```
+
+```{r enriched, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+
+# Use enriched kinases to find enriched kinase-substrate pairs
+enriched_kinases <- data.frame(kinase = ls(ksea_asterisk_hash))
+all_enriched_substrates <- sqldf("
+ SELECT
+ gene AS kinase,
+ ppep,
+ '('||group_concat(gene||'-'||sub_gene)||') '||ppep AS label
+ FROM (
+ SELECT DISTINCT gene, sub_gene, SUB_MOD_RSD AS ppep
+ FROM pseudo_ksdata
+ WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+ )
+ GROUP BY ppep
+ ")
+
+# helper used to label per-kinase substrate enrichment figure
+cat_enriched_heading <- function(m, cut_args) {
+ cutoff <- cut_args$cutoff
+ kinase <- cut_args$kinase
+ statistic <- cut_args$statistic
+ threshold <- cut_args$threshold
+ cat("\\newpage\n")
+ if (nrow(m) > intensity_hm_rows) {
+ subsection_header(
+ paste(
+ sprintf(
+ "Lowest p-valued %d (of %d) enriched %s-substrates,",
+ intensity_hm_rows,
+ nrow(m),
+ kinase
+ ),
+ sprintf(" KSEA %s < %0.2f\n", statistic, threshold)
+ )
+ )
+ } else {
+ if (nrow(m) == 1) {
+ return(FALSE)
+ } else {
+ subsection_header(
+ paste(
+ sprintf(
+ "%d enriched %s-substrates,",
+ nrow(m),
+ kinase
+ ),
+ sprintf(
+ " KSEA %s < %0.2f\n",
+ statistic,
+ threshold
+ )
+ )
+ )
+ }
+ }
+ cat("\n\n\n")
+ cat("\n\n\n")
+ return(TRUE)
+}
+
+# Disabling heatmaps for substrates pending decision whether to eliminate them altogether
+if (FALSE)
+ for (kinase_name in sort(enriched_kinases$kinase)) {
+ enriched_substrates <-
+ all_enriched_substrates[
+ all_enriched_substrates$kinase == kinase_name,
+ ,
+ drop = FALSE
+ ]
+ # Get the intensity values for the heatmap
+ enriched_intensities <-
+ as.matrix(unimputed_quant_data_log[enriched_substrates$ppep, , drop = FALSE])
+ # Remove rows having too many NA values to be relevant
+ na_counter <- is.na(enriched_intensities)
+ na_counts <- apply(na_counter, 1, sum)
+ enriched_intensities <-
+ enriched_intensities[na_counts < ncol(enriched_intensities) / 2, , drop = FALSE]
+ # Rename the rows with the display-name for the heatmap
+ rownames(enriched_intensities) <-
+ sapply(
+ X = rownames(enriched_intensities),
+ FUN = function(rn) {
+ enriched_substrates[enriched_substrates$ppep == rn, "label"]
+ }
+ )
+ # Format as matrix for heatmap
+ m <- as.matrix(enriched_intensities)
+ # Draw the heading and heatmap
+ if (nrow(m) > 0) {
+ cut_args <- new_env()
+ cut_args$cutoff <- cutoff
+ cut_args$kinase <- kinase_name
+ cut_args$statistic <- ksea_cutoff_statistic
+ cut_args$threshold <- ksea_cutoff_threshold
+ number_of_peptides_found <-
+ draw_intensity_heatmap(
+ m = m,
+ cutoff = cut_args,
+ hm_heading_function = cat_enriched_heading,
+ hm_main_title
+ = "Unnormalized (zero-imputed) intensities of enriched kinase-substrates",
+ suppress_row_dendrogram = FALSE
+ )
+ }
+ }
+
+# Write output tabular files
+
+# get kinase, ppep, concat(kinase) tuples for enriched kinases
+
+kinase_ppep_label <- sqldf("
+ WITH
+ t(ppep, label) AS
+ (
+ SELECT DISTINCT
+ SUB_MOD_RSD AS ppep,
+ group_concat(gene, '; ') AS label
+ FROM pseudo_ksdata
+ WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+ GROUP BY ppep
+ ),
+ k(kinase, ppep_join) AS
+ (
+ SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep_join
+ FROM pseudo_ksdata
+ WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+ )
+ SELECT k.kinase, t.ppep, t.label
+ FROM t, k
+ WHERE t.ppep = k.ppep_join
+ ORDER BY k.kinase, t.ppep
+ ")
+
+# extract what we need from full_data
+impish <- cbind(rownames(quant_data_imp), quant_data_imp)
+colnames(impish)[1] <- "Phosphopeptide"
+data_table_imputed_sql <- "
+ SELECT
+ f.*,
+ k.label AS KSEA_enrichments,
+ q.*
+ FROM
+ metadata_plus_p f
+ LEFT JOIN kinase_ppep_label k
+ ON f.Phosphopeptide = k.ppep,
+ impish q
+ WHERE
+ f.Phosphopeptide = q.Phosphopeptide
+ "
+data_table_imputed <- sqldf(data_table_imputed_sql)
+# Zap the duplicated 'Phosphopeptide' column named 'ppep'
+data_table_imputed <-
+ data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
+
+# Output with imputed, un-normalized data
+
+write.table(
+ data_table_imputed
+ , file = imputed_data_filename
+ , sep = "\t"
+ , col.names = TRUE
+ , row.names = FALSE
+ , quote = FALSE
+ )
+
+
+#output quantile normalized data
+impish <- cbind(rownames(quant_data_imp_qn_log), quant_data_imp_qn_log)
+colnames(impish)[1] <- "Phosphopeptide"
+data_table_imputed <- sqldf(data_table_imputed_sql)
+# Zap the duplicated 'Phosphopeptide' column named 'ppep'
+data_table_imputed <-
+ data_table_imputed[, c(1:12, 14:ncol(data_table_imputed))]
+write.table(
+ data_table_imputed,
+ file = imp_qn_lt_data_filenm,
+ sep = "\t",
+ col.names = TRUE,
+ row.names = FALSE,
+ quote = FALSE
+)
+
+ppep_kinase <- sqldf("
+ SELECT DISTINCT k.ppep, k.kinase
+ FROM (
+ SELECT DISTINCT gene AS kinase, SUB_MOD_RSD AS ppep
+ FROM pseudo_ksdata
+ WHERE GENE IN (SELECT kinase FROM enriched_kinases)
+ ) k
+ ORDER BY k.ppep, k.kinase
+ ")
+
+RSQLite::dbWriteTable(
+ conn = db,
+ name = "ksea_enriched_ks",
+ value = ppep_kinase,
+ append = FALSE
+ )
+
+RSQLite::dbWriteTable(
+ conn = db,
+ name = "anova_signif",
+ value = p_value_data,
+ append = FALSE
+ )
+
+ ddl_exec(db, "
+ DROP VIEW IF EXISTS stats_metadata_v;
+ "
+ )
+ dml_no_rows_exec(db, "
+ CREATE VIEW stats_metadata_v
+ AS
+ SELECT DISTINCT m.*,
+ p.raw_anova_p,
+ p.fdr_adjusted_anova_p,
+ kek.kinase AS ksea_enrichments
+ FROM
+ mrgfltr_metadata_view m
+ LEFT JOIN anova_signif p
+ ON m.phospho_peptide = p.phosphopeptide
+ LEFT JOIN ksea_enriched_ks kek
+ ON m.phospho_peptide = kek.ppep
+ ;
+ "
+ )
+
+write.table(
+ dbReadTable(db, "stats_metadata_v"),
+ file = anova_ksea_mtdt_file,
+ sep = "\t",
+ col.names = TRUE,
+ row.names = FALSE,
+ quote = FALSE
+ )
+
+
+```
+
+```{r parmlist, echo = FALSE, fig.dim = c(9, 10), results = 'asis'}
+cat("\\leavevmode\n\n\n")
+
+# write parameters to report
+
+param_unlist <- unlist(as.list(params))
+param_df <- data.frame(
+ parameter = paste0("\\verb@", names(param_unlist), "@"),
+ value = paste0("\\verb@", gsub("$", "\\$", param_unlist, fixed = TRUE), "@")
+ )
+
+data_frame_latex(
+ x = param_df,
+ justification = "p{0.35\\linewidth} p{0.6\\linewidth}",
+ centered = TRUE,
+ caption = "Input parameters",
+ anchor = const_table_anchor_bp,
+ underscore_whack = FALSE
+ )
+
+# write parameters to SQLite output
+
+mqppep_anova_script_param_df <- data.frame(
+ script = "mqppep_anova_script.Rmd",
+ parameter = names(param_unlist),
+ value = param_unlist
+ )
+ddl_exec(db, "
+ DROP TABLE IF EXISTS script_parameter;
+ "
+)
+ddl_exec(db, "
+ CREATE TABLE IF NOT EXISTS script_parameter(
+ script TEXT,
+ parameter TEXT,
+ value ANY,
+ UNIQUE (script, parameter) ON CONFLICT REPLACE
+ )
+ ;
+ "
+)
+RSQLite::dbWriteTable(
+ conn = db,
+ name = "script_parameter",
+ value = mqppep_anova_script_param_df,
+ append = TRUE
+)
+
+# We are done with output
+RSQLite::dbDisconnect(db)
+```
+
diff -r f3deca1a3c84 -r 61adb8801b73 test-data/test_input_for_anova.tabular
--- a/test-data/test_input_for_anova.tabular Wed Apr 13 19:48:32 2022 +0000
+++ b/test-data/test_input_for_anova.tabular Thu Jun 30 16:16:32 2022 +0000
@@ -1,23 +1,24 @@
Phosphopeptide Sequence10 Sequence7 Gene_Name Phosphoresidue UniProt_ID Description Function Phosphoresidue(PSP=PhosphoSitePlus.org) Putative Upstream Kinases(PSP=PhosphoSitePlus.org)/Phosphatases/Binding Domains Intensity.shL.1A Intensity.shL.1B Intensity.shL.1C Intensity.shR.2A Intensity.shR.2B Intensity.shR.2C
-AAAAPDSRVpSEEENLK MAAAAPDSRVpSEEENLKKTPK AAPDSRVsEEENLKK RRP15 pS11 Q9Y3B9 RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | BARD1 BRCT domain binding | PKA | CK1 | CK2 38150000 39445000 56305000 55338000 7010600 70203000
+AAAAPDSRVpSEEENLK MAAAAPDSRVpSEEENLKKTPK AAPDSRVsEEENLKK RRP15 pS11 Q9Y3B9 RRP15_HUMAN RRP15-like protein OS=Homo sapiens OX=9606 GN=RRP15 PE=1 SV=2 N/A CK2alpha | BARD1 Q99728 38150000 39445000 56305000 55338000 7010600 70203000
AAAITDMADLEELSRLpSPLPPGpSPGSAAR MADLEELSRLpSPLPPGSPGSA; LSRLSPLPPGpSPGSAARGRAE LEELSRLsPLPPGSP | LSPLPPGsPGSAARG AEBP2; AEBP2 pS18, pS24; pS18, pS24 Q6ZN18; Q6ZN18-2 AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 N/A N/A 5416400 7101800 385280000 208060000 41426000 352400000
ADALQAGASQFETpSAAK LQAGASQFETpSAAKLKRKYWW GASQFETsAAKLKRK VAMP2; VAMP3 pS80; pS63 P63027; Q15836 VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A PKD3 | PKCiota 44627000 41445000 69094000 42521000 5738000 61819000
-DQKLpSELDDR DKVLERDQKLpSELDDRADALQ LERDQKLsELDDRAD VAMP1; VAMP1; VAMP1; VAMP2; VAMP3 pS63; pS63; pS63; pS61; pS44 P23763; P23763-2; P23763-3; P63027; Q15836 VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1; VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A CK2alpha | PKAbeta | PKAgamma | PKCiota | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | Pyruvate dehydrogenase kinase substrate 75542000 44814000 32924000 35016000 11023000 4669900
-EFVpSSDESSSGENK SESFKSKEFVpSSDESSSGENK FKSKEFVsSDESSSG SSRP1 pS667 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2alpha | CK2a2 | CDK7 | Casein kinase II substrate | G protein-coupled receptor kinase 1 substrate | Casein Kinase I substrate | CK2 | GSK3 12562000 16302000 23000000 7857800 0 18830000
-EGMNPSYDEYADpSDEDQHDAYLER MNPSYDEYADpSDEDQHDAYLE SYDEYADsDEDQHDA SSRP1 pS444 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2alpha | CK2a2 | CDK7 | CK1alpha | Casein kinase II substrate | b-Adrenergic Receptor kinase substrate | Pyruvate dehydrogenase kinase substrate 0 0 0 0 0 0
-IGNEEpSDLEEACILPHpSPINVDK DDEEKIGNEEpSDLEEACILPH; DLEEACILPHpSPINVDKRPIA EKIGNEEsDLEEACI | EACILPHsPINVDKR HERC2 pS1577, pS1588 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | b-Adrenergic Receptor kinase substrate | WW domain binding | ERK/MAPK | CK2 | NEK6 167764000 121218000 155736000 140640000 83642000 128468000
-IRAEEEDLAAVPFLApSDNEEEEDEK EDLAAVPFLApSDNEEEEDEKG AAVPFLAsDNEEEED HERC2 pS2928 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | CK2 22562000 18225000 9119700 11689000 0 0
+DQKLpSELDDR DKVLERDQKLpSELDDRADALQ LERDQKLsELDDRAD VAMP1; VAMP1; VAMP1; VAMP2; VAMP3 pS63; pS63; pS63; pS61; pS44 P23763; P23763-2; P23763-3; P63027; Q15836 VAMP1_HUMAN_Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1 PE=1 SV=1; VAMP1_HUMAN_Isoform 3 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP1_HUMAN_Isoform 2 of Vesicle-associated membrane protein 1 OS=Homo sapiens OX=9606 GN=VAMP1; VAMP2_HUMAN_Vesicle-associated membrane protein 2 OS=Homo sapiens OX=9606 GN=VAMP2 PE=1 SV=3; VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A CK2alpha | PKAbeta | PKAgamma | PKCiota | PDHK1 75542000 44814000 32924000 35016000 11023000 4669900
+EFVpSSDESSSGENK SESFKSKEFVpSSDESSSGENK FKSKEFVsSDESSSG SSRP1 pS667 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2alpha | CK2a2 | CDK7 | GSK3 12562000 16302000 23000000 7857800 0 18830000
+EGMNPSYDEYADpSDEDQHDAYLER MNPSYDEYADpSDEDQHDAYLE SYDEYADsDEDQHDA SSRP1 pS444 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2alpha | CK2a2 | CDK7 | CK1alpha | GRK-2 | PDHK1 0 0 0 0 0 0
+IGNEEpSDLEEACILPHpSPINVDK DDEEKIGNEEpSDLEEACILPH; DLEEACILPHpSPINVDKRPIA EKIGNEEsDLEEACI | EACILPHsPINVDKR HERC2 pS1577, pS1588 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A CK2alpha | GRK-2 | DOC_WW_Pin1_4 | NEK6 167764000 121218000 155736000 140640000 83642000 128468000
+IRAEEEDLAAVPFLApSDNEEEEDEK EDLAAVPFLApSDNEEEEDEKG AAVPFLAsDNEEEED HERC2 pS2928 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A CK2alpha 22562000 18225000 9119700 11689000 0 0
KGLLApTpSGNDGTIR VWCNKKGLLApTSGNDGTIRVW; WCNKKGLLATpSGNDGTIRVWN NKKGLLAtSGNDGTI | KKGLLATsGNDGTIR HERC1 pT3445, pS3446 Q15751 HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 N/A N/A 7843600 0 241700000 0 0 10042600
-KpSSLVTSK PTPQDLPQRKpSSLVTSKLAGG; PTPQDLPQRKpSSLVTSKLAG QDLPQRKsSLVTSKL ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS108; pS108; pS124; pS131; pS104; pS104; pS120; pS124 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA N/A G protein-coupled receptor kinase 1 substrate 0 0 18629000 0 0 0
-KSpSLVTSK TPQDLPQRKSpSLVTSKLAGGQ; TPQDLPQRKSpSLVTSKLAG DLPQRKSsLVTSKLA ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS109; pS109; pS125; pS132; pS105; pS105; pS121; pS125 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA molecular association, regulation; protein conformation; SNCA(DISRUPTS) G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | Casein Kinase I substrate | MDC1 BRCT domain binding | GSK3 | AURORA 7090300 8341200 9691500 10030000 1675200 9952100
-LpSPNPWQEK MLAVDIEDRLpSPNPWQEKREI VDIEDRLsPNPWQEK HERC2 pS3462 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | WW domain binding 0 11706000 12495000 0 7273000 8877800
-NLLEDDpSDEEEDFFLR SERRNLLEDDpSDEEEDFFLRG RNLLEDDsDEEEDFF VAMP4 pS30 O75379 VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2 N/A CK2alpha | Casein kinase II substrate | Casein Kinase I substrate | b-Adrenergic Receptor kinase substrate | BARD1 BRCT domain binding | CK2 | Csnk2a1 1592100000 973800000 1011600000 1450300000 631970000 878760000
-pSQKQEEENPAEETGEEK MpSQKQEEENPAE ______MsQKQEEEN ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS2; pS2; pS2; pS2; pS2; pS2 O43768; O43768-2; O43768-3; O43768-4; O43768-8; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA N/A ATM kinase substrate | PKC kinase substrate | PKA kinase substrate 0 0 8765300 0 2355900 14706000
-QLSEpSFK SKSSSRQLSEpSFKSKEFVSSD SSRQLSEsFKSKEFV SSRP1 pS659 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2a2 | CDK7 | PKCalpha | PKCbeta | DNAPK | PKC kinase substrate | PKA kinase substrate | NEK6 68201000 87774000 138300000 95357000 19966000 149110000
-RGpSLEMSSDGEPLSR SSATSGGRRGpSLEMSSDGEPL TSGGRRGsLEMSSDG AEBP2; AEBP2 pS206; pS206 Q6ZN18; Q6ZN18-2 AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 N/A Casein Kinase II substrate | G protein-coupled receptor kinase 1 substrate | PKC kinase substrate | PKA kinase substrate | PKA | GSK3 | AURORA 19262000 11103000 19454000 0 1816900 22028000
-SDGpSLEDGDDVHR IEDGGARSDGpSLEDGDDVHRA GGARSDGsLEDGDDV SERINC1 pS364 Q9NRX5 SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1 N/A Casein kinase II substrate | Plk1 kinase substrate | Pyruvate dehydrogenase kinase substrate | CK1 | PLK | PLK1 31407000 17665000 20892000 23194000 5132400 54893000
-SEpSLTAESR EGGGLMTRSEpSLTAESRLVHT GLMTRSEsLTAESRL HERC1 pS1491 Q15751 HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 N/A b-Adrenergic Receptor kinase substrate 11766000 13176000 20540000 16963000 4364700 21308000
-STGPTAATGpSNRR MSTGPTAATGpSNRRLQQTQNQ GPTAATGsNRRLQQT VAMP3 pS11 Q15836 VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A PKCalpha | PKCbeta | PKCzeta | PKC kinase substrate | PKA kinase substrate 3057100 4718800 12052000 5047700 1070900 8333500
-TEDLEATpSEHFK RNKTEDLEATpSEHFKTTSQKV TEDLEATsEHFKTTS VAMP8 pS55 Q9BV40 VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1 activity, inhibited; abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion G protein-coupled receptor kinase 1 substrate | Casein Kinase I substrate 20400000 9738500 7862300 0 0 76518000
-TFWpSPELK SSMNSIKTFWpSPELKKERVLR NSIKTFWsPELKKER ERC2 pS187 O15083 ERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3 N/A IKKalpha | IKKbeta | HIPK2 | Casein Kinase II substrate | ERK1, ERK2 Kinase substrate | GSK-3, ERK1, ERK2, CDK5 substrate | WW domain binding 29764000 20957000 24855000 30752000 8304800 23771000
-YFDpSGDYNMAK CADEMQKYFDpSGDYNMAKAKM; RLQKGQKYFDpSGDYNMAKAKM; MKSVEQKYFDpSGDYNMAKAKM EMQKYFDsGDYNMAK | KGQKYFDsGDYNMAK | VEQKYFDsGDYNMAK ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS67; pS67; pS83; pS90; pS63; pS63; pS79; pS83 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA molecular association, regulation; cell cycle regulation; PPP2CA(INDUCES) b-Adrenergic Receptor kinase substrate 323250000 127970000 0 67123000 12790000 71378000
+KpSSLVTSK PTPQDLPQRKpSSLVTSKLAGG; PTPQDLPQRKpSSLVTSKLAG QDLPQRKsSLVTSKL ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS108; pS108; pS124; pS131; pS104; pS104; pS120; pS124 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA N/A N/A 0 0 18629000 0 0 0
+KSpSLVTSK TPQDLPQRKSpSLVTSKLAGGQ; TPQDLPQRKSpSLVTSKLAG DLPQRKSsLVTSKLA ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS109; pS109; pS125; pS132; pS105; pS105; pS121; pS125 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA molecular association, regulation; protein conformation; SNCA(DISRUPTS) MDC1 FHA | GSK3 | PLK1 PBD 7090300 8341200 9691500 10030000 1675200 9952100
+LpSPNPWQEK MLAVDIEDRLpSPNPWQEKREI VDIEDRLsPNPWQEK HERC2 pS3462 O95714 HERC2_HUMAN E3 ubiquitin-protein ligase HERC2 OS=Homo sapiens OX=9606 GN=HERC2 PE=1 SV=2 N/A DOC_WW_Pin1_4 0 11706000 12495000 0 7273000 8877800
+NLLEDDpSDEEEDFFLR SERRNLLEDDpSDEEEDFFLRG RNLLEDDsDEEEDFF VAMP4 pS30 O75379 VAMP4_HUMAN_Vesicle-associated membrane protein 4 OS=Homo sapiens OX=9606 GN=VAMP4 PE=1 SV=2 N/A CK2alpha | GRK-2 | BARD1 Q99728 | Csnk2a1 1592100000 973800000 1011600000 1450300000 631970000 878760000
+pSQKQEEENPAEETGEEK MpSQKQEEENPAE ______MsQKQEEEN ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS2; pS2; pS2; pS2; pS2; pS2 O43768; O43768-2; O43768-3; O43768-4; O43768-8; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 8 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA N/A N/A 0 0 8765300 0 2355900 14706000
+pTYVDPFTpYEDPNQAVR EEKHLNQGVRpTYVDPFTYEDP; GVRTYVDPFTpYEDPNQAVREF HLNQGVRtYVDPFTY | TYVDPFTyEDPNQAV EPHA4; EPHA4 pT595, pY602; pT544, pY551 P54764; P54764-2 EPHA4_HUMAN Ephrin type-A receptor 4 OS=Homo sapiens OX=9606 GN=EPHA4 PE=1 SV=1; EPHA4_HUMAN Isoform 2 of Ephrin type-A receptor 4 OS=Homo sapiens OX=9606 GN=EPHA4 N/A EPHA4 | EphA1 | EphA2 | EphA3 | EphA5 | EphA7 | EphA6 | Abl | EphA8 | Fgr | Yes | BLK | HCK | EphB6 | EphB3 725460 0 1651300 655850 646420 0
+QLSEpSFK SKSSSRQLSEpSFKSKEFVSSD SSRQLSEsFKSKEFV SSRP1 pS659 Q08945 SSRP1_HUMAN FACT complex subunit SSRP1 OS=Homo sapiens OX=9606 GN=SSRP1 PE=1 SV=1 N/A CK2a2 | CDK7 | PKCalpha | PKCbeta | DNAPK | NEK6 68201000 87774000 138300000 95357000 19966000 149110000
+RGpSLEMSSDGEPLSR SSATSGGRRGpSLEMSSDGEPL TSGGRRGsLEMSSDG AEBP2; AEBP2 pS206; pS206 Q6ZN18; Q6ZN18-2 AEBP2_HUMAN Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 PE=1 SV=2; AEBP2_HUMAN Isoform 2 of Zinc finger protein AEBP2 OS=Homo sapiens OX=9606 GN=AEBP2 N/A GSK3 19262000 11103000 19454000 0 1816900 22028000
+SDGpSLEDGDDVHR IEDGGARSDGpSLEDGDDVHRA GGARSDGsLEDGDDV SERINC1 pS364 Q9NRX5 SERC1_HUMAN Serine incorporator 1 OS=Homo sapiens OX=9606 GN=SERINC1 PE=1 SV=1 N/A PLK1 | PDHK1 31407000 17665000 20892000 23194000 5132400 54893000
+SEpSLTAESR EGGGLMTRSEpSLTAESRLVHT GLMTRSEsLTAESRL HERC1 pS1491 Q15751 HERC1_HUMAN Probable E3 ubiquitin-protein ligase HERC1 OS=Homo sapiens OX=9606 GN=HERC1 PE=1 SV=2 N/A GRK-2 11766000 13176000 20540000 16963000 4364700 21308000
+STGPTAATGpSNRR MSTGPTAATGpSNRRLQQTQNQ GPTAATGsNRRLQQT VAMP3 pS11 Q15836 VAMP3_HUMAN_Vesicle-associated membrane protein 3 OS=Homo sapiens OX=9606 GN=VAMP3 PE=1 SV=3 N/A PKCalpha | PKCbeta | PKCzeta 3057100 4718800 12052000 5047700 1070900 8333500
+TEDLEATpSEHFK RNKTEDLEATpSEHFKTTSQKV TEDLEATsEHFKTTS VAMP8 pS55 Q9BV40 VAMP8_HUMAN_Vesicle-associated membrane protein 8 OS=Homo sapiens OX=9606 GN=VAMP8 PE=1 SV=1 activity, inhibited; abolish function in SNARE complex during mast cell secretion, reduces in vitro ensemble vesicle fusion N/A 20400000 9738500 7862300 0 0 76518000
+TFWpSPELK SSMNSIKTFWpSPELKKERVLR NSIKTFWsPELKKER ERC2 pS187 O15083 ERC2_HUMAN ERC protein 2 OS=Homo sapiens OX=9606 GN=ERC2 PE=1 SV=3 N/A IKKalpha | IKKbeta | HIPK2 | DOC_WW_Pin1_4 29764000 20957000 24855000 30752000 8304800 23771000
+YFDpSGDYNMAK CADEMQKYFDpSGDYNMAKAKM; RLQKGQKYFDpSGDYNMAKAKM; MKSVEQKYFDpSGDYNMAKAKM EMQKYFDsGDYNMAK | KGQKYFDsGDYNMAK | VEQKYFDsGDYNMAK ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA; ENSA pS67; pS67; pS83; pS90; pS63; pS63; pS79; pS83 O43768; O43768-2; O43768-3; O43768-4; O43768-5; O43768-6; O43768-7; O43768-9 ENSA_HUMAN Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA PE=1 SV=1; ENSA_HUMAN Isoform 2 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 3 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 4 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 5 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 6 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 7 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA; ENSA_HUMAN Isoform 9 of Alpha-endosulfine OS=Homo sapiens OX=9606 GN=ENSA molecular association, regulation; cell cycle regulation; PPP2CA(INDUCES) GRK-2 323250000 127970000 0 67123000 12790000 71378000
diff -r f3deca1a3c84 -r 61adb8801b73 workflow/ppenrich_suite_wf.ga
--- a/workflow/ppenrich_suite_wf.ga Wed Apr 13 19:48:32 2022 +0000
+++ b/workflow/ppenrich_suite_wf.ga Thu Jun 30 16:16:32 2022 +0000
@@ -390,7 +390,7 @@
},
"11": {
"annotation": "Transform the output of MaxQuant for phosphoproteome-enriched samples to prepare it for statistical anlaysis.",
- "content_id": "testtoolshed.g2.bx.psu.edu/repos/eschen42/mqppep_preproc/mqppep_preproc/0.1.9+galaxy0",
+ "content_id": "mqppep_preproc",
"errors": null,
"id": 11,
"input_connections": {
@@ -792,7 +792,7 @@
},
"13": {
"annotation": "Perform ANOVA. For imputing missing values, create random values.",
- "content_id": "testtoolshed.g2.bx.psu.edu/repos/eschen42/mqppep_anova/mqppep_anova/0.1.9+galaxy0",
+ "content_id": "mqppep_anova",
"errors": null,
"id": 13,
"input_connections": {