diff kseaapp_demo.R @ 27:2e35f925c469 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit be6750cb1d7ae18e71b60ae6f2c55e5331105c4f
author eschen42
date Thu, 27 Oct 2022 18:54:25 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kseaapp_demo.R	Thu Oct 27 18:54:25 2022 +0000
@@ -0,0 +1,481 @@
+# KSEAapp demo using SQLite data source
+library(sqldf)   # implies that package RSQLite will be available
+library(KSEAapp) # implies that package gplots  will be available
+
+# ref: http://adv-r.had.co.nz/Environments.html
+# "When creating your own environment, note that you should set its parent
+#   environment to be the empty environment. This ensures you don't
+#   accidentally inherit objects from somewhere else."
+# Caution: this prevents `with(my_env, expr)` from working when `expr`
+#   contains anything from the global environment, even operators!
+#   Hence, `x <- 1; get("x", new_env())` fails by design.
+new_env <- function() {
+  new.env(parent = emptyenv())
+}
+
+# Adapted from KSEAapp::KSEA.Heatmap to allow specification of output file
+ksea_heatmap <-
+function(
+  # the data frame outputs from the KSEA.Scores() function, in list format
+  score_list,
+  # a character vector of all the sample names for heatmap annotation:
+  # - the names must be in the same order as the data in score_list
+  # - please avoid long names, as they may get cropped in the final image
+  sample_labels,
+  # character string of either "p.value" or "FDR" indicating the data column
+  #   to use for marking statistically significant scores
+  stats,
+  # a numeric value between 0 and infinity indicating the min. number of
+  #   substrates a kinase must have to be included in the heatmap
+  m_cutoff,
+  # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
+  #   for indicating significant kinases in the heatmap
+  p_cutoff,
+  # a binary input of TRUE or FALSE, indicating whether or not to perform
+  #   hierarchical clustering of the sample columns
+  sample_cluster,
+  # a binary input of TRUE or FALSE, indicating whether or not to export
+  #   the bar plot as a .tiff image into the working directory
+  export = FALSE
+) {
+  filter_m <- function(dataset, m_cutoff) {
+    filtered <- dataset[(dataset$m >= m_cutoff), ]
+    return(filtered)
+  }
+  score_list_m <- lapply(score_list, function(...)
+    filter_m(...,
+             m_cutoff))
+  for (i in seq_len(length(score_list_m))) {
+    names <- colnames(score_list_m[[i]])[c(2:7)]
+    colnames(score_list_m[[i]])[c(2:7)] <-
+      paste(names, i, sep = ".")
+  }
+  master <-
+    Reduce(function(...)
+      merge(..., by = "Kinase.Gene", all = F), score_list_m)
+  #ACE if (is.null(master)) return ("No significant kinase genes found") 
+  row.names(master) <- master$Kinase.Gene
+  columns <- as.character(colnames(master))
+  merged_scores <- as.matrix(master[, grep("z.score", columns)])
+  colnames(merged_scores) <- sample_labels
+  merged_stats <- as.matrix(master[, grep(stats, columns)])
+  asterisk <- function(matrix) {
+    new <- data.frame()
+    for (i in seq_len(nrow(matrix))) {
+      for (j in seq_len(ncol(matrix))) {
+        if (matrix[i, j] < p_cutoff) {
+          new[i, j] <- "*"
+        }
+        else {
+          new[i, j] <- ""
+        }
+      }
+    }
+    return(new)
+  }
+  merged_asterisk <- as.matrix(asterisk(merged_stats))
+  create_breaks <- function(merged_scores) {
+    if (min(merged_scores) < -1.6) {
+      breaks_neg <- seq(-1.6, 0, length.out = 30)
+      breaks_neg <-
+        append(seq(min(merged_scores), -1.6, length.out = 10), breaks_neg)
+      breaks_neg <- sort(unique(breaks_neg))
+    }
+    else {
+      breaks_neg <- seq(-1.6, 0, length.out = 30)
+    }
+    if (max(merged_scores) > 1.6) {
+      breaks_pos <- seq(0, 1.6, length.out = 30)
+      breaks_pos <-
+        append(breaks_pos, seq(1.6, max(merged_scores), length.out = 10))
+      breaks_pos <- sort(unique(breaks_pos))
+    }
+    else {
+      breaks_pos <- seq(0, 1.6, length.out = 30)
+    }
+    breaks_all <- unique(append(breaks_neg, breaks_pos))
+    mycol_neg <-
+      gplots::colorpanel(n = length(breaks_neg),
+                 low = "blue",
+                 high = "white")
+    mycol_pos <-
+      gplots::colorpanel(n = length(breaks_pos) - 1,
+                 low = "white",
+                 high = "red")
+    mycol <- unique(append(mycol_neg, mycol_pos))
+    color_breaks <- list(breaks_all, mycol)
+    return(color_breaks)
+  }
+  color_breaks <- create_breaks(merged_scores)
+  plot_height <- nrow(merged_scores) ^ 0.55
+  plot_width <- ncol(merged_scores) ^ 0.7
+  draw_heatmap <- function() gplots::heatmap.2(
+    merged_scores,
+    Colv = sample_cluster,
+    scale = "none",
+    cellnote = merged_asterisk,
+    notecol = "white",
+    cexCol = 0.9,
+    cexRow = 0.9,
+    srtCol = 45,
+    notecex = 1.4,
+    col = color_breaks[[2]],
+    density.info = "none",
+    trace = "none",
+    key = F,
+    breaks = color_breaks[[1]],
+    lmat = rbind(c(0, 3), c(2, 1), c(0, 4)),
+    lhei = c(0.4, 9.5, 0.6),
+    lwid = c(0.5, 3),
+    margins = c(2, 6)
+  )
+  if (export == "TRUE") {
+    png(
+      "KSEA.Merged.Heatmap.png",
+      width = plot_width * 300,
+      height = plot_height * 300,
+      res = 300,
+      pointsize = 14
+    )
+    draw_heatmap()
+    dev.off()
+  }
+  else {
+    draw_heatmap()
+  }
+}
+
+# Adapted from KSEAapp::KSEA.Barplot to allow:
+# - printing main and sub titles
+ksea_barplot <- function(
+  # The Kinase-Substrate dataset uploaded from the file prefaced with
+  #   "PSP&networkin_" available from github.com/casecpb/KSEA/
+  ksdata = KSEAapp::KSData,
+  # The experimental data file formatted as described in the
+  #   KSEAapp::KSEA.Complete() documentation
+  px,
+  # networkin = TRUE means inclusion of networkin predictions
+  # networkin = FALSE means not to include and to ignore networkin_cutoff
+  networkin,
+  # A numeric value between 1 and infinity setting the minimum networkin score
+  #   (can be left out if networkin = FALSE)
+  networkin_cutoff,
+  # A numeric value between 0 and infinity indicating the min. # of substrates
+  #   a kinase must have to be included in the bar plot output
+  m_cutoff,
+  # A numeric value greater than zero indicating the p-value cutoff for
+  #   indicating significant kinases in the bar plot
+  p_cutoff,
+  # A numeric value between 0 and 1 indicating the minimum magnitude of
+  #   z-value cutoff for adding a kinase to the bar plot
+  z_threshold = 1.0,
+  # A binary input of TRUE or FALSE, indicating whether or not to export
+  #   the bar plot as a .tiff image into the working directory
+  export = FALSE,
+  # This remaing arguments are passed through to barplot
+  main = "",
+  sub = ""
+) {
+  if (length(grep(";", px$Residue.Both)) == 0) {
+    new <- px
+    colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
+    new$log2_fc <- log2(abs(as.numeric(as.character(new$FC))))
+    new <- new[complete.cases(new$log2_fc), ]
+  }
+  else {
+    double <- px[grep(";", px$Residue.Both), ]
+    residues <- as.character(double$Residue.Both)
+    residues <- as.matrix(residues, ncol = 1)
+    split <- strsplit(residues, split = ";")
+    x <- sapply(split, length)
+    single <- data.frame(
+      Protein = rep(double$Protein, x),
+      Gene = rep(double$Gene, x),
+      Peptide = rep(double$Peptide,
+                    x),
+      Residue.Both = unlist(split),
+      p = rep(double$p,
+              x),
+      FC = rep(double$FC, x)
+    )
+    new <- px[-grep(";", px$Residue.Both), ]
+    new <- rbind(new, single)
+    colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
+    new$log2_fc <- log2(abs(as.numeric(as.character(new$FC))))
+    new <- new[complete.cases(new$log2_fc), ]
+  }
+  if (networkin == TRUE) {
+    ksdata_filterd <- ksdata[grep("[a-z]", ksdata$Source), ]
+    ksdata_filterd <- ksdata_filterd[
+      (ksdata_filterd$networkin_score >= networkin_cutoff),
+      ]
+  }
+  else {
+    ksdata_filterd <- ksdata[grep("PhosphoSitePlus", ksdata$Source), ]
+  }
+  ksdata_dataset <- merge(ksdata_filterd, new)
+  ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ]
+  ksdata_dataset$uniprot_no_isoform <- sapply(
+    ksdata_dataset$KIN_ACC_ID,
+    function(x) unlist(strsplit(as.character(x), split = "-"))[1])
+  ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)]
+  colnames(ksdata_dataset_abbrev) <- c(
+    "Kinase.Gene",
+    "Substrate.Gene",
+    "Substrate.Mod",
+    "Peptide",
+    "p",
+    "FC",
+    "log2FC",
+    "Source"
+  )
+  ksdata_dataset_abbrev <- ksdata_dataset_abbrev[
+    order(
+      ksdata_dataset_abbrev$Kinase.Gene,
+      ksdata_dataset_abbrev$Substrate.Gene,
+      ksdata_dataset_abbrev$Substrate.Mod,
+      ksdata_dataset_abbrev$p
+    ),
+    ]
+  ksdata_dataset_abbrev <- aggregate(
+    log2FC ~ Kinase.Gene +
+      Substrate.Gene + Substrate.Mod + Source,
+    data = ksdata_dataset_abbrev,
+    FUN = mean
+  )
+  ksdata_dataset_abbrev <-
+    ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ]
+  kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene)
+  kinase_list <- as.matrix(table(kinase_list))
+  mean_fc <- aggregate(
+    log2FC ~ Kinase.Gene,
+    data = ksdata_dataset_abbrev,
+    FUN = mean
+  )
+  mean_fc <- mean_fc[order(mean_fc[, 1]), ]
+  mean_fc$m_s <- mean_fc[, 2]
+  mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = T))
+  mean_fc$m <- kinase_list
+  mean_fc$z_score <- ((mean_fc$m_s - mean(new$log2_fc, na.rm = T)) *
+                        sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = T)
+  mean_fc$p_value <- pnorm(-abs(mean_fc$z_score))
+  mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr")
+  mean_fc_filtered <- mean_fc[(mean_fc$m >= m_cutoff), -2]
+  mean_fc_filtered <-
+    mean_fc_filtered[abs(mean_fc_filtered$z_score) > z_threshold, -2]
+  mean_fc_filtered <- mean_fc_filtered[order(mean_fc_filtered$z_score), ]
+  plot_height <- length(mean_fc_filtered$z_score) ^ 0.55
+  mean_fc_filtered$color <- "black"
+  mean_fc_filtered[
+    (mean_fc_filtered$p_value < p_cutoff) & (mean_fc_filtered$z_score < 0),
+    ncol(mean_fc_filtered)
+  ] <- "blue"
+  mean_fc_filtered[
+    (mean_fc_filtered$p_value < p_cutoff) & (mean_fc_filtered$z_score > 0),
+    ncol(mean_fc_filtered)
+  ] <- "red"
+  draw_barplot <- function() {
+    barplot(
+      as.numeric(mean_fc_filtered$z_score),
+      col = mean_fc_filtered$color,
+      border = NA,
+      xpd = F,
+      cex.names = 0.6,
+      cex.axis = 0.8,
+      xlab = "Kinase z-score",
+      names.arg = mean_fc_filtered$Kinase.Gene,
+      horiz = T,
+      las = 1,
+      main = main,
+      sub = sub
+    )
+  }
+  if (export == "TRUE") {
+    tiff(
+      "KSEA Bar Plot.tiff",
+      width = 6 * 300,
+      height = 300 * plot_height,
+      res = 300,
+      pointsize = 13
+    )
+    par(mai = c(1, 1, 0.4, 0.4))
+    draw_barplot()
+    dev.off()
+  }
+  else {
+    draw_barplot()
+  }
+}
+
+ksea_app_prep_db <- c(
+    "mqppep.UT_Phospho_ST_Sites.sqlite",
+    "mqppep.pST_Sites_NancyDu.sqlite"
+  )[1]
+
+db <- dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
+contrast_metadata_df <-
+  sqldf("select * from contrast_lvl_lvl_metadata", connection = db)
+rslt <- new_env()
+rslt$score_list <- list()
+rslt$name_list  <- list()
+#ACE rslt$usable_contrast_list <- list()
+next_index <- 0
+low_fdr_print <- function(i) {
+ #ACE i <- rslt$usable_contrast_list[[i]]
+ if (!is.null(rslt$score_list[[i]])) {
+    display_metadata <-
+      contrast_metadata_df[i, c(4, 5, 6), drop = FALSE]
+    colnames(display_metadata) <-
+      c(paste0(contrast_metadata_df[i, 2], "_trt"),
+        paste0(contrast_metadata_df[i, 3], "_trt"),
+        "fold-change"
+      )
+    print(display_metadata, row.names = FALSE)
+    #ACE print(
+    #ACE   contrast_metadata_df[i, c(2, 4, 3, 5, 6), drop = FALSE],
+    #ACE   row.names = FALSE)
+    #ACE print(contrast_metadata_df[i, ])
+    if (sum(rslt$score_list[[i]]$fdr < 0.05) > 0) {
+        print(
+          rslt$score_list[[i]][rslt$score_list[[i]]$fdr < 0.05, ],
+            row.names = FALSE)
+    } else {
+      cat("No kinases enriched with FDR < 0.05\n")
+    }
+  }
+}
+err_na_subscr_df_const <-
+ "missing values are not allowed in subscripted assignments of data frames"
+
+for (i_cntrst in seq_len(nrow(contrast_metadata_df))) {
+  contrast_label <- sprintf(
+    "%s -> %s",
+    contrast_metadata_df[i_cntrst, "b_level"],
+    contrast_metadata_df[i_cntrst, "a_level"]
+  )
+  kseaapp_input <-
+    sqldf(
+      x = sprintf("
+        SELECT `Protein`, `Gene`, `Peptide`, `Residue.Both`, `p`, `FC`
+          FROM v_kseaapp_input
+          WHERE contrast = %d
+        ", i_cntrst
+      ),
+    connection = db
+    )
+  #ACE K_S_links <-
+  #ACE   KSEA.KS_table(
+  #ACE     KSEAapp::KSData,
+  #ACE     kseaapp_input,
+  #ACE     networkin = TRUE,
+  #ACE     networkin_cutoff = 2
+  #ACE     )
+  #ACE print(sprintf("contrast = %d", i_cntrst))
+  tryCatch(
+    expr = {
+        main_title <- (
+            sprintf(
+            "Change from treatment %s to treatment %s",
+            contrast_metadata_df[i_cntrst, "b_level"],
+            contrast_metadata_df[i_cntrst, "a_level"]
+          )
+        )
+        sub_title <- (
+            sprintf(
+            "Trt %s = {%s}.  Trt %s = {%s}.",
+            contrast_metadata_df[i_cntrst, "b_level"],
+            gsub(
+              pattern = ";",
+              replacement = ", ",
+              x = contrast_metadata_df[i_cntrst, "b_samples"],
+              fixed = TRUE
+            ),
+            contrast_metadata_df[i_cntrst, "a_level"],
+            gsub(
+              pattern = ";",
+              replacement = "",
+              x = contrast_metadata_df[i_cntrst, "a_samples"],
+              fixed = TRUE
+            )
+          )
+        )
+        ksea_barplot(
+          KSEAapp::KSData,
+          kseaapp_input,
+          networkin = TRUE,
+          networkin_cutoff = 2,
+          m_cutoff = 2,
+          p_cutoff = 0.05,
+          export = FALSE,
+          main = main_title,
+          sub = sub_title
+          )
+      },
+    error = function(e) {
+      if (tolower(e$message) == err_na_subscr_df_const) {
+        cat(
+          sprintf("\nNo scores or plot for contrast '%s' because:\n  '%s'\n",
+                  contrast_label,
+                  "There are too many missing or invalid values."
+                  ),
+          file = stdout()
+        )
+      } else {
+        cat(
+          sprintf("\nNo scores or plot for contrast '%s' because:\n  '%s'\n",
+                  contrast_label,
+                  e$message
+                  ),
+          file = stdout()
+        )
+      }
+    }
+  )
+  tryCatch(
+    expr = {
+        ksea_scores <-
+          KSEA.Scores(
+            KSEAapp::KSData,
+            kseaapp_input,
+            NetworKIN = TRUE,
+            NetworKIN.cutoff = 2
+            )
+
+        if (0 < sum(!is.nan(ksea_scores$FDR))) {
+          next_index <- 1 + next_index
+          rslt$score_list[[next_index]] <- ksea_scores
+          #ACE rslt$usable_contrast_list[[next_index]] <- i_cntrst
+          rslt$name_list[[next_index]] <- contrast_label
+          cat("\n", rslt$name_list[[next_index]], "\n")
+          low_fdr_print(next_index)
+        }
+      },
+    error = function(e) str(e)
+  )
+}
+if (length(rslt$score_list) > 0) {
+  ksea_heatmap(
+    # the data frame outputs from the KSEA.Scores() function, in list format
+    score_list = rslt$score_list,
+    # a character vector of all the sample names for heatmap annotation:
+    # - the names must be in the same order as the data in score_list
+    # - please avoid long names, as they may get cropped in the final image
+    sample_labels = rslt$name_list,
+    # character string of either "p.value" or "FDR" indicating the data column
+    #   to use for marking statistically significant scores
+    stats = c("p.value", "FDR")[2],
+    # a numeric value between 0 and infinity indicating the min. number of
+    #   substrates a kinase must have to be included in the heatmap
+    m_cutoff = 1,
+    # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
+    #   for indicating significant kinases in the heatmap
+    p_cutoff = 0.05,
+    # a binary input of TRUE or FALSE, indicating whether or not to perform
+    #   hierarchical clustering of the sample columns
+    sample_cluster = TRUE,
+    # a binary input of TRUE or FALSE, indicating whether or not to export
+    #   the bar plot as a .tiff image into the working directory
+    export = FALSE
+  )
+}
+dbDisconnect(db)