view kseaapp_demo.R @ 27:2e35f925c469 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit be6750cb1d7ae18e71b60ae6f2c55e5331105c4f
author eschen42
date Thu, 27 Oct 2022 18:54:25 +0000
parents
children
line wrap: on
line source

# KSEAapp demo using SQLite data source
library(sqldf)   # implies that package RSQLite will be available
library(KSEAapp) # implies that package gplots  will be available

# ref: http://adv-r.had.co.nz/Environments.html
# "When creating your own environment, note that you should set its parent
#   environment to be the empty environment. This ensures you don't
#   accidentally inherit objects from somewhere else."
# Caution: this prevents `with(my_env, expr)` from working when `expr`
#   contains anything from the global environment, even operators!
#   Hence, `x <- 1; get("x", new_env())` fails by design.
new_env <- function() {
  new.env(parent = emptyenv())
}

# Adapted from KSEAapp::KSEA.Heatmap to allow specification of output file
ksea_heatmap <-
function(
  # the data frame outputs from the KSEA.Scores() function, in list format
  score_list,
  # a character vector of all the sample names for heatmap annotation:
  # - the names must be in the same order as the data in score_list
  # - please avoid long names, as they may get cropped in the final image
  sample_labels,
  # character string of either "p.value" or "FDR" indicating the data column
  #   to use for marking statistically significant scores
  stats,
  # a numeric value between 0 and infinity indicating the min. number of
  #   substrates a kinase must have to be included in the heatmap
  m_cutoff,
  # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
  #   for indicating significant kinases in the heatmap
  p_cutoff,
  # a binary input of TRUE or FALSE, indicating whether or not to perform
  #   hierarchical clustering of the sample columns
  sample_cluster,
  # a binary input of TRUE or FALSE, indicating whether or not to export
  #   the bar plot as a .tiff image into the working directory
  export = FALSE
) {
  filter_m <- function(dataset, m_cutoff) {
    filtered <- dataset[(dataset$m >= m_cutoff), ]
    return(filtered)
  }
  score_list_m <- lapply(score_list, function(...)
    filter_m(...,
             m_cutoff))
  for (i in seq_len(length(score_list_m))) {
    names <- colnames(score_list_m[[i]])[c(2:7)]
    colnames(score_list_m[[i]])[c(2:7)] <-
      paste(names, i, sep = ".")
  }
  master <-
    Reduce(function(...)
      merge(..., by = "Kinase.Gene", all = F), score_list_m)
  #ACE if (is.null(master)) return ("No significant kinase genes found") 
  row.names(master) <- master$Kinase.Gene
  columns <- as.character(colnames(master))
  merged_scores <- as.matrix(master[, grep("z.score", columns)])
  colnames(merged_scores) <- sample_labels
  merged_stats <- as.matrix(master[, grep(stats, columns)])
  asterisk <- function(matrix) {
    new <- data.frame()
    for (i in seq_len(nrow(matrix))) {
      for (j in seq_len(ncol(matrix))) {
        if (matrix[i, j] < p_cutoff) {
          new[i, j] <- "*"
        }
        else {
          new[i, j] <- ""
        }
      }
    }
    return(new)
  }
  merged_asterisk <- as.matrix(asterisk(merged_stats))
  create_breaks <- function(merged_scores) {
    if (min(merged_scores) < -1.6) {
      breaks_neg <- seq(-1.6, 0, length.out = 30)
      breaks_neg <-
        append(seq(min(merged_scores), -1.6, length.out = 10), breaks_neg)
      breaks_neg <- sort(unique(breaks_neg))
    }
    else {
      breaks_neg <- seq(-1.6, 0, length.out = 30)
    }
    if (max(merged_scores) > 1.6) {
      breaks_pos <- seq(0, 1.6, length.out = 30)
      breaks_pos <-
        append(breaks_pos, seq(1.6, max(merged_scores), length.out = 10))
      breaks_pos <- sort(unique(breaks_pos))
    }
    else {
      breaks_pos <- seq(0, 1.6, length.out = 30)
    }
    breaks_all <- unique(append(breaks_neg, breaks_pos))
    mycol_neg <-
      gplots::colorpanel(n = length(breaks_neg),
                 low = "blue",
                 high = "white")
    mycol_pos <-
      gplots::colorpanel(n = length(breaks_pos) - 1,
                 low = "white",
                 high = "red")
    mycol <- unique(append(mycol_neg, mycol_pos))
    color_breaks <- list(breaks_all, mycol)
    return(color_breaks)
  }
  color_breaks <- create_breaks(merged_scores)
  plot_height <- nrow(merged_scores) ^ 0.55
  plot_width <- ncol(merged_scores) ^ 0.7
  draw_heatmap <- function() gplots::heatmap.2(
    merged_scores,
    Colv = sample_cluster,
    scale = "none",
    cellnote = merged_asterisk,
    notecol = "white",
    cexCol = 0.9,
    cexRow = 0.9,
    srtCol = 45,
    notecex = 1.4,
    col = color_breaks[[2]],
    density.info = "none",
    trace = "none",
    key = F,
    breaks = color_breaks[[1]],
    lmat = rbind(c(0, 3), c(2, 1), c(0, 4)),
    lhei = c(0.4, 9.5, 0.6),
    lwid = c(0.5, 3),
    margins = c(2, 6)
  )
  if (export == "TRUE") {
    png(
      "KSEA.Merged.Heatmap.png",
      width = plot_width * 300,
      height = plot_height * 300,
      res = 300,
      pointsize = 14
    )
    draw_heatmap()
    dev.off()
  }
  else {
    draw_heatmap()
  }
}

# Adapted from KSEAapp::KSEA.Barplot to allow:
# - printing main and sub titles
ksea_barplot <- function(
  # The Kinase-Substrate dataset uploaded from the file prefaced with
  #   "PSP&networkin_" available from github.com/casecpb/KSEA/
  ksdata = KSEAapp::KSData,
  # The experimental data file formatted as described in the
  #   KSEAapp::KSEA.Complete() documentation
  px,
  # networkin = TRUE means inclusion of networkin predictions
  # networkin = FALSE means not to include and to ignore networkin_cutoff
  networkin,
  # A numeric value between 1 and infinity setting the minimum networkin score
  #   (can be left out if networkin = FALSE)
  networkin_cutoff,
  # A numeric value between 0 and infinity indicating the min. # of substrates
  #   a kinase must have to be included in the bar plot output
  m_cutoff,
  # A numeric value greater than zero indicating the p-value cutoff for
  #   indicating significant kinases in the bar plot
  p_cutoff,
  # A numeric value between 0 and 1 indicating the minimum magnitude of
  #   z-value cutoff for adding a kinase to the bar plot
  z_threshold = 1.0,
  # A binary input of TRUE or FALSE, indicating whether or not to export
  #   the bar plot as a .tiff image into the working directory
  export = FALSE,
  # This remaing arguments are passed through to barplot
  main = "",
  sub = ""
) {
  if (length(grep(";", px$Residue.Both)) == 0) {
    new <- px
    colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
    new$log2_fc <- log2(abs(as.numeric(as.character(new$FC))))
    new <- new[complete.cases(new$log2_fc), ]
  }
  else {
    double <- px[grep(";", px$Residue.Both), ]
    residues <- as.character(double$Residue.Both)
    residues <- as.matrix(residues, ncol = 1)
    split <- strsplit(residues, split = ";")
    x <- sapply(split, length)
    single <- data.frame(
      Protein = rep(double$Protein, x),
      Gene = rep(double$Gene, x),
      Peptide = rep(double$Peptide,
                    x),
      Residue.Both = unlist(split),
      p = rep(double$p,
              x),
      FC = rep(double$FC, x)
    )
    new <- px[-grep(";", px$Residue.Both), ]
    new <- rbind(new, single)
    colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD")
    new$log2_fc <- log2(abs(as.numeric(as.character(new$FC))))
    new <- new[complete.cases(new$log2_fc), ]
  }
  if (networkin == TRUE) {
    ksdata_filterd <- ksdata[grep("[a-z]", ksdata$Source), ]
    ksdata_filterd <- ksdata_filterd[
      (ksdata_filterd$networkin_score >= networkin_cutoff),
      ]
  }
  else {
    ksdata_filterd <- ksdata[grep("PhosphoSitePlus", ksdata$Source), ]
  }
  ksdata_dataset <- merge(ksdata_filterd, new)
  ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ]
  ksdata_dataset$uniprot_no_isoform <- sapply(
    ksdata_dataset$KIN_ACC_ID,
    function(x) unlist(strsplit(as.character(x), split = "-"))[1])
  ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)]
  colnames(ksdata_dataset_abbrev) <- c(
    "Kinase.Gene",
    "Substrate.Gene",
    "Substrate.Mod",
    "Peptide",
    "p",
    "FC",
    "log2FC",
    "Source"
  )
  ksdata_dataset_abbrev <- ksdata_dataset_abbrev[
    order(
      ksdata_dataset_abbrev$Kinase.Gene,
      ksdata_dataset_abbrev$Substrate.Gene,
      ksdata_dataset_abbrev$Substrate.Mod,
      ksdata_dataset_abbrev$p
    ),
    ]
  ksdata_dataset_abbrev <- aggregate(
    log2FC ~ Kinase.Gene +
      Substrate.Gene + Substrate.Mod + Source,
    data = ksdata_dataset_abbrev,
    FUN = mean
  )
  ksdata_dataset_abbrev <-
    ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ]
  kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene)
  kinase_list <- as.matrix(table(kinase_list))
  mean_fc <- aggregate(
    log2FC ~ Kinase.Gene,
    data = ksdata_dataset_abbrev,
    FUN = mean
  )
  mean_fc <- mean_fc[order(mean_fc[, 1]), ]
  mean_fc$m_s <- mean_fc[, 2]
  mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = T))
  mean_fc$m <- kinase_list
  mean_fc$z_score <- ((mean_fc$m_s - mean(new$log2_fc, na.rm = T)) *
                        sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = T)
  mean_fc$p_value <- pnorm(-abs(mean_fc$z_score))
  mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr")
  mean_fc_filtered <- mean_fc[(mean_fc$m >= m_cutoff), -2]
  mean_fc_filtered <-
    mean_fc_filtered[abs(mean_fc_filtered$z_score) > z_threshold, -2]
  mean_fc_filtered <- mean_fc_filtered[order(mean_fc_filtered$z_score), ]
  plot_height <- length(mean_fc_filtered$z_score) ^ 0.55
  mean_fc_filtered$color <- "black"
  mean_fc_filtered[
    (mean_fc_filtered$p_value < p_cutoff) & (mean_fc_filtered$z_score < 0),
    ncol(mean_fc_filtered)
  ] <- "blue"
  mean_fc_filtered[
    (mean_fc_filtered$p_value < p_cutoff) & (mean_fc_filtered$z_score > 0),
    ncol(mean_fc_filtered)
  ] <- "red"
  draw_barplot <- function() {
    barplot(
      as.numeric(mean_fc_filtered$z_score),
      col = mean_fc_filtered$color,
      border = NA,
      xpd = F,
      cex.names = 0.6,
      cex.axis = 0.8,
      xlab = "Kinase z-score",
      names.arg = mean_fc_filtered$Kinase.Gene,
      horiz = T,
      las = 1,
      main = main,
      sub = sub
    )
  }
  if (export == "TRUE") {
    tiff(
      "KSEA Bar Plot.tiff",
      width = 6 * 300,
      height = 300 * plot_height,
      res = 300,
      pointsize = 13
    )
    par(mai = c(1, 1, 0.4, 0.4))
    draw_barplot()
    dev.off()
  }
  else {
    draw_barplot()
  }
}

ksea_app_prep_db <- c(
    "mqppep.UT_Phospho_ST_Sites.sqlite",
    "mqppep.pST_Sites_NancyDu.sqlite"
  )[1]

db <- dbConnect(RSQLite::SQLite(), ksea_app_prep_db)
contrast_metadata_df <-
  sqldf("select * from contrast_lvl_lvl_metadata", connection = db)
rslt <- new_env()
rslt$score_list <- list()
rslt$name_list  <- list()
#ACE rslt$usable_contrast_list <- list()
next_index <- 0
low_fdr_print <- function(i) {
 #ACE i <- rslt$usable_contrast_list[[i]]
 if (!is.null(rslt$score_list[[i]])) {
    display_metadata <-
      contrast_metadata_df[i, c(4, 5, 6), drop = FALSE]
    colnames(display_metadata) <-
      c(paste0(contrast_metadata_df[i, 2], "_trt"),
        paste0(contrast_metadata_df[i, 3], "_trt"),
        "fold-change"
      )
    print(display_metadata, row.names = FALSE)
    #ACE print(
    #ACE   contrast_metadata_df[i, c(2, 4, 3, 5, 6), drop = FALSE],
    #ACE   row.names = FALSE)
    #ACE print(contrast_metadata_df[i, ])
    if (sum(rslt$score_list[[i]]$fdr < 0.05) > 0) {
        print(
          rslt$score_list[[i]][rslt$score_list[[i]]$fdr < 0.05, ],
            row.names = FALSE)
    } else {
      cat("No kinases enriched with FDR < 0.05\n")
    }
  }
}
err_na_subscr_df_const <-
 "missing values are not allowed in subscripted assignments of data frames"

for (i_cntrst in seq_len(nrow(contrast_metadata_df))) {
  contrast_label <- sprintf(
    "%s -> %s",
    contrast_metadata_df[i_cntrst, "b_level"],
    contrast_metadata_df[i_cntrst, "a_level"]
  )
  kseaapp_input <-
    sqldf(
      x = sprintf("
        SELECT `Protein`, `Gene`, `Peptide`, `Residue.Both`, `p`, `FC`
          FROM v_kseaapp_input
          WHERE contrast = %d
        ", i_cntrst
      ),
    connection = db
    )
  #ACE K_S_links <-
  #ACE   KSEA.KS_table(
  #ACE     KSEAapp::KSData,
  #ACE     kseaapp_input,
  #ACE     networkin = TRUE,
  #ACE     networkin_cutoff = 2
  #ACE     )
  #ACE print(sprintf("contrast = %d", i_cntrst))
  tryCatch(
    expr = {
        main_title <- (
            sprintf(
            "Change from treatment %s to treatment %s",
            contrast_metadata_df[i_cntrst, "b_level"],
            contrast_metadata_df[i_cntrst, "a_level"]
          )
        )
        sub_title <- (
            sprintf(
            "Trt %s = {%s}.  Trt %s = {%s}.",
            contrast_metadata_df[i_cntrst, "b_level"],
            gsub(
              pattern = ";",
              replacement = ", ",
              x = contrast_metadata_df[i_cntrst, "b_samples"],
              fixed = TRUE
            ),
            contrast_metadata_df[i_cntrst, "a_level"],
            gsub(
              pattern = ";",
              replacement = "",
              x = contrast_metadata_df[i_cntrst, "a_samples"],
              fixed = TRUE
            )
          )
        )
        ksea_barplot(
          KSEAapp::KSData,
          kseaapp_input,
          networkin = TRUE,
          networkin_cutoff = 2,
          m_cutoff = 2,
          p_cutoff = 0.05,
          export = FALSE,
          main = main_title,
          sub = sub_title
          )
      },
    error = function(e) {
      if (tolower(e$message) == err_na_subscr_df_const) {
        cat(
          sprintf("\nNo scores or plot for contrast '%s' because:\n  '%s'\n",
                  contrast_label,
                  "There are too many missing or invalid values."
                  ),
          file = stdout()
        )
      } else {
        cat(
          sprintf("\nNo scores or plot for contrast '%s' because:\n  '%s'\n",
                  contrast_label,
                  e$message
                  ),
          file = stdout()
        )
      }
    }
  )
  tryCatch(
    expr = {
        ksea_scores <-
          KSEA.Scores(
            KSEAapp::KSData,
            kseaapp_input,
            NetworKIN = TRUE,
            NetworKIN.cutoff = 2
            )

        if (0 < sum(!is.nan(ksea_scores$FDR))) {
          next_index <- 1 + next_index
          rslt$score_list[[next_index]] <- ksea_scores
          #ACE rslt$usable_contrast_list[[next_index]] <- i_cntrst
          rslt$name_list[[next_index]] <- contrast_label
          cat("\n", rslt$name_list[[next_index]], "\n")
          low_fdr_print(next_index)
        }
      },
    error = function(e) str(e)
  )
}
if (length(rslt$score_list) > 0) {
  ksea_heatmap(
    # the data frame outputs from the KSEA.Scores() function, in list format
    score_list = rslt$score_list,
    # a character vector of all the sample names for heatmap annotation:
    # - the names must be in the same order as the data in score_list
    # - please avoid long names, as they may get cropped in the final image
    sample_labels = rslt$name_list,
    # character string of either "p.value" or "FDR" indicating the data column
    #   to use for marking statistically significant scores
    stats = c("p.value", "FDR")[2],
    # a numeric value between 0 and infinity indicating the min. number of
    #   substrates a kinase must have to be included in the heatmap
    m_cutoff = 1,
    # a numeric value between 0 and 1 indicating the p-value/FDR cutoff
    #   for indicating significant kinases in the heatmap
    p_cutoff = 0.05,
    # a binary input of TRUE or FALSE, indicating whether or not to perform
    #   hierarchical clustering of the sample columns
    sample_cluster = TRUE,
    # a binary input of TRUE or FALSE, indicating whether or not to export
    #   the bar plot as a .tiff image into the working directory
    export = FALSE
  )
}
dbDisconnect(db)