Mercurial > repos > eschen42 > mqppep_anova
diff kseaapp_demo.R @ 27:2e35f925c469 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit be6750cb1d7ae18e71b60ae6f2c55e5331105c4f
author | eschen42 |
---|---|
date | Thu, 27 Oct 2022 18:54:25 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kseaapp_demo.R Thu Oct 27 18:54:25 2022 +0000 @@ -0,0 +1,481 @@ +# KSEAapp demo using SQLite data source +library(sqldf) # implies that package RSQLite will be available +library(KSEAapp) # implies that package gplots will be available + +# ref: http://adv-r.had.co.nz/Environments.html +# "When creating your own environment, note that you should set its parent +# environment to be the empty environment. This ensures you don't +# accidentally inherit objects from somewhere else." +# Caution: this prevents `with(my_env, expr)` from working when `expr` +# contains anything from the global environment, even operators! +# Hence, `x <- 1; get("x", new_env())` fails by design. +new_env <- function() { + new.env(parent = emptyenv()) +} + +# Adapted from KSEAapp::KSEA.Heatmap to allow specification of output file +ksea_heatmap <- +function( + # the data frame outputs from the KSEA.Scores() function, in list format + score_list, + # a character vector of all the sample names for heatmap annotation: + # - the names must be in the same order as the data in score_list + # - please avoid long names, as they may get cropped in the final image + sample_labels, + # character string of either "p.value" or "FDR" indicating the data column + # to use for marking statistically significant scores + stats, + # a numeric value between 0 and infinity indicating the min. number of + # substrates a kinase must have to be included in the heatmap + m_cutoff, + # a numeric value between 0 and 1 indicating the p-value/FDR cutoff + # for indicating significant kinases in the heatmap + p_cutoff, + # a binary input of TRUE or FALSE, indicating whether or not to perform + # hierarchical clustering of the sample columns + sample_cluster, + # a binary input of TRUE or FALSE, indicating whether or not to export + # the bar plot as a .tiff image into the working directory + export = FALSE +) { + filter_m <- function(dataset, m_cutoff) { + filtered <- dataset[(dataset$m >= m_cutoff), ] + return(filtered) + } + score_list_m <- lapply(score_list, function(...) + filter_m(..., + m_cutoff)) + for (i in seq_len(length(score_list_m))) { + names <- colnames(score_list_m[[i]])[c(2:7)] + colnames(score_list_m[[i]])[c(2:7)] <- + paste(names, i, sep = ".") + } + master <- + Reduce(function(...) + merge(..., by = "Kinase.Gene", all = F), score_list_m) + #ACE if (is.null(master)) return ("No significant kinase genes found") + row.names(master) <- master$Kinase.Gene + columns <- as.character(colnames(master)) + merged_scores <- as.matrix(master[, grep("z.score", columns)]) + colnames(merged_scores) <- sample_labels + merged_stats <- as.matrix(master[, grep(stats, columns)]) + asterisk <- function(matrix) { + new <- data.frame() + for (i in seq_len(nrow(matrix))) { + for (j in seq_len(ncol(matrix))) { + if (matrix[i, j] < p_cutoff) { + new[i, j] <- "*" + } + else { + new[i, j] <- "" + } + } + } + return(new) + } + merged_asterisk <- as.matrix(asterisk(merged_stats)) + create_breaks <- function(merged_scores) { + if (min(merged_scores) < -1.6) { + breaks_neg <- seq(-1.6, 0, length.out = 30) + breaks_neg <- + append(seq(min(merged_scores), -1.6, length.out = 10), breaks_neg) + breaks_neg <- sort(unique(breaks_neg)) + } + else { + breaks_neg <- seq(-1.6, 0, length.out = 30) + } + if (max(merged_scores) > 1.6) { + breaks_pos <- seq(0, 1.6, length.out = 30) + breaks_pos <- + append(breaks_pos, seq(1.6, max(merged_scores), length.out = 10)) + breaks_pos <- sort(unique(breaks_pos)) + } + else { + breaks_pos <- seq(0, 1.6, length.out = 30) + } + breaks_all <- unique(append(breaks_neg, breaks_pos)) + mycol_neg <- + gplots::colorpanel(n = length(breaks_neg), + low = "blue", + high = "white") + mycol_pos <- + gplots::colorpanel(n = length(breaks_pos) - 1, + low = "white", + high = "red") + mycol <- unique(append(mycol_neg, mycol_pos)) + color_breaks <- list(breaks_all, mycol) + return(color_breaks) + } + color_breaks <- create_breaks(merged_scores) + plot_height <- nrow(merged_scores) ^ 0.55 + plot_width <- ncol(merged_scores) ^ 0.7 + draw_heatmap <- function() gplots::heatmap.2( + merged_scores, + Colv = sample_cluster, + scale = "none", + cellnote = merged_asterisk, + notecol = "white", + cexCol = 0.9, + cexRow = 0.9, + srtCol = 45, + notecex = 1.4, + col = color_breaks[[2]], + density.info = "none", + trace = "none", + key = F, + breaks = color_breaks[[1]], + lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), + lhei = c(0.4, 9.5, 0.6), + lwid = c(0.5, 3), + margins = c(2, 6) + ) + if (export == "TRUE") { + png( + "KSEA.Merged.Heatmap.png", + width = plot_width * 300, + height = plot_height * 300, + res = 300, + pointsize = 14 + ) + draw_heatmap() + dev.off() + } + else { + draw_heatmap() + } +} + +# Adapted from KSEAapp::KSEA.Barplot to allow: +# - printing main and sub titles +ksea_barplot <- function( + # The Kinase-Substrate dataset uploaded from the file prefaced with + # "PSP&networkin_" available from github.com/casecpb/KSEA/ + ksdata = KSEAapp::KSData, + # The experimental data file formatted as described in the + # KSEAapp::KSEA.Complete() documentation + px, + # networkin = TRUE means inclusion of networkin predictions + # networkin = FALSE means not to include and to ignore networkin_cutoff + networkin, + # A numeric value between 1 and infinity setting the minimum networkin score + # (can be left out if networkin = FALSE) + networkin_cutoff, + # A numeric value between 0 and infinity indicating the min. # of substrates + # a kinase must have to be included in the bar plot output + m_cutoff, + # A numeric value greater than zero indicating the p-value cutoff for + # indicating significant kinases in the bar plot + p_cutoff, + # A numeric value between 0 and 1 indicating the minimum magnitude of + # z-value cutoff for adding a kinase to the bar plot + z_threshold = 1.0, + # A binary input of TRUE or FALSE, indicating whether or not to export + # the bar plot as a .tiff image into the working directory + export = FALSE, + # This remaing arguments are passed through to barplot + main = "", + sub = "" +) { + if (length(grep(";", px$Residue.Both)) == 0) { + new <- px + colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") + new$log2_fc <- log2(abs(as.numeric(as.character(new$FC)))) + new <- new[complete.cases(new$log2_fc), ] + } + else { + double <- px[grep(";", px$Residue.Both), ] + residues <- as.character(double$Residue.Both) + residues <- as.matrix(residues, ncol = 1) + split <- strsplit(residues, split = ";") + x <- sapply(split, length) + single <- data.frame( + Protein = rep(double$Protein, x), + Gene = rep(double$Gene, x), + Peptide = rep(double$Peptide, + x), + Residue.Both = unlist(split), + p = rep(double$p, + x), + FC = rep(double$FC, x) + ) + new <- px[-grep(";", px$Residue.Both), ] + new <- rbind(new, single) + colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") + new$log2_fc <- log2(abs(as.numeric(as.character(new$FC)))) + new <- new[complete.cases(new$log2_fc), ] + } + if (networkin == TRUE) { + ksdata_filterd <- ksdata[grep("[a-z]", ksdata$Source), ] + ksdata_filterd <- ksdata_filterd[ + (ksdata_filterd$networkin_score >= networkin_cutoff), + ] + } + else { + ksdata_filterd <- ksdata[grep("PhosphoSitePlus", ksdata$Source), ] + } + ksdata_dataset <- merge(ksdata_filterd, new) + ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] + ksdata_dataset$uniprot_no_isoform <- sapply( + ksdata_dataset$KIN_ACC_ID, + function(x) unlist(strsplit(as.character(x), split = "-"))[1]) + ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] + colnames(ksdata_dataset_abbrev) <- c( + "Kinase.Gene", + "Substrate.Gene", + "Substrate.Mod", + "Peptide", + "p", + "FC", + "log2FC", + "Source" + ) + ksdata_dataset_abbrev <- ksdata_dataset_abbrev[ + order( + ksdata_dataset_abbrev$Kinase.Gene, + ksdata_dataset_abbrev$Substrate.Gene, + ksdata_dataset_abbrev$Substrate.Mod, + ksdata_dataset_abbrev$p + ), + ] + ksdata_dataset_abbrev <- aggregate( + log2FC ~ Kinase.Gene + + Substrate.Gene + Substrate.Mod + Source, + data = ksdata_dataset_abbrev, + FUN = mean + ) + ksdata_dataset_abbrev <- + ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] + kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) + kinase_list <- as.matrix(table(kinase_list)) + mean_fc <- aggregate( + log2FC ~ Kinase.Gene, + data = ksdata_dataset_abbrev, + FUN = mean + ) + mean_fc <- mean_fc[order(mean_fc[, 1]), ] + mean_fc$m_s <- mean_fc[, 2] + mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = T)) + mean_fc$m <- kinase_list + mean_fc$z_score <- ((mean_fc$m_s - mean(new$log2_fc, na.rm = T)) * + sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = T) + mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) + mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") + mean_fc_filtered <- mean_fc[(mean_fc$m >= m_cutoff), -2] + mean_fc_filtered <- + mean_fc_filtered[abs(mean_fc_filtered$z_score) > z_threshold, -2] + mean_fc_filtered <- mean_fc_filtered[order(mean_fc_filtered$z_score), ] + plot_height <- length(mean_fc_filtered$z_score) ^ 0.55 + mean_fc_filtered$color <- "black" + mean_fc_filtered[ + (mean_fc_filtered$p_value < p_cutoff) & (mean_fc_filtered$z_score < 0), + ncol(mean_fc_filtered) + ] <- "blue" + mean_fc_filtered[ + (mean_fc_filtered$p_value < p_cutoff) & (mean_fc_filtered$z_score > 0), + ncol(mean_fc_filtered) + ] <- "red" + draw_barplot <- function() { + barplot( + as.numeric(mean_fc_filtered$z_score), + col = mean_fc_filtered$color, + border = NA, + xpd = F, + cex.names = 0.6, + cex.axis = 0.8, + xlab = "Kinase z-score", + names.arg = mean_fc_filtered$Kinase.Gene, + horiz = T, + las = 1, + main = main, + sub = sub + ) + } + if (export == "TRUE") { + tiff( + "KSEA Bar Plot.tiff", + width = 6 * 300, + height = 300 * plot_height, + res = 300, + pointsize = 13 + ) + par(mai = c(1, 1, 0.4, 0.4)) + draw_barplot() + dev.off() + } + else { + draw_barplot() + } +} + +ksea_app_prep_db <- c( + "mqppep.UT_Phospho_ST_Sites.sqlite", + "mqppep.pST_Sites_NancyDu.sqlite" + )[1] + +db <- dbConnect(RSQLite::SQLite(), ksea_app_prep_db) +contrast_metadata_df <- + sqldf("select * from contrast_lvl_lvl_metadata", connection = db) +rslt <- new_env() +rslt$score_list <- list() +rslt$name_list <- list() +#ACE rslt$usable_contrast_list <- list() +next_index <- 0 +low_fdr_print <- function(i) { + #ACE i <- rslt$usable_contrast_list[[i]] + if (!is.null(rslt$score_list[[i]])) { + display_metadata <- + contrast_metadata_df[i, c(4, 5, 6), drop = FALSE] + colnames(display_metadata) <- + c(paste0(contrast_metadata_df[i, 2], "_trt"), + paste0(contrast_metadata_df[i, 3], "_trt"), + "fold-change" + ) + print(display_metadata, row.names = FALSE) + #ACE print( + #ACE contrast_metadata_df[i, c(2, 4, 3, 5, 6), drop = FALSE], + #ACE row.names = FALSE) + #ACE print(contrast_metadata_df[i, ]) + if (sum(rslt$score_list[[i]]$fdr < 0.05) > 0) { + print( + rslt$score_list[[i]][rslt$score_list[[i]]$fdr < 0.05, ], + row.names = FALSE) + } else { + cat("No kinases enriched with FDR < 0.05\n") + } + } +} +err_na_subscr_df_const <- + "missing values are not allowed in subscripted assignments of data frames" + +for (i_cntrst in seq_len(nrow(contrast_metadata_df))) { + contrast_label <- sprintf( + "%s -> %s", + contrast_metadata_df[i_cntrst, "b_level"], + contrast_metadata_df[i_cntrst, "a_level"] + ) + kseaapp_input <- + sqldf( + x = sprintf(" + SELECT `Protein`, `Gene`, `Peptide`, `Residue.Both`, `p`, `FC` + FROM v_kseaapp_input + WHERE contrast = %d + ", i_cntrst + ), + connection = db + ) + #ACE K_S_links <- + #ACE KSEA.KS_table( + #ACE KSEAapp::KSData, + #ACE kseaapp_input, + #ACE networkin = TRUE, + #ACE networkin_cutoff = 2 + #ACE ) + #ACE print(sprintf("contrast = %d", i_cntrst)) + tryCatch( + expr = { + main_title <- ( + sprintf( + "Change from treatment %s to treatment %s", + contrast_metadata_df[i_cntrst, "b_level"], + contrast_metadata_df[i_cntrst, "a_level"] + ) + ) + sub_title <- ( + sprintf( + "Trt %s = {%s}. Trt %s = {%s}.", + contrast_metadata_df[i_cntrst, "b_level"], + gsub( + pattern = ";", + replacement = ", ", + x = contrast_metadata_df[i_cntrst, "b_samples"], + fixed = TRUE + ), + contrast_metadata_df[i_cntrst, "a_level"], + gsub( + pattern = ";", + replacement = "", + x = contrast_metadata_df[i_cntrst, "a_samples"], + fixed = TRUE + ) + ) + ) + ksea_barplot( + KSEAapp::KSData, + kseaapp_input, + networkin = TRUE, + networkin_cutoff = 2, + m_cutoff = 2, + p_cutoff = 0.05, + export = FALSE, + main = main_title, + sub = sub_title + ) + }, + error = function(e) { + if (tolower(e$message) == err_na_subscr_df_const) { + cat( + sprintf("\nNo scores or plot for contrast '%s' because:\n '%s'\n", + contrast_label, + "There are too many missing or invalid values." + ), + file = stdout() + ) + } else { + cat( + sprintf("\nNo scores or plot for contrast '%s' because:\n '%s'\n", + contrast_label, + e$message + ), + file = stdout() + ) + } + } + ) + tryCatch( + expr = { + ksea_scores <- + KSEA.Scores( + KSEAapp::KSData, + kseaapp_input, + NetworKIN = TRUE, + NetworKIN.cutoff = 2 + ) + + if (0 < sum(!is.nan(ksea_scores$FDR))) { + next_index <- 1 + next_index + rslt$score_list[[next_index]] <- ksea_scores + #ACE rslt$usable_contrast_list[[next_index]] <- i_cntrst + rslt$name_list[[next_index]] <- contrast_label + cat("\n", rslt$name_list[[next_index]], "\n") + low_fdr_print(next_index) + } + }, + error = function(e) str(e) + ) +} +if (length(rslt$score_list) > 0) { + ksea_heatmap( + # the data frame outputs from the KSEA.Scores() function, in list format + score_list = rslt$score_list, + # a character vector of all the sample names for heatmap annotation: + # - the names must be in the same order as the data in score_list + # - please avoid long names, as they may get cropped in the final image + sample_labels = rslt$name_list, + # character string of either "p.value" or "FDR" indicating the data column + # to use for marking statistically significant scores + stats = c("p.value", "FDR")[2], + # a numeric value between 0 and infinity indicating the min. number of + # substrates a kinase must have to be included in the heatmap + m_cutoff = 1, + # a numeric value between 0 and 1 indicating the p-value/FDR cutoff + # for indicating significant kinases in the heatmap + p_cutoff = 0.05, + # a binary input of TRUE or FALSE, indicating whether or not to perform + # hierarchical clustering of the sample columns + sample_cluster = TRUE, + # a binary input of TRUE or FALSE, indicating whether or not to export + # the bar plot as a .tiff image into the working directory + export = FALSE + ) +} +dbDisconnect(db)