Mercurial > repos > eschen42 > mqppep_anova
view kseaapp_demo.R @ 27:2e35f925c469 draft
planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit be6750cb1d7ae18e71b60ae6f2c55e5331105c4f
author | eschen42 |
---|---|
date | Thu, 27 Oct 2022 18:54:25 +0000 |
parents | |
children |
line wrap: on
line source
# KSEAapp demo using SQLite data source library(sqldf) # implies that package RSQLite will be available library(KSEAapp) # implies that package gplots will be available # ref: http://adv-r.had.co.nz/Environments.html # "When creating your own environment, note that you should set its parent # environment to be the empty environment. This ensures you don't # accidentally inherit objects from somewhere else." # Caution: this prevents `with(my_env, expr)` from working when `expr` # contains anything from the global environment, even operators! # Hence, `x <- 1; get("x", new_env())` fails by design. new_env <- function() { new.env(parent = emptyenv()) } # Adapted from KSEAapp::KSEA.Heatmap to allow specification of output file ksea_heatmap <- function( # the data frame outputs from the KSEA.Scores() function, in list format score_list, # a character vector of all the sample names for heatmap annotation: # - the names must be in the same order as the data in score_list # - please avoid long names, as they may get cropped in the final image sample_labels, # character string of either "p.value" or "FDR" indicating the data column # to use for marking statistically significant scores stats, # a numeric value between 0 and infinity indicating the min. number of # substrates a kinase must have to be included in the heatmap m_cutoff, # a numeric value between 0 and 1 indicating the p-value/FDR cutoff # for indicating significant kinases in the heatmap p_cutoff, # a binary input of TRUE or FALSE, indicating whether or not to perform # hierarchical clustering of the sample columns sample_cluster, # a binary input of TRUE or FALSE, indicating whether or not to export # the bar plot as a .tiff image into the working directory export = FALSE ) { filter_m <- function(dataset, m_cutoff) { filtered <- dataset[(dataset$m >= m_cutoff), ] return(filtered) } score_list_m <- lapply(score_list, function(...) filter_m(..., m_cutoff)) for (i in seq_len(length(score_list_m))) { names <- colnames(score_list_m[[i]])[c(2:7)] colnames(score_list_m[[i]])[c(2:7)] <- paste(names, i, sep = ".") } master <- Reduce(function(...) merge(..., by = "Kinase.Gene", all = F), score_list_m) #ACE if (is.null(master)) return ("No significant kinase genes found") row.names(master) <- master$Kinase.Gene columns <- as.character(colnames(master)) merged_scores <- as.matrix(master[, grep("z.score", columns)]) colnames(merged_scores) <- sample_labels merged_stats <- as.matrix(master[, grep(stats, columns)]) asterisk <- function(matrix) { new <- data.frame() for (i in seq_len(nrow(matrix))) { for (j in seq_len(ncol(matrix))) { if (matrix[i, j] < p_cutoff) { new[i, j] <- "*" } else { new[i, j] <- "" } } } return(new) } merged_asterisk <- as.matrix(asterisk(merged_stats)) create_breaks <- function(merged_scores) { if (min(merged_scores) < -1.6) { breaks_neg <- seq(-1.6, 0, length.out = 30) breaks_neg <- append(seq(min(merged_scores), -1.6, length.out = 10), breaks_neg) breaks_neg <- sort(unique(breaks_neg)) } else { breaks_neg <- seq(-1.6, 0, length.out = 30) } if (max(merged_scores) > 1.6) { breaks_pos <- seq(0, 1.6, length.out = 30) breaks_pos <- append(breaks_pos, seq(1.6, max(merged_scores), length.out = 10)) breaks_pos <- sort(unique(breaks_pos)) } else { breaks_pos <- seq(0, 1.6, length.out = 30) } breaks_all <- unique(append(breaks_neg, breaks_pos)) mycol_neg <- gplots::colorpanel(n = length(breaks_neg), low = "blue", high = "white") mycol_pos <- gplots::colorpanel(n = length(breaks_pos) - 1, low = "white", high = "red") mycol <- unique(append(mycol_neg, mycol_pos)) color_breaks <- list(breaks_all, mycol) return(color_breaks) } color_breaks <- create_breaks(merged_scores) plot_height <- nrow(merged_scores) ^ 0.55 plot_width <- ncol(merged_scores) ^ 0.7 draw_heatmap <- function() gplots::heatmap.2( merged_scores, Colv = sample_cluster, scale = "none", cellnote = merged_asterisk, notecol = "white", cexCol = 0.9, cexRow = 0.9, srtCol = 45, notecex = 1.4, col = color_breaks[[2]], density.info = "none", trace = "none", key = F, breaks = color_breaks[[1]], lmat = rbind(c(0, 3), c(2, 1), c(0, 4)), lhei = c(0.4, 9.5, 0.6), lwid = c(0.5, 3), margins = c(2, 6) ) if (export == "TRUE") { png( "KSEA.Merged.Heatmap.png", width = plot_width * 300, height = plot_height * 300, res = 300, pointsize = 14 ) draw_heatmap() dev.off() } else { draw_heatmap() } } # Adapted from KSEAapp::KSEA.Barplot to allow: # - printing main and sub titles ksea_barplot <- function( # The Kinase-Substrate dataset uploaded from the file prefaced with # "PSP&networkin_" available from github.com/casecpb/KSEA/ ksdata = KSEAapp::KSData, # The experimental data file formatted as described in the # KSEAapp::KSEA.Complete() documentation px, # networkin = TRUE means inclusion of networkin predictions # networkin = FALSE means not to include and to ignore networkin_cutoff networkin, # A numeric value between 1 and infinity setting the minimum networkin score # (can be left out if networkin = FALSE) networkin_cutoff, # A numeric value between 0 and infinity indicating the min. # of substrates # a kinase must have to be included in the bar plot output m_cutoff, # A numeric value greater than zero indicating the p-value cutoff for # indicating significant kinases in the bar plot p_cutoff, # A numeric value between 0 and 1 indicating the minimum magnitude of # z-value cutoff for adding a kinase to the bar plot z_threshold = 1.0, # A binary input of TRUE or FALSE, indicating whether or not to export # the bar plot as a .tiff image into the working directory export = FALSE, # This remaing arguments are passed through to barplot main = "", sub = "" ) { if (length(grep(";", px$Residue.Both)) == 0) { new <- px colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") new$log2_fc <- log2(abs(as.numeric(as.character(new$FC)))) new <- new[complete.cases(new$log2_fc), ] } else { double <- px[grep(";", px$Residue.Both), ] residues <- as.character(double$Residue.Both) residues <- as.matrix(residues, ncol = 1) split <- strsplit(residues, split = ";") x <- sapply(split, length) single <- data.frame( Protein = rep(double$Protein, x), Gene = rep(double$Gene, x), Peptide = rep(double$Peptide, x), Residue.Both = unlist(split), p = rep(double$p, x), FC = rep(double$FC, x) ) new <- px[-grep(";", px$Residue.Both), ] new <- rbind(new, single) colnames(new)[c(2, 4)] <- c("SUB_GENE", "SUB_MOD_RSD") new$log2_fc <- log2(abs(as.numeric(as.character(new$FC)))) new <- new[complete.cases(new$log2_fc), ] } if (networkin == TRUE) { ksdata_filterd <- ksdata[grep("[a-z]", ksdata$Source), ] ksdata_filterd <- ksdata_filterd[ (ksdata_filterd$networkin_score >= networkin_cutoff), ] } else { ksdata_filterd <- ksdata[grep("PhosphoSitePlus", ksdata$Source), ] } ksdata_dataset <- merge(ksdata_filterd, new) ksdata_dataset <- ksdata_dataset[order(ksdata_dataset$GENE), ] ksdata_dataset$uniprot_no_isoform <- sapply( ksdata_dataset$KIN_ACC_ID, function(x) unlist(strsplit(as.character(x), split = "-"))[1]) ksdata_dataset_abbrev <- ksdata_dataset[, c(5, 1, 2, 16:19, 14)] colnames(ksdata_dataset_abbrev) <- c( "Kinase.Gene", "Substrate.Gene", "Substrate.Mod", "Peptide", "p", "FC", "log2FC", "Source" ) ksdata_dataset_abbrev <- ksdata_dataset_abbrev[ order( ksdata_dataset_abbrev$Kinase.Gene, ksdata_dataset_abbrev$Substrate.Gene, ksdata_dataset_abbrev$Substrate.Mod, ksdata_dataset_abbrev$p ), ] ksdata_dataset_abbrev <- aggregate( log2FC ~ Kinase.Gene + Substrate.Gene + Substrate.Mod + Source, data = ksdata_dataset_abbrev, FUN = mean ) ksdata_dataset_abbrev <- ksdata_dataset_abbrev[order(ksdata_dataset_abbrev$Kinase.Gene), ] kinase_list <- as.vector(ksdata_dataset_abbrev$Kinase.Gene) kinase_list <- as.matrix(table(kinase_list)) mean_fc <- aggregate( log2FC ~ Kinase.Gene, data = ksdata_dataset_abbrev, FUN = mean ) mean_fc <- mean_fc[order(mean_fc[, 1]), ] mean_fc$m_s <- mean_fc[, 2] mean_fc$enrichment <- mean_fc$m_s / abs(mean(new$log2_fc, na.rm = T)) mean_fc$m <- kinase_list mean_fc$z_score <- ((mean_fc$m_s - mean(new$log2_fc, na.rm = T)) * sqrt(mean_fc$m)) / sd(new$log2_fc, na.rm = T) mean_fc$p_value <- pnorm(-abs(mean_fc$z_score)) mean_fc$fdr <- p.adjust(mean_fc$p_value, method = "fdr") mean_fc_filtered <- mean_fc[(mean_fc$m >= m_cutoff), -2] mean_fc_filtered <- mean_fc_filtered[abs(mean_fc_filtered$z_score) > z_threshold, -2] mean_fc_filtered <- mean_fc_filtered[order(mean_fc_filtered$z_score), ] plot_height <- length(mean_fc_filtered$z_score) ^ 0.55 mean_fc_filtered$color <- "black" mean_fc_filtered[ (mean_fc_filtered$p_value < p_cutoff) & (mean_fc_filtered$z_score < 0), ncol(mean_fc_filtered) ] <- "blue" mean_fc_filtered[ (mean_fc_filtered$p_value < p_cutoff) & (mean_fc_filtered$z_score > 0), ncol(mean_fc_filtered) ] <- "red" draw_barplot <- function() { barplot( as.numeric(mean_fc_filtered$z_score), col = mean_fc_filtered$color, border = NA, xpd = F, cex.names = 0.6, cex.axis = 0.8, xlab = "Kinase z-score", names.arg = mean_fc_filtered$Kinase.Gene, horiz = T, las = 1, main = main, sub = sub ) } if (export == "TRUE") { tiff( "KSEA Bar Plot.tiff", width = 6 * 300, height = 300 * plot_height, res = 300, pointsize = 13 ) par(mai = c(1, 1, 0.4, 0.4)) draw_barplot() dev.off() } else { draw_barplot() } } ksea_app_prep_db <- c( "mqppep.UT_Phospho_ST_Sites.sqlite", "mqppep.pST_Sites_NancyDu.sqlite" )[1] db <- dbConnect(RSQLite::SQLite(), ksea_app_prep_db) contrast_metadata_df <- sqldf("select * from contrast_lvl_lvl_metadata", connection = db) rslt <- new_env() rslt$score_list <- list() rslt$name_list <- list() #ACE rslt$usable_contrast_list <- list() next_index <- 0 low_fdr_print <- function(i) { #ACE i <- rslt$usable_contrast_list[[i]] if (!is.null(rslt$score_list[[i]])) { display_metadata <- contrast_metadata_df[i, c(4, 5, 6), drop = FALSE] colnames(display_metadata) <- c(paste0(contrast_metadata_df[i, 2], "_trt"), paste0(contrast_metadata_df[i, 3], "_trt"), "fold-change" ) print(display_metadata, row.names = FALSE) #ACE print( #ACE contrast_metadata_df[i, c(2, 4, 3, 5, 6), drop = FALSE], #ACE row.names = FALSE) #ACE print(contrast_metadata_df[i, ]) if (sum(rslt$score_list[[i]]$fdr < 0.05) > 0) { print( rslt$score_list[[i]][rslt$score_list[[i]]$fdr < 0.05, ], row.names = FALSE) } else { cat("No kinases enriched with FDR < 0.05\n") } } } err_na_subscr_df_const <- "missing values are not allowed in subscripted assignments of data frames" for (i_cntrst in seq_len(nrow(contrast_metadata_df))) { contrast_label <- sprintf( "%s -> %s", contrast_metadata_df[i_cntrst, "b_level"], contrast_metadata_df[i_cntrst, "a_level"] ) kseaapp_input <- sqldf( x = sprintf(" SELECT `Protein`, `Gene`, `Peptide`, `Residue.Both`, `p`, `FC` FROM v_kseaapp_input WHERE contrast = %d ", i_cntrst ), connection = db ) #ACE K_S_links <- #ACE KSEA.KS_table( #ACE KSEAapp::KSData, #ACE kseaapp_input, #ACE networkin = TRUE, #ACE networkin_cutoff = 2 #ACE ) #ACE print(sprintf("contrast = %d", i_cntrst)) tryCatch( expr = { main_title <- ( sprintf( "Change from treatment %s to treatment %s", contrast_metadata_df[i_cntrst, "b_level"], contrast_metadata_df[i_cntrst, "a_level"] ) ) sub_title <- ( sprintf( "Trt %s = {%s}. Trt %s = {%s}.", contrast_metadata_df[i_cntrst, "b_level"], gsub( pattern = ";", replacement = ", ", x = contrast_metadata_df[i_cntrst, "b_samples"], fixed = TRUE ), contrast_metadata_df[i_cntrst, "a_level"], gsub( pattern = ";", replacement = "", x = contrast_metadata_df[i_cntrst, "a_samples"], fixed = TRUE ) ) ) ksea_barplot( KSEAapp::KSData, kseaapp_input, networkin = TRUE, networkin_cutoff = 2, m_cutoff = 2, p_cutoff = 0.05, export = FALSE, main = main_title, sub = sub_title ) }, error = function(e) { if (tolower(e$message) == err_na_subscr_df_const) { cat( sprintf("\nNo scores or plot for contrast '%s' because:\n '%s'\n", contrast_label, "There are too many missing or invalid values." ), file = stdout() ) } else { cat( sprintf("\nNo scores or plot for contrast '%s' because:\n '%s'\n", contrast_label, e$message ), file = stdout() ) } } ) tryCatch( expr = { ksea_scores <- KSEA.Scores( KSEAapp::KSData, kseaapp_input, NetworKIN = TRUE, NetworKIN.cutoff = 2 ) if (0 < sum(!is.nan(ksea_scores$FDR))) { next_index <- 1 + next_index rslt$score_list[[next_index]] <- ksea_scores #ACE rslt$usable_contrast_list[[next_index]] <- i_cntrst rslt$name_list[[next_index]] <- contrast_label cat("\n", rslt$name_list[[next_index]], "\n") low_fdr_print(next_index) } }, error = function(e) str(e) ) } if (length(rslt$score_list) > 0) { ksea_heatmap( # the data frame outputs from the KSEA.Scores() function, in list format score_list = rslt$score_list, # a character vector of all the sample names for heatmap annotation: # - the names must be in the same order as the data in score_list # - please avoid long names, as they may get cropped in the final image sample_labels = rslt$name_list, # character string of either "p.value" or "FDR" indicating the data column # to use for marking statistically significant scores stats = c("p.value", "FDR")[2], # a numeric value between 0 and infinity indicating the min. number of # substrates a kinase must have to be included in the heatmap m_cutoff = 1, # a numeric value between 0 and 1 indicating the p-value/FDR cutoff # for indicating significant kinases in the heatmap p_cutoff = 0.05, # a binary input of TRUE or FALSE, indicating whether or not to perform # hierarchical clustering of the sample columns sample_cluster = TRUE, # a binary input of TRUE or FALSE, indicating whether or not to export # the bar plot as a .tiff image into the working directory export = FALSE ) } dbDisconnect(db)