mqppep_anova: mqppep_anova_script.Rmd comparison

comparison mqppep_anova_script.Rmd @ 15:2c5f1a2fe16a draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 96659062ea07ac43d139746b0d119f1ee020f9cd"

author	eschen42
date	Sat, 26 Mar 2022 02:27:12 +0000
parents	6679616d0c18
children	61adb8801b73

comparison

equal deleted inserted replaced

-:6679616d0c18
+:2c5f1a2fe16a
 pdf_document:
 toc: true
 latex_document:
 toc: true
 params:
-inputFile: "test-data/test_input_for_anova.tabular"
 alphaFile: "test-data/alpha_levels.tabular"
-firstDataColumn: "Intensity"
+inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular"
+firstDataColumn: "^Intensity[^_]"
 imputationMethod: !r c("group-median", "median", "mean", "random")[4]
 meanPercentile: 1
 sdPercentile: 1.0
 regexSampleNames: "\\.\\d+[A-Z]$"
 regexSampleGrouping: "\\d+"
-imputedDataFilename: "test-data/imputedDataFilename.txt"
+imputedDataFilename: "test-data/limbo/imputedDataFilename.txt"
-imputedQNLTDataFile: "test-data/imputedQNLTDataFile.txt"
+imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt"
 show_toc: true
 ---
 <!--
+alphaFile: "test-data/alpha_levels.tabular"
+inputFile: "test-data/test_input_for_anova.tabular"
+inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular"
+inputFile: "test-data/density_failure.preproc_tab.tabular"
 latex_document: default
-inputFile: "test-data/test_input_for_anova.tabular"
-inputFile: "test-data/density_failure.preproc_tab.tabular"
-inputFile: "test-data/UT_Phospho_STY_Sites.preproc_tab"
-date: "May 28, 2018; Mar 16, 2022"
 -->
 ```{r setup, include = FALSE}
 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10))
 const_stripchart_cex <- 0.5
 const_stripsmall_cex <-
 sqrt(const_stripchart_cex * const_stripchart_cex / 2)
 const_stripchart_jitter <- 0.3
 const_write_debug_files <- FALSE
+const_table_anchor <- "tbp"
 ### FUNCTIONS
 #ANOVA filter function
 anova_func <- function(x, grouping_factor) {
 latex_collapsed_vector(" & ", v)
 cat(" \\\\\n")
 }
 # Use this like print.data.frame, from which it is adapted:
-print_data_frame_latex <-
+data_frame_latex <-
 function(
 x,
 ...,
 # digits to pass to format.data.frame
 digits = NULL,
 # maximumn number of rows to print
 max = NULL,
 # string with justification of each column
 justification = NULL,
 # TRUE to center on page
-centered = FALSE,
+centered = TRUE,
 # optional capttion
 caption = NULL,
 # h(inline); b(bottom); t (top) or p (separate page)
 anchor = "h"
 ) {
 invisible(x)
 }
 ```
-## Purpose:
+## Purpose
 Perform imputation of missing values, quantile normalization, and ANOVA.
-<!--
-## Variables to change for each input file
--->
 ```{r include = FALSE}
 # Input Filename
 input_file <- params$inputFile
 # First data column - ideally, this could be detected via regexSampleNames,
 #   but for now leave it as is.
 first_data_column <- params$firstDataColumn
-fdc_is_integer <- TRUE
+fdc_is_integer <- is.integer(first_data_column)
-first_data_column <- withCallingHandlers(
+if (fdc_is_integer) {
-as.integer(first_data_column)
+first_data_column <- as.integer(params$firstDataColumn)
-, warning = function(w) fdc_is_integer <<- FALSE
-)
-if (FALSE == fdc_is_integer) {
-first_data_column <- params$firstDataColumn
 }
 # False discovery rate adjustment for ANOVA
 #  Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05
 val_fdr <-
-read.table(file = params$alphaFile, sep = "\t", header = F, quote = "")[, 1]
+read.table(file = params$alphaFile, sep = "\t", header = F, quote = "")
+if (
+ncol(val_fdr) != 1 ||
+sum(!is.numeric(val_fdr[, 1])) ||
+sum(val_fdr[, 1] < 0) ||
+sum(val_fdr[, 1] > 1)
+) {
+stop("alphaFile should be one column of numbers within the range [0.0,1.0]")
+}
+val_fdr <- val_fdr[, 1]
 #Imputed Data filename
 imputed_data_filename <- params$imputedDataFilename
 imp_qn_lt_data_filenm <- params$imputedQNLTDataFile
 quote = "",
 check.names = FALSE
 )
 ```
-### Parse column names, sample names, and factor levels from input file
+## Extract Sample Names and Factor Levels
+Column names parsed from input file are shown in Table 1; sample names and factor levels, in Table 2.
 ```{r echo = FALSE, results = 'asis'}
-# Write column naames as an enumerated list.
-column_name_df <- data.frame(
-column = seq_len(length(colnames(full_data))),
-name = colnames(full_data)
-)
-print_data_frame_latex(
-x = column_name_df,
-justification = "l l",
-centered = TRUE,
-caption = "Input data column name",
-anchor = "h"
-)
 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE)
-cat(
-sprintf(
+if (!fdc_is_integer) {
-"\n\nData columns: [%d,%d]\n\n",
-min(data_column_indices),
-max(data_column_indices)
-)
-)
-if (FALSE == fdc_is_integer) {
 if (length(data_column_indices) > 0) {
 first_data_column <- data_column_indices[1]
 } else {
 stop(paste("failed to convert firstDataColumn:", first_data_column))
 }
 }
+cat(
+sprintf(
+paste(
+"\n\nPeptide-intensity data for each sample is",
+"in one of columns %d through %d.\n\n"
+),
+min(data_column_indices),
+max(data_column_indices)
+)
+)
+# Write column names as a LaTeX enumerated list.
+column_name_df <- data.frame(
+column = seq_len(length(colnames(full_data))),
+name = colnames(full_data)
+)
+data_frame_latex(
+x = column_name_df,
+justification = "l l",
+centered = TRUE,
+caption = "Input data column name",
+anchor = const_table_anchor
+)
 ```
 ```{r echo = FALSE, results = 'asis'}
 #```{r echo = FALSE, results = 'asis'}
 number_of_samples <- length(sample_name_matches)
 sample_factor_df <- data.frame(
 sample = sample_name_matches,
 level = sample_factor_levels
 )
-print_data_frame_latex(
+data_frame_latex(
 x = sample_factor_df,
 justification = "c c",
 centered = TRUE,
 caption = "Factor level",
-anchor = "h"
+anchor = const_table_anchor
 )
 ```
 ```{r echo = FALSE, results = 'asis'}
 cat("\\newpage\n")
 ```
 # Vertical plot
 boxplot(
 quant_data_log
 , las = 1
 , col = const_boxplot_fill
+, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+, xlab = "Sample"
 )
-# Points
-stripchart(
-quant_data_log, # Data
-method = "jitter",          # Random noise
-jitter = const_stripchart_jitter,
-pch = 19,                   # Pch symbols
-cex = const_stripchart_cex, # Size of symbols reduced
-col = "goldenrod",          # Color of the symbol
-vertical = TRUE,            # Vertical mode
-add = TRUE                  # Add it over
-)
 par(old_par)
 cat("\n\n\n")
 if (nrow(quant_data_log) > 1) {
 quant_data_log_stack <- stack(quant_data_log)
 ggplot(
 quant_data_log_stack,
 aes(x = values)
-) +
+) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
+ylab("Probability density") +
 geom_density(
 aes(group = ind, colour = ind),
 na.rm = TRUE
 )
 } else {
 ```
 ### Globally, are peptide intensities are approximately unimodal?
 <!--
-# ref for bquote below particularly and plotting math expressions generally:
+# bquote could be used as an alternative to latex2exp::TeX below particularly
+#   and when plotting math expressions generally, at the expense of mastering
+#   another syntax, which hardly seems worthwhile when I need to use TeX
+#   elsewhere; here's an introduction to bquote:
 #   https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/
 -->
 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'}
 # identify the location of missing values
 fin <- is.finite(as.numeric(as.matrix(quant_data_log)))
 logvalues <- as.numeric(as.matrix(quant_data_log))[fin]
+logvalues_density <- density(logvalues)
 plot(
-density(logvalues),
+x = logvalues_density,
-main = bquote(
+main = latex2exp::TeX(
-"Smoothed estimated probability density vs." ~ log[10](intensity)),
+"Smoothed estimated probability density vs. $log_{10}$(peptide intensity)"
-xlab = bquote(log[10](intensity))
+),
+xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
+ylab = "Probability density"
 )
 hist(
-x = as.numeric(as.matrix(quant_data_log))
+x = as.numeric(as.matrix(quant_data_log)),
-, breaks = 100
+xlim = c(min(logvalues_density$x), max(logvalues_density$x)),
-, main = bquote("Frequency vs." ~ log[10](intensity))
+breaks = 100,
-, xlab = bquote(log[10](intensity))
+main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"),
+xlab = latex2exp::TeX("$log_{10}$(peptide intensity)")
 )
 ```
 ### Distribution of standard deviations of $log_{10}(\text{intensity})$, ignoring missing values:
 if (sum(!is.na(sds)) > 2) {
 plot(
 density(sds, na.rm = T)
 , main = "Smoothed estimated probability density vs. std. deviation"
 , sub = "(probability estimation made with Gaussian smoothing)"
+, ylab = "Probability density"
 )
 } else {
 cat(
 "At least two non-missing values are required to plot",
 "probability density.\n\n"
 switch(
 imputation_method
 , "group-median" = {
 imputation_method_description <-
 paste("Substitute missing value with",
-"median peptide intensity for sample group\n"
+"median peptide intensity for sample group.\n"
 )
 sample_level_integers <- as.integer(sample_factor_levels)
 for (i in seq_len(length(levels(sample_factor_levels)))) {
 level_cols <- i == sample_level_integers
 ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE)
 good_rows <- !is.na(rowMeans(quant_data_imp))
 }
 , "median" = {
 imputation_method_description <-
 paste("Substitute missing value with",
-"median peptide intensity across all sample classes\n"
+"median peptide intensity across all sample classes.\n"
 )
 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]]
 good_rows <- !is.na(rowMeans(quant_data_imp))
 }
 , "mean" = {
 imputation_method_description <-
 paste("Substitute missing value with",
-"mean peptide intensity across all sample classes\n"
+"mean peptide intensity across all sample classes.\n"
 )
 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]]
 good_rows <- !is.na(rowMeans(quant_data_imp))
 }
 , "random" = {
 # If you want results to be reproducible, you will want to call
 #   base::set.seed before calling stats::rnorm
 imputation_method_description <-
 paste("Substitute each missing value with random intensity",
 sprintf(
-"random intensity $N \\sim (%0.2f, %0.2f)$\n",
+"random intensity $N \\sim (%0.2f, %0.2f)$.\n",
 q1, m1
 )
 )
 cat(sprintf("mean_percentile (from input parameter) is %2.0f\n\n",
 100 * mean_percentile))
 cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n",
 sd_percentile))
-#ACE cat(sprintf("sd for rnorm is %0.4f\n\n", m1))
 quant_data_imp[ind] <-
 10 ^ rnorm(number_to_impute, mean = q1, sd = m1)
 good_rows <- !is.na(rowMeans(quant_data_imp))
 }
 )
 log10(quant_data_imp)
 write_debug_file(quant_data_imp_log10)
 red_dots <- quant_data_imp_log10 * x
-ylim <- c(
+count_red <- sum(!is.na(red_dots))
+count_blue <- sum(!is.na(blue_dots))
+ylim_save <- ylim <- c(
 min(red_dots, blue_dots, na.rm = TRUE),
 max(red_dots, blue_dots, na.rm = TRUE)
 )
-# ref: https://r-charts.com/distribution/add-points-boxplot/
+show_stripchart <-
+50 > (count_red + count_blue) / length(sample_name_matches)
+if (show_stripchart) {
+boxplot_sub <- "Light blue = data before imputation; Red = imputed data"
+} else {
+boxplot_sub <- ""
+}
 # Vertical plot
 colnames(blue_dots) <- sample_name_matches
 boxplot(
 blue_dots
 , las = 1 # "always horizontal"
 , col = const_boxplot_fill
 , ylim = ylim
 , main = "Peptide intensities before and after imputation"
-, sub = "Light blue = data before imputation; Red = imputed data"
+, sub = boxplot_sub
 , xlab = "Sample"
-, ylab = "log10(peptide intensity)"
+, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
 )
-# Points
-# NA values are not plotted
+if (show_stripchart) {
-stripchart(
+# Points
-blue_dots,                 # Data
+# ref: https://r-charts.com/distribution/add-points-boxplot/
-method = "jitter",          # Random noise
+# NA values are not plotted
-jitter = const_stripchart_jitter,
+stripchart(
-pch = 19,                   # Pch symbols
+blue_dots,                 # Data
-cex = const_stripsmall_cex, # Size of symbols reduced
+method = "jitter",          # Random noise
-col = "lightblue",          # Color of the symbol
+jitter = const_stripchart_jitter,
-vertical = TRUE,            # Vertical mode
+pch = 19,                   # Pch symbols
-add = TRUE                  # Add it over
+cex = const_stripsmall_cex, # Size of symbols reduced
-)
+col = "lightblue",          # Color of the symbol
-stripchart(
+vertical = TRUE,            # Vertical mode
-red_dots,                   # Data
+add = TRUE                  # Add it over
-method = "jitter",          # Random noise
+)
-jitter = const_stripchart_jitter,
+stripchart(
-pch = 19,                   # Pch symbols
+red_dots,                   # Data
-cex = const_stripsmall_cex, # Size of symbols reduced
+method = "jitter",          # Random noise
-col = "red",                # Color of the symbol
+jitter = const_stripchart_jitter,
-vertical = TRUE,            # Vertical mode
+pch = 19,                   # Pch symbols
-add = TRUE                  # Add it over
+cex = const_stripsmall_cex, # Size of symbols reduced
-)
+col = "red",                # Color of the symbol
+vertical = TRUE,            # Vertical mode
+add = TRUE                  # Add it over
+)
+} else {
+# violin plot
+cat("\\leavevmode\n\\quad\n\n\\quad\n\n")
+vioplot::vioplot(
+x = lapply(blue_dots, function(x) x[!is.na(x)]),
+col = "lightblue1",
+side = "left",
+plotCentre = "line",
+ylim = ylim_save,
+main = "Distributions of observed and imputed data",
+sub = "Light blue = observed data; Pink = imputed data",
+xlab = "Sample",
+ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
+)
+vioplot::vioplot(
+x = lapply(red_dots, function(x) x[!is.na(x)]),
+col = "lightpink1",
+side = "right",
+plotCentre = "line",
+add = T
+)
+}
 par(old_par)
 # density plot
 cat("\\leavevmode\n\n\n\n\n\n\n")
 if (can_plot_before_after_imp) {
 paste(
 "Blue = data before imputation; Red = imputed data;",
 "Black = combined"
 ),
 main = "Density of peptide intensity before and after imputation",
-xlab = "log10(peptide intensity)",
+xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"),
 ylab = "Probability density"
 )
 lines(d_original, col = "blue")
 lines(d_imputed, col = "red")
 } else {
 colnames(quant_data_log) <- sample_name_matches
 boxplot(
 quant_data_log
 , las = 1
 , col = const_boxplot_fill
-)
+, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
-# Points
+, xlab = "Sample"
-stripchart(
+)
-quant_data_log, # Data
-method = "jitter",          # Random noise
-jitter = const_stripchart_jitter,
-pch = 19,                   # Pch symbols
-cex = const_stripchart_cex, # Size of symbols reduced
-col = "goldenrod",          # Color of the symbol
-vertical = TRUE,            # Vertical mode
-add = TRUE                  # Add it over
-)
 par(old_par)
 } else {
 cat("There are no peptides to plot\n")
 }
 if (nrow(quant_data_log) > 1) {
 quant_data_log_stack <- stack(quant_data_log)
 ggplot(
 quant_data_log_stack,
 aes(x = values)
-) +
+) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) +
+ylab("Probability density") +
 geom_density(
 aes(group = ind, colour = ind),
 na.rm = TRUE
 )
 } else {
 ```
 ```{r echo = FALSE, results = 'asis'}
 cat("\\leavevmode\\newpage\n")
 ```
-## Perform ANOVA filters
+## Perform ANOVA Filters
 ```{r, echo = FALSE}
 # Make new data frame containing only Phosphopeptides
 #   to connect preANOVA to ANOVA (connect_df)
 connect_df <- data.frame(
 boxplot(
 filtered_data_filtered,
 main = "Imputed, normalized intensities", # no line plot
 las = 1,
 col = const_boxplot_fill,
-ylab = expression(log[10](intensity))
+ylab = latex2exp::TeX("$log_{10}$(peptide intensity)")
 )
-# Points
-stripchart(
-filtered_data_filtered,     # Data
-method = "jitter",          # Random noise
-jitter = const_stripchart_jitter,
-pch = 19,                   # Pch symbols
-cex = const_stripchart_cex, # Size of symbols reduced
-col = "goldenrod",          # Color of the symbol
-vertical = TRUE,            # Vertical mode
-add = TRUE                  # Add it over
-)
 par(old_par)
 } else {
 cat(sprintf(
 "%s < %0.2f\n\n\n\n\n",
-"No peptides were found to have cutoff adjusted p-value <",
+"No peptides were found to have cutoff adjusted p-value",
 cutoff
 ))
 }
 if (nrow(filtered_data_filtered) > 0) {
 "whose adjusted p-value < %0.2f\n",
 cutoff)
 )
 } else {
 if (nrow(m) == 1) {
+next
+} else {
 cat(
 sprintf("Heatmap for %d usable peptides whose", nrow(m)),
 sprintf("adjusted p-value < %0.2f\n", cutoff)
 )
-next
 }
 }
 cat("\n\n\n")
 cat("\n\n\n")
 try(
 }
 }
 }
 cat("\\leavevmode\n\n\n")
 ```
-<!--
-## Peptide IDs, etc.
-See output files.
--->

Mercurial > repos > eschen42 > mqppep_anova

comparison mqppep_anova_script.Rmd @ 15:2c5f1a2fe16a draft