Mercurial > repos > eschen42 > mqppep_anova
diff mqppep_anova_script.Rmd @ 15:2c5f1a2fe16a draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 96659062ea07ac43d139746b0d119f1ee020f9cd"
author | eschen42 |
---|---|
date | Sat, 26 Mar 2022 02:27:12 +0000 |
parents | 6679616d0c18 |
children | 61adb8801b73 |
line wrap: on
line diff
--- a/mqppep_anova_script.Rmd Tue Mar 22 23:12:18 2022 +0000 +++ b/mqppep_anova_script.Rmd Sat Mar 26 02:27:12 2022 +0000 @@ -8,24 +8,24 @@ latex_document: toc: true params: - inputFile: "test-data/test_input_for_anova.tabular" alphaFile: "test-data/alpha_levels.tabular" - firstDataColumn: "Intensity" + inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular" + firstDataColumn: "^Intensity[^_]" imputationMethod: !r c("group-median", "median", "mean", "random")[4] meanPercentile: 1 sdPercentile: 1.0 regexSampleNames: "\\.\\d+[A-Z]$" regexSampleGrouping: "\\d+" - imputedDataFilename: "test-data/imputedDataFilename.txt" - imputedQNLTDataFile: "test-data/imputedQNLTDataFile.txt" + imputedDataFilename: "test-data/limbo/imputedDataFilename.txt" + imputedQNLTDataFile: "test-data/limbo/imputedQNLTDataFile.txt" show_toc: true --- <!-- - latex_document: default + alphaFile: "test-data/alpha_levels.tabular" inputFile: "test-data/test_input_for_anova.tabular" + inputFile: "test-data/UT_Phospho_ST_Sites.preproc.tabular" inputFile: "test-data/density_failure.preproc_tab.tabular" - inputFile: "test-data/UT_Phospho_STY_Sites.preproc_tab" -date: "May 28, 2018; Mar 16, 2022" + latex_document: default --> ```{r setup, include = FALSE} # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 @@ -44,6 +44,7 @@ sqrt(const_stripchart_cex * const_stripchart_cex / 2) const_stripchart_jitter <- 0.3 const_write_debug_files <- FALSE +const_table_anchor <- "tbp" ### FUNCTIONS @@ -103,7 +104,7 @@ } # Use this like print.data.frame, from which it is adapted: -print_data_frame_latex <- +data_frame_latex <- function( x, ..., @@ -116,7 +117,7 @@ # string with justification of each column justification = NULL, # TRUE to center on page - centered = FALSE, + centered = TRUE, # optional capttion caption = NULL, # h(inline); b(bottom); t (top) or p (separate page) @@ -200,13 +201,10 @@ ``` -## Purpose: +## Purpose Perform imputation of missing values, quantile normalization, and ANOVA. -<!-- -## Variables to change for each input file ---> ```{r include = FALSE} # Input Filename input_file <- params$inputFile @@ -214,19 +212,25 @@ # First data column - ideally, this could be detected via regexSampleNames, # but for now leave it as is. first_data_column <- params$firstDataColumn -fdc_is_integer <- TRUE -first_data_column <- withCallingHandlers( - as.integer(first_data_column) - , warning = function(w) fdc_is_integer <<- FALSE - ) -if (FALSE == fdc_is_integer) { - first_data_column <- params$firstDataColumn +fdc_is_integer <- is.integer(first_data_column) +if (fdc_is_integer) { + first_data_column <- as.integer(params$firstDataColumn) } # False discovery rate adjustment for ANOVA # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 val_fdr <- - read.table(file = params$alphaFile, sep = "\t", header = F, quote = "")[, 1] + read.table(file = params$alphaFile, sep = "\t", header = F, quote = "") + +if ( + ncol(val_fdr) != 1 || + sum(!is.numeric(val_fdr[, 1])) || + sum(val_fdr[, 1] < 0) || + sum(val_fdr[, 1] > 1) +) { + stop("alphaFile should be one column of numbers within the range [0.0,1.0]") +} +val_fdr <- val_fdr[, 1] #Imputed Data filename imputed_data_filename <- params$imputedDataFilename @@ -274,32 +278,15 @@ ) ``` -### Parse column names, sample names, and factor levels from input file +## Extract Sample Names and Factor Levels + +Column names parsed from input file are shown in Table 1; sample names and factor levels, in Table 2. ```{r echo = FALSE, results = 'asis'} -# Write column naames as an enumerated list. -column_name_df <- data.frame( - column = seq_len(length(colnames(full_data))), - name = colnames(full_data) - ) -print_data_frame_latex( - x = column_name_df, - justification = "l l", - centered = TRUE, - caption = "Input data column name", - anchor = "h" - ) data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) -cat( - sprintf( - "\n\nData columns: [%d,%d]\n\n", - min(data_column_indices), - max(data_column_indices) - ) - ) -if (FALSE == fdc_is_integer) { +if (!fdc_is_integer) { if (length(data_column_indices) > 0) { first_data_column <- data_column_indices[1] } else { @@ -307,6 +294,30 @@ } } +cat( + sprintf( + paste( + "\n\nPeptide-intensity data for each sample is", + "in one of columns %d through %d.\n\n" + ), + min(data_column_indices), + max(data_column_indices) + ) + ) + +# Write column names as a LaTeX enumerated list. +column_name_df <- data.frame( + column = seq_len(length(colnames(full_data))), + name = colnames(full_data) + ) +data_frame_latex( + x = column_name_df, + justification = "l l", + centered = TRUE, + caption = "Input data column name", + anchor = const_table_anchor + ) + ``` ```{r echo = FALSE, results = 'asis'} @@ -336,12 +347,12 @@ sample = sample_name_matches, level = sample_factor_levels ) -print_data_frame_latex( +data_frame_latex( x = sample_factor_df, justification = "c c", centered = TRUE, caption = "Factor level", - anchor = "h" + anchor = const_table_anchor ) ``` ```{r echo = FALSE, results = 'asis'} @@ -370,18 +381,9 @@ quant_data_log , las = 1 , col = const_boxplot_fill +, ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") +, xlab = "Sample" ) -# Points -stripchart( - quant_data_log, # Data - method = "jitter", # Random noise - jitter = const_stripchart_jitter, - pch = 19, # Pch symbols - cex = const_stripchart_cex, # Size of symbols reduced - col = "goldenrod", # Color of the symbol - vertical = TRUE, # Vertical mode - add = TRUE # Add it over - ) par(old_par) @@ -398,7 +400,8 @@ ggplot( quant_data_log_stack, aes(x = values) - ) + + ) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + + ylab("Probability density") + geom_density( aes(group = ind, colour = ind), na.rm = TRUE @@ -411,7 +414,10 @@ ### Globally, are peptide intensities are approximately unimodal? <!-- -# ref for bquote below particularly and plotting math expressions generally: +# bquote could be used as an alternative to latex2exp::TeX below particularly +# and when plotting math expressions generally, at the expense of mastering +# another syntax, which hardly seems worthwhile when I need to use TeX +# elsewhere; here's an introduction to bquote: # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ --> ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5), results = 'asis'} @@ -420,17 +426,21 @@ fin <- is.finite(as.numeric(as.matrix(quant_data_log))) logvalues <- as.numeric(as.matrix(quant_data_log))[fin] +logvalues_density <- density(logvalues) plot( - density(logvalues), - main = bquote( - "Smoothed estimated probability density vs." ~ log[10](intensity)), - xlab = bquote(log[10](intensity)) + x = logvalues_density, + main = latex2exp::TeX( + "Smoothed estimated probability density vs. $log_{10}$(peptide intensity)" + ), + xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), + ylab = "Probability density" ) hist( - x = as.numeric(as.matrix(quant_data_log)) -, breaks = 100 -, main = bquote("Frequency vs." ~ log[10](intensity)) -, xlab = bquote(log[10](intensity)) + x = as.numeric(as.matrix(quant_data_log)), + xlim = c(min(logvalues_density$x), max(logvalues_density$x)), + breaks = 100, + main = latex2exp::TeX("Frequency vs. $log_{10}$(peptide intensity)"), + xlab = latex2exp::TeX("$log_{10}$(peptide intensity)") ) ``` @@ -452,6 +462,7 @@ density(sds, na.rm = T) , main = "Smoothed estimated probability density vs. std. deviation" , sub = "(probability estimation made with Gaussian smoothing)" + , ylab = "Probability density" ) } else { cat( @@ -510,7 +521,7 @@ , "group-median" = { imputation_method_description <- paste("Substitute missing value with", - "median peptide intensity for sample group\n" + "median peptide intensity for sample group.\n" ) sample_level_integers <- as.integer(sample_factor_levels) for (i in seq_len(length(levels(sample_factor_levels)))) { @@ -524,7 +535,7 @@ , "median" = { imputation_method_description <- paste("Substitute missing value with", - "median peptide intensity across all sample classes\n" + "median peptide intensity across all sample classes.\n" ) quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]] good_rows <- !is.na(rowMeans(quant_data_imp)) @@ -532,7 +543,7 @@ , "mean" = { imputation_method_description <- paste("Substitute missing value with", - "mean peptide intensity across all sample classes\n" + "mean peptide intensity across all sample classes.\n" ) quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]] good_rows <- !is.na(rowMeans(quant_data_imp)) @@ -544,7 +555,7 @@ imputation_method_description <- paste("Substitute each missing value with random intensity", sprintf( - "random intensity $N \\sim (%0.2f, %0.2f)$\n", + "random intensity $N \\sim (%0.2f, %0.2f)$.\n", q1, m1 ) ) @@ -552,7 +563,6 @@ 100 * mean_percentile)) cat(sprintf("sd_percentile (from input parameter) is %0.2f\n\n", sd_percentile)) - #ACE cat(sprintf("sd for rnorm is %0.4f\n\n", m1)) quant_data_imp[ind] <- 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) good_rows <- !is.na(rowMeans(quant_data_imp)) @@ -684,11 +694,20 @@ write_debug_file(quant_data_imp_log10) red_dots <- quant_data_imp_log10 * x - ylim <- c( + count_red <- sum(!is.na(red_dots)) + count_blue <- sum(!is.na(blue_dots)) + ylim_save <- ylim <- c( min(red_dots, blue_dots, na.rm = TRUE), max(red_dots, blue_dots, na.rm = TRUE) ) - # ref: https://r-charts.com/distribution/add-points-boxplot/ + show_stripchart <- + 50 > (count_red + count_blue) / length(sample_name_matches) + if (show_stripchart) { + boxplot_sub <- "Light blue = data before imputation; Red = imputed data" + } else { + boxplot_sub <- "" + } + # Vertical plot colnames(blue_dots) <- sample_name_matches boxplot( @@ -697,32 +716,59 @@ , col = const_boxplot_fill , ylim = ylim , main = "Peptide intensities before and after imputation" - , sub = "Light blue = data before imputation; Red = imputed data" + , sub = boxplot_sub , xlab = "Sample" - , ylab = "log10(peptide intensity)" + , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") ) - # Points - # NA values are not plotted - stripchart( - blue_dots, # Data - method = "jitter", # Random noise - jitter = const_stripchart_jitter, - pch = 19, # Pch symbols - cex = const_stripsmall_cex, # Size of symbols reduced - col = "lightblue", # Color of the symbol - vertical = TRUE, # Vertical mode - add = TRUE # Add it over - ) - stripchart( - red_dots, # Data - method = "jitter", # Random noise - jitter = const_stripchart_jitter, - pch = 19, # Pch symbols - cex = const_stripsmall_cex, # Size of symbols reduced - col = "red", # Color of the symbol - vertical = TRUE, # Vertical mode - add = TRUE # Add it over - ) + + if (show_stripchart) { + # Points + # ref: https://r-charts.com/distribution/add-points-boxplot/ + # NA values are not plotted + stripchart( + blue_dots, # Data + method = "jitter", # Random noise + jitter = const_stripchart_jitter, + pch = 19, # Pch symbols + cex = const_stripsmall_cex, # Size of symbols reduced + col = "lightblue", # Color of the symbol + vertical = TRUE, # Vertical mode + add = TRUE # Add it over + ) + stripchart( + red_dots, # Data + method = "jitter", # Random noise + jitter = const_stripchart_jitter, + pch = 19, # Pch symbols + cex = const_stripsmall_cex, # Size of symbols reduced + col = "red", # Color of the symbol + vertical = TRUE, # Vertical mode + add = TRUE # Add it over + ) + + } else { + # violin plot + cat("\\leavevmode\n\\quad\n\n\\quad\n\n") + vioplot::vioplot( + x = lapply(blue_dots, function(x) x[!is.na(x)]), + col = "lightblue1", + side = "left", + plotCentre = "line", + ylim = ylim_save, + main = "Distributions of observed and imputed data", + sub = "Light blue = observed data; Pink = imputed data", + xlab = "Sample", + ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + ) + vioplot::vioplot( + x = lapply(red_dots, function(x) x[!is.na(x)]), + col = "lightpink1", + side = "right", + plotCentre = "line", + add = T + ) + } + par(old_par) # density plot @@ -738,7 +784,7 @@ "Black = combined" ), main = "Density of peptide intensity before and after imputation", - xlab = "log10(peptide intensity)", + xlab = latex2exp::TeX("$log_{10}$(peptide intensity)"), ylab = "Probability density" ) lines(d_original, col = "blue") @@ -909,18 +955,9 @@ quant_data_log , las = 1 , col = const_boxplot_fill + , ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") + , xlab = "Sample" ) - # Points - stripchart( - quant_data_log, # Data - method = "jitter", # Random noise - jitter = const_stripchart_jitter, - pch = 19, # Pch symbols - cex = const_stripchart_cex, # Size of symbols reduced - col = "goldenrod", # Color of the symbol - vertical = TRUE, # Vertical mode - add = TRUE # Add it over - ) par(old_par) } else { cat("There are no peptides to plot\n") @@ -936,7 +973,8 @@ ggplot( quant_data_log_stack, aes(x = values) - ) + + ) + xlab(latex2exp::TeX("$log_{10}$(peptide intensity)")) + + ylab("Probability density") + geom_density( aes(group = ind, colour = ind), na.rm = TRUE @@ -949,7 +987,7 @@ cat("\\leavevmode\\newpage\n") ``` -## Perform ANOVA filters +## Perform ANOVA Filters ```{r, echo = FALSE} # Make new data frame containing only Phosphopeptides @@ -1125,24 +1163,13 @@ main = "Imputed, normalized intensities", # no line plot las = 1, col = const_boxplot_fill, - ylab = expression(log[10](intensity)) + ylab = latex2exp::TeX("$log_{10}$(peptide intensity)") ) - # Points - stripchart( - filtered_data_filtered, # Data - method = "jitter", # Random noise - jitter = const_stripchart_jitter, - pch = 19, # Pch symbols - cex = const_stripchart_cex, # Size of symbols reduced - col = "goldenrod", # Color of the symbol - vertical = TRUE, # Vertical mode - add = TRUE # Add it over - ) par(old_par) } else { cat(sprintf( "%s < %0.2f\n\n\n\n\n", - "No peptides were found to have cutoff adjusted p-value <", + "No peptides were found to have cutoff adjusted p-value", cutoff )) } @@ -1229,11 +1256,12 @@ ) } else { if (nrow(m) == 1) { + next + } else { cat( sprintf("Heatmap for %d usable peptides whose", nrow(m)), sprintf("adjusted p-value < %0.2f\n", cutoff) ) - next } } cat("\n\n\n") @@ -1263,9 +1291,3 @@ } cat("\\leavevmode\n\n\n") ``` - -<!-- -## Peptide IDs, etc. - -See output files. --->