Mercurial > repos > eschen42 > mqppep_anova
diff mqppep_anova_script.Rmd @ 7:d728198f1ba5 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 9a0fa6d0f9aadc069a5551a54da6daf307885637"
author | eschen42 |
---|---|
date | Tue, 15 Mar 2022 00:35:16 +0000 |
parents | c1403d18c189 |
children | 4deacfee76ef |
line wrap: on
line diff
--- a/mqppep_anova_script.Rmd Fri Mar 11 20:04:05 2022 +0000 +++ b/mqppep_anova_script.Rmd Tue Mar 15 00:35:16 2022 +0000 @@ -1,24 +1,32 @@ --- -title: "Quant Data Processing Script" +title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA" author: "Larry Cheng; Art Eschenlauer" date: "May 28, 2018; Nov 16, 2021" output: - html_document: default pdf_document: default params: - inputFile: "Upstream_Map_pST_outputfile_STEP4.txt" - alphaFile: "alpha_levels.txt" + inputFile: "test-data/test_input_for_anova.tabular" + alphaFile: "test-data/alpha_levels.tabular" firstDataColumn: "Intensity" - imputationMethod: !r c("group-median","median","mean","random")[4] + imputationMethod: !r c("group-median", "median", "mean", "random")[1] meanPercentile: 1 sdPercentile: 0.2 regexSampleNames: "\\.(\\d+)[A-Z]$" regexSampleGrouping: "(\\d+)" imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt" --- -```{r setup, include=FALSE} +```{r setup, include = FALSE} # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 -knitr::opts_chunk$set(echo = FALSE, fig.dim=c(9,10)) +knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) + +### FUNCTIONS + +#ANOVA filter function +anova_func <- function(x, grouping_factor) { + x_aov <- aov(as.numeric(x) ~ grouping_factor) + pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1] + pvalue +} ``` ## Purpose: @@ -28,156 +36,175 @@ ## Variables to change for each input file --> ```{r include = FALSE} -#Input Filename -inputFile <- params$inputFile +# Input Filename +input_file <- params$inputFile -#First data column - ideally, this could be detected via regexSampleNames, but for now leave it as is. -firstDataColumn <- params$firstDataColumn -FDC_is_integer <- TRUE -firstDataColumn <- withCallingHandlers( - as.integer(firstDataColumn) - , warning = function(w) FDC_is_integer <<- FALSE +# First data column - ideally, this could be detected via regexSampleNames, +# but for now leave it as is. +first_data_column <- params$firstDataColumn +fdc_is_integer <- TRUE +first_data_column <- withCallingHandlers( + as.integer(first_data_column) + , warning = function(w) fdc_is_integer <<- FALSE ) -if (FALSE == FDC_is_integer) { - firstDataColumn <- params$firstDataColumn +if (FALSE == fdc_is_integer) { + first_data_column <- params$firstDataColumn } -#False discovery rate adjustment for ANOVA (Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05) -valFDR <- read.table(file = params$alphaFile, sep = "\t", header=F, quote="")[,1] +# False discovery rate adjustment for ANOVA +# Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 +val_fdr <- + read.table(file = params$alphaFile, sep = "\t", header = F, quote = "")[, 1] #Imputed Data filename -imputedDataFilename <- params$imputedDataFilename +imputed_data_filename <- params$imputedDataFilename #ANOVA data filename ``` -```{r include = FALSE} -#Imputation method, should be one of c("random","group-median","median","mean") -imputationMethod <- params$imputationMethod +```{r echo = FALSE} +# Imputation method, should be one of +# "random", "group-median", "median", or "mean" +imputation_method <- params$imputationMethod -#Selection of percentile of logvalue data to set the mean for random number generation when using random imputation -meanPercentile <- params$meanPercentile / 100.0 +# Selection of percentile of logvalue data to set the mean for random number +# generation when using random imputation +mean_percentile <- params$meanPercentile / 100.0 -#deviation adjustment-factor for random values; real number. -sdPercentile <- params$sdPercentile +# deviation adjustment-factor for random values; real number. +sd_percentile <- params$sdPercentile + +# Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" +regex_sample_names <- params$regexSampleNames -#Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" -regexSampleNames <- params$regexSampleNames - -#Regular expression to extract Sample Grouping from Sample Name (if error occurs, compare sampleNumbers and tempMatches to see if groupings/pairs line up) -# e.g., "(\\d+)" -regexSampleGrouping <- params$regexSampleGrouping +# Regular expression to extract Sample Grouping from Sample Name; +# if error occurs, compare sample_factor_levels and temp_matches +# to see if groupings/pairs line up +# e.g., "(\\d+)" +regex_sample_grouping <- params$regexSampleGrouping ``` - -```{r include = FALSE} -### FUNCTIONS - -#ANOVA filter function -anovaFunc <- function(x, groupingFactor) { - x.aov = aov(as.numeric(x) ~ groupingFactor) - pvalue = summary(x.aov)[[1]][["Pr(>F)"]][1] - pvalue -} -``` - - - -### Checking that log-transformed sample distributions are similar: -```{r echo=FALSE} +```{r echo = FALSE} +### READ DATA library(data.table) # read.table reads a file in table format and creates a data frame from it. -# - note that `quote=""` means that quotation marks are treated literally. -fullData <- read.table(file = inputFile, sep = "\t", header=T, quote="", check.names=FALSE) -print(colnames(fullData)) -#head(fullData) +# - note that `quote = ""` means that quotation marks are treated literally. +full_data <- read.table( + file = input_file, + sep = "\t", + header = T, + quote = "", + check.names = FALSE + ) +``` + +### Column names from input file -if (FALSE == FDC_is_integer) { - dataColumnIndices <- grep(firstDataColumn, names(fullData), perl=TRUE) - str(dataColumnIndices) - if (length(dataColumnIndices) > 0) { - firstDataColumn <- dataColumnIndices[1] +```{r echo = FALSE, results = 'markup'} +print(colnames(full_data)) +data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) +cat(sprintf("First data column: %d\n", min(data_column_indices))) +cat(sprintf("Last data column: %d\n", max(data_column_indices))) +``` + +```{r echo = FALSE, results = 'asis'} +cat("\\newpage\n") +``` + +### Checking that log-transformed sample distributions are similar: + +```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} + +if (FALSE == fdc_is_integer) { + + if (length(data_column_indices) > 0) { + first_data_column <- data_column_indices[1] } else { - stop(paste("failed to convert firstDataColumn:", firstDataColumn)) + stop(paste("failed to convert firstDataColumn:", first_data_column)) } } - -quantData0 <- fullData[firstDataColumn:length(fullData)] -quantData <- fullData[firstDataColumn:length(fullData)] -quantData[quantData==0] <- NA #replace 0 with NA -quantDataLog <- log10(quantData) + +quant_data0 <- full_data[first_data_column:length(full_data)] +quant_data <- full_data[first_data_column:length(full_data)] +quant_data[quant_data == 0] <- NA #replace 0 with NA +quant_data_log <- log10(quant_data) -rownames(quantDataLog) <- fullData$Phosphopeptide - -summary(quantDataLog) +rownames(quant_data_log) <- full_data$Phosphopeptide -#data visualization +# data visualization old_par <- par( - mai=par("mai") + c(0.5,0,0,0) + mai = par("mai") + c(0.5, 0, 0, 0) ) boxplot( - quantDataLog -, las=2 + quant_data_log +, las = 2 ) par(old_par) -quantDataLog_stack <- stack(quantDataLog) + + +cat("\\newline\n") +cat("\\newline\n") + ``` -```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)} +```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), warning = FALSE} +quant_data_log_stack <- stack(quant_data_log) library(ggplot2) -ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind)) +ggplot( + quant_data_log_stack, + aes(x = values)) + geom_density(aes(group = ind, colour = ind)) ``` ### Globally, are phosphopeptide intensities are approximately unimodal? -```{r echo = FALSE,fig.align="left", fig.dim=c(9,5)} -# ref for bquote particularly and plotting math expressions generally: +<!-- +# ref for bquote below particularly and plotting math expressions generally: # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ +--> +```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)} -#identify the location of missing values -fin <- is.finite(as.numeric(as.matrix(quantDataLog))) +# identify the location of missing values +fin <- is.finite(as.numeric(as.matrix(quant_data_log))) -logvalues <- as.numeric(as.matrix(quantDataLog))[fin] +logvalues <- as.numeric(as.matrix(quant_data_log))[fin] plot( - density(logvalues) -, main = bquote("Smoothed estimated probability density vs." ~ log[10](intensity)) -, xlab = bquote(log[10](intensity)) -) + density(logvalues), + main = bquote( + "Smoothed estimated probability density vs." ~ log[10](intensity)), + xlab = bquote(log[10](intensity)) + ) hist( - x = as.numeric(as.matrix(quantDataLog)) + x = as.numeric(as.matrix(quant_data_log)) , breaks = 100 , main = bquote("Frequency vs." ~ log[10](intensity)) , xlab = bquote(log[10](intensity)) ) ``` -<!-- -## Impute missing values ---> - ### Distribution of standard deviations of phosphopeptides, ignoring missing values: -```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)} -#determine quantile -q1 <- quantile(logvalues, probs = meanPercentile)[1] +```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)} +# determine quantile +q1 <- quantile(logvalues, probs = mean_percentile)[1] -#determine standard deviation of quantile to impute +# determine standard deviation of quantile to impute sd_finite <- function(x) { ok <- is.finite(x) - sd(x[ok]) * sdPercentile + sd(x[ok]) * sd_percentile } -sds <- apply(quantDataLog, 1, sd_finite) # 1 = row of matrix (ie, phosphopeptide) +# 1 = row of matrix (ie, phosphopeptide) +sds <- apply(quant_data_log, 1, sd_finite) plot( - density(sds, na.rm=T) -, main="Smoothed estimated probability density vs. std. deviation" -, sub="(probability estimation made with Gaussian smoothing)" + density(sds, na.rm = T) +, main = "Smoothed estimated probability density vs. std. deviation" +, sub = "(probability estimation made with Gaussian smoothing)" ) -m1 <- median(sds, na.rm=T) #sd to be used is the median sd +m1 <- median(sds, na.rm = T) #sd to be used is the median sd ``` @@ -186,102 +213,116 @@ <!-- The number of missing values are: --> -```{r echo=FALSE} +```{r echo = FALSE} #Determine number of cells to impute -temp <- quantData[is.na(quantData)] +temp <- quant_data[is.na(quant_data)] #Determine number of values to impute -NoToImpute <- length(temp) +number_to_impute <- length(temp) ``` <!-- % of values that are missing: --> -```{r echo=FALSE} -pct_missing_values <- length(temp)/(length(logvalues)+length(temp)) * 100 +```{r echo = FALSE} +pct_missing_values <- length(temp) / (length(logvalues) + length(temp)) * 100 ``` <!-- First few rows of data before imputation: --> -## Impute missing values +```{r echo = FALSE, results = 'asis'} +cat("\\newpage\n") +``` + +## Parse sample names + +Parse the names of the samples to deduce the factor level for each sample: + ```{r echo = FALSE} -#ACE start segment: trt-median based imputation # prep for trt-median based imputation -# Assuming that regexSampleNames <- "\\.(\\d+)[A-Z]$" -# get factors -> group runs (samples) by ignoring terminal [A-Z] in sample names -# regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) -m <- regexpr(regexSampleNames, names(quantData), perl=TRUE) -tempMatches <- regmatches(names(quantData), m) +# Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$" +# get factors -> +# group runs (samples) by ignoring terminal [A-Z] in sample names + +m <- regexpr(regex_sample_names, names(quant_data), perl = TRUE) +temp_matches <- regmatches(names(quant_data), m) print("Extracted sample names") -print(tempMatches) -m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE) -sampleNumbers <- as.factor(regmatches(tempMatches, m2)) +print(temp_matches) +m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE) +sample_factor_levels <- as.factor(regmatches(temp_matches, m2)) print("Factor levels") -print(sampleNumbers) +print(sample_factor_levels) ``` +## Impute missing values + ```{r echo = FALSE} -#ACE hack begin #Determine number of cells to impute -cat( - sprintf("Before imputation, there are:\n %d peptides\n %d missing values (%2.0f%s)" - , sum(rep.int(TRUE, nrow(quantData))) - , sum(is.na(quantData)) - , pct_missing_values - , "%" - ) +cat("Before imputation,", + sprintf( + "there are:\n %d peptides\n %d missing values (%2.0f%s)", + sum(rep.int(TRUE, nrow(quant_data))), + sum(is.na(quant_data)), + pct_missing_values, + "%" + ) ) -#ACE hack end ``` ```{r echo = FALSE} #Impute data -quantDataImputed <- quantData +quant_data_imp <- quant_data # Identify which values are missing and need to be imputed -ind <- which(is.na(quantDataImputed), arr.ind=TRUE) +ind <- which(is.na(quant_data_imp), arr.ind = TRUE) ``` ```{r echo = FALSE} # Apply imputation switch( - imputationMethod -, "group-median"={ - cat("Imputation method: substitute missing value with median peptide-intensity for sample-group\n") - #goodRows <- rep.int(TRUE, nrow(quantDataImputed)) - sampleLevelIntegers <- as.integer(sampleNumbers) - for (i in 1:length(levels(sampleNumbers))) { - levelCols <- i == sampleLevelIntegers - ind <- which(is.na(quantDataImputed[,levelCols]), arr.ind=TRUE) - quantDataImputed[ind,levelCols] <- apply(quantDataImputed[,levelCols], 1, median, na.rm=T)[ind[,1]] + imputation_method +, "group-median" = { + cat("Imputation method:\n substitute missing value", + "with median peptide-intensity for sample-group\n") + + sample_level_integers <- as.integer(sample_factor_levels) + for (i in seq_len(length(levels(sample_factor_levels)))) { + level_cols <- i == sample_level_integers + ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE) + quant_data_imp[ind, level_cols] <- + apply(quant_data_imp[, level_cols], 1, median, na.rm = T)[ind[, 1]] } - goodRows <- !is.na(rowMeans(quantDataImputed)) + good_rows <- !is.na(rowMeans(quant_data_imp)) } -, "median"={ - cat("Imputation method: substitute missing value with median peptide-intensity across all sample classes\n") - quantDataImputed[ind] <- apply(quantDataImputed, 1, median, na.rm=T)[ind[,1]] - goodRows <- !is.na(rowMeans(quantDataImputed)) +, "median" = { + cat("Imputation method:\n substitute missing value with", + "median peptide-intensity across all sample classes\n") + quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]] + good_rows <- !is.na(rowMeans(quant_data_imp)) } -, "mean"={ - cat("Imputation method: substitute missing value with mean peptide-intensity across all sample classes\n") - quantDataImputed[ind] <- apply(quantDataImputed, 1, mean, na.rm=T)[ind[,1]] - goodRows <- !is.na(rowMeans(quantDataImputed)) +, "mean" = { + cat("Imputation method:\n substitute missing value with", + "mean peptide-intensity across all sample classes\n") + quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]] + good_rows <- !is.na(rowMeans(quant_data_imp)) } -, "random"={ +, "random" = { cat( + "Imputation method:\n substitute missing value with\n ", sprintf( - "Imputation method: substitute missing value with random intensity N ~ (%0.2f, %0.2f)\n" + "random intensity N ~ (%0.2f, %0.2f)\n" , q1, m1 ) ) - quantDataImputed[is.na(quantDataImputed)] <- 10^rnorm(NoToImpute, mean= q1, sd = m1) - goodRows <- !is.na(rowMeans(quantDataImputed)) + quant_data_imp[is.na(quant_data_imp)] <- + 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) + good_rows <- !is.na(rowMeans(quant_data_imp)) } ) @@ -289,13 +330,16 @@ ```{r echo = FALSE} #Determine number of cells to impute -temp <- quantDataImputed[is.na(quantDataImputed)] -cat( +temp <- quant_data_imp[is.na(quant_data_imp)] +cat("After imputation, there are:", sprintf( - "After imputation, there are:\n %d missing values\n %d usable peptides\n %d peptides with too many missing values for further analysis" - , sum(is.na(quantDataImputed[goodRows,])) - , sum(goodRows) - , sum(!goodRows) + "\n %d missing values\n %d usable peptides analysis" + , sum(is.na(quant_data_imp[good_rows, ])) + , sum(good_rows) + ), + sprintf( + "\n %d peptides with too many missing values for further analysis" + , sum(!good_rows) ) ) ``` @@ -303,22 +347,29 @@ # Zap rows where imputation was ineffective -fullData <- fullData [goodRows, ] -quantData <- quantData [goodRows, ] -quantDataImputed <- quantDataImputed[goodRows, ] +full_data <- full_data [good_rows, ] +quant_data <- quant_data [good_rows, ] +quant_data_imp <- quant_data_imp[good_rows, ] ``` ```{r echo = FALSE} -d_combined <- (density(as.numeric(as.matrix(log10(quantDataImputed))))) -d_original <- density(as.numeric(as.matrix(log10(quantDataImputed[!is.na(quantData)])))) +d_combined <- (density(as.numeric(as.matrix( + log10(quant_data_imp) +)))) +d_original <- + density(as.numeric(as.matrix( + log10(quant_data_imp[!is.na(quant_data)])))) ``` ```{r echo = FALSE} -if (sum(is.na(quantData)) > 0) { +if (sum(is.na(quant_data)) > 0) { # There ARE missing values - d_imputed <- (density(as.numeric(as.matrix(log10(quantDataImputed[is.na(quantData)]))))) + d_imputed <- + (density(as.numeric(as.matrix( + log10(quant_data_imp[is.na(quant_data)]) + )))) } else { # There are NO missing values d_imputed <- d_combined @@ -326,29 +377,28 @@ ``` -<!-- ```{r echo = FALSE, fig.cap = "Blue = Data before imputation; Red = Imputed data"} --> -```{r echo = FALSE, fig.dim=c(9,5)} +```{r echo = FALSE, fig.dim = c(9, 5)} ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) plot( - d_combined -, ylim = ylim -, sub = "Blue = data before imputation; Red = imputed data" -, main = "Density vs. log10(intensity) before and after imputation" + d_combined, + ylim = ylim, + sub = "Blue = data before imputation; Red = imputed data", + main = "Density vs. log10(intensity) before and after imputation" ) -lines(d_original, col="blue") -lines(d_imputed, col="red") +lines(d_original, col = "blue") +lines(d_imputed, col = "red") ``` ## Perform Quantile Normalization -```{r echo=FALSE} -library(preprocessCore) + +<!-- # Apply quantile normalization using preprocessCore::normalize.quantiles # --- # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html # except this: https://support.bioconductor.org/p/122925/#9135989 # says to install it like this: # ``` -# BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE,lib=.libPaths()[1]) +# BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) # ``` # conda installation (necessary because of a bug in recent openblas): # conda install bioconductor-preprocesscore openblas=0.3.3 @@ -360,7 +410,7 @@ # Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities. # # Usage: -# normalize.quantiles(x,copy=TRUE, keep.names=FALSE) +# normalize.quantiles(x, copy = TRUE, keep.names = FALSE) # # Arguments: # @@ -397,261 +447,355 @@ # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 # http://bmbolstad.com/misc/normalize/normalize.html # ... +--> +```{r echo = FALSE} +library(preprocessCore) if (TRUE) { - quantDataImputed.qn <- normalize.quantiles(as.matrix(quantDataImputed)) + quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp)) } else { - quantDataImputed.qn <- as.matrix(quantDataImputed) + quant_data_imp_qn <- as.matrix(quant_data_imp) } -quantDataImputed.qn = as.data.frame(quantDataImputed.qn) -names(quantDataImputed.qn) = names(quantDataImputed) -quantDataImputed_QN_log <- log10(quantDataImputed.qn) +quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) +names(quant_data_imp_qn) <- names(quant_data_imp) +quant_data_imp_qn_log <- log10(quant_data_imp_qn) -rownames(quantDataImputed_QN_log) <- fullData[,1] +rownames(quant_data_imp_qn_log) <- full_data[, 1] -quantDataImputed.qn.LS = t(scale(t(log10(quantDataImputed.qn)))) -anyNaN <- function (x) { +quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) +any_nan <- function(x) { !any(x == "NaN") } -sel = apply(quantDataImputed.qn.LS, 1, anyNaN) -quantDataImputed.qn.LS2 <- quantDataImputed.qn.LS[which(sel),] -quantDataImputed.qn.LS2 = as.data.frame(quantDataImputed.qn.LS2) +sel <- apply(quant_data_imp_qn_ls, 1, any_nan) +quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ] +quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) #output quantile normalized data -dataTableImputed_QN_LT <- cbind(fullData[1:9], quantDataImputed_QN_log) -write.table(dataTableImputed_QN_LT, file = paste(paste(strsplit(imputedDataFilename, ".txt"),"QN_LT",sep="_"),".txt",sep=""), sep = "\t", col.names=TRUE, row.names=FALSE) +data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) +write.table( + data_table_imp_qn_lt, + file = paste(paste( + strsplit(imputed_data_filename, ".txt"), "QN_LT", sep = "_" + ), ".txt", sep = ""), + sep = "\t", + col.names = TRUE, + row.names = FALSE +) ``` <!-- ACE insertion begin --> ### Checking that normalized, imputed, log-transformed sample distributions are similar: -```{r echo=FALSE} -#library(data.table) +```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} + -#Save unimputed quantDataLog for plotting below -unimputedQuantDataLog <- quantDataLog +# Save unimputed quant_data_log for plotting below +unimputed_quant_data_log <- quant_data_log -#Log10 transform (after preparing for zero values, which should never happen...) -quantDataImputed.qn[quantDataImputed.qn == 0] <- .000000001 -quantDataLog <- log10(quantDataImputed.qn) +# log10 transform (after preparing for zero values, +# which should never happen...) +quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 +quant_data_log <- log10(quant_data_imp_qn) -summary(quantDataLog) +# Output quantile-normalized log-transformed dataset +# with imputed, normalized data -#Output quantile-normalized log-transformed dataset with imputed, normalized data - -dataTableImputed <- cbind(fullData[1:9], quantDataLog) +data_table_imputed <- cbind(full_data[1:9], quant_data_log) write.table( - dataTableImputed - , file=imputedDataFilename - , sep="\t" - , col.names=TRUE - , row.names=FALSE - , quote=FALSE + data_table_imputed + , file = imputed_data_filename + , sep = "\t" + , col.names = TRUE + , row.names = FALSE + , quote = FALSE ) -#data visualization +# data visualization old_par <- par( - mai=par("mai") + c(0.5,0,0,0) -, oma=par("oma") + c(0.5,0,0,0) + mai = par("mai") + c(0.5, 0, 0, 0) +, oma = par("oma") + c(0.5, 0, 0, 0) ) boxplot( - quantDataLog -, las=2 + quant_data_log +, las = 2 ) par(old_par) + + + +cat("\\newline\n") +cat("\\newline\n") + ``` -```{r echo=FALSE, fig.dim=c(9,5)} -quantDataLog_stack <- stack(quantDataLog) -ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind)) +```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4)} +quant_data_log_stack <- stack(quant_data_log) +ggplot( + quant_data_log_stack, + aes(x = values) + ) + geom_density(aes(group = ind, colour = ind)) ``` ## Perform ANOVA filters -```{r,echo=FALSE} -#Make new data frame containing only Phosphopeptides to connect preANOVA to ANOVA (connect_df) +(see following pages) + +```{r, echo = FALSE} +# Make new data frame containing only Phosphopeptides +# to connect preANOVA to ANOVA (connect_df) connect_df <- data.frame( - dataTableImputed_QN_LT$Phosphopeptide - , dataTableImputed_QN_LT[,firstDataColumn] + data_table_imp_qn_lt$Phosphopeptide + , data_table_imp_qn_lt[, first_data_column] ) -colnames(connect_df) <- c("Phosphopeptide","Intensity") +colnames(connect_df) <- c("Phosphopeptide", "Intensity") ``` -```{r echo=FALSE, fig.dim=c(9,10)} -# Get factors -> group replicates (as indicated by terminal letter) by the preceding digits -# For example, group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc.. -m <- regexpr(regexSampleNames, names(quantDataImputed_QN_log), perl=TRUE) -#ACE str(m) -tempMatches <- regmatches(names(quantDataImputed_QN_log), m) -#ACE str(tempMatches) -numSamples <- length(tempMatches) -#ACE str(numSamples) -m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE) -#ACE str(m2) -#ACE str(regmatches(tempMatches, m2)) -sampleNumbers <- as.factor(regmatches(tempMatches, m2)) -#ACE str(sampleNumbers) +```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} +# Get factors -> group replicates (as indicated by terminal letter) +# by the preceding digits; +# e.g., group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc.. +m <- + regexpr(regex_sample_names, names(quant_data_imp_qn_log), perl = TRUE) + +temp_matches <- regmatches(names(quant_data_imp_qn_log), m) + +number_of_samples <- length(temp_matches) -if (length(levels(sampleNumbers))<2) { - cat("ERROR!!!! Cannot perform ANOVA analysis because it requires two or more factor levels\n") +m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE) + + +sample_factor_levels <- as.factor(regmatches(temp_matches, m2)) + + +if (length(levels(sample_factor_levels)) < 2) { + cat( + "ERROR!!!! Cannot perform ANOVA analysis", + "because it requires two or more factor levels\n" + ) cat("Unparsed sample names are:\n") - print(names(quantDataImputed_QN_log)) - cat(sprintf("Parsing rule for SampleNames is '%s'\n", regexSampleNames)) + print(names(quant_data_imp_qn_log)) + cat(sprintf("Parsing rule for SampleNames is '%s'\n", regex_sample_names)) cat("Parsed names are:\n") - print(tempMatches) - cat(sprintf("Parsing rule for SampleGrouping is '%s'\n", regexSampleGrouping)) + print(temp_matches) + cat(sprintf( + "Parsing rule for SampleGrouping is '%s'\n", + regex_sample_grouping + )) cat("Sample group assignments are:\n") - print(regmatches(tempMatches, m2)) + print(regmatches(temp_matches, m2)) } else { - pValueData.anovaPs <- apply(quantDataImputed_QN_log, 1, anovaFunc, groupingFactor=sampleNumbers) + p_value_data_anova_ps <- + apply( + quant_data_imp_qn_log, + 1, + anova_func, + grouping_factor = sample_factor_levels + ) - pValueData.anovaPs.FDR <- p.adjust(pValueData.anovaPs, method="fdr") - pValueData <- data.frame( - phosphopeptide = fullData[,1] - , rawANOVAp = pValueData.anovaPs - , FDRadjustedANOVAp = pValueData.anovaPs.FDR + p_value_data_anova_ps_fdr <- + p.adjust(p_value_data_anova_ps, method = "fdr") + p_value_data <- data.frame( + phosphopeptide = full_data[, 1] + , + raw_anova_p = p_value_data_anova_ps + , + fdr_adjusted_anova_p = p_value_data_anova_ps_fdr ) - #ACE rownames(pValueData) <- fullData[,1] - # output ANOVA file to constructed filename, + + # output ANOVA file to constructed filename, # e.g. "Outputfile_pST_ANOVA_STEP5.txt" # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" - #Re-output quantile-normalized log-transformed dataset with imputed, normalized data to include p-values + # Re-output quantile-normalized log-transformed dataset + # with imputed, normalized data to include p-values - dataTableImputed <- cbind(fullData[1:9], pValueData[,2:3], quantDataLog) + data_table_imputed <- + cbind(full_data[1:9], p_value_data[, 2:3], quant_data_log) write.table( - dataTableImputed - , file=imputedDataFilename - , sep="\t" - , col.names=TRUE - , row.names=FALSE - , quote=FALSE + data_table_imputed, + file = imputed_data_filename, + sep = "\t", + col.names = TRUE, + row.names = FALSE, + quote = FALSE ) - pValueData <- pValueData[order(pValueData$FDRadjustedANOVAp),] + p_value_data <- + p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] + + cutoff <- val_fdr[1] + for (cutoff in val_fdr) { + #loop through FDR cutoffs - cutoff <- valFDR[1] - for (cutoff in valFDR){ #loop through FDR cutoffs - - filtered_p <- pValueData[which(pValueData$FDRadjustedANOVAp < cutoff),, drop = FALSE] - filteredData.filtered <- quantDataImputed_QN_log[rownames(filtered_p),, drop = FALSE] - filteredData.filtered <- filteredData.filtered[order(filtered_p$FDRadjustedANOVAp),, drop = FALSE] + filtered_p <- + p_value_data[ + which(p_value_data$fdr_adjusted_anova_p < cutoff), + , + drop = FALSE + ] + filtered_data_filtered <- + quant_data_imp_qn_log[ + rownames(filtered_p), + , + drop = FALSE + ] + filtered_data_filtered <- + filtered_data_filtered[ + order(filtered_p$fdr_adjusted_anova_p), + , + drop = FALSE + ] # <!-- ACE insertion start --> old_oma <- par("oma") old_par <- par( - mai=(par("mai") + c(0.7,0,0,0)) * c(1,1,0.3,1) - , oma=old_oma * c(1,1,0.3,1) - , cex.main=0.9 - , cex.axis=0.7 - ) - - if (nrow(filteredData.filtered) > 0) { + mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), + oma = old_oma * c(1, 1, 0.3, 1), + cex.main = 0.9, + cex.axis = 0.7 + ) + + cat("\\newpage\n") + if (nrow(filtered_data_filtered) > 0) { + cat(sprintf( + "Intensities for peptides whose adjusted p-value < %0.2f\n", + cutoff + )) + cat("\\newline\n") + cat("\\newline\n") + boxplot( - filteredData.filtered - , main = sprintf("Imputed, normalized intensities where adjusted p-value < %0.2f", cutoff) - # no line plot , main = "" - , las = 2 - # , ylim = c(5.5,10) - , ylab = expression(log[10](intensity)) + filtered_data_filtered, + main = "Imputed, normalized intensities", # no line plot + las = 2, + ylab = expression(log[10](intensity)) ) } else { - cat(sprintf("No peptides were found to have cutoff adjusted p-value < %0.2f\n", cutoff)) + cat(sprintf( + "No peptides were found to have cutoff adjusted p-value < %0.2f\n", + cutoff + )) } par(old_par) - - #Add Phosphopeptide column to ANOVA filtered table - ANOVA.filtered_merge <- merge( + + if (nrow(filtered_data_filtered) > 0) { + #Add Phosphopeptide column to anova_filtered table + anova_filtered_merge <- merge( x = connect_df - , y = filteredData.filtered - , by.x="Intensity" - , by.y=1 + , + y = filtered_data_filtered + , + by.x = "Intensity" + , + by.y = 1 ) - ANOVA.filtered_merge.order <- rownames(filtered_p) - - ANOVA.filtered_merge.format <- sapply( - X = filtered_p$FDRadjustedANOVAp - , FUN = function(x) { - if (x > 0.0001) - paste0("(%0.",1+ceiling(-log10(x)),"f) %s") - else - paste0("(%0.4e) %s") + anova_filtered_merge_order <- rownames(filtered_p) + + anova_filtered_merge_format <- sapply( + X = filtered_p$fdr_adjusted_anova_p + , + FUN = function(x) { + if (x > 0.0001) + paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s") + else + paste0("(%0.4e) %s") } - ) - - #ANOVA.filtered_merge.format <- paste0("(%0.",1+ceiling(-log10(filtered_p$FDRadjustedANOVAp)),"f) %s") - - ANOVA.filtered <- data.table( - ANOVA.filtered_merge$Phosphopeptide - , ANOVA.filtered_merge$Intensity - , ANOVA.filtered_merge[, 2:numSamples+1] - ) - colnames(ANOVA.filtered) <- c("Phosphopeptide", colnames(filteredData.filtered)) - - # merge qualitative columns into the ANOVA data - output_table <- data.frame(ANOVA.filtered$Phosphopeptide) - output_table <- merge( - x = output_table - , y = dataTableImputed_QN_LT - , by.x = "ANOVA.filtered.Phosphopeptide" - , by.y="Phosphopeptide" ) - #Produce heatmap to visualize significance and the effect of imputation - m <- as.matrix(unimputedQuantDataLog[ANOVA.filtered_merge.order,]) - if (nrow(m) > 0) { - rownames_m <- rownames(m) - rownames(m) <- sapply( - X = 1:nrow(m) - , FUN = function(i) { + + + anova_filtered <- data.table( + anova_filtered_merge$Phosphopeptide + , + anova_filtered_merge$Intensity + , + anova_filtered_merge[, 2:number_of_samples + 1] + ) + colnames(anova_filtered) <- + c("Phosphopeptide", colnames(filtered_data_filtered)) + + # merge qualitative columns into the ANOVA data + output_table <- data.frame(anova_filtered$Phosphopeptide) + output_table <- merge( + x = output_table + , + y = data_table_imp_qn_lt + , + by.x = "anova_filtered.Phosphopeptide" + , + by.y = "Phosphopeptide" + ) + + #Produce heatmap to visualize significance and the effect of imputation + m <- + as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) + if (nrow(m) > 0) { + rownames_m <- rownames(m) + rownames(m) <- sapply( + X = seq_len(nrow(m)) + , + FUN = function(i) { sprintf( - ANOVA.filtered_merge.format[i] - , filtered_p$FDRadjustedANOVAp[i] - , rownames_m[i] + anova_filtered_merge_format[i] + , + filtered_p$fdr_adjusted_anova_p[i] + , + rownames_m[i] ) } ) - margins <- c( - max(nchar(colnames(m))) * 10 / 16 # col - , max(nchar(rownames(m))) * 5 / 16 # row - ) - how_many_peptides <- min(50, nrow(m)) + margins <- c(max(nchar(colnames(m))) * 10 / 16 # col + , max(nchar(rownames(m))) * 5 / 16 # row + ) + how_many_peptides <- min(50, nrow(m)) - op <- par("cex.main") - try( - if (nrow(m) > 1) { - par(cex.main=0.6) - heatmap( - m[how_many_peptides:1,] - , Rowv = NA - , Colv = NA - , cexRow = 0.7 - , cexCol = 0.8 - , scale="row" - , margins = margins - , main = "Heatmap of unimputed, unnormalized intensities" - , xlab = "" - # , main = bquote( - # .( how_many_peptides ) - # ~ " peptides with adjusted p-value <" - # ~ .(sprintf("%0.2f", cutoff)) - # ) - ) - } - ) - #ACE fig_dim knitr::opts_chunk$set(fig.dim = fig_dim) - par(op) + cat("\\newpage\n") + if (nrow(m) > 50) { + cat("Heatmap for the 50 most-significant peptides", + sprintf( + "whose adjusted p-value < %0.2f\n", + cutoff) + ) + } else { + cat("Heatmap for peptides whose", + sprintf("adjusted p-value < %0.2f\n", + cutoff) + ) + } + cat("\\newline\n") + cat("\\newline\n") + op <- par("cex.main") + try( + if (nrow(m) > 1) { + par(cex.main = 0.6) + heatmap( + m[how_many_peptides:1, ], + Rowv = NA, + Colv = NA, + cexRow = 0.7, + cexCol = 0.8, + scale = "row", + margins = margins, + main = + "Heatmap of unimputed, unnormalized intensities", + xlab = "" + ) + } + ) + par(op) + } } - } } ``` +<!-- ## Peptide IDs, etc. See output files. +-->