Mercurial > repos > eschen42 > mqppep_anova
diff mqppep_anova_script.Rmd @ 0:c1403d18c189 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
author | eschen42 |
---|---|
date | Mon, 07 Mar 2022 19:05:01 +0000 |
parents | |
children | d728198f1ba5 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mqppep_anova_script.Rmd Mon Mar 07 19:05:01 2022 +0000 @@ -0,0 +1,657 @@ +--- +title: "Quant Data Processing Script" +author: "Larry Cheng; Art Eschenlauer" +date: "May 28, 2018; Nov 16, 2021" +output: + html_document: default + pdf_document: default +params: + inputFile: "Upstream_Map_pST_outputfile_STEP4.txt" + alphaFile: "alpha_levels.txt" + firstDataColumn: "Intensity" + imputationMethod: !r c("group-median","median","mean","random")[4] + meanPercentile: 1 + sdPercentile: 0.2 + regexSampleNames: "\\.(\\d+)[A-Z]$" + regexSampleGrouping: "(\\d+)" + imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt" +--- +```{r setup, include=FALSE} +# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 +knitr::opts_chunk$set(echo = FALSE, fig.dim=c(9,10)) +``` + +## Purpose: +Perform imputation of missing values, quantile normalization, and ANOVA. + +<!-- +## Variables to change for each input file +--> +```{r include = FALSE} +#Input Filename +inputFile <- params$inputFile + +#First data column - ideally, this could be detected via regexSampleNames, but for now leave it as is. +firstDataColumn <- params$firstDataColumn +FDC_is_integer <- TRUE +firstDataColumn <- withCallingHandlers( + as.integer(firstDataColumn) + , warning = function(w) FDC_is_integer <<- FALSE + ) +if (FALSE == FDC_is_integer) { + firstDataColumn <- params$firstDataColumn +} + +#False discovery rate adjustment for ANOVA (Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05) +valFDR <- read.table(file = params$alphaFile, sep = "\t", header=F, quote="")[,1] + +#Imputed Data filename +imputedDataFilename <- params$imputedDataFilename + +#ANOVA data filename +``` + +```{r include = FALSE} +#Imputation method, should be one of c("random","group-median","median","mean") +imputationMethod <- params$imputationMethod + +#Selection of percentile of logvalue data to set the mean for random number generation when using random imputation +meanPercentile <- params$meanPercentile / 100.0 + +#deviation adjustment-factor for random values; real number. +sdPercentile <- params$sdPercentile + +#Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" +regexSampleNames <- params$regexSampleNames + +#Regular expression to extract Sample Grouping from Sample Name (if error occurs, compare sampleNumbers and tempMatches to see if groupings/pairs line up) +# e.g., "(\\d+)" +regexSampleGrouping <- params$regexSampleGrouping + +``` + + +```{r include = FALSE} +### FUNCTIONS + +#ANOVA filter function +anovaFunc <- function(x, groupingFactor) { + x.aov = aov(as.numeric(x) ~ groupingFactor) + pvalue = summary(x.aov)[[1]][["Pr(>F)"]][1] + pvalue +} +``` + + + +### Checking that log-transformed sample distributions are similar: +```{r echo=FALSE} + +library(data.table) + +# read.table reads a file in table format and creates a data frame from it. +# - note that `quote=""` means that quotation marks are treated literally. +fullData <- read.table(file = inputFile, sep = "\t", header=T, quote="", check.names=FALSE) +print(colnames(fullData)) +#head(fullData) + +if (FALSE == FDC_is_integer) { + dataColumnIndices <- grep(firstDataColumn, names(fullData), perl=TRUE) + str(dataColumnIndices) + if (length(dataColumnIndices) > 0) { + firstDataColumn <- dataColumnIndices[1] + } else { + stop(paste("failed to convert firstDataColumn:", firstDataColumn)) + } +} + +quantData0 <- fullData[firstDataColumn:length(fullData)] +quantData <- fullData[firstDataColumn:length(fullData)] +quantData[quantData==0] <- NA #replace 0 with NA +quantDataLog <- log10(quantData) + +rownames(quantDataLog) <- fullData$Phosphopeptide + +summary(quantDataLog) + +#data visualization +old_par <- par( + mai=par("mai") + c(0.5,0,0,0) +) +boxplot( + quantDataLog +, las=2 +) +par(old_par) + +quantDataLog_stack <- stack(quantDataLog) +``` + +```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)} +library(ggplot2) +ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind)) +``` + +### Globally, are phosphopeptide intensities are approximately unimodal? +```{r echo = FALSE,fig.align="left", fig.dim=c(9,5)} + +# ref for bquote particularly and plotting math expressions generally: +# https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ + +#identify the location of missing values +fin <- is.finite(as.numeric(as.matrix(quantDataLog))) + +logvalues <- as.numeric(as.matrix(quantDataLog))[fin] +plot( + density(logvalues) +, main = bquote("Smoothed estimated probability density vs." ~ log[10](intensity)) +, xlab = bquote(log[10](intensity)) +) +hist( + x = as.numeric(as.matrix(quantDataLog)) +, breaks = 100 +, main = bquote("Frequency vs." ~ log[10](intensity)) +, xlab = bquote(log[10](intensity)) +) +``` + +<!-- +## Impute missing values +--> + +### Distribution of standard deviations of phosphopeptides, ignoring missing values: + +```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)} +#determine quantile +q1 <- quantile(logvalues, probs = meanPercentile)[1] + +#determine standard deviation of quantile to impute +sd_finite <- function(x) { + ok <- is.finite(x) + sd(x[ok]) * sdPercentile +} +sds <- apply(quantDataLog, 1, sd_finite) # 1 = row of matrix (ie, phosphopeptide) +plot( + density(sds, na.rm=T) +, main="Smoothed estimated probability density vs. std. deviation" +, sub="(probability estimation made with Gaussian smoothing)" +) + +m1 <- median(sds, na.rm=T) #sd to be used is the median sd + +``` + + + +<!-- +The number of missing values are: +--> +```{r echo=FALSE} +#Determine number of cells to impute +temp <- quantData[is.na(quantData)] + +#Determine number of values to impute +NoToImpute <- length(temp) +``` + +<!-- +% of values that are missing: +--> +```{r echo=FALSE} +pct_missing_values <- length(temp)/(length(logvalues)+length(temp)) * 100 +``` + +<!-- +First few rows of data before imputation: +--> +## Impute missing values +```{r echo = FALSE} + +#ACE start segment: trt-median based imputation +# prep for trt-median based imputation + +# Assuming that regexSampleNames <- "\\.(\\d+)[A-Z]$" +# get factors -> group runs (samples) by ignoring terminal [A-Z] in sample names +# regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) +m <- regexpr(regexSampleNames, names(quantData), perl=TRUE) +tempMatches <- regmatches(names(quantData), m) +print("Extracted sample names") +print(tempMatches) +m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE) +sampleNumbers <- as.factor(regmatches(tempMatches, m2)) +print("Factor levels") +print(sampleNumbers) + +``` +```{r echo = FALSE} + +#ACE hack begin +#Determine number of cells to impute +cat( + sprintf("Before imputation, there are:\n %d peptides\n %d missing values (%2.0f%s)" + , sum(rep.int(TRUE, nrow(quantData))) + , sum(is.na(quantData)) + , pct_missing_values + , "%" + ) +) +#ACE hack end + +``` +```{r echo = FALSE} + +#Impute data +quantDataImputed <- quantData + +# Identify which values are missing and need to be imputed +ind <- which(is.na(quantDataImputed), arr.ind=TRUE) + +``` +```{r echo = FALSE} + +# Apply imputation +switch( + imputationMethod +, "group-median"={ + cat("Imputation method: substitute missing value with median peptide-intensity for sample-group\n") + #goodRows <- rep.int(TRUE, nrow(quantDataImputed)) + sampleLevelIntegers <- as.integer(sampleNumbers) + for (i in 1:length(levels(sampleNumbers))) { + levelCols <- i == sampleLevelIntegers + ind <- which(is.na(quantDataImputed[,levelCols]), arr.ind=TRUE) + quantDataImputed[ind,levelCols] <- apply(quantDataImputed[,levelCols], 1, median, na.rm=T)[ind[,1]] + } + goodRows <- !is.na(rowMeans(quantDataImputed)) + } +, "median"={ + cat("Imputation method: substitute missing value with median peptide-intensity across all sample classes\n") + quantDataImputed[ind] <- apply(quantDataImputed, 1, median, na.rm=T)[ind[,1]] + goodRows <- !is.na(rowMeans(quantDataImputed)) + } +, "mean"={ + cat("Imputation method: substitute missing value with mean peptide-intensity across all sample classes\n") + quantDataImputed[ind] <- apply(quantDataImputed, 1, mean, na.rm=T)[ind[,1]] + goodRows <- !is.na(rowMeans(quantDataImputed)) + } +, "random"={ + cat( + sprintf( + "Imputation method: substitute missing value with random intensity N ~ (%0.2f, %0.2f)\n" + , q1, m1 + ) + ) + quantDataImputed[is.na(quantDataImputed)] <- 10^rnorm(NoToImpute, mean= q1, sd = m1) + goodRows <- !is.na(rowMeans(quantDataImputed)) + } +) + +``` +```{r echo = FALSE} + +#Determine number of cells to impute +temp <- quantDataImputed[is.na(quantDataImputed)] +cat( + sprintf( + "After imputation, there are:\n %d missing values\n %d usable peptides\n %d peptides with too many missing values for further analysis" + , sum(is.na(quantDataImputed[goodRows,])) + , sum(goodRows) + , sum(!goodRows) + ) +) +``` +```{r echo = FALSE} + + +# Zap rows where imputation was ineffective +fullData <- fullData [goodRows, ] +quantData <- quantData [goodRows, ] +quantDataImputed <- quantDataImputed[goodRows, ] + +``` +```{r echo = FALSE} + +d_combined <- (density(as.numeric(as.matrix(log10(quantDataImputed))))) +d_original <- density(as.numeric(as.matrix(log10(quantDataImputed[!is.na(quantData)])))) + +``` +```{r echo = FALSE} + +if (sum(is.na(quantData)) > 0) { + # There ARE missing values + d_imputed <- (density(as.numeric(as.matrix(log10(quantDataImputed[is.na(quantData)]))))) +} else { + # There are NO missing values + d_imputed <- d_combined +} + +``` + +<!-- ```{r echo = FALSE, fig.cap = "Blue = Data before imputation; Red = Imputed data"} --> +```{r echo = FALSE, fig.dim=c(9,5)} +ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) +plot( + d_combined +, ylim = ylim +, sub = "Blue = data before imputation; Red = imputed data" +, main = "Density vs. log10(intensity) before and after imputation" +) +lines(d_original, col="blue") +lines(d_imputed, col="red") +``` + +## Perform Quantile Normalization +```{r echo=FALSE} +library(preprocessCore) +# Apply quantile normalization using preprocessCore::normalize.quantiles +# --- +# tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html +# except this: https://support.bioconductor.org/p/122925/#9135989 +# says to install it like this: +# ``` +# BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE,lib=.libPaths()[1]) +# ``` +# conda installation (necessary because of a bug in recent openblas): +# conda install bioconductor-preprocesscore openblas=0.3.3 +# ... +# --- +# normalize.quantiles {preprocessCore} -- Quantile Normalization +# +# Description: +# Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities. +# +# Usage: +# normalize.quantiles(x,copy=TRUE, keep.names=FALSE) +# +# Arguments: +# +# - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. +# +# - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, +# but in certain situations not making a copy of the matrix, but instead normalizing +# it in place will be more memory friendly. +# +# - keep.names: Boolean option to preserve matrix row and column names in output. +# +# Details: +# This method is based upon the concept of a quantile-quantile plot extended to n dimensions. +# No special allowances are made for outliers. If you make use of quantile normalization +# please cite Bolstad et al, Bioinformatics (2003). +# +# This functions will handle missing data (ie NA values), based on +# the assumption that the data is missing at random. +# +# Note that the current implementation optimizes for better memory usage +# at the cost of some additional run-time. +# +# Value: A normalized matrix. +# +# Author: Ben Bolstad, bmbolstad.com +# +# References +# +# - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide +# Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf +# +# - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of +# Normalization Methods for High Density Oligonucleotide Array Data Based on Bias +# and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 +# http://bmbolstad.com/misc/normalize/normalize.html +# ... + +if (TRUE) { + quantDataImputed.qn <- normalize.quantiles(as.matrix(quantDataImputed)) +} else { + quantDataImputed.qn <- as.matrix(quantDataImputed) +} + +quantDataImputed.qn = as.data.frame(quantDataImputed.qn) +names(quantDataImputed.qn) = names(quantDataImputed) +quantDataImputed_QN_log <- log10(quantDataImputed.qn) + +rownames(quantDataImputed_QN_log) <- fullData[,1] + +quantDataImputed.qn.LS = t(scale(t(log10(quantDataImputed.qn)))) +anyNaN <- function (x) { + !any(x == "NaN") +} +sel = apply(quantDataImputed.qn.LS, 1, anyNaN) +quantDataImputed.qn.LS2 <- quantDataImputed.qn.LS[which(sel),] +quantDataImputed.qn.LS2 = as.data.frame(quantDataImputed.qn.LS2) + +#output quantile normalized data +dataTableImputed_QN_LT <- cbind(fullData[1:9], quantDataImputed_QN_log) +write.table(dataTableImputed_QN_LT, file = paste(paste(strsplit(imputedDataFilename, ".txt"),"QN_LT",sep="_"),".txt",sep=""), sep = "\t", col.names=TRUE, row.names=FALSE) + +``` + +<!-- ACE insertion begin --> +### Checking that normalized, imputed, log-transformed sample distributions are similar: + +```{r echo=FALSE} +#library(data.table) + +#Save unimputed quantDataLog for plotting below +unimputedQuantDataLog <- quantDataLog + +#Log10 transform (after preparing for zero values, which should never happen...) +quantDataImputed.qn[quantDataImputed.qn == 0] <- .000000001 +quantDataLog <- log10(quantDataImputed.qn) + +summary(quantDataLog) + +#Output quantile-normalized log-transformed dataset with imputed, normalized data + +dataTableImputed <- cbind(fullData[1:9], quantDataLog) +write.table( + dataTableImputed + , file=imputedDataFilename + , sep="\t" + , col.names=TRUE + , row.names=FALSE + , quote=FALSE + ) + + + +#data visualization +old_par <- par( + mai=par("mai") + c(0.5,0,0,0) +, oma=par("oma") + c(0.5,0,0,0) +) +boxplot( + quantDataLog +, las=2 +) +par(old_par) +``` + +```{r echo=FALSE, fig.dim=c(9,5)} +quantDataLog_stack <- stack(quantDataLog) +ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind)) +``` + +## Perform ANOVA filters + +```{r,echo=FALSE} +#Make new data frame containing only Phosphopeptides to connect preANOVA to ANOVA (connect_df) +connect_df <- data.frame( + dataTableImputed_QN_LT$Phosphopeptide + , dataTableImputed_QN_LT[,firstDataColumn] + ) +colnames(connect_df) <- c("Phosphopeptide","Intensity") +``` + +```{r echo=FALSE, fig.dim=c(9,10)} +# Get factors -> group replicates (as indicated by terminal letter) by the preceding digits +# For example, group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc.. +m <- regexpr(regexSampleNames, names(quantDataImputed_QN_log), perl=TRUE) +#ACE str(m) +tempMatches <- regmatches(names(quantDataImputed_QN_log), m) +#ACE str(tempMatches) +numSamples <- length(tempMatches) +#ACE str(numSamples) +m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE) +#ACE str(m2) +#ACE str(regmatches(tempMatches, m2)) +sampleNumbers <- as.factor(regmatches(tempMatches, m2)) +#ACE str(sampleNumbers) + +if (length(levels(sampleNumbers))<2) { + cat("ERROR!!!! Cannot perform ANOVA analysis because it requires two or more factor levels\n") + cat("Unparsed sample names are:\n") + print(names(quantDataImputed_QN_log)) + cat(sprintf("Parsing rule for SampleNames is '%s'\n", regexSampleNames)) + cat("Parsed names are:\n") + print(tempMatches) + cat(sprintf("Parsing rule for SampleGrouping is '%s'\n", regexSampleGrouping)) + cat("Sample group assignments are:\n") + print(regmatches(tempMatches, m2)) +} else { + pValueData.anovaPs <- apply(quantDataImputed_QN_log, 1, anovaFunc, groupingFactor=sampleNumbers) + + pValueData.anovaPs.FDR <- p.adjust(pValueData.anovaPs, method="fdr") + pValueData <- data.frame( + phosphopeptide = fullData[,1] + , rawANOVAp = pValueData.anovaPs + , FDRadjustedANOVAp = pValueData.anovaPs.FDR + ) + #ACE rownames(pValueData) <- fullData[,1] + # output ANOVA file to constructed filename, + # e.g. "Outputfile_pST_ANOVA_STEP5.txt" + # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" + + #Re-output quantile-normalized log-transformed dataset with imputed, normalized data to include p-values + + dataTableImputed <- cbind(fullData[1:9], pValueData[,2:3], quantDataLog) + write.table( + dataTableImputed + , file=imputedDataFilename + , sep="\t" + , col.names=TRUE + , row.names=FALSE + , quote=FALSE + ) + + + pValueData <- pValueData[order(pValueData$FDRadjustedANOVAp),] + + cutoff <- valFDR[1] + for (cutoff in valFDR){ #loop through FDR cutoffs + + filtered_p <- pValueData[which(pValueData$FDRadjustedANOVAp < cutoff),, drop = FALSE] + filteredData.filtered <- quantDataImputed_QN_log[rownames(filtered_p),, drop = FALSE] + filteredData.filtered <- filteredData.filtered[order(filtered_p$FDRadjustedANOVAp),, drop = FALSE] + + # <!-- ACE insertion start --> + old_oma <- par("oma") + old_par <- par( + mai=(par("mai") + c(0.7,0,0,0)) * c(1,1,0.3,1) + , oma=old_oma * c(1,1,0.3,1) + , cex.main=0.9 + , cex.axis=0.7 + ) + + if (nrow(filteredData.filtered) > 0) { + boxplot( + filteredData.filtered + , main = sprintf("Imputed, normalized intensities where adjusted p-value < %0.2f", cutoff) + # no line plot , main = "" + , las = 2 + # , ylim = c(5.5,10) + , ylab = expression(log[10](intensity)) + ) + } else { + cat(sprintf("No peptides were found to have cutoff adjusted p-value < %0.2f\n", cutoff)) + } + par(old_par) + + #Add Phosphopeptide column to ANOVA filtered table + ANOVA.filtered_merge <- merge( + x = connect_df + , y = filteredData.filtered + , by.x="Intensity" + , by.y=1 + ) + ANOVA.filtered_merge.order <- rownames(filtered_p) + + ANOVA.filtered_merge.format <- sapply( + X = filtered_p$FDRadjustedANOVAp + , FUN = function(x) { + if (x > 0.0001) + paste0("(%0.",1+ceiling(-log10(x)),"f) %s") + else + paste0("(%0.4e) %s") + } + ) + + #ANOVA.filtered_merge.format <- paste0("(%0.",1+ceiling(-log10(filtered_p$FDRadjustedANOVAp)),"f) %s") + + ANOVA.filtered <- data.table( + ANOVA.filtered_merge$Phosphopeptide + , ANOVA.filtered_merge$Intensity + , ANOVA.filtered_merge[, 2:numSamples+1] + ) + colnames(ANOVA.filtered) <- c("Phosphopeptide", colnames(filteredData.filtered)) + + # merge qualitative columns into the ANOVA data + output_table <- data.frame(ANOVA.filtered$Phosphopeptide) + output_table <- merge( + x = output_table + , y = dataTableImputed_QN_LT + , by.x = "ANOVA.filtered.Phosphopeptide" + , by.y="Phosphopeptide" + ) + + #Produce heatmap to visualize significance and the effect of imputation + m <- as.matrix(unimputedQuantDataLog[ANOVA.filtered_merge.order,]) + if (nrow(m) > 0) { + rownames_m <- rownames(m) + rownames(m) <- sapply( + X = 1:nrow(m) + , FUN = function(i) { + sprintf( + ANOVA.filtered_merge.format[i] + , filtered_p$FDRadjustedANOVAp[i] + , rownames_m[i] + ) + } + ) + margins <- c( + max(nchar(colnames(m))) * 10 / 16 # col + , max(nchar(rownames(m))) * 5 / 16 # row + ) + how_many_peptides <- min(50, nrow(m)) + + op <- par("cex.main") + try( + if (nrow(m) > 1) { + par(cex.main=0.6) + heatmap( + m[how_many_peptides:1,] + , Rowv = NA + , Colv = NA + , cexRow = 0.7 + , cexCol = 0.8 + , scale="row" + , margins = margins + , main = "Heatmap of unimputed, unnormalized intensities" + , xlab = "" + # , main = bquote( + # .( how_many_peptides ) + # ~ " peptides with adjusted p-value <" + # ~ .(sprintf("%0.2f", cutoff)) + # ) + ) + } + ) + #ACE fig_dim knitr::opts_chunk$set(fig.dim = fig_dim) + par(op) + } + + } +} +``` + +## Peptide IDs, etc. + +See output files.