mqppep_anova: mqppep_anova_script.Rmd comparison

comparison mqppep_anova_script.Rmd @ 0:c1403d18c189 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"

author	eschen42
date	Mon, 07 Mar 2022 19:05:01 +0000
parents
children	d728198f1ba5

comparison

equal deleted inserted replaced

--1:000000000000
+:c1403d18c189
+---
+title: "Quant Data Processing Script"
+author: "Larry Cheng; Art Eschenlauer"
+date: "May 28, 2018; Nov 16, 2021"
+output:
+html_document: default
+pdf_document: default
+params:
+inputFile: "Upstream_Map_pST_outputfile_STEP4.txt"
+alphaFile: "alpha_levels.txt"
+firstDataColumn: "Intensity"
+imputationMethod: !r c("group-median","median","mean","random")[4]
+meanPercentile: 1
+sdPercentile: 0.2
+regexSampleNames: "\\.(\\d+)[A-Z]$"
+regexSampleGrouping: "(\\d+)"
+imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt"
+---
+```{r setup, include=FALSE}
+# ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285
+knitr::opts_chunk$set(echo = FALSE, fig.dim=c(9,10))
+```
+## Purpose:
+Perform imputation of missing values, quantile normalization, and ANOVA.
+<!--
+## Variables to change for each input file
+-->
+```{r include = FALSE}
+#Input Filename
+inputFile <- params$inputFile
+#First data column - ideally, this could be detected via regexSampleNames, but for now leave it as is.
+firstDataColumn <- params$firstDataColumn
+FDC_is_integer <- TRUE
+firstDataColumn <- withCallingHandlers(
+as.integer(firstDataColumn)
+, warning = function(w) FDC_is_integer <<- FALSE
+)
+if (FALSE == FDC_is_integer) {
+firstDataColumn <- params$firstDataColumn
+}
+#False discovery rate adjustment for ANOVA (Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05)
+valFDR <- read.table(file = params$alphaFile, sep = "\t", header=F, quote="")[,1]
+#Imputed Data filename
+imputedDataFilename <- params$imputedDataFilename
+#ANOVA data filename
+```
+```{r include = FALSE}
+#Imputation method, should be one of c("random","group-median","median","mean")
+imputationMethod <- params$imputationMethod
+#Selection of percentile of logvalue data to set the mean for random number generation when using random imputation
+meanPercentile <- params$meanPercentile / 100.0
+#deviation adjustment-factor for random values; real number.
+sdPercentile <- params$sdPercentile
+#Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$"
+regexSampleNames <- params$regexSampleNames
+#Regular expression to extract Sample Grouping from Sample Name (if error occurs, compare sampleNumbers and tempMatches to see if groupings/pairs line up)
+# e.g., "(\\d+)"
+regexSampleGrouping <- params$regexSampleGrouping
+```
+```{r include = FALSE}
+### FUNCTIONS
+#ANOVA filter function
+anovaFunc <- function(x, groupingFactor) {
+x.aov = aov(as.numeric(x) ~ groupingFactor)
+pvalue = summary(x.aov)[[1]][["Pr(>F)"]][1]
+pvalue
+}
+```
+### Checking that log-transformed sample distributions are similar:
+```{r echo=FALSE}
+library(data.table)
+# read.table reads a file in table format and creates a data frame from it.
+#   - note that `quote=""` means that quotation marks are treated literally.
+fullData <- read.table(file = inputFile, sep = "\t", header=T, quote="", check.names=FALSE)
+print(colnames(fullData))
+#head(fullData)
+if (FALSE == FDC_is_integer) {
+dataColumnIndices <- grep(firstDataColumn, names(fullData), perl=TRUE)
+str(dataColumnIndices)
+if (length(dataColumnIndices) > 0) {
+firstDataColumn <- dataColumnIndices[1]
+} else {
+stop(paste("failed to convert firstDataColumn:", firstDataColumn))
+}
+}
+quantData0 <- fullData[firstDataColumn:length(fullData)]
+quantData <- fullData[firstDataColumn:length(fullData)]
+quantData[quantData==0] <- NA  #replace 0 with NA
+quantDataLog <- log10(quantData)
+rownames(quantDataLog) <- fullData$Phosphopeptide
+summary(quantDataLog)
+#data visualization
+old_par <- par(
+mai=par("mai") + c(0.5,0,0,0)
+)
+boxplot(
+quantDataLog
+, las=2
+)
+par(old_par)
+quantDataLog_stack <- stack(quantDataLog)
+```
+```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)}
+library(ggplot2)
+ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind))
+```
+### Globally, are phosphopeptide intensities are approximately unimodal?
+```{r echo = FALSE,fig.align="left", fig.dim=c(9,5)}
+# ref for bquote particularly and plotting math expressions generally:
+#   https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/
+#identify the location of missing values
+fin <- is.finite(as.numeric(as.matrix(quantDataLog)))
+logvalues <- as.numeric(as.matrix(quantDataLog))[fin]
+plot(
+density(logvalues)
+, main = bquote("Smoothed estimated probability density vs." ~ log[10](intensity))
+, xlab = bquote(log[10](intensity))
+)
+hist(
+x = as.numeric(as.matrix(quantDataLog))
+, breaks = 100
+, main = bquote("Frequency vs." ~ log[10](intensity))
+, xlab = bquote(log[10](intensity))
+)
+```
+<!--
+## Impute missing values
+-->
+### Distribution of standard deviations of phosphopeptides, ignoring missing values:
+```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)}
+#determine quantile
+q1 <- quantile(logvalues, probs = meanPercentile)[1]
+#determine standard deviation of quantile to impute
+sd_finite <- function(x) {
+ok <- is.finite(x)
+sd(x[ok]) * sdPercentile
+}
+sds <- apply(quantDataLog, 1, sd_finite) # 1 = row of matrix (ie, phosphopeptide)
+plot(
+density(sds, na.rm=T)
+, main="Smoothed estimated probability density vs. std. deviation"
+, sub="(probability estimation made with Gaussian smoothing)"
+)
+m1 <- median(sds, na.rm=T) #sd to be used is the median sd
+```
+<!--
+The number of missing values are:
+-->
+```{r echo=FALSE}
+#Determine number of cells to impute
+temp <- quantData[is.na(quantData)]
+#Determine number of values to impute
+NoToImpute <- length(temp)
+```
+<!--
+% of values that are missing:
+-->
+```{r echo=FALSE}
+pct_missing_values <- length(temp)/(length(logvalues)+length(temp)) * 100
+```
+<!--
+First few rows of data before imputation:
+-->
+## Impute missing values
+```{r echo = FALSE}
+#ACE start segment: trt-median based imputation
+# prep for trt-median based imputation
+# Assuming that regexSampleNames <- "\\.(\\d+)[A-Z]$"
+#   get factors -> group runs (samples) by ignoring terminal [A-Z] in sample names
+# regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)
+m <- regexpr(regexSampleNames, names(quantData), perl=TRUE)
+tempMatches <- regmatches(names(quantData), m)
+print("Extracted sample names")
+print(tempMatches)
+m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE)
+sampleNumbers <- as.factor(regmatches(tempMatches, m2))
+print("Factor levels")
+print(sampleNumbers)
+```
+```{r echo = FALSE}
+#ACE hack begin
+#Determine number of cells to impute
+cat(
+sprintf("Before imputation, there are:\n %d peptides\n %d missing values (%2.0f%s)"
+, sum(rep.int(TRUE, nrow(quantData)))
+, sum(is.na(quantData))
+, pct_missing_values
+, "%"
+)
+)
+#ACE hack end
+```
+```{r echo = FALSE}
+#Impute data
+quantDataImputed <- quantData
+# Identify which values are missing and need to be imputed
+ind <- which(is.na(quantDataImputed), arr.ind=TRUE)
+```
+```{r echo = FALSE}
+# Apply imputation
+switch(
+imputationMethod
+, "group-median"={
+cat("Imputation method: substitute missing value with median peptide-intensity for sample-group\n")
+#goodRows <- rep.int(TRUE, nrow(quantDataImputed))
+sampleLevelIntegers <- as.integer(sampleNumbers)
+for (i in 1:length(levels(sampleNumbers))) {
+levelCols <- i == sampleLevelIntegers
+ind <- which(is.na(quantDataImputed[,levelCols]), arr.ind=TRUE)
+quantDataImputed[ind,levelCols] <- apply(quantDataImputed[,levelCols], 1, median, na.rm=T)[ind[,1]]
+}
+goodRows <- !is.na(rowMeans(quantDataImputed))
+}
+, "median"={
+cat("Imputation method: substitute missing value with median peptide-intensity across all sample classes\n")
+quantDataImputed[ind] <- apply(quantDataImputed, 1, median, na.rm=T)[ind[,1]]
+goodRows <- !is.na(rowMeans(quantDataImputed))
+}
+, "mean"={
+cat("Imputation method: substitute missing value with mean peptide-intensity across all sample classes\n")
+quantDataImputed[ind] <- apply(quantDataImputed, 1, mean, na.rm=T)[ind[,1]]
+goodRows <- !is.na(rowMeans(quantDataImputed))
+}
+, "random"={
+cat(
+sprintf(
+"Imputation method: substitute missing value with random intensity N ~ (%0.2f, %0.2f)\n"
+, q1, m1
+)
+)
+quantDataImputed[is.na(quantDataImputed)] <- 10^rnorm(NoToImpute, mean= q1, sd = m1)
+goodRows <- !is.na(rowMeans(quantDataImputed))
+}
+)
+```
+```{r echo = FALSE}
+#Determine number of cells to impute
+temp <- quantDataImputed[is.na(quantDataImputed)]
+cat(
+sprintf(
+"After imputation, there are:\n  %d missing values\n  %d usable peptides\n  %d peptides with too many missing values for further analysis"
+, sum(is.na(quantDataImputed[goodRows,]))
+, sum(goodRows)
+, sum(!goodRows)
+)
+)
+```
+```{r echo = FALSE}
+# Zap rows where imputation was ineffective
+fullData         <- fullData        [goodRows, ]
+quantData        <- quantData       [goodRows, ]
+quantDataImputed <- quantDataImputed[goodRows, ]
+```
+```{r echo = FALSE}
+d_combined <- (density(as.numeric(as.matrix(log10(quantDataImputed)))))
+d_original <- density(as.numeric(as.matrix(log10(quantDataImputed[!is.na(quantData)]))))
+```
+```{r echo = FALSE}
+if (sum(is.na(quantData)) > 0) {
+# There ARE missing values
+d_imputed <- (density(as.numeric(as.matrix(log10(quantDataImputed[is.na(quantData)])))))
+} else {
+# There are NO missing values
+d_imputed <- d_combined
+}
+```
+<!-- ```{r echo = FALSE, fig.cap = "Blue =  Data before imputation; Red = Imputed data"} -->
+```{r echo = FALSE, fig.dim=c(9,5)}
+ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y))
+plot(
+d_combined
+, ylim = ylim
+, sub = "Blue = data before imputation; Red = imputed data"
+, main = "Density vs. log10(intensity) before and after imputation"
+)
+lines(d_original, col="blue")
+lines(d_imputed, col="red")
+```
+## Perform Quantile Normalization
+```{r echo=FALSE}
+library(preprocessCore)
+# Apply quantile normalization using preprocessCore::normalize.quantiles
+# ---
+# tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html
+#   except this: https://support.bioconductor.org/p/122925/#9135989
+#   says to install it like this:
+#     ```
+#     BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE,lib=.libPaths()[1])
+#     ```
+# conda installation (necessary because of a bug in recent openblas):
+#   conda install bioconductor-preprocesscore openblas=0.3.3
+# ...
+# ---
+# normalize.quantiles {preprocessCore}	--  Quantile Normalization
+#
+# Description:
+#   Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities.
+#
+# Usage:
+#   normalize.quantiles(x,copy=TRUE, keep.names=FALSE)
+#
+# Arguments:
+#
+#   - x: A matrix of intensities where each column corresponds to a chip and each row is a probe.
+#
+#   - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy,
+#       but in certain situations not making a copy of the matrix, but instead normalizing
+#       it in place will be more memory friendly.
+#
+#   - keep.names: Boolean option to preserve matrix row and column names in output.
+#
+# Details:
+#   This method is based upon the concept of a quantile-quantile plot extended to n dimensions.
+#     No special allowances are made for outliers. If you make use of quantile normalization
+#     please cite Bolstad et al, Bioinformatics (2003).
+#
+#   This functions will handle missing data (ie NA values), based on
+#     the assumption that the data is missing at random.
+#
+#   Note that the current implementation optimizes for better memory usage
+#     at the cost of some additional run-time.
+#
+# Value: A normalized matrix.
+#
+# Author: Ben Bolstad, bmbolstad.com
+#
+# References
+#
+#   - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide
+#       Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf
+#
+#   - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of
+#       Normalization Methods for High Density Oligonucleotide Array Data Based on Bias
+#       and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185
+#       http://bmbolstad.com/misc/normalize/normalize.html
+# ...
+if (TRUE) {
+quantDataImputed.qn <- normalize.quantiles(as.matrix(quantDataImputed))
+} else {
+quantDataImputed.qn <- as.matrix(quantDataImputed)
+}
+quantDataImputed.qn = as.data.frame(quantDataImputed.qn)
+names(quantDataImputed.qn) = names(quantDataImputed)
+quantDataImputed_QN_log <- log10(quantDataImputed.qn)
+rownames(quantDataImputed_QN_log) <- fullData[,1]
+quantDataImputed.qn.LS = t(scale(t(log10(quantDataImputed.qn))))
+anyNaN <- function (x) {
+!any(x == "NaN")
+}
+sel = apply(quantDataImputed.qn.LS, 1, anyNaN)
+quantDataImputed.qn.LS2 <- quantDataImputed.qn.LS[which(sel),]
+quantDataImputed.qn.LS2 = as.data.frame(quantDataImputed.qn.LS2)
+#output quantile normalized data
+dataTableImputed_QN_LT <- cbind(fullData[1:9], quantDataImputed_QN_log)
+write.table(dataTableImputed_QN_LT, file = paste(paste(strsplit(imputedDataFilename, ".txt"),"QN_LT",sep="_"),".txt",sep=""), sep = "\t", col.names=TRUE, row.names=FALSE)
+```
+<!-- ACE insertion begin -->
+### Checking that normalized, imputed, log-transformed sample distributions are similar:
+```{r echo=FALSE}
+#library(data.table)
+#Save unimputed quantDataLog for plotting below
+unimputedQuantDataLog <- quantDataLog
+#Log10 transform (after preparing for zero values, which should never happen...)
+quantDataImputed.qn[quantDataImputed.qn == 0] <- .000000001
+quantDataLog <- log10(quantDataImputed.qn)
+summary(quantDataLog)
+#Output quantile-normalized log-transformed dataset with imputed, normalized data
+dataTableImputed <- cbind(fullData[1:9], quantDataLog)
+write.table(
+dataTableImputed
+, file=imputedDataFilename
+, sep="\t"
+, col.names=TRUE
+, row.names=FALSE
+, quote=FALSE
+)
+#data visualization
+old_par <- par(
+mai=par("mai") + c(0.5,0,0,0)
+, oma=par("oma") + c(0.5,0,0,0)
+)
+boxplot(
+quantDataLog
+, las=2
+)
+par(old_par)
+```
+```{r echo=FALSE, fig.dim=c(9,5)}
+quantDataLog_stack <- stack(quantDataLog)
+ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind))
+```
+## Perform ANOVA filters
+```{r,echo=FALSE}
+#Make new data frame containing only Phosphopeptides to connect preANOVA to ANOVA (connect_df)
+connect_df <- data.frame(
+dataTableImputed_QN_LT$Phosphopeptide
+, dataTableImputed_QN_LT[,firstDataColumn]
+)
+colnames(connect_df) <- c("Phosphopeptide","Intensity")
+```
+```{r echo=FALSE, fig.dim=c(9,10)}
+# Get factors -> group replicates (as indicated by terminal letter) by the preceding digits
+#   For example, group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc..
+m <- regexpr(regexSampleNames, names(quantDataImputed_QN_log), perl=TRUE)
+#ACE str(m)
+tempMatches <- regmatches(names(quantDataImputed_QN_log), m)
+#ACE str(tempMatches)
+numSamples <- length(tempMatches)
+#ACE str(numSamples)
+m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE)
+#ACE str(m2)
+#ACE str(regmatches(tempMatches, m2))
+sampleNumbers <- as.factor(regmatches(tempMatches, m2))
+#ACE str(sampleNumbers)
+if (length(levels(sampleNumbers))<2) {
+cat("ERROR!!!! Cannot perform ANOVA analysis because it requires two or more factor levels\n")
+cat("Unparsed sample names are:\n")
+print(names(quantDataImputed_QN_log))
+cat(sprintf("Parsing rule for SampleNames is '%s'\n", regexSampleNames))
+cat("Parsed names are:\n")
+print(tempMatches)
+cat(sprintf("Parsing rule for SampleGrouping is '%s'\n", regexSampleGrouping))
+cat("Sample group assignments are:\n")
+print(regmatches(tempMatches, m2))
+} else {
+pValueData.anovaPs <- apply(quantDataImputed_QN_log, 1, anovaFunc, groupingFactor=sampleNumbers)
+pValueData.anovaPs.FDR <- p.adjust(pValueData.anovaPs, method="fdr")
+pValueData <- data.frame(
+phosphopeptide = fullData[,1]
+, rawANOVAp = pValueData.anovaPs
+, FDRadjustedANOVAp = pValueData.anovaPs.FDR
+)
+#ACE rownames(pValueData) <- fullData[,1]
+# output ANOVA file to constructed filename,
+#   e.g.    "Outputfile_pST_ANOVA_STEP5.txt"
+#   becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt"
+#Re-output quantile-normalized log-transformed dataset with imputed, normalized data to include p-values
+dataTableImputed <- cbind(fullData[1:9], pValueData[,2:3], quantDataLog)
+write.table(
+dataTableImputed
+, file=imputedDataFilename
+, sep="\t"
+, col.names=TRUE
+, row.names=FALSE
+, quote=FALSE
+)
+pValueData <- pValueData[order(pValueData$FDRadjustedANOVAp),]
+cutoff <- valFDR[1]
+for (cutoff in valFDR){ #loop through FDR cutoffs
+filtered_p <- pValueData[which(pValueData$FDRadjustedANOVAp < cutoff),, drop = FALSE]
+filteredData.filtered <- quantDataImputed_QN_log[rownames(filtered_p),, drop = FALSE]
+filteredData.filtered <- filteredData.filtered[order(filtered_p$FDRadjustedANOVAp),, drop = FALSE]
+# <!-- ACE insertion start -->
+old_oma <- par("oma")
+old_par <- par(
+mai=(par("mai") + c(0.7,0,0,0)) * c(1,1,0.3,1)
+, oma=old_oma * c(1,1,0.3,1)
+, cex.main=0.9
+, cex.axis=0.7
+)
+if (nrow(filteredData.filtered) > 0) {
+boxplot(
+filteredData.filtered
+, main = sprintf("Imputed, normalized intensities where adjusted p-value < %0.2f", cutoff)
+# no line plot , main = ""
+, las = 2
+# , ylim = c(5.5,10)
+, ylab = expression(log[10](intensity))
+)
+} else {
+cat(sprintf("No peptides were found to have cutoff adjusted p-value < %0.2f\n", cutoff))
+}
+par(old_par)
+#Add Phosphopeptide column to ANOVA filtered table
+ANOVA.filtered_merge <- merge(
+x = connect_df
+, y = filteredData.filtered
+, by.x="Intensity"
+, by.y=1
+)
+ANOVA.filtered_merge.order <- rownames(filtered_p)
+ANOVA.filtered_merge.format <- sapply(
+X = filtered_p$FDRadjustedANOVAp
+, FUN = function(x) {
+if (x > 0.0001)
+paste0("(%0.",1+ceiling(-log10(x)),"f) %s")
+else
+paste0("(%0.4e) %s")
+}
+)
+#ANOVA.filtered_merge.format <- paste0("(%0.",1+ceiling(-log10(filtered_p$FDRadjustedANOVAp)),"f) %s")
+ANOVA.filtered <- data.table(
+ANOVA.filtered_merge$Phosphopeptide
+, ANOVA.filtered_merge$Intensity
+, ANOVA.filtered_merge[, 2:numSamples+1]
+)
+colnames(ANOVA.filtered) <- c("Phosphopeptide", colnames(filteredData.filtered))
+# merge qualitative columns into the ANOVA data
+output_table <- data.frame(ANOVA.filtered$Phosphopeptide)
+output_table <- merge(
+x = output_table
+, y = dataTableImputed_QN_LT
+, by.x = "ANOVA.filtered.Phosphopeptide"
+, by.y="Phosphopeptide"
+)
+#Produce heatmap to visualize significance and the effect of imputation
+m <- as.matrix(unimputedQuantDataLog[ANOVA.filtered_merge.order,])
+if (nrow(m) > 0) {
+rownames_m <- rownames(m)
+rownames(m) <- sapply(
+X = 1:nrow(m)
+, FUN = function(i) {
+sprintf(
+ANOVA.filtered_merge.format[i]
+, filtered_p$FDRadjustedANOVAp[i]
+, rownames_m[i]
+)
+}
+)
+margins <- c(
+max(nchar(colnames(m))) * 10 / 16 # col
+, max(nchar(rownames(m))) * 5 / 16 # row
+)
+how_many_peptides <- min(50, nrow(m))
+op <- par("cex.main")
+try(
+if (nrow(m) > 1) {
+par(cex.main=0.6)
+heatmap(
+m[how_many_peptides:1,]
+, Rowv = NA
+, Colv = NA
+, cexRow = 0.7
+, cexCol = 0.8
+, scale="row"
+, margins = margins
+, main = "Heatmap of unimputed, unnormalized intensities"
+, xlab = ""
+# , main = bquote(
+#     .( how_many_peptides )
+#       ~ " peptides with adjusted p-value <"
+#       ~ .(sprintf("%0.2f", cutoff))
+#     )
+)
+}
+)
+#ACE fig_dim knitr::opts_chunk$set(fig.dim = fig_dim)
+par(op)
+}
+}
+}
+```
+## Peptide IDs, etc.
+See output files.

Mercurial > repos > eschen42 > mqppep_anova

comparison mqppep_anova_script.Rmd @ 0:c1403d18c189 draft