Mercurial > repos > eschen42 > mqppep_preproc
comparison mqppep_anova_script.Rmd @ 7:36f183e5e4ed draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit 9a0fa6d0f9aadc069a5551a54da6daf307885637"
| author | eschen42 |
|---|---|
| date | Tue, 15 Mar 2022 00:35:55 +0000 |
| parents | 2c7e1b167736 |
| children |
comparison
equal
deleted
inserted
replaced
| 6:42daf70d4ed4 | 7:36f183e5e4ed |
|---|---|
| 1 --- | 1 --- |
| 2 title: "Quant Data Processing Script" | 2 title: "MaxQuant Phospho-Proteomic Enrichment Pipeline ANOVA" |
| 3 author: "Larry Cheng; Art Eschenlauer" | 3 author: "Larry Cheng; Art Eschenlauer" |
| 4 date: "May 28, 2018; Nov 16, 2021" | 4 date: "May 28, 2018; Nov 16, 2021" |
| 5 output: | 5 output: |
| 6 html_document: default | |
| 7 pdf_document: default | 6 pdf_document: default |
| 8 params: | 7 params: |
| 9 inputFile: "Upstream_Map_pST_outputfile_STEP4.txt" | 8 inputFile: "test-data/test_input_for_anova.tabular" |
| 10 alphaFile: "alpha_levels.txt" | 9 alphaFile: "test-data/alpha_levels.tabular" |
| 11 firstDataColumn: "Intensity" | 10 firstDataColumn: "Intensity" |
| 12 imputationMethod: !r c("group-median","median","mean","random")[4] | 11 imputationMethod: !r c("group-median", "median", "mean", "random")[1] |
| 13 meanPercentile: 1 | 12 meanPercentile: 1 |
| 14 sdPercentile: 0.2 | 13 sdPercentile: 0.2 |
| 15 regexSampleNames: "\\.(\\d+)[A-Z]$" | 14 regexSampleNames: "\\.(\\d+)[A-Z]$" |
| 16 regexSampleGrouping: "(\\d+)" | 15 regexSampleGrouping: "(\\d+)" |
| 17 imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt" | 16 imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt" |
| 18 --- | 17 --- |
| 19 ```{r setup, include=FALSE} | 18 ```{r setup, include = FALSE} |
| 20 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 | 19 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 |
| 21 knitr::opts_chunk$set(echo = FALSE, fig.dim=c(9,10)) | 20 knitr::opts_chunk$set(echo = FALSE, fig.dim = c(9, 10)) |
| 21 | |
| 22 ### FUNCTIONS | |
| 23 | |
| 24 #ANOVA filter function | |
| 25 anova_func <- function(x, grouping_factor) { | |
| 26 x_aov <- aov(as.numeric(x) ~ grouping_factor) | |
| 27 pvalue <- summary(x_aov)[[1]][["Pr(>F)"]][1] | |
| 28 pvalue | |
| 29 } | |
| 22 ``` | 30 ``` |
| 23 | 31 |
| 24 ## Purpose: | 32 ## Purpose: |
| 25 Perform imputation of missing values, quantile normalization, and ANOVA. | 33 Perform imputation of missing values, quantile normalization, and ANOVA. |
| 26 | 34 |
| 27 <!-- | 35 <!-- |
| 28 ## Variables to change for each input file | 36 ## Variables to change for each input file |
| 29 --> | 37 --> |
| 30 ```{r include = FALSE} | 38 ```{r include = FALSE} |
| 31 #Input Filename | 39 # Input Filename |
| 32 inputFile <- params$inputFile | 40 input_file <- params$inputFile |
| 33 | 41 |
| 34 #First data column - ideally, this could be detected via regexSampleNames, but for now leave it as is. | 42 # First data column - ideally, this could be detected via regexSampleNames, |
| 35 firstDataColumn <- params$firstDataColumn | 43 # but for now leave it as is. |
| 36 FDC_is_integer <- TRUE | 44 first_data_column <- params$firstDataColumn |
| 37 firstDataColumn <- withCallingHandlers( | 45 fdc_is_integer <- TRUE |
| 38 as.integer(firstDataColumn) | 46 first_data_column <- withCallingHandlers( |
| 39 , warning = function(w) FDC_is_integer <<- FALSE | 47 as.integer(first_data_column) |
| 48 , warning = function(w) fdc_is_integer <<- FALSE | |
| 40 ) | 49 ) |
| 41 if (FALSE == FDC_is_integer) { | 50 if (FALSE == fdc_is_integer) { |
| 42 firstDataColumn <- params$firstDataColumn | 51 first_data_column <- params$firstDataColumn |
| 43 } | 52 } |
| 44 | 53 |
| 45 #False discovery rate adjustment for ANOVA (Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05) | 54 # False discovery rate adjustment for ANOVA |
| 46 valFDR <- read.table(file = params$alphaFile, sep = "\t", header=F, quote="")[,1] | 55 # Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05 |
| 56 val_fdr <- | |
| 57 read.table(file = params$alphaFile, sep = "\t", header = F, quote = "")[, 1] | |
| 47 | 58 |
| 48 #Imputed Data filename | 59 #Imputed Data filename |
| 49 imputedDataFilename <- params$imputedDataFilename | 60 imputed_data_filename <- params$imputedDataFilename |
| 50 | 61 |
| 51 #ANOVA data filename | 62 #ANOVA data filename |
| 52 ``` | 63 ``` |
| 53 | 64 |
| 54 ```{r include = FALSE} | 65 ```{r echo = FALSE} |
| 55 #Imputation method, should be one of c("random","group-median","median","mean") | 66 # Imputation method, should be one of |
| 56 imputationMethod <- params$imputationMethod | 67 # "random", "group-median", "median", or "mean" |
| 57 | 68 imputation_method <- params$imputationMethod |
| 58 #Selection of percentile of logvalue data to set the mean for random number generation when using random imputation | 69 |
| 59 meanPercentile <- params$meanPercentile / 100.0 | 70 # Selection of percentile of logvalue data to set the mean for random number |
| 60 | 71 # generation when using random imputation |
| 61 #deviation adjustment-factor for random values; real number. | 72 mean_percentile <- params$meanPercentile / 100.0 |
| 62 sdPercentile <- params$sdPercentile | 73 |
| 63 | 74 # deviation adjustment-factor for random values; real number. |
| 64 #Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" | 75 sd_percentile <- params$sdPercentile |
| 65 regexSampleNames <- params$regexSampleNames | 76 |
| 66 | 77 # Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" |
| 67 #Regular expression to extract Sample Grouping from Sample Name (if error occurs, compare sampleNumbers and tempMatches to see if groupings/pairs line up) | 78 regex_sample_names <- params$regexSampleNames |
| 68 # e.g., "(\\d+)" | 79 |
| 69 regexSampleGrouping <- params$regexSampleGrouping | 80 # Regular expression to extract Sample Grouping from Sample Name; |
| 70 | 81 # if error occurs, compare sample_factor_levels and temp_matches |
| 71 ``` | 82 # to see if groupings/pairs line up |
| 72 | 83 # e.g., "(\\d+)" |
| 73 | 84 regex_sample_grouping <- params$regexSampleGrouping |
| 74 ```{r include = FALSE} | 85 |
| 75 ### FUNCTIONS | 86 ``` |
| 76 | 87 |
| 77 #ANOVA filter function | 88 ```{r echo = FALSE} |
| 78 anovaFunc <- function(x, groupingFactor) { | 89 ### READ DATA |
| 79 x.aov = aov(as.numeric(x) ~ groupingFactor) | 90 |
| 80 pvalue = summary(x.aov)[[1]][["Pr(>F)"]][1] | 91 library(data.table) |
| 81 pvalue | 92 |
| 82 } | 93 # read.table reads a file in table format and creates a data frame from it. |
| 83 ``` | 94 # - note that `quote = ""` means that quotation marks are treated literally. |
| 84 | 95 full_data <- read.table( |
| 85 | 96 file = input_file, |
| 97 sep = "\t", | |
| 98 header = T, | |
| 99 quote = "", | |
| 100 check.names = FALSE | |
| 101 ) | |
| 102 ``` | |
| 103 | |
| 104 ### Column names from input file | |
| 105 | |
| 106 ```{r echo = FALSE, results = 'markup'} | |
| 107 print(colnames(full_data)) | |
| 108 data_column_indices <- grep(first_data_column, names(full_data), perl = TRUE) | |
| 109 cat(sprintf("First data column: %d\n", min(data_column_indices))) | |
| 110 cat(sprintf("Last data column: %d\n", max(data_column_indices))) | |
| 111 ``` | |
| 112 | |
| 113 ```{r echo = FALSE, results = 'asis'} | |
| 114 cat("\\newpage\n") | |
| 115 ``` | |
| 86 | 116 |
| 87 ### Checking that log-transformed sample distributions are similar: | 117 ### Checking that log-transformed sample distributions are similar: |
| 88 ```{r echo=FALSE} | 118 |
| 89 | 119 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} |
| 90 library(data.table) | 120 |
| 91 | 121 if (FALSE == fdc_is_integer) { |
| 92 # read.table reads a file in table format and creates a data frame from it. | 122 |
| 93 # - note that `quote=""` means that quotation marks are treated literally. | 123 if (length(data_column_indices) > 0) { |
| 94 fullData <- read.table(file = inputFile, sep = "\t", header=T, quote="", check.names=FALSE) | 124 first_data_column <- data_column_indices[1] |
| 95 print(colnames(fullData)) | |
| 96 #head(fullData) | |
| 97 | |
| 98 if (FALSE == FDC_is_integer) { | |
| 99 dataColumnIndices <- grep(firstDataColumn, names(fullData), perl=TRUE) | |
| 100 str(dataColumnIndices) | |
| 101 if (length(dataColumnIndices) > 0) { | |
| 102 firstDataColumn <- dataColumnIndices[1] | |
| 103 } else { | 125 } else { |
| 104 stop(paste("failed to convert firstDataColumn:", firstDataColumn)) | 126 stop(paste("failed to convert firstDataColumn:", first_data_column)) |
| 105 } | 127 } |
| 106 } | 128 } |
| 107 | 129 |
| 108 quantData0 <- fullData[firstDataColumn:length(fullData)] | 130 quant_data0 <- full_data[first_data_column:length(full_data)] |
| 109 quantData <- fullData[firstDataColumn:length(fullData)] | 131 quant_data <- full_data[first_data_column:length(full_data)] |
| 110 quantData[quantData==0] <- NA #replace 0 with NA | 132 quant_data[quant_data == 0] <- NA #replace 0 with NA |
| 111 quantDataLog <- log10(quantData) | 133 quant_data_log <- log10(quant_data) |
| 112 | 134 |
| 113 rownames(quantDataLog) <- fullData$Phosphopeptide | 135 rownames(quant_data_log) <- full_data$Phosphopeptide |
| 114 | 136 |
| 115 summary(quantDataLog) | 137 # data visualization |
| 116 | |
| 117 #data visualization | |
| 118 old_par <- par( | 138 old_par <- par( |
| 119 mai=par("mai") + c(0.5,0,0,0) | 139 mai = par("mai") + c(0.5, 0, 0, 0) |
| 120 ) | 140 ) |
| 121 boxplot( | 141 boxplot( |
| 122 quantDataLog | 142 quant_data_log |
| 123 , las=2 | 143 , las = 2 |
| 124 ) | 144 ) |
| 125 par(old_par) | 145 par(old_par) |
| 126 | 146 |
| 127 quantDataLog_stack <- stack(quantDataLog) | 147 |
| 128 ``` | 148 |
| 129 | 149 cat("\\newline\n") |
| 130 ```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)} | 150 cat("\\newline\n") |
| 151 | |
| 152 ``` | |
| 153 | |
| 154 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4), warning = FALSE} | |
| 155 quant_data_log_stack <- stack(quant_data_log) | |
| 131 library(ggplot2) | 156 library(ggplot2) |
| 132 ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind)) | 157 ggplot( |
| 158 quant_data_log_stack, | |
| 159 aes(x = values)) + geom_density(aes(group = ind, colour = ind)) | |
| 133 ``` | 160 ``` |
| 134 | 161 |
| 135 ### Globally, are phosphopeptide intensities are approximately unimodal? | 162 ### Globally, are phosphopeptide intensities are approximately unimodal? |
| 136 ```{r echo = FALSE,fig.align="left", fig.dim=c(9,5)} | 163 |
| 137 | 164 <!-- |
| 138 # ref for bquote particularly and plotting math expressions generally: | 165 # ref for bquote below particularly and plotting math expressions generally: |
| 139 # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ | 166 # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ |
| 140 | 167 --> |
| 141 #identify the location of missing values | 168 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)} |
| 142 fin <- is.finite(as.numeric(as.matrix(quantDataLog))) | 169 |
| 143 | 170 # identify the location of missing values |
| 144 logvalues <- as.numeric(as.matrix(quantDataLog))[fin] | 171 fin <- is.finite(as.numeric(as.matrix(quant_data_log))) |
| 172 | |
| 173 logvalues <- as.numeric(as.matrix(quant_data_log))[fin] | |
| 145 plot( | 174 plot( |
| 146 density(logvalues) | 175 density(logvalues), |
| 147 , main = bquote("Smoothed estimated probability density vs." ~ log[10](intensity)) | 176 main = bquote( |
| 148 , xlab = bquote(log[10](intensity)) | 177 "Smoothed estimated probability density vs." ~ log[10](intensity)), |
| 149 ) | 178 xlab = bquote(log[10](intensity)) |
| 179 ) | |
| 150 hist( | 180 hist( |
| 151 x = as.numeric(as.matrix(quantDataLog)) | 181 x = as.numeric(as.matrix(quant_data_log)) |
| 152 , breaks = 100 | 182 , breaks = 100 |
| 153 , main = bquote("Frequency vs." ~ log[10](intensity)) | 183 , main = bquote("Frequency vs." ~ log[10](intensity)) |
| 154 , xlab = bquote(log[10](intensity)) | 184 , xlab = bquote(log[10](intensity)) |
| 155 ) | 185 ) |
| 156 ``` | 186 ``` |
| 157 | 187 |
| 158 <!-- | |
| 159 ## Impute missing values | |
| 160 --> | |
| 161 | |
| 162 ### Distribution of standard deviations of phosphopeptides, ignoring missing values: | 188 ### Distribution of standard deviations of phosphopeptides, ignoring missing values: |
| 163 | 189 |
| 164 ```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)} | 190 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 5)} |
| 165 #determine quantile | 191 # determine quantile |
| 166 q1 <- quantile(logvalues, probs = meanPercentile)[1] | 192 q1 <- quantile(logvalues, probs = mean_percentile)[1] |
| 167 | 193 |
| 168 #determine standard deviation of quantile to impute | 194 # determine standard deviation of quantile to impute |
| 169 sd_finite <- function(x) { | 195 sd_finite <- function(x) { |
| 170 ok <- is.finite(x) | 196 ok <- is.finite(x) |
| 171 sd(x[ok]) * sdPercentile | 197 sd(x[ok]) * sd_percentile |
| 172 } | 198 } |
| 173 sds <- apply(quantDataLog, 1, sd_finite) # 1 = row of matrix (ie, phosphopeptide) | 199 # 1 = row of matrix (ie, phosphopeptide) |
| 200 sds <- apply(quant_data_log, 1, sd_finite) | |
| 174 plot( | 201 plot( |
| 175 density(sds, na.rm=T) | 202 density(sds, na.rm = T) |
| 176 , main="Smoothed estimated probability density vs. std. deviation" | 203 , main = "Smoothed estimated probability density vs. std. deviation" |
| 177 , sub="(probability estimation made with Gaussian smoothing)" | 204 , sub = "(probability estimation made with Gaussian smoothing)" |
| 178 ) | 205 ) |
| 179 | 206 |
| 180 m1 <- median(sds, na.rm=T) #sd to be used is the median sd | 207 m1 <- median(sds, na.rm = T) #sd to be used is the median sd |
| 181 | 208 |
| 182 ``` | 209 ``` |
| 183 | 210 |
| 184 | 211 |
| 185 | 212 |
| 186 <!-- | 213 <!-- |
| 187 The number of missing values are: | 214 The number of missing values are: |
| 188 --> | 215 --> |
| 189 ```{r echo=FALSE} | 216 ```{r echo = FALSE} |
| 190 #Determine number of cells to impute | 217 #Determine number of cells to impute |
| 191 temp <- quantData[is.na(quantData)] | 218 temp <- quant_data[is.na(quant_data)] |
| 192 | 219 |
| 193 #Determine number of values to impute | 220 #Determine number of values to impute |
| 194 NoToImpute <- length(temp) | 221 number_to_impute <- length(temp) |
| 195 ``` | 222 ``` |
| 196 | 223 |
| 197 <!-- | 224 <!-- |
| 198 % of values that are missing: | 225 % of values that are missing: |
| 199 --> | 226 --> |
| 200 ```{r echo=FALSE} | 227 ```{r echo = FALSE} |
| 201 pct_missing_values <- length(temp)/(length(logvalues)+length(temp)) * 100 | 228 pct_missing_values <- length(temp) / (length(logvalues) + length(temp)) * 100 |
| 202 ``` | 229 ``` |
| 203 | 230 |
| 204 <!-- | 231 <!-- |
| 205 First few rows of data before imputation: | 232 First few rows of data before imputation: |
| 206 --> | 233 --> |
| 234 ```{r echo = FALSE, results = 'asis'} | |
| 235 cat("\\newpage\n") | |
| 236 ``` | |
| 237 | |
| 238 ## Parse sample names | |
| 239 | |
| 240 Parse the names of the samples to deduce the factor level for each sample: | |
| 241 | |
| 242 ```{r echo = FALSE} | |
| 243 | |
| 244 # prep for trt-median based imputation | |
| 245 | |
| 246 # Assuming that regex_sample_names <- "\\.(\\d+)[A-Z]$" | |
| 247 # get factors -> | |
| 248 # group runs (samples) by ignoring terminal [A-Z] in sample names | |
| 249 | |
| 250 m <- regexpr(regex_sample_names, names(quant_data), perl = TRUE) | |
| 251 temp_matches <- regmatches(names(quant_data), m) | |
| 252 print("Extracted sample names") | |
| 253 print(temp_matches) | |
| 254 m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE) | |
| 255 sample_factor_levels <- as.factor(regmatches(temp_matches, m2)) | |
| 256 print("Factor levels") | |
| 257 print(sample_factor_levels) | |
| 258 | |
| 259 ``` | |
| 207 ## Impute missing values | 260 ## Impute missing values |
| 208 ```{r echo = FALSE} | 261 |
| 209 | 262 ```{r echo = FALSE} |
| 210 #ACE start segment: trt-median based imputation | 263 |
| 211 # prep for trt-median based imputation | |
| 212 | |
| 213 # Assuming that regexSampleNames <- "\\.(\\d+)[A-Z]$" | |
| 214 # get factors -> group runs (samples) by ignoring terminal [A-Z] in sample names | |
| 215 # regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) | |
| 216 m <- regexpr(regexSampleNames, names(quantData), perl=TRUE) | |
| 217 tempMatches <- regmatches(names(quantData), m) | |
| 218 print("Extracted sample names") | |
| 219 print(tempMatches) | |
| 220 m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE) | |
| 221 sampleNumbers <- as.factor(regmatches(tempMatches, m2)) | |
| 222 print("Factor levels") | |
| 223 print(sampleNumbers) | |
| 224 | |
| 225 ``` | |
| 226 ```{r echo = FALSE} | |
| 227 | |
| 228 #ACE hack begin | |
| 229 #Determine number of cells to impute | 264 #Determine number of cells to impute |
| 230 cat( | 265 cat("Before imputation,", |
| 231 sprintf("Before imputation, there are:\n %d peptides\n %d missing values (%2.0f%s)" | 266 sprintf( |
| 232 , sum(rep.int(TRUE, nrow(quantData))) | 267 "there are:\n %d peptides\n %d missing values (%2.0f%s)", |
| 233 , sum(is.na(quantData)) | 268 sum(rep.int(TRUE, nrow(quant_data))), |
| 234 , pct_missing_values | 269 sum(is.na(quant_data)), |
| 235 , "%" | 270 pct_missing_values, |
| 236 ) | 271 "%" |
| 237 ) | 272 ) |
| 238 #ACE hack end | 273 ) |
| 239 | 274 |
| 240 ``` | 275 ``` |
| 241 ```{r echo = FALSE} | 276 ```{r echo = FALSE} |
| 242 | 277 |
| 243 #Impute data | 278 #Impute data |
| 244 quantDataImputed <- quantData | 279 quant_data_imp <- quant_data |
| 245 | 280 |
| 246 # Identify which values are missing and need to be imputed | 281 # Identify which values are missing and need to be imputed |
| 247 ind <- which(is.na(quantDataImputed), arr.ind=TRUE) | 282 ind <- which(is.na(quant_data_imp), arr.ind = TRUE) |
| 248 | 283 |
| 249 ``` | 284 ``` |
| 250 ```{r echo = FALSE} | 285 ```{r echo = FALSE} |
| 251 | 286 |
| 252 # Apply imputation | 287 # Apply imputation |
| 253 switch( | 288 switch( |
| 254 imputationMethod | 289 imputation_method |
| 255 , "group-median"={ | 290 , "group-median" = { |
| 256 cat("Imputation method: substitute missing value with median peptide-intensity for sample-group\n") | 291 cat("Imputation method:\n substitute missing value", |
| 257 #goodRows <- rep.int(TRUE, nrow(quantDataImputed)) | 292 "with median peptide-intensity for sample-group\n") |
| 258 sampleLevelIntegers <- as.integer(sampleNumbers) | 293 |
| 259 for (i in 1:length(levels(sampleNumbers))) { | 294 sample_level_integers <- as.integer(sample_factor_levels) |
| 260 levelCols <- i == sampleLevelIntegers | 295 for (i in seq_len(length(levels(sample_factor_levels)))) { |
| 261 ind <- which(is.na(quantDataImputed[,levelCols]), arr.ind=TRUE) | 296 level_cols <- i == sample_level_integers |
| 262 quantDataImputed[ind,levelCols] <- apply(quantDataImputed[,levelCols], 1, median, na.rm=T)[ind[,1]] | 297 ind <- which(is.na(quant_data_imp[, level_cols]), arr.ind = TRUE) |
| 298 quant_data_imp[ind, level_cols] <- | |
| 299 apply(quant_data_imp[, level_cols], 1, median, na.rm = T)[ind[, 1]] | |
| 263 } | 300 } |
| 264 goodRows <- !is.na(rowMeans(quantDataImputed)) | 301 good_rows <- !is.na(rowMeans(quant_data_imp)) |
| 265 } | 302 } |
| 266 , "median"={ | 303 , "median" = { |
| 267 cat("Imputation method: substitute missing value with median peptide-intensity across all sample classes\n") | 304 cat("Imputation method:\n substitute missing value with", |
| 268 quantDataImputed[ind] <- apply(quantDataImputed, 1, median, na.rm=T)[ind[,1]] | 305 "median peptide-intensity across all sample classes\n") |
| 269 goodRows <- !is.na(rowMeans(quantDataImputed)) | 306 quant_data_imp[ind] <- apply(quant_data_imp, 1, median, na.rm = T)[ind[, 1]] |
| 307 good_rows <- !is.na(rowMeans(quant_data_imp)) | |
| 270 } | 308 } |
| 271 , "mean"={ | 309 , "mean" = { |
| 272 cat("Imputation method: substitute missing value with mean peptide-intensity across all sample classes\n") | 310 cat("Imputation method:\n substitute missing value with", |
| 273 quantDataImputed[ind] <- apply(quantDataImputed, 1, mean, na.rm=T)[ind[,1]] | 311 "mean peptide-intensity across all sample classes\n") |
| 274 goodRows <- !is.na(rowMeans(quantDataImputed)) | 312 quant_data_imp[ind] <- apply(quant_data_imp, 1, mean, na.rm = T)[ind[, 1]] |
| 313 good_rows <- !is.na(rowMeans(quant_data_imp)) | |
| 275 } | 314 } |
| 276 , "random"={ | 315 , "random" = { |
| 277 cat( | 316 cat( |
| 317 "Imputation method:\n substitute missing value with\n ", | |
| 278 sprintf( | 318 sprintf( |
| 279 "Imputation method: substitute missing value with random intensity N ~ (%0.2f, %0.2f)\n" | 319 "random intensity N ~ (%0.2f, %0.2f)\n" |
| 280 , q1, m1 | 320 , q1, m1 |
| 281 ) | 321 ) |
| 282 ) | 322 ) |
| 283 quantDataImputed[is.na(quantDataImputed)] <- 10^rnorm(NoToImpute, mean= q1, sd = m1) | 323 quant_data_imp[is.na(quant_data_imp)] <- |
| 284 goodRows <- !is.na(rowMeans(quantDataImputed)) | 324 10 ^ rnorm(number_to_impute, mean = q1, sd = m1) |
| 325 good_rows <- !is.na(rowMeans(quant_data_imp)) | |
| 285 } | 326 } |
| 286 ) | 327 ) |
| 287 | 328 |
| 288 ``` | 329 ``` |
| 289 ```{r echo = FALSE} | 330 ```{r echo = FALSE} |
| 290 | 331 |
| 291 #Determine number of cells to impute | 332 #Determine number of cells to impute |
| 292 temp <- quantDataImputed[is.na(quantDataImputed)] | 333 temp <- quant_data_imp[is.na(quant_data_imp)] |
| 293 cat( | 334 cat("After imputation, there are:", |
| 294 sprintf( | 335 sprintf( |
| 295 "After imputation, there are:\n %d missing values\n %d usable peptides\n %d peptides with too many missing values for further analysis" | 336 "\n %d missing values\n %d usable peptides analysis" |
| 296 , sum(is.na(quantDataImputed[goodRows,])) | 337 , sum(is.na(quant_data_imp[good_rows, ])) |
| 297 , sum(goodRows) | 338 , sum(good_rows) |
| 298 , sum(!goodRows) | 339 ), |
| 340 sprintf( | |
| 341 "\n %d peptides with too many missing values for further analysis" | |
| 342 , sum(!good_rows) | |
| 299 ) | 343 ) |
| 300 ) | 344 ) |
| 301 ``` | 345 ``` |
| 302 ```{r echo = FALSE} | 346 ```{r echo = FALSE} |
| 303 | 347 |
| 304 | 348 |
| 305 # Zap rows where imputation was ineffective | 349 # Zap rows where imputation was ineffective |
| 306 fullData <- fullData [goodRows, ] | 350 full_data <- full_data [good_rows, ] |
| 307 quantData <- quantData [goodRows, ] | 351 quant_data <- quant_data [good_rows, ] |
| 308 quantDataImputed <- quantDataImputed[goodRows, ] | 352 quant_data_imp <- quant_data_imp[good_rows, ] |
| 309 | 353 |
| 310 ``` | 354 ``` |
| 311 ```{r echo = FALSE} | 355 ```{r echo = FALSE} |
| 312 | 356 |
| 313 d_combined <- (density(as.numeric(as.matrix(log10(quantDataImputed))))) | 357 d_combined <- (density(as.numeric(as.matrix( |
| 314 d_original <- density(as.numeric(as.matrix(log10(quantDataImputed[!is.na(quantData)])))) | 358 log10(quant_data_imp) |
| 315 | 359 )))) |
| 316 ``` | 360 d_original <- |
| 317 ```{r echo = FALSE} | 361 density(as.numeric(as.matrix( |
| 318 | 362 log10(quant_data_imp[!is.na(quant_data)])))) |
| 319 if (sum(is.na(quantData)) > 0) { | 363 |
| 364 ``` | |
| 365 ```{r echo = FALSE} | |
| 366 | |
| 367 if (sum(is.na(quant_data)) > 0) { | |
| 320 # There ARE missing values | 368 # There ARE missing values |
| 321 d_imputed <- (density(as.numeric(as.matrix(log10(quantDataImputed[is.na(quantData)]))))) | 369 d_imputed <- |
| 370 (density(as.numeric(as.matrix( | |
| 371 log10(quant_data_imp[is.na(quant_data)]) | |
| 372 )))) | |
| 322 } else { | 373 } else { |
| 323 # There are NO missing values | 374 # There are NO missing values |
| 324 d_imputed <- d_combined | 375 d_imputed <- d_combined |
| 325 } | 376 } |
| 326 | 377 |
| 327 ``` | 378 ``` |
| 328 | 379 |
| 329 <!-- ```{r echo = FALSE, fig.cap = "Blue = Data before imputation; Red = Imputed data"} --> | 380 ```{r echo = FALSE, fig.dim = c(9, 5)} |
| 330 ```{r echo = FALSE, fig.dim=c(9,5)} | |
| 331 ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) | 381 ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) |
| 332 plot( | 382 plot( |
| 333 d_combined | 383 d_combined, |
| 334 , ylim = ylim | 384 ylim = ylim, |
| 335 , sub = "Blue = data before imputation; Red = imputed data" | 385 sub = "Blue = data before imputation; Red = imputed data", |
| 336 , main = "Density vs. log10(intensity) before and after imputation" | 386 main = "Density vs. log10(intensity) before and after imputation" |
| 337 ) | 387 ) |
| 338 lines(d_original, col="blue") | 388 lines(d_original, col = "blue") |
| 339 lines(d_imputed, col="red") | 389 lines(d_imputed, col = "red") |
| 340 ``` | 390 ``` |
| 341 | 391 |
| 342 ## Perform Quantile Normalization | 392 ## Perform Quantile Normalization |
| 343 ```{r echo=FALSE} | 393 |
| 344 library(preprocessCore) | 394 <!-- |
| 345 # Apply quantile normalization using preprocessCore::normalize.quantiles | 395 # Apply quantile normalization using preprocessCore::normalize.quantiles |
| 346 # --- | 396 # --- |
| 347 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html | 397 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html |
| 348 # except this: https://support.bioconductor.org/p/122925/#9135989 | 398 # except this: https://support.bioconductor.org/p/122925/#9135989 |
| 349 # says to install it like this: | 399 # says to install it like this: |
| 350 # ``` | 400 # ``` |
| 351 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE,lib=.libPaths()[1]) | 401 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE, lib=.libPaths()[1]) |
| 352 # ``` | 402 # ``` |
| 353 # conda installation (necessary because of a bug in recent openblas): | 403 # conda installation (necessary because of a bug in recent openblas): |
| 354 # conda install bioconductor-preprocesscore openblas=0.3.3 | 404 # conda install bioconductor-preprocesscore openblas=0.3.3 |
| 355 # ... | 405 # ... |
| 356 # --- | 406 # --- |
| 358 # | 408 # |
| 359 # Description: | 409 # Description: |
| 360 # Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities. | 410 # Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities. |
| 361 # | 411 # |
| 362 # Usage: | 412 # Usage: |
| 363 # normalize.quantiles(x,copy=TRUE, keep.names=FALSE) | 413 # normalize.quantiles(x, copy = TRUE, keep.names = FALSE) |
| 364 # | 414 # |
| 365 # Arguments: | 415 # Arguments: |
| 366 # | 416 # |
| 367 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. | 417 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. |
| 368 # | 418 # |
| 395 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of | 445 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of |
| 396 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias | 446 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias |
| 397 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 | 447 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 |
| 398 # http://bmbolstad.com/misc/normalize/normalize.html | 448 # http://bmbolstad.com/misc/normalize/normalize.html |
| 399 # ... | 449 # ... |
| 450 --> | |
| 451 ```{r echo = FALSE} | |
| 452 library(preprocessCore) | |
| 400 | 453 |
| 401 if (TRUE) { | 454 if (TRUE) { |
| 402 quantDataImputed.qn <- normalize.quantiles(as.matrix(quantDataImputed)) | 455 quant_data_imp_qn <- normalize.quantiles(as.matrix(quant_data_imp)) |
| 403 } else { | 456 } else { |
| 404 quantDataImputed.qn <- as.matrix(quantDataImputed) | 457 quant_data_imp_qn <- as.matrix(quant_data_imp) |
| 405 } | 458 } |
| 406 | 459 |
| 407 quantDataImputed.qn = as.data.frame(quantDataImputed.qn) | 460 quant_data_imp_qn <- as.data.frame(quant_data_imp_qn) |
| 408 names(quantDataImputed.qn) = names(quantDataImputed) | 461 names(quant_data_imp_qn) <- names(quant_data_imp) |
| 409 quantDataImputed_QN_log <- log10(quantDataImputed.qn) | 462 quant_data_imp_qn_log <- log10(quant_data_imp_qn) |
| 410 | 463 |
| 411 rownames(quantDataImputed_QN_log) <- fullData[,1] | 464 rownames(quant_data_imp_qn_log) <- full_data[, 1] |
| 412 | 465 |
| 413 quantDataImputed.qn.LS = t(scale(t(log10(quantDataImputed.qn)))) | 466 quant_data_imp_qn_ls <- t(scale(t(log10(quant_data_imp_qn)))) |
| 414 anyNaN <- function (x) { | 467 any_nan <- function(x) { |
| 415 !any(x == "NaN") | 468 !any(x == "NaN") |
| 416 } | 469 } |
| 417 sel = apply(quantDataImputed.qn.LS, 1, anyNaN) | 470 sel <- apply(quant_data_imp_qn_ls, 1, any_nan) |
| 418 quantDataImputed.qn.LS2 <- quantDataImputed.qn.LS[which(sel),] | 471 quant_data_imp_qn_ls2 <- quant_data_imp_qn_ls[which(sel), ] |
| 419 quantDataImputed.qn.LS2 = as.data.frame(quantDataImputed.qn.LS2) | 472 quant_data_imp_qn_ls2 <- as.data.frame(quant_data_imp_qn_ls2) |
| 420 | 473 |
| 421 #output quantile normalized data | 474 #output quantile normalized data |
| 422 dataTableImputed_QN_LT <- cbind(fullData[1:9], quantDataImputed_QN_log) | 475 data_table_imp_qn_lt <- cbind(full_data[1:9], quant_data_imp_qn_log) |
| 423 write.table(dataTableImputed_QN_LT, file = paste(paste(strsplit(imputedDataFilename, ".txt"),"QN_LT",sep="_"),".txt",sep=""), sep = "\t", col.names=TRUE, row.names=FALSE) | 476 write.table( |
| 477 data_table_imp_qn_lt, | |
| 478 file = paste(paste( | |
| 479 strsplit(imputed_data_filename, ".txt"), "QN_LT", sep = "_" | |
| 480 ), ".txt", sep = ""), | |
| 481 sep = "\t", | |
| 482 col.names = TRUE, | |
| 483 row.names = FALSE | |
| 484 ) | |
| 424 | 485 |
| 425 ``` | 486 ``` |
| 426 | 487 |
| 427 <!-- ACE insertion begin --> | 488 <!-- ACE insertion begin --> |
| 428 ### Checking that normalized, imputed, log-transformed sample distributions are similar: | 489 ### Checking that normalized, imputed, log-transformed sample distributions are similar: |
| 429 | 490 |
| 430 ```{r echo=FALSE} | 491 ```{r echo = FALSE, fig.dim = c(9, 5.5), results = 'asis'} |
| 431 #library(data.table) | 492 |
| 432 | 493 |
| 433 #Save unimputed quantDataLog for plotting below | 494 # Save unimputed quant_data_log for plotting below |
| 434 unimputedQuantDataLog <- quantDataLog | 495 unimputed_quant_data_log <- quant_data_log |
| 435 | 496 |
| 436 #Log10 transform (after preparing for zero values, which should never happen...) | 497 # log10 transform (after preparing for zero values, |
| 437 quantDataImputed.qn[quantDataImputed.qn == 0] <- .000000001 | 498 # which should never happen...) |
| 438 quantDataLog <- log10(quantDataImputed.qn) | 499 quant_data_imp_qn[quant_data_imp_qn == 0] <- .000000001 |
| 439 | 500 quant_data_log <- log10(quant_data_imp_qn) |
| 440 summary(quantDataLog) | 501 |
| 441 | 502 # Output quantile-normalized log-transformed dataset |
| 442 #Output quantile-normalized log-transformed dataset with imputed, normalized data | 503 # with imputed, normalized data |
| 443 | 504 |
| 444 dataTableImputed <- cbind(fullData[1:9], quantDataLog) | 505 data_table_imputed <- cbind(full_data[1:9], quant_data_log) |
| 445 write.table( | 506 write.table( |
| 446 dataTableImputed | 507 data_table_imputed |
| 447 , file=imputedDataFilename | 508 , file = imputed_data_filename |
| 448 , sep="\t" | 509 , sep = "\t" |
| 449 , col.names=TRUE | 510 , col.names = TRUE |
| 450 , row.names=FALSE | 511 , row.names = FALSE |
| 451 , quote=FALSE | 512 , quote = FALSE |
| 452 ) | 513 ) |
| 453 | 514 |
| 454 | 515 |
| 455 | 516 |
| 456 #data visualization | 517 # data visualization |
| 457 old_par <- par( | 518 old_par <- par( |
| 458 mai=par("mai") + c(0.5,0,0,0) | 519 mai = par("mai") + c(0.5, 0, 0, 0) |
| 459 , oma=par("oma") + c(0.5,0,0,0) | 520 , oma = par("oma") + c(0.5, 0, 0, 0) |
| 460 ) | 521 ) |
| 461 boxplot( | 522 boxplot( |
| 462 quantDataLog | 523 quant_data_log |
| 463 , las=2 | 524 , las = 2 |
| 464 ) | 525 ) |
| 465 par(old_par) | 526 par(old_par) |
| 466 ``` | 527 |
| 467 | 528 |
| 468 ```{r echo=FALSE, fig.dim=c(9,5)} | 529 |
| 469 quantDataLog_stack <- stack(quantDataLog) | 530 cat("\\newline\n") |
| 470 ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind)) | 531 cat("\\newline\n") |
| 532 | |
| 533 ``` | |
| 534 | |
| 535 ```{r echo = FALSE, fig.align = "left", fig.dim = c(9, 4)} | |
| 536 quant_data_log_stack <- stack(quant_data_log) | |
| 537 ggplot( | |
| 538 quant_data_log_stack, | |
| 539 aes(x = values) | |
| 540 ) + geom_density(aes(group = ind, colour = ind)) | |
| 471 ``` | 541 ``` |
| 472 | 542 |
| 473 ## Perform ANOVA filters | 543 ## Perform ANOVA filters |
| 474 | 544 |
| 475 ```{r,echo=FALSE} | 545 (see following pages) |
| 476 #Make new data frame containing only Phosphopeptides to connect preANOVA to ANOVA (connect_df) | 546 |
| 547 ```{r, echo = FALSE} | |
| 548 # Make new data frame containing only Phosphopeptides | |
| 549 # to connect preANOVA to ANOVA (connect_df) | |
| 477 connect_df <- data.frame( | 550 connect_df <- data.frame( |
| 478 dataTableImputed_QN_LT$Phosphopeptide | 551 data_table_imp_qn_lt$Phosphopeptide |
| 479 , dataTableImputed_QN_LT[,firstDataColumn] | 552 , data_table_imp_qn_lt[, first_data_column] |
| 480 ) | 553 ) |
| 481 colnames(connect_df) <- c("Phosphopeptide","Intensity") | 554 colnames(connect_df) <- c("Phosphopeptide", "Intensity") |
| 482 ``` | 555 ``` |
| 483 | 556 |
| 484 ```{r echo=FALSE, fig.dim=c(9,10)} | 557 ```{r echo = FALSE, fig.dim = c(9, 10), results = 'asis'} |
| 485 # Get factors -> group replicates (as indicated by terminal letter) by the preceding digits | 558 # Get factors -> group replicates (as indicated by terminal letter) |
| 486 # For example, group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc.. | 559 # by the preceding digits; |
| 487 m <- regexpr(regexSampleNames, names(quantDataImputed_QN_log), perl=TRUE) | 560 # e.g., group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc.. |
| 488 #ACE str(m) | 561 m <- |
| 489 tempMatches <- regmatches(names(quantDataImputed_QN_log), m) | 562 regexpr(regex_sample_names, names(quant_data_imp_qn_log), perl = TRUE) |
| 490 #ACE str(tempMatches) | 563 |
| 491 numSamples <- length(tempMatches) | 564 temp_matches <- regmatches(names(quant_data_imp_qn_log), m) |
| 492 #ACE str(numSamples) | 565 |
| 493 m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE) | 566 number_of_samples <- length(temp_matches) |
| 494 #ACE str(m2) | 567 |
| 495 #ACE str(regmatches(tempMatches, m2)) | 568 m2 <- regexpr(regex_sample_grouping, temp_matches, perl = TRUE) |
| 496 sampleNumbers <- as.factor(regmatches(tempMatches, m2)) | 569 |
| 497 #ACE str(sampleNumbers) | 570 |
| 498 | 571 sample_factor_levels <- as.factor(regmatches(temp_matches, m2)) |
| 499 if (length(levels(sampleNumbers))<2) { | 572 |
| 500 cat("ERROR!!!! Cannot perform ANOVA analysis because it requires two or more factor levels\n") | 573 |
| 574 if (length(levels(sample_factor_levels)) < 2) { | |
| 575 cat( | |
| 576 "ERROR!!!! Cannot perform ANOVA analysis", | |
| 577 "because it requires two or more factor levels\n" | |
| 578 ) | |
| 501 cat("Unparsed sample names are:\n") | 579 cat("Unparsed sample names are:\n") |
| 502 print(names(quantDataImputed_QN_log)) | 580 print(names(quant_data_imp_qn_log)) |
| 503 cat(sprintf("Parsing rule for SampleNames is '%s'\n", regexSampleNames)) | 581 cat(sprintf("Parsing rule for SampleNames is '%s'\n", regex_sample_names)) |
| 504 cat("Parsed names are:\n") | 582 cat("Parsed names are:\n") |
| 505 print(tempMatches) | 583 print(temp_matches) |
| 506 cat(sprintf("Parsing rule for SampleGrouping is '%s'\n", regexSampleGrouping)) | 584 cat(sprintf( |
| 585 "Parsing rule for SampleGrouping is '%s'\n", | |
| 586 regex_sample_grouping | |
| 587 )) | |
| 507 cat("Sample group assignments are:\n") | 588 cat("Sample group assignments are:\n") |
| 508 print(regmatches(tempMatches, m2)) | 589 print(regmatches(temp_matches, m2)) |
| 509 } else { | 590 } else { |
| 510 pValueData.anovaPs <- apply(quantDataImputed_QN_log, 1, anovaFunc, groupingFactor=sampleNumbers) | 591 p_value_data_anova_ps <- |
| 511 | 592 apply( |
| 512 pValueData.anovaPs.FDR <- p.adjust(pValueData.anovaPs, method="fdr") | 593 quant_data_imp_qn_log, |
| 513 pValueData <- data.frame( | 594 1, |
| 514 phosphopeptide = fullData[,1] | 595 anova_func, |
| 515 , rawANOVAp = pValueData.anovaPs | 596 grouping_factor = sample_factor_levels |
| 516 , FDRadjustedANOVAp = pValueData.anovaPs.FDR | 597 ) |
| 598 | |
| 599 p_value_data_anova_ps_fdr <- | |
| 600 p.adjust(p_value_data_anova_ps, method = "fdr") | |
| 601 p_value_data <- data.frame( | |
| 602 phosphopeptide = full_data[, 1] | |
| 603 , | |
| 604 raw_anova_p = p_value_data_anova_ps | |
| 605 , | |
| 606 fdr_adjusted_anova_p = p_value_data_anova_ps_fdr | |
| 517 ) | 607 ) |
| 518 #ACE rownames(pValueData) <- fullData[,1] | 608 |
| 519 # output ANOVA file to constructed filename, | 609 # output ANOVA file to constructed filename, |
| 520 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" | 610 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" |
| 521 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" | 611 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" |
| 522 | 612 |
| 523 #Re-output quantile-normalized log-transformed dataset with imputed, normalized data to include p-values | 613 # Re-output quantile-normalized log-transformed dataset |
| 524 | 614 # with imputed, normalized data to include p-values |
| 525 dataTableImputed <- cbind(fullData[1:9], pValueData[,2:3], quantDataLog) | 615 |
| 616 data_table_imputed <- | |
| 617 cbind(full_data[1:9], p_value_data[, 2:3], quant_data_log) | |
| 526 write.table( | 618 write.table( |
| 527 dataTableImputed | 619 data_table_imputed, |
| 528 , file=imputedDataFilename | 620 file = imputed_data_filename, |
| 529 , sep="\t" | 621 sep = "\t", |
| 530 , col.names=TRUE | 622 col.names = TRUE, |
| 531 , row.names=FALSE | 623 row.names = FALSE, |
| 532 , quote=FALSE | 624 quote = FALSE |
| 533 ) | 625 ) |
| 534 | 626 |
| 535 | 627 |
| 536 pValueData <- pValueData[order(pValueData$FDRadjustedANOVAp),] | 628 p_value_data <- |
| 537 | 629 p_value_data[order(p_value_data$fdr_adjusted_anova_p), ] |
| 538 cutoff <- valFDR[1] | 630 |
| 539 for (cutoff in valFDR){ #loop through FDR cutoffs | 631 cutoff <- val_fdr[1] |
| 540 | 632 for (cutoff in val_fdr) { |
| 541 filtered_p <- pValueData[which(pValueData$FDRadjustedANOVAp < cutoff),, drop = FALSE] | 633 #loop through FDR cutoffs |
| 542 filteredData.filtered <- quantDataImputed_QN_log[rownames(filtered_p),, drop = FALSE] | 634 |
| 543 filteredData.filtered <- filteredData.filtered[order(filtered_p$FDRadjustedANOVAp),, drop = FALSE] | 635 filtered_p <- |
| 636 p_value_data[ | |
| 637 which(p_value_data$fdr_adjusted_anova_p < cutoff), | |
| 638 , | |
| 639 drop = FALSE | |
| 640 ] | |
| 641 filtered_data_filtered <- | |
| 642 quant_data_imp_qn_log[ | |
| 643 rownames(filtered_p), | |
| 644 , | |
| 645 drop = FALSE | |
| 646 ] | |
| 647 filtered_data_filtered <- | |
| 648 filtered_data_filtered[ | |
| 649 order(filtered_p$fdr_adjusted_anova_p), | |
| 650 , | |
| 651 drop = FALSE | |
| 652 ] | |
| 544 | 653 |
| 545 # <!-- ACE insertion start --> | 654 # <!-- ACE insertion start --> |
| 546 old_oma <- par("oma") | 655 old_oma <- par("oma") |
| 547 old_par <- par( | 656 old_par <- par( |
| 548 mai=(par("mai") + c(0.7,0,0,0)) * c(1,1,0.3,1) | 657 mai = (par("mai") + c(0.7, 0, 0, 0)) * c(1, 1, 0.3, 1), |
| 549 , oma=old_oma * c(1,1,0.3,1) | 658 oma = old_oma * c(1, 1, 0.3, 1), |
| 550 , cex.main=0.9 | 659 cex.main = 0.9, |
| 551 , cex.axis=0.7 | 660 cex.axis = 0.7 |
| 552 ) | 661 ) |
| 553 | 662 |
| 554 if (nrow(filteredData.filtered) > 0) { | 663 cat("\\newpage\n") |
| 664 if (nrow(filtered_data_filtered) > 0) { | |
| 665 cat(sprintf( | |
| 666 "Intensities for peptides whose adjusted p-value < %0.2f\n", | |
| 667 cutoff | |
| 668 )) | |
| 669 cat("\\newline\n") | |
| 670 cat("\\newline\n") | |
| 671 | |
| 555 boxplot( | 672 boxplot( |
| 556 filteredData.filtered | 673 filtered_data_filtered, |
| 557 , main = sprintf("Imputed, normalized intensities where adjusted p-value < %0.2f", cutoff) | 674 main = "Imputed, normalized intensities", # no line plot |
| 558 # no line plot , main = "" | 675 las = 2, |
| 559 , las = 2 | 676 ylab = expression(log[10](intensity)) |
| 560 # , ylim = c(5.5,10) | |
| 561 , ylab = expression(log[10](intensity)) | |
| 562 ) | 677 ) |
| 563 } else { | 678 } else { |
| 564 cat(sprintf("No peptides were found to have cutoff adjusted p-value < %0.2f\n", cutoff)) | 679 cat(sprintf( |
| 680 "No peptides were found to have cutoff adjusted p-value < %0.2f\n", | |
| 681 cutoff | |
| 682 )) | |
| 565 } | 683 } |
| 566 par(old_par) | 684 par(old_par) |
| 567 | 685 |
| 568 #Add Phosphopeptide column to ANOVA filtered table | 686 if (nrow(filtered_data_filtered) > 0) { |
| 569 ANOVA.filtered_merge <- merge( | 687 #Add Phosphopeptide column to anova_filtered table |
| 688 anova_filtered_merge <- merge( | |
| 570 x = connect_df | 689 x = connect_df |
| 571 , y = filteredData.filtered | 690 , |
| 572 , by.x="Intensity" | 691 y = filtered_data_filtered |
| 573 , by.y=1 | 692 , |
| 693 by.x = "Intensity" | |
| 694 , | |
| 695 by.y = 1 | |
| 574 ) | 696 ) |
| 575 ANOVA.filtered_merge.order <- rownames(filtered_p) | 697 anova_filtered_merge_order <- rownames(filtered_p) |
| 576 | 698 |
| 577 ANOVA.filtered_merge.format <- sapply( | 699 anova_filtered_merge_format <- sapply( |
| 578 X = filtered_p$FDRadjustedANOVAp | 700 X = filtered_p$fdr_adjusted_anova_p |
| 579 , FUN = function(x) { | 701 , |
| 580 if (x > 0.0001) | 702 FUN = function(x) { |
| 581 paste0("(%0.",1+ceiling(-log10(x)),"f) %s") | 703 if (x > 0.0001) |
| 582 else | 704 paste0("(%0.", 1 + ceiling(-log10(x)), "f) %s") |
| 583 paste0("(%0.4e) %s") | 705 else |
| 706 paste0("(%0.4e) %s") | |
| 584 } | 707 } |
| 585 ) | |
| 586 | |
| 587 #ANOVA.filtered_merge.format <- paste0("(%0.",1+ceiling(-log10(filtered_p$FDRadjustedANOVAp)),"f) %s") | |
| 588 | |
| 589 ANOVA.filtered <- data.table( | |
| 590 ANOVA.filtered_merge$Phosphopeptide | |
| 591 , ANOVA.filtered_merge$Intensity | |
| 592 , ANOVA.filtered_merge[, 2:numSamples+1] | |
| 593 ) | 708 ) |
| 594 colnames(ANOVA.filtered) <- c("Phosphopeptide", colnames(filteredData.filtered)) | 709 |
| 595 | 710 |
| 596 # merge qualitative columns into the ANOVA data | 711 |
| 597 output_table <- data.frame(ANOVA.filtered$Phosphopeptide) | 712 anova_filtered <- data.table( |
| 598 output_table <- merge( | 713 anova_filtered_merge$Phosphopeptide |
| 714 , | |
| 715 anova_filtered_merge$Intensity | |
| 716 , | |
| 717 anova_filtered_merge[, 2:number_of_samples + 1] | |
| 718 ) | |
| 719 colnames(anova_filtered) <- | |
| 720 c("Phosphopeptide", colnames(filtered_data_filtered)) | |
| 721 | |
| 722 # merge qualitative columns into the ANOVA data | |
| 723 output_table <- data.frame(anova_filtered$Phosphopeptide) | |
| 724 output_table <- merge( | |
| 599 x = output_table | 725 x = output_table |
| 600 , y = dataTableImputed_QN_LT | 726 , |
| 601 , by.x = "ANOVA.filtered.Phosphopeptide" | 727 y = data_table_imp_qn_lt |
| 602 , by.y="Phosphopeptide" | 728 , |
| 729 by.x = "anova_filtered.Phosphopeptide" | |
| 730 , | |
| 731 by.y = "Phosphopeptide" | |
| 603 ) | 732 ) |
| 604 | 733 |
| 605 #Produce heatmap to visualize significance and the effect of imputation | 734 #Produce heatmap to visualize significance and the effect of imputation |
| 606 m <- as.matrix(unimputedQuantDataLog[ANOVA.filtered_merge.order,]) | 735 m <- |
| 607 if (nrow(m) > 0) { | 736 as.matrix(unimputed_quant_data_log[anova_filtered_merge_order, ]) |
| 608 rownames_m <- rownames(m) | 737 if (nrow(m) > 0) { |
| 609 rownames(m) <- sapply( | 738 rownames_m <- rownames(m) |
| 610 X = 1:nrow(m) | 739 rownames(m) <- sapply( |
| 611 , FUN = function(i) { | 740 X = seq_len(nrow(m)) |
| 741 , | |
| 742 FUN = function(i) { | |
| 612 sprintf( | 743 sprintf( |
| 613 ANOVA.filtered_merge.format[i] | 744 anova_filtered_merge_format[i] |
| 614 , filtered_p$FDRadjustedANOVAp[i] | 745 , |
| 615 , rownames_m[i] | 746 filtered_p$fdr_adjusted_anova_p[i] |
| 747 , | |
| 748 rownames_m[i] | |
| 616 ) | 749 ) |
| 617 } | 750 } |
| 618 ) | 751 ) |
| 619 margins <- c( | 752 margins <- c(max(nchar(colnames(m))) * 10 / 16 # col |
| 620 max(nchar(colnames(m))) * 10 / 16 # col | 753 , max(nchar(rownames(m))) * 5 / 16 # row |
| 621 , max(nchar(rownames(m))) * 5 / 16 # row | 754 ) |
| 622 ) | 755 how_many_peptides <- min(50, nrow(m)) |
| 623 how_many_peptides <- min(50, nrow(m)) | 756 |
| 624 | 757 cat("\\newpage\n") |
| 625 op <- par("cex.main") | 758 if (nrow(m) > 50) { |
| 626 try( | 759 cat("Heatmap for the 50 most-significant peptides", |
| 627 if (nrow(m) > 1) { | 760 sprintf( |
| 628 par(cex.main=0.6) | 761 "whose adjusted p-value < %0.2f\n", |
| 629 heatmap( | 762 cutoff) |
| 630 m[how_many_peptides:1,] | 763 ) |
| 631 , Rowv = NA | 764 } else { |
| 632 , Colv = NA | 765 cat("Heatmap for peptides whose", |
| 633 , cexRow = 0.7 | 766 sprintf("adjusted p-value < %0.2f\n", |
| 634 , cexCol = 0.8 | 767 cutoff) |
| 635 , scale="row" | 768 ) |
| 636 , margins = margins | 769 } |
| 637 , main = "Heatmap of unimputed, unnormalized intensities" | 770 cat("\\newline\n") |
| 638 , xlab = "" | 771 cat("\\newline\n") |
| 639 # , main = bquote( | 772 op <- par("cex.main") |
| 640 # .( how_many_peptides ) | 773 try( |
| 641 # ~ " peptides with adjusted p-value <" | 774 if (nrow(m) > 1) { |
| 642 # ~ .(sprintf("%0.2f", cutoff)) | 775 par(cex.main = 0.6) |
| 643 # ) | 776 heatmap( |
| 644 ) | 777 m[how_many_peptides:1, ], |
| 645 } | 778 Rowv = NA, |
| 646 ) | 779 Colv = NA, |
| 647 #ACE fig_dim knitr::opts_chunk$set(fig.dim = fig_dim) | 780 cexRow = 0.7, |
| 648 par(op) | 781 cexCol = 0.8, |
| 782 scale = "row", | |
| 783 margins = margins, | |
| 784 main = | |
| 785 "Heatmap of unimputed, unnormalized intensities", | |
| 786 xlab = "" | |
| 787 ) | |
| 788 } | |
| 789 ) | |
| 790 par(op) | |
| 791 } | |
| 649 } | 792 } |
| 650 | |
| 651 } | 793 } |
| 652 } | 794 } |
| 653 ``` | 795 ``` |
| 654 | 796 |
| 797 <!-- | |
| 655 ## Peptide IDs, etc. | 798 ## Peptide IDs, etc. |
| 656 | 799 |
| 657 See output files. | 800 See output files. |
| 801 --> |
