Mercurial > repos > eschen42 > mqppep_anova
comparison mqppep_anova_script.Rmd @ 0:c1403d18c189 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/mqppep commit bb6c941be50db4c0719efdeaa904d7cb7aa1d182"
author | eschen42 |
---|---|
date | Mon, 07 Mar 2022 19:05:01 +0000 |
parents | |
children | d728198f1ba5 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c1403d18c189 |
---|---|
1 --- | |
2 title: "Quant Data Processing Script" | |
3 author: "Larry Cheng; Art Eschenlauer" | |
4 date: "May 28, 2018; Nov 16, 2021" | |
5 output: | |
6 html_document: default | |
7 pdf_document: default | |
8 params: | |
9 inputFile: "Upstream_Map_pST_outputfile_STEP4.txt" | |
10 alphaFile: "alpha_levels.txt" | |
11 firstDataColumn: "Intensity" | |
12 imputationMethod: !r c("group-median","median","mean","random")[4] | |
13 meanPercentile: 1 | |
14 sdPercentile: 0.2 | |
15 regexSampleNames: "\\.(\\d+)[A-Z]$" | |
16 regexSampleGrouping: "(\\d+)" | |
17 imputedDataFilename: "Upstream_Map_pST_outputfile_STEP4_QN_LT.txt" | |
18 --- | |
19 ```{r setup, include=FALSE} | |
20 # ref for parameterizing Rmd document: https://stackoverflow.com/a/37940285 | |
21 knitr::opts_chunk$set(echo = FALSE, fig.dim=c(9,10)) | |
22 ``` | |
23 | |
24 ## Purpose: | |
25 Perform imputation of missing values, quantile normalization, and ANOVA. | |
26 | |
27 <!-- | |
28 ## Variables to change for each input file | |
29 --> | |
30 ```{r include = FALSE} | |
31 #Input Filename | |
32 inputFile <- params$inputFile | |
33 | |
34 #First data column - ideally, this could be detected via regexSampleNames, but for now leave it as is. | |
35 firstDataColumn <- params$firstDataColumn | |
36 FDC_is_integer <- TRUE | |
37 firstDataColumn <- withCallingHandlers( | |
38 as.integer(firstDataColumn) | |
39 , warning = function(w) FDC_is_integer <<- FALSE | |
40 ) | |
41 if (FALSE == FDC_is_integer) { | |
42 firstDataColumn <- params$firstDataColumn | |
43 } | |
44 | |
45 #False discovery rate adjustment for ANOVA (Since pY abundance is low, set to 0.10 and 0.20 in addition to 0.05) | |
46 valFDR <- read.table(file = params$alphaFile, sep = "\t", header=F, quote="")[,1] | |
47 | |
48 #Imputed Data filename | |
49 imputedDataFilename <- params$imputedDataFilename | |
50 | |
51 #ANOVA data filename | |
52 ``` | |
53 | |
54 ```{r include = FALSE} | |
55 #Imputation method, should be one of c("random","group-median","median","mean") | |
56 imputationMethod <- params$imputationMethod | |
57 | |
58 #Selection of percentile of logvalue data to set the mean for random number generation when using random imputation | |
59 meanPercentile <- params$meanPercentile / 100.0 | |
60 | |
61 #deviation adjustment-factor for random values; real number. | |
62 sdPercentile <- params$sdPercentile | |
63 | |
64 #Regular expression of Sample Names, e.g., "\\.(\\d+)[A-Z]$" | |
65 regexSampleNames <- params$regexSampleNames | |
66 | |
67 #Regular expression to extract Sample Grouping from Sample Name (if error occurs, compare sampleNumbers and tempMatches to see if groupings/pairs line up) | |
68 # e.g., "(\\d+)" | |
69 regexSampleGrouping <- params$regexSampleGrouping | |
70 | |
71 ``` | |
72 | |
73 | |
74 ```{r include = FALSE} | |
75 ### FUNCTIONS | |
76 | |
77 #ANOVA filter function | |
78 anovaFunc <- function(x, groupingFactor) { | |
79 x.aov = aov(as.numeric(x) ~ groupingFactor) | |
80 pvalue = summary(x.aov)[[1]][["Pr(>F)"]][1] | |
81 pvalue | |
82 } | |
83 ``` | |
84 | |
85 | |
86 | |
87 ### Checking that log-transformed sample distributions are similar: | |
88 ```{r echo=FALSE} | |
89 | |
90 library(data.table) | |
91 | |
92 # read.table reads a file in table format and creates a data frame from it. | |
93 # - note that `quote=""` means that quotation marks are treated literally. | |
94 fullData <- read.table(file = inputFile, sep = "\t", header=T, quote="", check.names=FALSE) | |
95 print(colnames(fullData)) | |
96 #head(fullData) | |
97 | |
98 if (FALSE == FDC_is_integer) { | |
99 dataColumnIndices <- grep(firstDataColumn, names(fullData), perl=TRUE) | |
100 str(dataColumnIndices) | |
101 if (length(dataColumnIndices) > 0) { | |
102 firstDataColumn <- dataColumnIndices[1] | |
103 } else { | |
104 stop(paste("failed to convert firstDataColumn:", firstDataColumn)) | |
105 } | |
106 } | |
107 | |
108 quantData0 <- fullData[firstDataColumn:length(fullData)] | |
109 quantData <- fullData[firstDataColumn:length(fullData)] | |
110 quantData[quantData==0] <- NA #replace 0 with NA | |
111 quantDataLog <- log10(quantData) | |
112 | |
113 rownames(quantDataLog) <- fullData$Phosphopeptide | |
114 | |
115 summary(quantDataLog) | |
116 | |
117 #data visualization | |
118 old_par <- par( | |
119 mai=par("mai") + c(0.5,0,0,0) | |
120 ) | |
121 boxplot( | |
122 quantDataLog | |
123 , las=2 | |
124 ) | |
125 par(old_par) | |
126 | |
127 quantDataLog_stack <- stack(quantDataLog) | |
128 ``` | |
129 | |
130 ```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)} | |
131 library(ggplot2) | |
132 ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind)) | |
133 ``` | |
134 | |
135 ### Globally, are phosphopeptide intensities are approximately unimodal? | |
136 ```{r echo = FALSE,fig.align="left", fig.dim=c(9,5)} | |
137 | |
138 # ref for bquote particularly and plotting math expressions generally: | |
139 # https://www.r-bloggers.com/2018/03/math-notation-for-r-plot-titles-expression-and-bquote/ | |
140 | |
141 #identify the location of missing values | |
142 fin <- is.finite(as.numeric(as.matrix(quantDataLog))) | |
143 | |
144 logvalues <- as.numeric(as.matrix(quantDataLog))[fin] | |
145 plot( | |
146 density(logvalues) | |
147 , main = bquote("Smoothed estimated probability density vs." ~ log[10](intensity)) | |
148 , xlab = bquote(log[10](intensity)) | |
149 ) | |
150 hist( | |
151 x = as.numeric(as.matrix(quantDataLog)) | |
152 , breaks = 100 | |
153 , main = bquote("Frequency vs." ~ log[10](intensity)) | |
154 , xlab = bquote(log[10](intensity)) | |
155 ) | |
156 ``` | |
157 | |
158 <!-- | |
159 ## Impute missing values | |
160 --> | |
161 | |
162 ### Distribution of standard deviations of phosphopeptides, ignoring missing values: | |
163 | |
164 ```{r echo = FALSE, fig.align="left", fig.dim=c(9,5)} | |
165 #determine quantile | |
166 q1 <- quantile(logvalues, probs = meanPercentile)[1] | |
167 | |
168 #determine standard deviation of quantile to impute | |
169 sd_finite <- function(x) { | |
170 ok <- is.finite(x) | |
171 sd(x[ok]) * sdPercentile | |
172 } | |
173 sds <- apply(quantDataLog, 1, sd_finite) # 1 = row of matrix (ie, phosphopeptide) | |
174 plot( | |
175 density(sds, na.rm=T) | |
176 , main="Smoothed estimated probability density vs. std. deviation" | |
177 , sub="(probability estimation made with Gaussian smoothing)" | |
178 ) | |
179 | |
180 m1 <- median(sds, na.rm=T) #sd to be used is the median sd | |
181 | |
182 ``` | |
183 | |
184 | |
185 | |
186 <!-- | |
187 The number of missing values are: | |
188 --> | |
189 ```{r echo=FALSE} | |
190 #Determine number of cells to impute | |
191 temp <- quantData[is.na(quantData)] | |
192 | |
193 #Determine number of values to impute | |
194 NoToImpute <- length(temp) | |
195 ``` | |
196 | |
197 <!-- | |
198 % of values that are missing: | |
199 --> | |
200 ```{r echo=FALSE} | |
201 pct_missing_values <- length(temp)/(length(logvalues)+length(temp)) * 100 | |
202 ``` | |
203 | |
204 <!-- | |
205 First few rows of data before imputation: | |
206 --> | |
207 ## Impute missing values | |
208 ```{r echo = FALSE} | |
209 | |
210 #ACE start segment: trt-median based imputation | |
211 # prep for trt-median based imputation | |
212 | |
213 # Assuming that regexSampleNames <- "\\.(\\d+)[A-Z]$" | |
214 # get factors -> group runs (samples) by ignoring terminal [A-Z] in sample names | |
215 # regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE) | |
216 m <- regexpr(regexSampleNames, names(quantData), perl=TRUE) | |
217 tempMatches <- regmatches(names(quantData), m) | |
218 print("Extracted sample names") | |
219 print(tempMatches) | |
220 m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE) | |
221 sampleNumbers <- as.factor(regmatches(tempMatches, m2)) | |
222 print("Factor levels") | |
223 print(sampleNumbers) | |
224 | |
225 ``` | |
226 ```{r echo = FALSE} | |
227 | |
228 #ACE hack begin | |
229 #Determine number of cells to impute | |
230 cat( | |
231 sprintf("Before imputation, there are:\n %d peptides\n %d missing values (%2.0f%s)" | |
232 , sum(rep.int(TRUE, nrow(quantData))) | |
233 , sum(is.na(quantData)) | |
234 , pct_missing_values | |
235 , "%" | |
236 ) | |
237 ) | |
238 #ACE hack end | |
239 | |
240 ``` | |
241 ```{r echo = FALSE} | |
242 | |
243 #Impute data | |
244 quantDataImputed <- quantData | |
245 | |
246 # Identify which values are missing and need to be imputed | |
247 ind <- which(is.na(quantDataImputed), arr.ind=TRUE) | |
248 | |
249 ``` | |
250 ```{r echo = FALSE} | |
251 | |
252 # Apply imputation | |
253 switch( | |
254 imputationMethod | |
255 , "group-median"={ | |
256 cat("Imputation method: substitute missing value with median peptide-intensity for sample-group\n") | |
257 #goodRows <- rep.int(TRUE, nrow(quantDataImputed)) | |
258 sampleLevelIntegers <- as.integer(sampleNumbers) | |
259 for (i in 1:length(levels(sampleNumbers))) { | |
260 levelCols <- i == sampleLevelIntegers | |
261 ind <- which(is.na(quantDataImputed[,levelCols]), arr.ind=TRUE) | |
262 quantDataImputed[ind,levelCols] <- apply(quantDataImputed[,levelCols], 1, median, na.rm=T)[ind[,1]] | |
263 } | |
264 goodRows <- !is.na(rowMeans(quantDataImputed)) | |
265 } | |
266 , "median"={ | |
267 cat("Imputation method: substitute missing value with median peptide-intensity across all sample classes\n") | |
268 quantDataImputed[ind] <- apply(quantDataImputed, 1, median, na.rm=T)[ind[,1]] | |
269 goodRows <- !is.na(rowMeans(quantDataImputed)) | |
270 } | |
271 , "mean"={ | |
272 cat("Imputation method: substitute missing value with mean peptide-intensity across all sample classes\n") | |
273 quantDataImputed[ind] <- apply(quantDataImputed, 1, mean, na.rm=T)[ind[,1]] | |
274 goodRows <- !is.na(rowMeans(quantDataImputed)) | |
275 } | |
276 , "random"={ | |
277 cat( | |
278 sprintf( | |
279 "Imputation method: substitute missing value with random intensity N ~ (%0.2f, %0.2f)\n" | |
280 , q1, m1 | |
281 ) | |
282 ) | |
283 quantDataImputed[is.na(quantDataImputed)] <- 10^rnorm(NoToImpute, mean= q1, sd = m1) | |
284 goodRows <- !is.na(rowMeans(quantDataImputed)) | |
285 } | |
286 ) | |
287 | |
288 ``` | |
289 ```{r echo = FALSE} | |
290 | |
291 #Determine number of cells to impute | |
292 temp <- quantDataImputed[is.na(quantDataImputed)] | |
293 cat( | |
294 sprintf( | |
295 "After imputation, there are:\n %d missing values\n %d usable peptides\n %d peptides with too many missing values for further analysis" | |
296 , sum(is.na(quantDataImputed[goodRows,])) | |
297 , sum(goodRows) | |
298 , sum(!goodRows) | |
299 ) | |
300 ) | |
301 ``` | |
302 ```{r echo = FALSE} | |
303 | |
304 | |
305 # Zap rows where imputation was ineffective | |
306 fullData <- fullData [goodRows, ] | |
307 quantData <- quantData [goodRows, ] | |
308 quantDataImputed <- quantDataImputed[goodRows, ] | |
309 | |
310 ``` | |
311 ```{r echo = FALSE} | |
312 | |
313 d_combined <- (density(as.numeric(as.matrix(log10(quantDataImputed))))) | |
314 d_original <- density(as.numeric(as.matrix(log10(quantDataImputed[!is.na(quantData)])))) | |
315 | |
316 ``` | |
317 ```{r echo = FALSE} | |
318 | |
319 if (sum(is.na(quantData)) > 0) { | |
320 # There ARE missing values | |
321 d_imputed <- (density(as.numeric(as.matrix(log10(quantDataImputed[is.na(quantData)]))))) | |
322 } else { | |
323 # There are NO missing values | |
324 d_imputed <- d_combined | |
325 } | |
326 | |
327 ``` | |
328 | |
329 <!-- ```{r echo = FALSE, fig.cap = "Blue = Data before imputation; Red = Imputed data"} --> | |
330 ```{r echo = FALSE, fig.dim=c(9,5)} | |
331 ylim <- c(0, max(d_combined$y, d_original$y, d_imputed$y)) | |
332 plot( | |
333 d_combined | |
334 , ylim = ylim | |
335 , sub = "Blue = data before imputation; Red = imputed data" | |
336 , main = "Density vs. log10(intensity) before and after imputation" | |
337 ) | |
338 lines(d_original, col="blue") | |
339 lines(d_imputed, col="red") | |
340 ``` | |
341 | |
342 ## Perform Quantile Normalization | |
343 ```{r echo=FALSE} | |
344 library(preprocessCore) | |
345 # Apply quantile normalization using preprocessCore::normalize.quantiles | |
346 # --- | |
347 # tool repository: http://bioconductor.org/packages/release/bioc/html/preprocessCore.html | |
348 # except this: https://support.bioconductor.org/p/122925/#9135989 | |
349 # says to install it like this: | |
350 # ``` | |
351 # BiocManager::install("preprocessCore", configure.args="--disable-threading", force = TRUE,lib=.libPaths()[1]) | |
352 # ``` | |
353 # conda installation (necessary because of a bug in recent openblas): | |
354 # conda install bioconductor-preprocesscore openblas=0.3.3 | |
355 # ... | |
356 # --- | |
357 # normalize.quantiles {preprocessCore} -- Quantile Normalization | |
358 # | |
359 # Description: | |
360 # Using a normalization based upon quantiles, this function normalizes a matrix of probe level intensities. | |
361 # | |
362 # Usage: | |
363 # normalize.quantiles(x,copy=TRUE, keep.names=FALSE) | |
364 # | |
365 # Arguments: | |
366 # | |
367 # - x: A matrix of intensities where each column corresponds to a chip and each row is a probe. | |
368 # | |
369 # - copy: Make a copy of matrix before normalizing. Usually safer to work with a copy, | |
370 # but in certain situations not making a copy of the matrix, but instead normalizing | |
371 # it in place will be more memory friendly. | |
372 # | |
373 # - keep.names: Boolean option to preserve matrix row and column names in output. | |
374 # | |
375 # Details: | |
376 # This method is based upon the concept of a quantile-quantile plot extended to n dimensions. | |
377 # No special allowances are made for outliers. If you make use of quantile normalization | |
378 # please cite Bolstad et al, Bioinformatics (2003). | |
379 # | |
380 # This functions will handle missing data (ie NA values), based on | |
381 # the assumption that the data is missing at random. | |
382 # | |
383 # Note that the current implementation optimizes for better memory usage | |
384 # at the cost of some additional run-time. | |
385 # | |
386 # Value: A normalized matrix. | |
387 # | |
388 # Author: Ben Bolstad, bmbolstad.com | |
389 # | |
390 # References | |
391 # | |
392 # - Bolstad, B (2001) Probe Level Quantile Normalization of High Density Oligonucleotide | |
393 # Array Data. Unpublished manuscript http://bmbolstad.com/stuff/qnorm.pdf | |
394 # | |
395 # - Bolstad, B. M., Irizarry R. A., Astrand, M, and Speed, T. P. (2003) A Comparison of | |
396 # Normalization Methods for High Density Oligonucleotide Array Data Based on Bias | |
397 # and Variance. Bioinformatics 19(2), pp 185-193. DOI 10.1093/bioinformatics/19.2.185 | |
398 # http://bmbolstad.com/misc/normalize/normalize.html | |
399 # ... | |
400 | |
401 if (TRUE) { | |
402 quantDataImputed.qn <- normalize.quantiles(as.matrix(quantDataImputed)) | |
403 } else { | |
404 quantDataImputed.qn <- as.matrix(quantDataImputed) | |
405 } | |
406 | |
407 quantDataImputed.qn = as.data.frame(quantDataImputed.qn) | |
408 names(quantDataImputed.qn) = names(quantDataImputed) | |
409 quantDataImputed_QN_log <- log10(quantDataImputed.qn) | |
410 | |
411 rownames(quantDataImputed_QN_log) <- fullData[,1] | |
412 | |
413 quantDataImputed.qn.LS = t(scale(t(log10(quantDataImputed.qn)))) | |
414 anyNaN <- function (x) { | |
415 !any(x == "NaN") | |
416 } | |
417 sel = apply(quantDataImputed.qn.LS, 1, anyNaN) | |
418 quantDataImputed.qn.LS2 <- quantDataImputed.qn.LS[which(sel),] | |
419 quantDataImputed.qn.LS2 = as.data.frame(quantDataImputed.qn.LS2) | |
420 | |
421 #output quantile normalized data | |
422 dataTableImputed_QN_LT <- cbind(fullData[1:9], quantDataImputed_QN_log) | |
423 write.table(dataTableImputed_QN_LT, file = paste(paste(strsplit(imputedDataFilename, ".txt"),"QN_LT",sep="_"),".txt",sep=""), sep = "\t", col.names=TRUE, row.names=FALSE) | |
424 | |
425 ``` | |
426 | |
427 <!-- ACE insertion begin --> | |
428 ### Checking that normalized, imputed, log-transformed sample distributions are similar: | |
429 | |
430 ```{r echo=FALSE} | |
431 #library(data.table) | |
432 | |
433 #Save unimputed quantDataLog for plotting below | |
434 unimputedQuantDataLog <- quantDataLog | |
435 | |
436 #Log10 transform (after preparing for zero values, which should never happen...) | |
437 quantDataImputed.qn[quantDataImputed.qn == 0] <- .000000001 | |
438 quantDataLog <- log10(quantDataImputed.qn) | |
439 | |
440 summary(quantDataLog) | |
441 | |
442 #Output quantile-normalized log-transformed dataset with imputed, normalized data | |
443 | |
444 dataTableImputed <- cbind(fullData[1:9], quantDataLog) | |
445 write.table( | |
446 dataTableImputed | |
447 , file=imputedDataFilename | |
448 , sep="\t" | |
449 , col.names=TRUE | |
450 , row.names=FALSE | |
451 , quote=FALSE | |
452 ) | |
453 | |
454 | |
455 | |
456 #data visualization | |
457 old_par <- par( | |
458 mai=par("mai") + c(0.5,0,0,0) | |
459 , oma=par("oma") + c(0.5,0,0,0) | |
460 ) | |
461 boxplot( | |
462 quantDataLog | |
463 , las=2 | |
464 ) | |
465 par(old_par) | |
466 ``` | |
467 | |
468 ```{r echo=FALSE, fig.dim=c(9,5)} | |
469 quantDataLog_stack <- stack(quantDataLog) | |
470 ggplot(quantDataLog_stack, aes(x=values)) + geom_density(aes(group=ind, colour=ind)) | |
471 ``` | |
472 | |
473 ## Perform ANOVA filters | |
474 | |
475 ```{r,echo=FALSE} | |
476 #Make new data frame containing only Phosphopeptides to connect preANOVA to ANOVA (connect_df) | |
477 connect_df <- data.frame( | |
478 dataTableImputed_QN_LT$Phosphopeptide | |
479 , dataTableImputed_QN_LT[,firstDataColumn] | |
480 ) | |
481 colnames(connect_df) <- c("Phosphopeptide","Intensity") | |
482 ``` | |
483 | |
484 ```{r echo=FALSE, fig.dim=c(9,10)} | |
485 # Get factors -> group replicates (as indicated by terminal letter) by the preceding digits | |
486 # For example, group .1A .1B .1C into group 1; .2A .2B .2C, into group 2; etc.. | |
487 m <- regexpr(regexSampleNames, names(quantDataImputed_QN_log), perl=TRUE) | |
488 #ACE str(m) | |
489 tempMatches <- regmatches(names(quantDataImputed_QN_log), m) | |
490 #ACE str(tempMatches) | |
491 numSamples <- length(tempMatches) | |
492 #ACE str(numSamples) | |
493 m2 <- regexpr(regexSampleGrouping, tempMatches, perl=TRUE) | |
494 #ACE str(m2) | |
495 #ACE str(regmatches(tempMatches, m2)) | |
496 sampleNumbers <- as.factor(regmatches(tempMatches, m2)) | |
497 #ACE str(sampleNumbers) | |
498 | |
499 if (length(levels(sampleNumbers))<2) { | |
500 cat("ERROR!!!! Cannot perform ANOVA analysis because it requires two or more factor levels\n") | |
501 cat("Unparsed sample names are:\n") | |
502 print(names(quantDataImputed_QN_log)) | |
503 cat(sprintf("Parsing rule for SampleNames is '%s'\n", regexSampleNames)) | |
504 cat("Parsed names are:\n") | |
505 print(tempMatches) | |
506 cat(sprintf("Parsing rule for SampleGrouping is '%s'\n", regexSampleGrouping)) | |
507 cat("Sample group assignments are:\n") | |
508 print(regmatches(tempMatches, m2)) | |
509 } else { | |
510 pValueData.anovaPs <- apply(quantDataImputed_QN_log, 1, anovaFunc, groupingFactor=sampleNumbers) | |
511 | |
512 pValueData.anovaPs.FDR <- p.adjust(pValueData.anovaPs, method="fdr") | |
513 pValueData <- data.frame( | |
514 phosphopeptide = fullData[,1] | |
515 , rawANOVAp = pValueData.anovaPs | |
516 , FDRadjustedANOVAp = pValueData.anovaPs.FDR | |
517 ) | |
518 #ACE rownames(pValueData) <- fullData[,1] | |
519 # output ANOVA file to constructed filename, | |
520 # e.g. "Outputfile_pST_ANOVA_STEP5.txt" | |
521 # becomes "Outpufile_pST_ANOVA_STEP5_FDR0.05.txt" | |
522 | |
523 #Re-output quantile-normalized log-transformed dataset with imputed, normalized data to include p-values | |
524 | |
525 dataTableImputed <- cbind(fullData[1:9], pValueData[,2:3], quantDataLog) | |
526 write.table( | |
527 dataTableImputed | |
528 , file=imputedDataFilename | |
529 , sep="\t" | |
530 , col.names=TRUE | |
531 , row.names=FALSE | |
532 , quote=FALSE | |
533 ) | |
534 | |
535 | |
536 pValueData <- pValueData[order(pValueData$FDRadjustedANOVAp),] | |
537 | |
538 cutoff <- valFDR[1] | |
539 for (cutoff in valFDR){ #loop through FDR cutoffs | |
540 | |
541 filtered_p <- pValueData[which(pValueData$FDRadjustedANOVAp < cutoff),, drop = FALSE] | |
542 filteredData.filtered <- quantDataImputed_QN_log[rownames(filtered_p),, drop = FALSE] | |
543 filteredData.filtered <- filteredData.filtered[order(filtered_p$FDRadjustedANOVAp),, drop = FALSE] | |
544 | |
545 # <!-- ACE insertion start --> | |
546 old_oma <- par("oma") | |
547 old_par <- par( | |
548 mai=(par("mai") + c(0.7,0,0,0)) * c(1,1,0.3,1) | |
549 , oma=old_oma * c(1,1,0.3,1) | |
550 , cex.main=0.9 | |
551 , cex.axis=0.7 | |
552 ) | |
553 | |
554 if (nrow(filteredData.filtered) > 0) { | |
555 boxplot( | |
556 filteredData.filtered | |
557 , main = sprintf("Imputed, normalized intensities where adjusted p-value < %0.2f", cutoff) | |
558 # no line plot , main = "" | |
559 , las = 2 | |
560 # , ylim = c(5.5,10) | |
561 , ylab = expression(log[10](intensity)) | |
562 ) | |
563 } else { | |
564 cat(sprintf("No peptides were found to have cutoff adjusted p-value < %0.2f\n", cutoff)) | |
565 } | |
566 par(old_par) | |
567 | |
568 #Add Phosphopeptide column to ANOVA filtered table | |
569 ANOVA.filtered_merge <- merge( | |
570 x = connect_df | |
571 , y = filteredData.filtered | |
572 , by.x="Intensity" | |
573 , by.y=1 | |
574 ) | |
575 ANOVA.filtered_merge.order <- rownames(filtered_p) | |
576 | |
577 ANOVA.filtered_merge.format <- sapply( | |
578 X = filtered_p$FDRadjustedANOVAp | |
579 , FUN = function(x) { | |
580 if (x > 0.0001) | |
581 paste0("(%0.",1+ceiling(-log10(x)),"f) %s") | |
582 else | |
583 paste0("(%0.4e) %s") | |
584 } | |
585 ) | |
586 | |
587 #ANOVA.filtered_merge.format <- paste0("(%0.",1+ceiling(-log10(filtered_p$FDRadjustedANOVAp)),"f) %s") | |
588 | |
589 ANOVA.filtered <- data.table( | |
590 ANOVA.filtered_merge$Phosphopeptide | |
591 , ANOVA.filtered_merge$Intensity | |
592 , ANOVA.filtered_merge[, 2:numSamples+1] | |
593 ) | |
594 colnames(ANOVA.filtered) <- c("Phosphopeptide", colnames(filteredData.filtered)) | |
595 | |
596 # merge qualitative columns into the ANOVA data | |
597 output_table <- data.frame(ANOVA.filtered$Phosphopeptide) | |
598 output_table <- merge( | |
599 x = output_table | |
600 , y = dataTableImputed_QN_LT | |
601 , by.x = "ANOVA.filtered.Phosphopeptide" | |
602 , by.y="Phosphopeptide" | |
603 ) | |
604 | |
605 #Produce heatmap to visualize significance and the effect of imputation | |
606 m <- as.matrix(unimputedQuantDataLog[ANOVA.filtered_merge.order,]) | |
607 if (nrow(m) > 0) { | |
608 rownames_m <- rownames(m) | |
609 rownames(m) <- sapply( | |
610 X = 1:nrow(m) | |
611 , FUN = function(i) { | |
612 sprintf( | |
613 ANOVA.filtered_merge.format[i] | |
614 , filtered_p$FDRadjustedANOVAp[i] | |
615 , rownames_m[i] | |
616 ) | |
617 } | |
618 ) | |
619 margins <- c( | |
620 max(nchar(colnames(m))) * 10 / 16 # col | |
621 , max(nchar(rownames(m))) * 5 / 16 # row | |
622 ) | |
623 how_many_peptides <- min(50, nrow(m)) | |
624 | |
625 op <- par("cex.main") | |
626 try( | |
627 if (nrow(m) > 1) { | |
628 par(cex.main=0.6) | |
629 heatmap( | |
630 m[how_many_peptides:1,] | |
631 , Rowv = NA | |
632 , Colv = NA | |
633 , cexRow = 0.7 | |
634 , cexCol = 0.8 | |
635 , scale="row" | |
636 , margins = margins | |
637 , main = "Heatmap of unimputed, unnormalized intensities" | |
638 , xlab = "" | |
639 # , main = bquote( | |
640 # .( how_many_peptides ) | |
641 # ~ " peptides with adjusted p-value <" | |
642 # ~ .(sprintf("%0.2f", cutoff)) | |
643 # ) | |
644 ) | |
645 } | |
646 ) | |
647 #ACE fig_dim knitr::opts_chunk$set(fig.dim = fig_dim) | |
648 par(op) | |
649 } | |
650 | |
651 } | |
652 } | |
653 ``` | |
654 | |
655 ## Peptide IDs, etc. | |
656 | |
657 See output files. |