comparison w4mcorcov_calc.R @ 0:50a07adddfbd draft

planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit 52e588e19fe93d83d221710bb75559c5700ba637
author eschen42
date Mon, 09 Oct 2017 15:46:13 -0400
parents
children e25fd8a13665
comparison
equal deleted inserted replaced
-1:000000000000 0:50a07adddfbd
1 # center with 'colMeans()' - ref: http://gastonsanchez.com/visually-enforced/how-to/2014/01/15/Center-data-in-R/
2 center_colmeans <- function(x) {
3 xcenter = colMeans(x)
4 x - rep(xcenter, rep.int(nrow(x), ncol(x)))
5 }
6
7 #### OPLS-DA
8 algoC <- "nipals"
9
10 do_detail_plot <- function(x_dataMatrix, x_predictor, x_is_match, x_algorithm, x_prefix, x_show_labels, x_progress = print, x_env) {
11 off <- function(x) if (x_show_labels) x else 0
12 salience_lookup <- x_env$salience_lookup
13 salient_rcv_lookup <- x_env$salient_rcv_lookup
14 # x_progress("head(salience_df): ", head(salience_df))
15 # x_progress("head(salience): ", head(salience))
16 if (x_is_match && ncol(x_dataMatrix) > 0 && length(unique(x_predictor))> 1) {
17 my_oplsda <- opls(
18 x = x_dataMatrix
19 , y = x_predictor
20 , algoC = x_algorithm
21 , predI = 1
22 , orthoI = if (ncol(x_dataMatrix) > 1) 1 else 0
23 , printL = FALSE
24 , plotL = FALSE
25 )
26 my_oplsda_suppLs_y_levels <- levels(as.factor(my_oplsda@suppLs$y))
27 fctr_lvl_1 <- my_oplsda_suppLs_y_levels[1]
28 fctr_lvl_2 <- my_oplsda_suppLs_y_levels[2]
29 my_cor_vs_cov <- cor_vs_cov(
30 matrix_x = x_dataMatrix
31 , ropls_x = my_oplsda
32 )
33 with(
34 my_cor_vs_cov
35 , {
36 min_x <- min(covariance)
37 max_x <- max(covariance)
38 lim_x <- max(sapply(X=c(min_x, max_x), FUN=abs))
39 covariance <- covariance / lim_x
40 lim_x <- 1.2
41 main_label <- sprintf("%s for levels %s versus %s", x_prefix, fctr_lvl_1, fctr_lvl_2)
42 # print("main_label")
43 # print(main_label)
44 main_cex <- min(1.0, 46.0/nchar(main_label))
45 # " It is generally accepted that a variable should be selected if vj>1, [27–29], but a proper threshold between 0.83 and 1.21 can yield more relevant variables according to [28]." (Mehmood 2012 doi:10.1016/j.chemolab.2004.12.011)
46 vipco <- pmax(0, pmin(1,(vip4p-0.83)/(1.21-0.83)))
47 alpha <- 0.1 + 0.4 * vipco
48 red <- as.numeric(correlation < 0) * vipco
49 blue <- as.numeric(correlation > 0) * vipco
50 minus_cor <- -correlation
51 minus_cov <- -covariance
52 # x_progress("head(names(minus_cor)): ", head(names(minus_cor)))
53 cex <- salience_lookup(feature = names(minus_cor))
54 # x_progress("head(cex.1): ", head(cex))
55 # cex <- 0.25 + (0.75 * cex / max(cex))
56 cex <- 0.25 + (1.25 * cex / max(cex))
57 # x_progress("head(cex.2): ", head(cex))
58 plot(
59 y = minus_cor
60 , x = minus_cov
61 , type="p"
62 , xlim=c(-lim_x, lim_x + off(0.1))
63 , ylim=c(-1.0 - off(0.1), 1.0)
64 , xlab = sprintf("relative covariance(feature,t1)")
65 , ylab = sprintf("correlation(feature,t1)")
66 , main = main_label
67 , cex.main = main_cex
68 , cex = cex
69 , pch = 16
70 , col = rgb(blue = blue, red = red, green = 0, alpha = alpha)
71 )
72 low_x <- -0.7 * lim_x
73 high_x <- 0.7 * lim_x
74 text(x = low_x, y = -0.15, labels = fctr_lvl_1)
75 text(x = high_x, y = 0.15, labels = fctr_lvl_2)
76 if (x_show_labels) {
77 text(
78 y = minus_cor - 0.013
79 , x = minus_cov + 0.020
80 , cex = 0.3
81 , labels = tsv1$featureID
82 , col = rgb(blue = blue, red = red, green = 0, alpha = 0.2 + 0.8 * alpha)
83 , srt = -30 # slant 30 degrees downward
84 , adj = 0 # left-justified
85 )
86 }
87 }
88 )
89 typeVc <- c("correlation", # 1
90 "outlier", # 2
91 "overview", # 3
92 "permutation", # 4
93 "predict-train", # 5
94 "predict-test", # 6
95 "summary", # 7 = c(2,3,4,9)
96 "x-loading", # 8
97 "x-score", # 9
98 "x-variance", # 10
99 "xy-score", # 11
100 "xy-weight" # 12
101 ) # [c(3,8,9)] # [c(4,3,8,9)]
102 if ( length(my_oplsda@orthoVipVn) > 0 ) {
103 my_typevc <- typeVc[c(9,3,8)]
104 } else {
105 my_typevc <- c("(dummy)","overview","(dummy)")
106 }
107 for (my_type in my_typevc) {
108 if (my_type %in% typeVc) {
109 # print(sprintf("plotting type %s", my_type))
110 plot(
111 x = my_oplsda
112 , typeVc = my_type
113 , parCexN = 0.4
114 , parDevNewL = FALSE
115 , parLayL = TRUE
116 , parEllipsesL = TRUE
117 )
118 } else {
119 # print("plotting dummy graph")
120 plot(x=1, y=1, xaxt="n", yaxt="n", xlab="", ylab="", type="n")
121 text(x=1, y=1, labels="no orthogonal projection is possible")
122 }
123 }
124 return (my_cor_vs_cov)
125 } else {
126 # x_progress(sprintf("x_is_match = %s, ncol(x_dataMatrix) = %d, length(unique(x_predictor)) = %d",x_is_match, ncol(x_dataMatrix), length(unique(x_predictor))))
127 return (NULL)
128 }
129 }
130
131 # S-PLOT and OPLS reference: Wiklund_2008 doi:10.1021/ac0713510
132 corcov_calc <- function(calc_env, failure_action = stop, progress_action = function(x){}, corcov_tsv_action = NULL) {
133 if ( ! is.environment(calc_env) ) {
134 failure_action("corcov_calc: fatal error - 'calc_env' is not an environment")
135 return ( FALSE )
136 }
137 if ( is.null(corcov_tsv_action) || ! is.function(corcov_tsv_action) ) {
138 failure_action("corcov_calc: fatal error - 'corcov_tsv_action' is not a function")
139 return ( FALSE )
140 }
141
142 # extract parameters from the environment
143 vrbl_metadata <- calc_env$vrbl_metadata
144 vrbl_metadata_names <- vrbl_metadata[,1]
145 smpl_metadata <- calc_env$smpl_metadata
146 data_matrix <- calc_env$data_matrix
147 pairSigFeatOnly <- calc_env$pairSigFeatOnly
148 facC <- calc_env$facC
149 tesC <- calc_env$tesC
150 # extract the levels from the environment
151 originalLevCSV <- levCSV <- calc_env$levCSV
152 # matchingC is one of { "none", "wildcard", "regex" }
153 matchingC <- calc_env$matchingC
154 labelFeatures <- calc_env$labelFeatures
155
156 # arg/env checking
157 if (!(facC %in% names(smpl_metadata))) {
158 failure_action(sprintf("bad parameter! Factor name '%s' not found in sampleMetadata", facC))
159 return ( FALSE )
160 }
161
162 # calculate salience_df as data.frame(feature, max_level, max_median, max_rcv, mean_median, salience, salient_rcv)
163 salience_df <- calc_env$salience_df <- w4msalience(
164 data_matrix = data_matrix
165 , sample_class = smpl_metadata[,facC]
166 , failure_action = failure_action
167 )
168 salience <- salience_df$salience
169 names(salience) <- salience_df$feature
170 salience_lookup <- calc_env$salience_lookup <- function(feature) unname(salience[feature])
171 salient_rcv <- salience_df$salient_rcv
172 names(salient_rcv) <- salience_df$feature
173 salient_rcv_lookup <- calc_env$salient_rcv_lookup <- function(feature) unname(salient_rcv[feature])
174 salient_level <- salience_df$max_level
175 names(salient_level) <- salience_df$feature
176 salient_level_lookup <- calc_env$salient_level_lookup <- function(feature) unname(salient_level[feature])
177
178 # transform wildcards to regexen
179 if (matchingC == "wildcard") {
180 # strsplit(x = "hello,wild,world", split = ",")
181 levCSV <- gsub("[.]", "[.]", levCSV)
182 levCSV <- strsplit(x = levCSV, split = ",")
183 levCSV <- sapply(levCSV, utils::glob2rx, trim.tail = FALSE)
184 levCSV <- paste(levCSV, collapse = ",")
185 }
186 # function to determine whether level is a member of levCSV
187 isLevelSelected <- function(lvl) {
188 matchFun <- if (matchingC != "none") grepl else `==`
189 return(
190 Reduce(
191 f = "||"
192 , x = sapply(
193 X = strsplit(
194 x = levCSV
195 , split = ","
196 , fixed = TRUE
197 )[[1]]
198 , FUN = matchFun
199 , lvl
200 )
201 )
202 )
203 }
204
205 # transpose matrix because ropls matrix is the transpose of XCMS matrix
206 # Wiklund_2008 centers and pareto-scales data before OPLS-DA S-plot
207 # center
208 cdm <- center_colmeans(t(data_matrix))
209 # pareto-scale
210 my_scale <- sqrt(apply(cdm, 2, sd, na.rm=TRUE))
211 scdm <- sweep(cdm, 2, my_scale, "/")
212
213 # pattern to match column names like k10_kruskal_k4.k3_sig
214 col_pattern <- sprintf('^%s_%s_(.*)[.](.*)_sig$', facC, tesC)
215 # column name like k10_kruskal_sig
216 intersample_sig_col <- sprintf('%s_%s_sig', facC, tesC)
217 # get the facC column from sampleMetadata, dropping to one dimension
218 smpl_metadata_facC <- smpl_metadata[,facC]
219
220 # allocate a slot in the environment for the contrast_list, each element of which will be a data.frame with this structure:
221 # - feature ID
222 # - value1
223 # - value2
224 # - Wiklund_2008 correlation
225 # - Wiklund_2008 covariance
226 # - Wiklund_2008 VIP
227 calc_env$contrast_list <- list()
228
229
230 did_plot <- FALSE
231 if (tesC != "none") {
232 # for each column name, extract the parts of the name matched by 'col_pattern', if any
233 the_colnames <- colnames(vrbl_metadata)
234 if (!Reduce(f = "||", x = grepl(tesC, the_colnames))) {
235 failure_action(sprintf("bad parameter! variableMetadata must contain results of W4M Univariate test '%s'.", tesC))
236 return ( FALSE )
237 }
238 col_matches <- lapply(
239 X = the_colnames,
240 FUN = function(x) {
241 regmatches( x, regexec(col_pattern, x) )[[1]]
242 }
243 )
244 # process columns matching the pattern
245 for (i in 1:length(col_matches)) {
246 # for each potential match of the pattern
247 col_match <- col_matches[[i]]
248 if (length(col_match) > 0) {
249 # it's an actual match; extract the pieces, e.g., k10_kruskal_k4.k3_sig
250 vrbl_metadata_col <- col_match[1] # ^^^^^^^^^^^^^^^^^^^^^ # Column name
251 fctr_lvl_1 <- col_match[2] # ^^ # Factor-level 1
252 fctr_lvl_2 <- col_match[3] # ^^ # Factor-level 2
253 # only process this column if both factors are members of lvlCSV
254 is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2)
255 progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2))
256 # TODO delete next line displaying currently-processed column
257 # cat(sprintf("%s %s %s %s\n", vrbl_metadata_col, fctr_lvl_1, fctr_lvl_2, is_match))
258 # choose only samples with one of the two factors for this column
259 chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2)
260 predictor <- smpl_metadata_facC[chosen_samples]
261 # extract only the significantly-varying features and the chosen samples
262 fully_significant <- 1 == vrbl_metadata[,vrbl_metadata_col] * vrbl_metadata[,intersample_sig_col]
263 overall_significant <- 1 == vrbl_metadata[,intersample_sig_col]
264 col_selector <- vrbl_metadata_names[ if ( pairSigFeatOnly ) fully_significant else overall_significant ]
265 my_matrix <- scdm[ chosen_samples, col_selector, drop = FALSE ]
266 my_cor_cov <- do_detail_plot(
267 x_dataMatrix = my_matrix
268 , x_predictor = predictor
269 , x_is_match = is_match
270 , x_algorithm = algoC
271 , x_prefix = if (pairSigFeatOnly) "Significantly contrasting features" else "Significant features"
272 , x_show_labels = labelFeatures
273 , x_progress = progress_action
274 , x_env = calc_env
275 )
276 if ( is.null(my_cor_cov) ) {
277 progress_action("NOTHING TO PLOT.")
278 } else {
279 tsv <- my_cor_cov$tsv1
280 tsv$salientLevel <- salient_level_lookup(tsv$featureID)
281 tsv$salientRCV <- salient_rcv_lookup(tsv$featureID)
282 tsv$salience <- salience_lookup(tsv$featureID)
283 tsv["level1Level2Sig"] <- vrbl_metadata[ match(tsv$featureID, vrbl_metadata_names), vrbl_metadata_col ]
284 corcov_tsv_action(tsv)
285 did_plot <- TRUE
286 }
287 }
288 }
289 } else { # tesC == "none"
290 utils::combn(
291 x = unique(sort(smpl_metadata_facC))
292 , m = 2
293 , FUN = function(x) {
294 fctr_lvl_1 <- x[1]
295 fctr_lvl_2 <- x[2]
296 progress_action(sprintf("calculating/plotting contrast of %s vs. %s", fctr_lvl_1, fctr_lvl_2))
297 chosen_samples <- smpl_metadata_facC %in% c(fctr_lvl_1, fctr_lvl_2)
298 if (length(unique(chosen_samples)) < 1) {
299 progress_action("NOTHING TO PLOT...")
300 } else {
301 predictor <- smpl_metadata_facC[chosen_samples]
302 my_matrix <- scdm[ chosen_samples, , drop = FALSE ]
303 # only process this column if both factors are members of lvlCSV
304 is_match <- isLevelSelected(fctr_lvl_1) && isLevelSelected(fctr_lvl_2)
305 my_cor_cov <- do_detail_plot(
306 x_dataMatrix = my_matrix
307 , x_predictor = predictor
308 , x_is_match = is_match
309 , x_algorithm = algoC
310 , x_prefix = "Features"
311 , x_show_labels = labelFeatures
312 , x_progress = progress_action
313 , x_env = calc_env
314 )
315 if ( is.null(my_cor_cov) ) {
316 progress_action("NOTHING TO PLOT")
317 } else {
318 tsv <- my_cor_cov$tsv1
319 tsv$salientLevel <- salient_level_lookup(tsv$featureID)
320 tsv$salientRCV <- salient_rcv_lookup(tsv$featureID)
321 tsv$salience <- salience_lookup(tsv$featureID)
322 corcov_tsv_action(tsv)
323 did_plot <<- TRUE
324 }
325 }
326 #print("baz")
327 "dummy" # need to return a value; otherwise combn fails with an error
328 }
329 )
330 }
331 if (!did_plot) {
332 failure_action(sprintf("bad parameter! sampleMetadata must have at least two levels of factor '%s' matching '%s'", facC, originalLevCSV))
333 return ( FALSE )
334 }
335 return ( TRUE )
336 }
337
338 # Calculate data for correlation-versus-covariance plot
339 # Adapted from:
340 # Wiklund_2008 doi:10.1021/ac0713510
341 # Galindo_Prieto_2014 doi:10.1002/cem.2627
342 # https://github.com/HegemanLab/extra_tools/blob/master/generic_PCA.R
343 cor_vs_cov <- function(matrix_x, ropls_x) {
344 x_class <- class(ropls_x)
345 if ( !( as.character(x_class) == "opls" ) ) { # || !( attr(class(x_class),"package") == "ropls" ) )
346 stop( "cor_vs_cov: Expected ropls_x to be of class ropls::opls but instead it was of class ", as.character(x_class) )
347 }
348 result <- list()
349 # suppLs$algoC - Character: algorithm used - "svd" for singular value decomposition; "nipals" for NIPALS
350 if ( ropls_x@suppLs$algoC == "nipals") {
351 # Equations (1) and (2) from *Supplement to* Wiklund 2008, doi:10.1021/ac0713510
352 mag <- function(one_dimensional) sqrt(sum(one_dimensional * one_dimensional))
353 mag_xi <- sapply(X = 1:ncol(matrix_x), FUN = function(x) mag(matrix_x[,x]))
354 score_matrix <- ropls_x@scoreMN
355 score_matrix_transposed <- t(score_matrix)
356 score_matrix_magnitude <- mag(score_matrix)
357 result$covariance <- score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * score_matrix_magnitude )
358 result$correlation <- score_matrix_transposed %*% matrix_x / ( score_matrix_magnitude * mag_xi )
359 } else {
360 # WARNING - untested code - I don't have test data to exercise this branch
361 # Equations (1) and (2) from Wiklund 2008, doi:10.1021/ac0713510
362 # scoreMN - Numerical matrix of x scores (T; dimensions: nrow(x) x predI) X = TP' + E; Y = TC' + F
363 score_matrix <- ropls_x@scoreMN
364 score_matrix_transposed <- t(score_matrix)
365 cov_divisor <- nrow(matrix_x) - 1
366 result$covariance <- sapply(
367 X = 1:ncol(matrix_x)
368 , FUN = function(x) score_matrix_transposed %*% matrix_x[,x] / cov_divisor
369 )
370 score_sd <- sapply(
371 X = 1:ncol(score_matrix)
372 , FUN = function(x) sd(score_matrix[,x])
373 )
374 # xSdVn - Numerical vector: variable standard deviations of the 'x' matrix
375 xSdVn <- ropls_x@xSdVn
376 result$correlation <- sapply(
377 X = 1:ncol(matrix_x)
378 , FUN = function(x) {
379 ( score_matrix_transposed / score_sd ) %*% ( matrix_x[,x] / (xSdVn[x] * cov_divisor) )
380 }
381 )
382 }
383 result$correlation <- result$correlation[1,,drop = TRUE]
384 result$covariance <- result$covariance[1,,drop = TRUE]
385
386 # Variant 4 of Variable Influence on Projection for OPLS from Galindo_Prieto_2014
387 # Length = number of features; labels = feature identifiers. (The same is true for $correlation and $covariance.)
388 result$vip4p <- as.numeric(ropls_x@vipVn)
389 result$vip4o <- as.numeric(ropls_x@orthoVipVn)
390 # get the level names
391 level_names <- sort(levels(as.factor(ropls_x@suppLs$y)))
392 feature_count <- length(ropls_x@vipVn)
393 result$level1 <- rep.int(x = level_names[1], times = feature_count)
394 result$level2 <- rep.int(x = level_names[2], times = feature_count)
395 # print(sprintf("sd(covariance) = %f; sd(correlation) = %f", sd(result$covariance), sd(result$correlation)))
396 superresult <- list()
397 if (length(result$vip4o) == 0) result$vip4o <- NA
398 superresult$tsv1 <- data.frame(
399 featureID = names(ropls_x@vipVn)
400 , factorLevel1 = result$level1
401 , factorLevel2 = result$level2
402 , correlation = result$correlation
403 , covariance = result$covariance
404 , vip4p = result$vip4p
405 , vip4o = result$vip4o
406 , row.names = NULL
407 )
408 rownames(superresult$tsv1) <- superresult$tsv1$featureID
409 superresult$covariance <- result$covariance
410 superresult$correlation <- result$correlation
411 superresult$vip4p <- result$vip4p
412 superresult$vip4o <- result$vip4o
413 superresult$details <- result
414 # #print(superresult$tsv1)
415 result$superresult <- superresult
416 # Include thise in case future consumers of this routine want to use it in currently unanticipated ways
417 result$oplsda <- ropls_x
418 result$predictor <- ropls_x@suppLs$y # in case future consumers of this routine want to use it in currently unanticipated ways
419 return (superresult)
420 }
421
422