comparison w4mkmeans_routines.R @ 6:3f72a635a075 draft default tip

planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 2799299a221358b648334c3f890c4024155af73b
author eschen42
date Fri, 02 Mar 2018 08:32:17 -0500
parents 6817b036b06e
children
comparison
equal deleted inserted replaced
5:6817b036b06e 6:3f72a635a075
2 ## these are the batch-independent and file-structure-independent routines to support the w4mkmeans tool 2 ## these are the batch-independent and file-structure-independent routines to support the w4mkmeans tool
3 ##------------------------------------------------------------------------------------------------------ 3 ##------------------------------------------------------------------------------------------------------
4 4
5 library(parallel) 5 library(parallel)
6 6
7 w4kmeans_usage <- function() { 7 w4mkmeans_usage <- function() {
8 return ( 8 return (
9 c( 9 c(
10 "w4mkmeans: bad input.", 10 "w4mkmeans: bad input.",
11 "# contract:", 11 " contract:",
12 " required - caller will provide an environment comprising:", 12 " required - caller will provide an environment comprising:",
13 " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...", 13 " log_print - a logging function with the signature function(x, ...) expecting strings as x and ...",
14 " variableMetadata - the corresponding W4M data.frame having feature metadata", 14 " variableMetadata - the corresponding W4M data.frame having feature metadata",
15 " sampleMetdata - the corresponding W4M data.frame having sample metadata", 15 " sampleMetdata - the corresponding W4M data.frame having sample metadata",
16 " dataMatrix - the corresponding W4M matrix", 16 " dataMatrix - the corresponding W4M matrix",
17 " slots - the number of parallel slots for calculating kmeans", 17 " slots - the number of parallel slots for calculating kmeans",
18 " optional - environment may comprise:", 18 " optional - environment may comprise:",
19 " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)", 19 " kfeatures - an array of integers, the k's to apply for clustering by feature (default, empty array)",
20 " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)", 20 " ksamples - an array of integers, the k's to apply for clustering by sample (default, empty array)",
21 " iter.max - the maximum number of iterations when calculating a cluster (default = 10)", 21 " iter_max - the maximum number of iterations when calculating a cluster (default = 20)",
22 " nstart - how many random sets of centers should be chosen (default = 1)", 22 " nstart - how many random sets of centers should be chosen (default = 20)",
23 " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", 23 " algorithm - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
24 " categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)", 24 " categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
25 " ", 25 " ",
26 " this routine will return a list comprising:", 26 " this routine will return a list comprising:",
27 " variableMetadata - the input variableMetadata data.frame with updates, if any", 27 " variableMetadata - the input variableMetadata data.frame with updates, if any",
33 } 33 }
34 34
35 w4mkmeans <- function(env) { 35 w4mkmeans <- function(env) {
36 # abort if 'env' is null or is not an environment 36 # abort if 'env' is null or is not an environment
37 if ( is.null(env) || ! is.environment(env) ) { 37 if ( is.null(env) || ! is.environment(env) ) {
38 lapply(w4kmeans_usage(),print) 38 lapply(w4mkmeans_usage(),print)
39 } 39 }
40 # extract parameters from 'env'
41 log_action <- env$log_print
40 # supply default arguments 42 # supply default arguments
41 if ( ! exists("iter.max" , env) ) env$iter.max <- 10 43 if ( ! exists("iter_max" , env) ) env$iter_max <- 20
42 if ( ! exists("nstart" , env) ) env$nstart <- 1 44 if ( ! exists("nstart" , env) ) env$nstart <- 20
43 if ( ! exists("algorithm" , env) ) env$algorithm <- 'Hartigan-Wong' 45 if ( ! exists("algorithm" , env) ) env$algorithm <- 'Hartigan-Wong'
44 if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'k' 46 if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'c'
45 if ( ! exists("ksamples" , env) ) env$ksamples <- c() 47 if ( ! exists("ksamples" , env) ) env$ksamples <- c()
46 if ( ! exists("kfeatures" , env) ) env$kfeatures <- c() 48 if ( ! exists("kfeatures" , env) ) env$kfeatures <- c()
47 # check mandatory arguments 49 # check mandatory arguments
48 expected <- c( 50 expected <- c(
49 "log_print" 51 "log_print"
53 , "slots" 55 , "slots"
54 ) 56 )
55 missing_from_env <- setdiff(expected, (ls(env))) 57 missing_from_env <- setdiff(expected, (ls(env)))
56 if ( length(missing_from_env) > 0 ) { 58 if ( length(missing_from_env) > 0 ) {
57 print(paste(c('expected environment members not found: ', as.character(missing_from_env)), collapse = ", ")) 59 print(paste(c('expected environment members not found: ', as.character(missing_from_env)), collapse = ", "))
58 lapply(w4kmeans_usage(),print) 60 lapply(w4mkmeans_usage(),log_action)
59 stop("w4mkmeans: contract has been broken") 61 stop("w4mkmeans: contract has been broken")
60 } 62 }
61 # extract parameters from 'env' 63 # extract parameters from 'env'
62 log_action <- env$log_print 64 log_action <- env$log_print
63 scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" ) 65 scores <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" )
64 sampleMetadata <- env$sampleMetadata 66 sampleMetadata <- env$sampleMetadata
65 featureMetadata <- env$variableMetadata 67 featureMetadata <- env$variableMetadata
75 return (i) # return results, if any 77 return (i) # return results, if any
76 } 78 }
77 ksamples <- positive_ints(env$ksamples , "ksamples") 79 ksamples <- positive_ints(env$ksamples , "ksamples")
78 kfeatures <- positive_ints(env$kfeatures, "kfeatures") 80 kfeatures <- positive_ints(env$kfeatures, "kfeatures")
79 81
82 log_action("w4mkmeans: preparing data matrix")
83 # prepare data matrix (normalize, eliminate zero-variance rows, etc.; no transformation)
84 dm_en <- new.env()
85 dm_en$log <- c()
86 preparation_result <- tryCatchFunc(function(){
87 dm <- prepare.data.matrix(
88 x.matrix = env$dataMatrix
89 , data.transformation = function(x) { x }
90 , en = dm_en
91 )
92 my_log <- dm_en$log
93 for ( i in 1:length(my_log) ) {
94 log_action(my_log[i])
95 }
96 dm
97 })
98 if ( !preparation_result$success ) {
99 postmortem <- paste("prepare.data.matrix failed:", preparation_result$msg)
100 log_action(postmortem)
101 stop(postmortem)
102 }
103
104 env$preparedDataMatrix <- preparation_result$value
105
106 log_action("w4mkmeans: determining evaluation mode")
107
80 myLapply <- parLapply 108 myLapply <- parLapply
81 cl <- NULL 109 cl <- NULL
82 tryCatch( 110 tryCatch(
83 expr = { 111 expr = {
84 cl <- makePSOCKcluster(names = slots) 112 outfile <- ""
113 # outfile: Where to direct the stdout and stderr connection output from the workers.
114 # - "" indicates no redirection (which may only be useful for workers on the local machine).
115 # - Defaults to ‘/dev/null’ (‘nul:’ on Windows).
116 # - The other possibility is a file path on the worker's host.
117 # - Files will be opened in append mode, as all workers log to the same file.
118 cl <- makePSOCKcluster( names = slots, outfile = outfile )
85 } 119 }
86 , error = function(e) { 120 , error = function(e) {
87 log_action(sprintf("w4kmeans: falling back to serial evaluation because makePSOCKcluster(names = %d) threw an exception", slots)) 121 log_action(sprintf("w4kmeans: falling back to serial evaluation because makePSOCKcluster(names = %d) threw an exception", slots))
88 # mimic parLapply, but without parallelization (as a last resort) 122 # mimic parLapply, but without parallelization (as a last resort)
89 myLapply <<- function(cl, ...) lapply(...) 123 myLapply <<- function(cl, ...) lapply(...)
90 } 124 }
91 ) 125 )
92 if ( identical(myLapply, parLapply) ) { 126 if ( identical(myLapply, parLapply) ) {
93 log_action(sprintf("w4mkmeans: using parallel evaluation with %d slots", slots)) 127 log_action(sprintf("w4mkmeans: using parallel evaluation with %d slots", slots))
94 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster." 128 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster."
129 # note varlist: names of references must be passed to the cluster so that they can be resolved
95 clusterExport( 130 clusterExport(
96 cl = cl 131 cl = cl
97 , varlist = c( 132 , varlist = c(
98 "tryCatchFunc" 133 "tryCatchFunc" # required by calc_kmeans_one_dimension_one_k
99 , "calc_kmeans_one_dimension_one_k" 134 , "format_error" # required by tryCatchFunc when errors are caught
100 , "prepare.data.matrix" 135 , "iso_date" # required by log_print
136 , "log_print" # required by calc_kmeans_one_dimension_one_k
101 ) 137 )
102 ) 138 )
103 final <- function(cl) { 139 final <- function(cl) {
104 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster." 140 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster."
105 if ( !is.null(cl) ) { 141 if ( !is.null(cl) ) {
119 # - The inner list has keys "clusters" and "scores" 155 # - The inner list has keys "clusters" and "scores"
120 156
121 # for each $i in ksamples, append column 'k$i' to data frame sampleMetadata 157 # for each $i in ksamples, append column 'k$i' to data frame sampleMetadata
122 ksamples_length <- length(ksamples) 158 ksamples_length <- length(ksamples)
123 if ( ksamples_length > 0 ) { 159 if ( ksamples_length > 0 ) {
124 smpl_result_list <- myLapply( 160 smpl_result_list <- myLapply(
125 cl = cl 161 cl = cl
126 , ksamples 162 , ksamples
127 , calc_kmeans_one_dimension_one_k 163 , calc_kmeans_one_dimension_one_k
128 , env = env 164 , env = env
129 , dimension = "samples" 165 , dimension = "samples"
138 } 174 }
139 175
140 # for each $i in kfeatures, append column 'k$i' to data frame featureMetadata 176 # for each $i in kfeatures, append column 'k$i' to data frame featureMetadata
141 kfeatures_length <- length(kfeatures) 177 kfeatures_length <- length(kfeatures)
142 if ( kfeatures_length > 0 ) { 178 if ( kfeatures_length > 0 ) {
143 feat_result_list <- myLapply( 179 feat_result_list <- myLapply(
144 cl = cl 180 cl = cl
145 , kfeatures 181 , kfeatures
146 , calc_kmeans_one_dimension_one_k 182 , calc_kmeans_one_dimension_one_k
147 , env = env 183 , env = env
148 , dimension = "features" 184 , dimension = "features"
154 scores <- c(scores, result$value$scores) 190 scores <- c(scores, result$value$scores)
155 } 191 }
156 } 192 }
157 } 193 }
158 194
159 return ( 195 return (
160 list( 196 list(
161 variableMetadata = featureMetadata 197 variableMetadata = featureMetadata
162 , sampleMetadata = sampleMetadata 198 , sampleMetadata = sampleMetadata
163 , scores = scores 199 , scores = scores
164 ) 200 )
165 ) 201 )
166 } 202 }
167 , finally = final(cl) 203 , finally = {
204 final(cl)
205 }
168 ) 206 )
169 } 207 }
170 208
171 # calculate k-means for features or samples 209 # calculate k-means for features or samples
172 # - recall that the dataMatrix has features in rows and samples in columns 210 # - recall that the dataMatrix has features in rows and samples in columns
173 # return value: 211 # return value:
174 # list(clusters = km$cluster, scores = scores) 212 # list(clusters = km$cluster, scores = scores)
175 # arguments: 213 # arguments:
176 # env: 214 # env:
177 # environment having dataMatrix 215 # environment having dataMatrix
178 # dimension: 216 # dimension:
179 # - "samples": produce clusters column to add to the sampleMetadata table 217 # - "samples": produce clusters column to add to the sampleMetadata table
183 # integer, the number of clusters to make 221 # integer, the number of clusters to make
184 calc_kmeans_one_dimension_one_k <- function(k, env, dimension = "samples") { 222 calc_kmeans_one_dimension_one_k <- function(k, env, dimension = "samples") {
185 # abort if environment is not as expected 223 # abort if environment is not as expected
186 if ( is.null(env) || ! is.environment(env) ) { 224 if ( is.null(env) || ! is.environment(env) ) {
187 stop("calc_kmeans_one_dimension_one_k - argument 'env' is not an environment") 225 stop("calc_kmeans_one_dimension_one_k - argument 'env' is not an environment")
188 } 226 }
189 if ( ! exists("log_print", env) || ! is.function(env$log_print) ) { 227 if ( ! exists("log_print", env) || ! is.function(env$log_print) ) {
190 stop("calc_kmeans_one_dimension_one_k - argument 'env' - environment does not include log_print or it is not a function") 228 stop("calc_kmeans_one_dimension_one_k - argument 'env' - environment does not include log_print or it is not a function")
191 } 229 }
230 log_action <- env$log_print
192 # abort if k is not as expected 231 # abort if k is not as expected
193 if ( ! is.numeric(k) ) { 232 if ( ! is.numeric(k) ) {
194 stop(sprintf("calc_kmeans_one_dimension_one_k - expected numeric argument 'k' but type is %s", typeof(k))) 233 stop(sprintf("calc_kmeans_one_dimension_one_k - expected numeric argument 'k' but type is %s", typeof(k)))
195 } 234 }
196 k <- as.integer(k) 235 k <- as.integer(k)
197 # abort if dimension is not as expected 236 # abort if dimension is not as expected
198 if ( ! is.character(dimension) 237 if ( ! is.character(dimension)
199 || ! Reduce( f =`|`, x = sapply(X = c("features","samples"), FUN = `==`, dimension), init = FALSE) ) { 238 || ! Reduce( f =`|`, x = sapply(X = c("features","samples"), FUN = `==`, dimension), init = FALSE) ) {
200 stop("calc_kmeans_one_dimension_one_k - argument 'dimension' is neither 'features' nor 'samples'") 239 stop("calc_kmeans_one_dimension_one_k - argument 'dimension' is neither 'features' nor 'samples'")
201 } 240 }
202 dm <- env$dataMatrix 241 dm <- env$preparedDataMatrix
203 iter.max <- env$iter.max 242 iter_max <- env$iter_max
204 nstart <- env$nstart 243 nstart <- env$nstart
205 algorithm <- env$algorithm 244 algorithm <- env$algorithm
206 dim_features <- dimension == "features" 245 dim_features <- dimension == "features"
246
207 # tryCatchFunc produces a list 247 # tryCatchFunc produces a list
208 # On success of expr(), tryCatchFunc produces 248 # On success of func(), tryCatchFunc produces
209 # list(success TRUE, value = expr(), msg = "") 249 # list(success = TRUE, value = func(), msg = "")
210 # On failure of expr(), tryCatchFunc produces 250 # On failure of func(), tryCatchFunc produces
211 # list(success = FALSE, value = NA, msg = "the error message") 251 # list(success = FALSE, value = NA, msg = "the error message")
212 result_list <- tryCatchFunc( expr = function() { 252 result_list <- tryCatchFunc( func = function() {
213 # kmeans clusters the rows; features are the columns of args_env$dataMatrix; samples, the rows 253 # kmeans clusters the rows; features are the columns of args_env$dataMatrix; samples, the rows
214 # - to calculate sample-clusters, no transposition is needed because samples are rows 254 # - to calculate sample-clusters, no transposition is needed because samples are rows
215 # - to calculate feature-clusters, transposition is needed so that features will be the rows 255 # - to calculate feature-clusters, transposition is needed so that features will be the rows
216 if ( ! dim_features ) dm <- t(dm) 256 if ( ! dim_features ) {
217 dm <- prepare.data.matrix( x.matrix = dm, data.transformation = function(x) { x } ) 257 dm <- t(dm)
258 }
259
218 # need to set.seed to get reproducible results from kmeans 260 # need to set.seed to get reproducible results from kmeans
219 set.seed(4567) 261 set.seed(4567)
262
220 # do the k-means clustering 263 # do the k-means clustering
221 km <- kmeans( x = dm, centers = k, iter.max, nstart = nstart, algorithm = algorithm ) 264 withCallingHandlers(
265 {
266 km <<- kmeans( x = dm, centers = k, iter.max = iter_max, nstart = nstart, algorithm = algorithm )
267 }
268 , warning = function(w) {
269 lw <- list(w)
270 smplwrn <- as.character(w[[1]])
271 log_print(
272 sprintf( "Warning for %s: center = %d, nstart = %d, iter_max = %d: %s"
273 , if (dim_features) "features" else "samples"
274 , k
275 , nstart
276 , iter_max
277 , smplwrn
278 )
279 )
280 }
281 )
282
283 # collect the scores
222 scores <- 284 scores <-
223 sprintf("%s\t%d\t%0.5e\t%0.5e\t%0.5f" 285 sprintf("%s\t%d\t%0.5e\t%0.5e\t%0.5f"
224 , dimension 286 , dimension
225 , k 287 , k
226 , km$totss 288 , km$totss
227 , km$betweenss 289 , km$betweenss
228 , km$betweenss/km$totss 290 , km$betweenss/km$totss
229 ) 291 )
292
293 # return list of results
230 list(clusters = km$cluster, scores = scores) 294 list(clusters = km$cluster, scores = scores)
231 }) 295 })
296
297 # return either
298 # list(success = TRUE, value = func(), msg = "")
299 # or
300 # list(success = FALSE, value = NA, msg = "the error message")
232 return ( result_list ) 301 return ( result_list )
233 } 302 }
234 303
304 # vim: sw=2 ts=2 et :