w4mkmeans: w4mkmeans_routines.R comparison

comparison w4mkmeans_routines.R @ 6:3f72a635a075 draft default tip

planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 2799299a221358b648334c3f890c4024155af73b

author	eschen42
date	Fri, 02 Mar 2018 08:32:17 -0500
parents	6817b036b06e
children

comparison

equal deleted inserted replaced

-:6817b036b06e
+:3f72a635a075
 ## these are the batch-independent and file-structure-independent routines to support the w4mkmeans tool
 ##------------------------------------------------------------------------------------------------------
 library(parallel)
-w4kmeans_usage <- function() {
+w4mkmeans_usage <- function() {
 return (
 c(
 "w4mkmeans: bad input.",
-"# contract:",
+"  contract:",
 "    required - caller will provide an environment comprising:",
 "      log_print          - a logging function with the signature function(x, ...) expecting strings as x and ...",
 "      variableMetadata   - the corresponding W4M data.frame having feature metadata",
 "      sampleMetdata      - the corresponding W4M data.frame having sample metadata",
 "      dataMatrix         - the corresponding W4M matrix",
 "      slots              - the number of parallel slots for calculating kmeans",
 "    optional - environment may comprise:",
 "      kfeatures          - an array of integers, the k's to apply for clustering by feature (default, empty array)",
 "      ksamples           - an array of integers, the k's to apply for clustering by sample (default, empty array)",
-"      iter.max           - the maximum number of iterations when calculating a cluster (default = 10)",
+"      iter_max           - the maximum number of iterations when calculating a cluster (default = 20)",
-"      nstart             - how many random sets of centers should be chosen (default = 1)",
+"      nstart             - how many random sets of centers should be chosen (default = 20)",
 "      algorithm          - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
 "      categorical_prefix - string from c('Hartigan-Wong', 'Lloyd', 'Forgy', 'MacQueen') (default = Hartigan-Wong)",
 "      ",
 "    this routine will return a list comprising:",
 "      variableMetadata   - the input variableMetadata data.frame with updates, if any",
 }
 w4mkmeans <- function(env) {
 # abort if 'env' is null or is not an environment
 if ( is.null(env) || ! is.environment(env) ) {
-lapply(w4kmeans_usage(),print)
+lapply(w4mkmeans_usage(),print)
 }
+# extract parameters from 'env'
+log_action  <- env$log_print
 # supply default arguments
-if ( ! exists("iter.max"          , env) ) env$iter.max  <- 10
+if ( ! exists("iter_max"          , env) ) env$iter_max  <- 20
-if ( ! exists("nstart"            , env) ) env$nstart    <- 1
+if ( ! exists("nstart"            , env) ) env$nstart    <- 20
 if ( ! exists("algorithm"         , env) ) env$algorithm <- 'Hartigan-Wong'
-if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'k'
+if ( ! exists("categorical_prefix", env) ) env$categorical_prefix <- 'c'
 if ( ! exists("ksamples"          , env) ) env$ksamples  <- c()
 if ( ! exists("kfeatures"         , env) ) env$kfeatures <- c()
 # check mandatory arguments
 expected <- c(
 "log_print"
 , "slots"
 )
 missing_from_env <- setdiff(expected, (ls(env)))
 if ( length(missing_from_env) > 0 ) {
 print(paste(c('expected environment members not found: ', as.character(missing_from_env)), collapse = ", "))
-lapply(w4kmeans_usage(),print)
+lapply(w4mkmeans_usage(),log_action)
 stop("w4mkmeans: contract has been broken")
 }
 # extract parameters from 'env'
 log_action  <- env$log_print
 scores          <- c( "clusterOn\tk\ttotalSS\tbetweenSS\tproportion" )
 sampleMetadata  <- env$sampleMetadata
 featureMetadata <- env$variableMetadata
 return (i)            # return results, if any
 }
 ksamples        <- positive_ints(env$ksamples , "ksamples")
 kfeatures       <- positive_ints(env$kfeatures, "kfeatures")
+log_action("w4mkmeans: preparing data matrix")
+# prepare data matrix (normalize, eliminate zero-variance rows, etc.; no transformation)
+dm_en <- new.env()
+dm_en$log <- c()
+preparation_result <- tryCatchFunc(function(){
+dm <- prepare.data.matrix(
+x.matrix = env$dataMatrix
+, data.transformation = function(x) { x }
+, en = dm_en
+)
+my_log <- dm_en$log
+for ( i in 1:length(my_log) ) {
+log_action(my_log[i])
+}
+dm
+})
+if ( !preparation_result$success ) {
+postmortem <- paste("prepare.data.matrix failed:", preparation_result$msg)
+log_action(postmortem)
+stop(postmortem)
+}
+env$preparedDataMatrix <- preparation_result$value
+log_action("w4mkmeans: determining evaluation mode")
 myLapply <- parLapply
 cl <- NULL
 tryCatch(
 expr = {
-cl <- makePSOCKcluster(names = slots)
+outfile <- ""
+# outfile: Where to direct the stdout and stderr connection output from the workers.
+#   - "" indicates no redirection (which may only be useful for workers on the local machine).
+#   - Defaults to ‘/dev/null’ (‘nul:’ on Windows).
+#   - The other possibility is a file path on the worker's host.
+#     - Files will be opened in append mode, as all workers log to the same file.
+cl <- makePSOCKcluster( names = slots, outfile = outfile )
 }
 , error = function(e) {
 log_action(sprintf("w4kmeans: falling back to serial evaluation because makePSOCKcluster(names = %d) threw an exception", slots))
 # mimic parLapply, but without parallelization (as a last resort)
 myLapply <<- function(cl, ...) lapply(...)
 }
 )
 if ( identical(myLapply, parLapply) ) {
 log_action(sprintf("w4mkmeans: using parallel evaluation with %d slots", slots))
 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster."
+# note varlist: names of references must be passed to the cluster so that they can be resolved
 clusterExport(
 cl = cl
 , varlist = c(
-"tryCatchFunc"
+"tryCatchFunc"                     # required by calc_kmeans_one_dimension_one_k
-, "calc_kmeans_one_dimension_one_k"
+, "format_error"                     # required by tryCatchFunc when errors are caught
-, "prepare.data.matrix"
+, "iso_date"                         # required by log_print
+, "log_print"                        # required by calc_kmeans_one_dimension_one_k
 )
 )
 final <- function(cl) {
 # from ?makePSOCKcluster: "It is good practice to shut down the workers by calling stopCluster."
 if ( !is.null(cl) ) {
 #   - The inner list has keys "clusters" and "scores"
 # for each $i in ksamples, append column 'k$i' to data frame sampleMetadata
 ksamples_length <- length(ksamples)
 if ( ksamples_length > 0 ) {
 smpl_result_list <- myLapply(
 cl = cl
 , ksamples
 , calc_kmeans_one_dimension_one_k
 , env = env
 , dimension = "samples"
 }
 # for each $i in kfeatures, append column 'k$i' to data frame featureMetadata
 kfeatures_length <- length(kfeatures)
 if ( kfeatures_length > 0 ) {
 feat_result_list <- myLapply(
 cl = cl
 , kfeatures
 , calc_kmeans_one_dimension_one_k
 , env = env
 , dimension = "features"
 scores <- c(scores, result$value$scores)
 }
 }
 }
 return (
 list(
 variableMetadata = featureMetadata
 , sampleMetadata   = sampleMetadata
 , scores           = scores
 )
 )
 }
-, finally = final(cl)
+, finally = {
+final(cl)
+}
 )
 }
 # calculate k-means for features or samples
 #   - recall that the dataMatrix has features in rows and samples in columns
 # return value:
 #   list(clusters = km$cluster, scores = scores)
 # arguments:
 #   env:
 #     environment having dataMatrix
 #   dimension:
 #   - "samples":  produce clusters column to add to the sampleMetadata table
 #     integer, the number of clusters to make
 calc_kmeans_one_dimension_one_k <- function(k, env, dimension = "samples") {
 # abort if environment is not as expected
 if ( is.null(env) || ! is.environment(env) ) {
 stop("calc_kmeans_one_dimension_one_k - argument 'env' is not an environment")
 }
 if ( ! exists("log_print", env) || ! is.function(env$log_print) ) {
 stop("calc_kmeans_one_dimension_one_k - argument 'env' - environment does not include log_print or it is not a function")
 }
+log_action  <- env$log_print
 # abort if k is not as expected
 if ( ! is.numeric(k) ) {
 stop(sprintf("calc_kmeans_one_dimension_one_k - expected numeric argument 'k' but type is %s", typeof(k)))
 }
 k <- as.integer(k)
 # abort if dimension is not as expected
 if (   ! is.character(dimension)
 || ! Reduce( f =`|`, x = sapply(X = c("features","samples"), FUN = `==`, dimension), init = FALSE) ) {
 stop("calc_kmeans_one_dimension_one_k - argument 'dimension' is neither 'features' nor 'samples'")
 }
-dm           <- env$dataMatrix
+dm           <- env$preparedDataMatrix
-iter.max     <- env$iter.max
+iter_max     <- env$iter_max
 nstart       <- env$nstart
 algorithm    <- env$algorithm
 dim_features <- dimension == "features"
 # tryCatchFunc produces a list
-#   On success of expr(), tryCatchFunc produces
+#   On success of func(), tryCatchFunc produces
-#     list(success TRUE, value = expr(), msg = "")
+#     list(success = TRUE, value = func(), msg = "")
-#   On failure of expr(), tryCatchFunc produces
+#   On failure of func(), tryCatchFunc produces
 #     list(success = FALSE, value = NA, msg = "the error message")
-result_list <- tryCatchFunc( expr = function() {
+result_list <- tryCatchFunc( func = function() {
 # kmeans clusters the rows; features are the columns of args_env$dataMatrix; samples, the rows
 # - to calculate sample-clusters, no transposition is needed because samples are rows
 # - to calculate feature-clusters, transposition is needed so that features will be the rows
-if ( ! dim_features ) dm <- t(dm)
+if ( ! dim_features ) {
-dm <- prepare.data.matrix( x.matrix = dm, data.transformation = function(x) { x } )
+dm <- t(dm)
+}
 # need to set.seed to get reproducible results from kmeans
 set.seed(4567)
 # do the k-means clustering
-km <- kmeans( x = dm, centers = k, iter.max, nstart = nstart, algorithm = algorithm )
+withCallingHandlers(
+{
+km <<- kmeans( x = dm, centers = k, iter.max = iter_max, nstart = nstart, algorithm = algorithm )
+}
+, warning = function(w) {
+lw <- list(w)
+smplwrn <- as.character(w[[1]])
+log_print(
+sprintf( "Warning for %s: center = %d, nstart = %d, iter_max = %d: %s"
+, if (dim_features) "features" else "samples"
+, k
+, nstart
+, iter_max
+, smplwrn
+)
+)
+}
+)
+# collect the scores
 scores <-
 sprintf("%s\t%d\t%0.5e\t%0.5e\t%0.5f"
 , dimension
 , k
 , km$totss
 , km$betweenss
 , km$betweenss/km$totss
 )
+# return list of results
 list(clusters = km$cluster, scores = scores)
 })
+# return either
+#     list(success = TRUE, value = func(), msg = "")
+# or
+#     list(success = FALSE, value = NA, msg = "the error message")
 return ( result_list )
 }
+# vim: sw=2 ts=2 et :

Mercurial > repos > eschen42 > w4mkmeans

comparison w4mkmeans_routines.R @ 6:3f72a635a075 draft default tip