Mercurial > repos > ecology > claraguess
comparison claraguess.R @ 0:52d4151e00d8 draft default tip
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
| author | ecology |
|---|---|
| date | Wed, 28 May 2025 10:12:06 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:52d4151e00d8 |
|---|---|
| 1 ##30/04/2025 | |
| 2 ##Jean Le Cras | |
| 3 ### Clustering with Clara algorithm with an option to determine the optimal number of clusters based on SIH index | |
| 4 | |
| 5 #load libraries | |
| 6 library(cluster) | |
| 7 library(dplyr) | |
| 8 library(tidyverse) | |
| 9 | |
| 10 #load arguments | |
| 11 args <- commandArgs(trailingOnly = TRUE) | |
| 12 if (length(args)==0) { | |
| 13 stop("This tool needs at least one argument") | |
| 14 } | |
| 15 | |
| 16 #load data | |
| 17 enviro_path <- args[1] | |
| 18 preds_path <- args[2] | |
| 19 taxas_path <- args[3] | |
| 20 type <- args[4] | |
| 21 k <- as.integer(args[5]) | |
| 22 metric <- args[6] | |
| 23 samples <- as.integer(args[7]) | |
| 24 env.data <- read.table(enviro_path, sep = "\t", header = TRUE, dec = ".", na.strings = "-9999") | |
| 25 | |
| 26 data_split = str_split(preds_path, ",") | |
| 27 preds.data = NULL | |
| 28 | |
| 29 for (i in 1:length(data_split[[1]])) { | |
| 30 df <- read.table(data_split[[1]][i], dec=".", sep="\t", header=T, na.strings="NA") | |
| 31 preds.data <- rbind(preds.data, df) | |
| 32 remove(df) | |
| 33 } | |
| 34 | |
| 35 names(preds.data) <- c("lat", "long", "pred", "taxa") | |
| 36 | |
| 37 development_traits <- str_split(readLines(taxas_path), "\t") | |
| 38 | |
| 39 #select the clara model with the optimal number of clusters | |
| 40 model <- NULL | |
| 41 | |
| 42 if (type == "auto") { | |
| 43 sih_scores <- c() | |
| 44 models <- list() | |
| 45 | |
| 46 for (i in 2:k) { | |
| 47 models[[i]] <- clara(preds.data$pred, i, metric = metric, samples = samples, stand = TRUE) | |
| 48 sih_scores[i] <- models[[i]]$silinfo$avg.width | |
| 49 } | |
| 50 png("sih_scores.png") | |
| 51 plot(2:k, sih_scores[2:k], type = "b", xlab = "Number of clusters", ylab = "SIH index") | |
| 52 dev.off() | |
| 53 | |
| 54 best_k <- which.max(sih_scores[3:k]) + 2 | |
| 55 model <- models[[best_k]] | |
| 56 k <- best_k | |
| 57 } else { | |
| 58 model <- clara(preds.data$pred, k, metric = metric, samples = samples, stand = TRUE) | |
| 59 } | |
| 60 | |
| 61 #saving results | |
| 62 png("silhouette_plot.png") | |
| 63 plot(silhouette(model), main = paste("Silhouette plot for k =", k)) | |
| 64 dev.off() | |
| 65 | |
| 66 data.test <- matrix(preds.data$pred, nrow = nrow(env.data), ncol = nrow(preds.data) / nrow(env.data)) | |
| 67 data.test <- data.frame(data.test) | |
| 68 names(data.test) <- unique(preds.data$development) | |
| 69 | |
| 70 full.data <- cbind(preds.data[1:nrow(data.test), 1:2], model$clustering) | |
| 71 names(full.data) <- c("lat", "long", "cluster") | |
| 72 full.data <- cbind(full.data, data.test, env.data[, 3:ncol(env.data)]) | |
| 73 | |
| 74 write.table(full.data[1:3], file = "data_cluster.tabular", quote = FALSE, sep = "\t", row.names = FALSE) | |
| 75 write.table(full.data, file = "clustered_taxas_env.tabular", quote = FALSE, sep = "\t", row.names = FALSE) |
