Mercurial > repos > ecology > ecoregion_clara_cluster
annotate claraguess.R @ 2:a7269e5eed27 draft default tip
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
author | ecology |
---|---|
date | Wed, 28 May 2025 10:11:06 +0000 |
parents | |
children |
rev | line source |
---|---|
2
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
1 ##30/04/2025 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
2 ##Jean Le Cras |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
3 ### Clustering with Clara algorithm with an option to determine the optimal number of clusters based on SIH index |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
4 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
5 #load libraries |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
6 library(cluster) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
7 library(dplyr) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
8 library(tidyverse) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
9 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
10 #load arguments |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
11 args <- commandArgs(trailingOnly = TRUE) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
12 if (length(args)==0) { |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
13 stop("This tool needs at least one argument") |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
14 } |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
15 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
16 #load data |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
17 enviro_path <- args[1] |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
18 preds_path <- args[2] |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
19 taxas_path <- args[3] |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
20 type <- args[4] |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
21 k <- as.integer(args[5]) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
22 metric <- args[6] |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
23 samples <- as.integer(args[7]) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
24 env.data <- read.table(enviro_path, sep = "\t", header = TRUE, dec = ".", na.strings = "-9999") |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
25 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
26 data_split = str_split(preds_path, ",") |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
27 preds.data = NULL |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
28 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
29 for (i in 1:length(data_split[[1]])) { |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
30 df <- read.table(data_split[[1]][i], dec=".", sep="\t", header=T, na.strings="NA") |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
31 preds.data <- rbind(preds.data, df) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
32 remove(df) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
33 } |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
34 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
35 names(preds.data) <- c("lat", "long", "pred", "taxa") |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
36 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
37 development_traits <- str_split(readLines(taxas_path), "\t") |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
38 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
39 #select the clara model with the optimal number of clusters |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
40 model <- NULL |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
41 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
42 if (type == "auto") { |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
43 sih_scores <- c() |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
44 models <- list() |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
45 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
46 for (i in 2:k) { |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
47 models[[i]] <- clara(preds.data$pred, i, metric = metric, samples = samples, stand = TRUE) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
48 sih_scores[i] <- models[[i]]$silinfo$avg.width |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
49 } |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
50 png("sih_scores.png") |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
51 plot(2:k, sih_scores[2:k], type = "b", xlab = "Number of clusters", ylab = "SIH index") |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
52 dev.off() |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
53 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
54 best_k <- which.max(sih_scores[3:k]) + 2 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
55 model <- models[[best_k]] |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
56 k <- best_k |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
57 } else { |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
58 model <- clara(preds.data$pred, k, metric = metric, samples = samples, stand = TRUE) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
59 } |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
60 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
61 #saving results |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
62 png("silhouette_plot.png") |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
63 plot(silhouette(model), main = paste("Silhouette plot for k =", k)) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
64 dev.off() |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
65 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
66 data.test <- matrix(preds.data$pred, nrow = nrow(env.data), ncol = nrow(preds.data) / nrow(env.data)) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
67 data.test <- data.frame(data.test) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
68 names(data.test) <- unique(preds.data$development) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
69 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
70 full.data <- cbind(preds.data[1:nrow(data.test), 1:2], model$clustering) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
71 names(full.data) <- c("lat", "long", "cluster") |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
72 full.data <- cbind(full.data, data.test, env.data[, 3:ncol(env.data)]) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
73 |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
74 write.table(full.data[1:3], file = "data_cluster.tabular", quote = FALSE, sep = "\t", row.names = FALSE) |
a7269e5eed27
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
ecology
parents:
diff
changeset
|
75 write.table(full.data, file = "clustered_taxas_env.tabular", quote = FALSE, sep = "\t", row.names = FALSE) |