annotate nb_clust_G.R @ 1:a23e4fded433 draft

planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit c41939f1cdc03331ec021d47495576a6b0c5fd14
author ecology
date Wed, 16 Oct 2024 11:45:06 +0000
parents 88964fcccfef
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
1 # Script to determine the optimal number of clusters thanks to the optimization of the SIH index and to produce the files needed in the next step of clustering
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
2
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
3 #load packages
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
4 library(cluster)
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
5 library(dplyr)
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
6 library(tidyverse)
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
7
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
8 #load arguments
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
9 args = commandArgs(trailingOnly=TRUE)
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
10 if (length(args)==0)
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
11 {
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
12 stop("This tool needs at least one argument")
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
13 }else{
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
14 enviro <- args[1]
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
15 taxa_list <- args[2]
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
16 preds <- args[3]
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
17 max_k <- as.numeric(args[4])
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
18 metric <- args[5]
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
19 sample <- as.numeric(args[6])
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
20 }
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
21
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
22 #load data
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
23
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
24 env.data <- read.table(enviro, sep="\t", header = TRUE, dec = ".", na.strings = "-9999")
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
25
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
26 ##List of modelled taxa used for clustering
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
27 tv <- read.table(taxa_list, dec=".", sep=" ", header=F, na.strings = "NA")
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
28 names(tv) <- c("a")
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
29
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
30 ################Grouping of taxa if multiple prediction files entered ################
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
31
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
32 data_split = str_split(preds,",")
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
33 data.bio = NULL
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
34
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
35 for (i in 1:length(data_split[[1]])) {
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
36 data.bio1 <- read.table(data_split[[1]][i], dec=".", sep="\t", header=T, na.strings = "NA")
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
37 data.bio <- rbind(data.bio,data.bio1)
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
38 remove(data.bio1)
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
39 }
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
40
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
41 names(data.bio) <- c("lat", "long", "pred", "taxon")
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
42
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
43 #keep selected taxa
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
44 data.bio <- data.bio[which(data.bio$taxon %in% tv$a),]
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
45
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
46 write.table(data.bio,file="data_bio.tsv",sep="\t",quote=F,row.names=F)
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
47
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
48 #format data
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
49
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
50 test3 <- matrix(data.bio$pred , nrow = nrow(env.data), ncol = nrow(data.bio)/nrow(env.data))
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
51 test3 <- data.frame(test3)
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
52 names(test3) <- unique(data.bio$taxon)
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
53
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
54 write.table(test3, file="data_to_clus.tsv", sep="\t",quote=F,row.names=F)
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
55
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
56 #Max number of clusters to test
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
57 max_k <- max_k
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
58
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
59 # Initialization of vectors to store SIH indices
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
60 sih_values <- rep(0, max_k)
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
61
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
62 # Calculation of the SIH index for each number of clusters
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
63 for (k in 2:max_k) {
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
64 # Clara execution
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
65 clara_res <- clara(test3, k, metric =metric, samples = sample, sampsize = min(nrow(test3), (nrow(data.bio)/nrow(test3))+2*k))
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
66 # Calculation of the SIH index
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
67 sih_values[k] <- clara_res$silinfo$avg.width
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
68 }
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
69
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
70 # Plot SIH Index Chart by Number of Clusters
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
71 png("Indices_SIH.png")
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
72 plot(2:max_k, sih_values[2:max_k], type = "b", xlab = "Number of clusters", ylab = "SIH index")
88964fcccfef planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
73 dev.off()