annotate nb_clust_G.R @ 1:c9109f90d229 draft

planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit c41939f1cdc03331ec021d47495576a6b0c5fd14
author ecology
date Wed, 16 Oct 2024 11:44:42 +0000
parents 92ca83b60089
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
1 # Script to determine the optimal number of clusters thanks to the optimization of the SIH index and to produce the files needed in the next step of clustering
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
2
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
3 #load packages
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
4 library(cluster)
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
5 library(dplyr)
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
6 library(tidyverse)
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
7
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
8 #load arguments
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
9 args = commandArgs(trailingOnly=TRUE)
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
10 if (length(args)==0)
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
11 {
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
12 stop("This tool needs at least one argument")
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
13 }else{
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
14 enviro <- args[1]
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
15 taxa_list <- args[2]
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
16 preds <- args[3]
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
17 max_k <- as.numeric(args[4])
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
18 metric <- args[5]
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
19 sample <- as.numeric(args[6])
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
20 }
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
21
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
22 #load data
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
23
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
24 env.data <- read.table(enviro, sep="\t", header = TRUE, dec = ".", na.strings = "-9999")
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
25
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
26 ##List of modelled taxa used for clustering
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
27 tv <- read.table(taxa_list, dec=".", sep=" ", header=F, na.strings = "NA")
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
28 names(tv) <- c("a")
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
29
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
30 ################Grouping of taxa if multiple prediction files entered ################
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
31
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
32 data_split = str_split(preds,",")
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
33 data.bio = NULL
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
34
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
35 for (i in 1:length(data_split[[1]])) {
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
36 data.bio1 <- read.table(data_split[[1]][i], dec=".", sep="\t", header=T, na.strings = "NA")
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
37 data.bio <- rbind(data.bio,data.bio1)
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
38 remove(data.bio1)
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
39 }
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
40
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
41 names(data.bio) <- c("lat", "long", "pred", "taxon")
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
42
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
43 #keep selected taxa
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
44 data.bio <- data.bio[which(data.bio$taxon %in% tv$a),]
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
45
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
46 write.table(data.bio,file="data_bio.tsv",sep="\t",quote=F,row.names=F)
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
47
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
48 #format data
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
49
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
50 test3 <- matrix(data.bio$pred , nrow = nrow(env.data), ncol = nrow(data.bio)/nrow(env.data))
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
51 test3 <- data.frame(test3)
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
52 names(test3) <- unique(data.bio$taxon)
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
53
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
54 write.table(test3, file="data_to_clus.tsv", sep="\t",quote=F,row.names=F)
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
55
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
56 #Max number of clusters to test
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
57 max_k <- max_k
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
58
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
59 # Initialization of vectors to store SIH indices
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
60 sih_values <- rep(0, max_k)
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
61
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
62 # Calculation of the SIH index for each number of clusters
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
63 for (k in 2:max_k) {
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
64 # Clara execution
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
65 clara_res <- clara(test3, k, metric =metric, samples = sample, sampsize = min(nrow(test3), (nrow(data.bio)/nrow(test3))+2*k))
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
66 # Calculation of the SIH index
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
67 sih_values[k] <- clara_res$silinfo$avg.width
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
68 }
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
69
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
70 # Plot SIH Index Chart by Number of Clusters
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
71 png("Indices_SIH.png")
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
72 plot(2:max_k, sih_values[2:max_k], type = "b", xlab = "Number of clusters", ylab = "SIH index")
92ca83b60089 planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit e03df85746a3b61a382a5ee7e3357a8bf42a5097
ecology
parents:
diff changeset
73 dev.off()