Mercurial > repos > ecology > claraguess
comparison nb_clust_G.R @ 0:52d4151e00d8 draft default tip
planemo upload for repository https://github.com/galaxyecology/tools-ecology/tree/master/tools/Ecoregionalization_workflow commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
| author | ecology |
|---|---|
| date | Wed, 28 May 2025 10:12:06 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:52d4151e00d8 |
|---|---|
| 1 # Script to determine the optimal number of clusters thanks to the optimization of the SIH index and to produce the files needed in the next step of clustering | |
| 2 | |
| 3 #load packages | |
| 4 library(cluster) | |
| 5 library(dplyr) | |
| 6 library(tidyverse) | |
| 7 | |
| 8 #load arguments | |
| 9 args = commandArgs(trailingOnly=TRUE) | |
| 10 if (length(args)==0) | |
| 11 { | |
| 12 stop("This tool needs at least one argument") | |
| 13 }else{ | |
| 14 enviro <- args[1] | |
| 15 taxa_list <- args[2] | |
| 16 preds <- args[3] | |
| 17 max_k <- as.numeric(args[4]) | |
| 18 metric <- args[5] | |
| 19 sample <- as.numeric(args[6]) | |
| 20 } | |
| 21 | |
| 22 #load data | |
| 23 | |
| 24 env.data <- read.table(enviro, sep="\t", header = TRUE, dec = ".", na.strings = "-9999") | |
| 25 | |
| 26 ##List of modelled taxa used for clustering | |
| 27 tv <- read.table(taxa_list, dec=".", sep=" ", header=F, na.strings = "NA") | |
| 28 names(tv) <- c("a") | |
| 29 | |
| 30 ################Grouping of taxa if multiple prediction files entered ################ | |
| 31 | |
| 32 data_split = str_split(preds,",") | |
| 33 data.bio = NULL | |
| 34 | |
| 35 for (i in 1:length(data_split[[1]])) { | |
| 36 data.bio1 <- read.table(data_split[[1]][i], dec=".", sep="\t", header=T, na.strings = "NA") | |
| 37 data.bio <- rbind(data.bio,data.bio1) | |
| 38 remove(data.bio1) | |
| 39 } | |
| 40 | |
| 41 names(data.bio) <- c("lat", "long", "pred", "taxon") | |
| 42 | |
| 43 #keep selected taxa | |
| 44 data.bio <- data.bio[which(data.bio$taxon %in% tv$a),] | |
| 45 | |
| 46 write.table(data.bio,file="data_bio.tsv",sep="\t",quote=F,row.names=F) | |
| 47 | |
| 48 #format data | |
| 49 | |
| 50 test3 <- matrix(data.bio$pred , nrow = nrow(env.data), ncol = nrow(data.bio)/nrow(env.data)) | |
| 51 test3 <- data.frame(test3) | |
| 52 names(test3) <- unique(data.bio$taxon) | |
| 53 | |
| 54 write.table(test3, file="data_to_clus.tsv", sep="\t",quote=F,row.names=F) | |
| 55 | |
| 56 #Max number of clusters to test | |
| 57 max_k <- max_k | |
| 58 | |
| 59 # Initialization of vectors to store SIH indices | |
| 60 sih_values <- rep(0, max_k) | |
| 61 | |
| 62 # Calculation of the SIH index for each number of clusters | |
| 63 for (k in 2:max_k) { | |
| 64 # Clara execution | |
| 65 clara_res <- clara(test3, k, metric =metric, samples = sample, sampsize = min(nrow(test3), (nrow(data.bio)/nrow(test3))+2*k)) | |
| 66 # Calculation of the SIH index | |
| 67 sih_values[k] <- clara_res$silinfo$avg.width | |
| 68 } | |
| 69 | |
| 70 # Plot SIH Index Chart by Number of Clusters | |
| 71 png("Indices_SIH.png") | |
| 72 plot(2:max_k, sih_values[2:max_k], type = "b", xlab = "Number of clusters", ylab = "SIH index") | |
| 73 dev.off() |
