Mercurial > repos > ecology > wormsmeasurements
comparison wormsmeasurements.R @ 0:4da00cf02719 draft default tip
planemo upload for repository https://github.com/jeanlecras/tools-ecology/tree/master/tools/WormsMeasurements commit ced658540f05bb07e1e687af30a3fa4ea8e4803c
author | ecology |
---|---|
date | Wed, 28 May 2025 10:12:16 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4da00cf02719 |
---|---|
1 ##05/05/2025 | |
2 ##Jean Le Cras | |
3 ### Enrich dataset with data from WoRMS | |
4 | |
5 #load libraries | |
6 library(tidyverse) | |
7 library(worrms) | |
8 library(fastDummies) | |
9 | |
10 ### parameters | |
11 args <- commandArgs(trailingOnly = TRUE) | |
12 if (length(args) == 0) { | |
13 stop("This tool needs at least one argument") | |
14 } | |
15 | |
16 scientificName_name <- args[3] | |
17 occurrence <- read.csv(args[1], header=T, sep="\t") %>% | |
18 arrange(.[[scientificName_name]]) | |
19 measurement_types <- unlist(str_split(args[2], ",")) | |
20 include_inherited <- ifelse(args[4]=="true", T, F) | |
21 pivot_wider <- ifelse(args[5]=="true", T, F) | |
22 exclude_NA <- ifelse(args[6]=="true", T, F) | |
23 | |
24 # regex to only keep genus and specific epithet from scientific names | |
25 regex_find <- "^([A-Z][^A-Z(]+)(.*)$" | |
26 regex_replace <- "\\1" | |
27 | |
28 | |
29 # function to extract the measurement values from the attributes data tibble | |
30 extract_traits_values <- function(traits_data) { | |
31 result <- setNames(rep(NA, length(measurement_types)), measurement_types) | |
32 | |
33 if (is.null(traits_data) || nrow(traits_data) == 0) { | |
34 return(result) | |
35 } | |
36 | |
37 traits_filtered <- traits_data %>% | |
38 filter(measurementType %in% measurement_types) %>% | |
39 filter(!is.na(measurementValue)) | |
40 | |
41 if (nrow(traits_filtered) == 0) { | |
42 return(result) | |
43 } | |
44 | |
45 for (i in 1:nrow(traits_filtered)) { | |
46 result[traits_filtered$measurementType[i]] <- traits_filtered$measurementValue[i] | |
47 } | |
48 return(result) | |
49 } | |
50 | |
51 # function to call the call the WoRMS API and get the measurement values | |
52 get_life_history_traits <- function(scientific_name) { | |
53 clean_scientific_name <- trimws(gsub(regex_find, regex_replace, scientific_name)) | |
54 | |
55 if (clean_scientific_name %in% names(cache)) { | |
56 return(cache[[clean_scientific_name]]) | |
57 } | |
58 | |
59 worms_id <- tryCatch( | |
60 wm_name2id(name = clean_scientific_name), | |
61 error = function(e) NA | |
62 ) | |
63 | |
64 if (is.na(worms_id) || length(worms_id) == 0) { | |
65 cache[[clean_scientific_name]] <<- NULL | |
66 return(NULL) | |
67 } | |
68 | |
69 data_attr <- tryCatch( | |
70 wm_attr_data(worms_id, include_inherited=include_inherited), | |
71 error = function(e) NULL | |
72 ) | |
73 | |
74 if (is.null(data_attr)) { | |
75 cache[[clean_scientific_name]] <<- NULL | |
76 return(NULL) | |
77 } | |
78 | |
79 traits <- extract_traits_values(data_attr) | |
80 cache[[clean_scientific_name]] <<- traits | |
81 return(traits) | |
82 } | |
83 | |
84 # a cache to limit API calls | |
85 cache <- list() | |
86 | |
87 # add a columns conataining the lists of values of the measurments requested | |
88 trait_data <- occurrence %>% | |
89 mutate(life_history_traits = map(.data[[scientificName_name]], ~ get_life_history_traits(.x))) | |
90 | |
91 # convert the column of lists to multiple columns of unique values | |
92 trait_data <- trait_data %>% | |
93 unnest_wider(life_history_traits) | |
94 | |
95 # make sur each measurement type has a column | |
96 for (col in measurement_types) { | |
97 if (!(col %in% names(trait_data))) { | |
98 trait_data[[col]] <- NA | |
99 } | |
100 } | |
101 | |
102 # list of quantitativ measurements | |
103 numeric_cols <- c() | |
104 | |
105 # try to convert columns to numeric and remember them | |
106 trait_data <- trait_data %>% | |
107 mutate(across(all_of(measurement_types), ~ { | |
108 numeric_col <- suppressWarnings(as.numeric(.)) | |
109 if (all(is.na(.) == is.na(numeric_col))) { | |
110 numeric_cols <<- c(numeric_cols, cur_column()) | |
111 numeric_col | |
112 } else { | |
113 . | |
114 } | |
115 })) | |
116 | |
117 # filter NA but only in the added columns | |
118 if (exclude_NA) { | |
119 trait_data <- trait_data[complete.cases(trait_data[, measurement_types]),] | |
120 } | |
121 | |
122 # determine what are the qualitativ columns to be one hot encoded | |
123 factor_cols <- setdiff(measurement_types, numeric_cols) | |
124 | |
125 # one hot encode quantitativ columns | |
126 if (pivot_wider & length(factor_cols) > 0) { | |
127 trait_data <- dummy_cols(trait_data, select_columns = factor_cols, remove_selected_columns=T, ignore_na=T) | |
128 } | |
129 | |
130 # write the enriched dataset as tabular | |
131 write.table(trait_data, "enriched_data.tabular", sep="\t", row.names = FALSE) |