Mercurial > repos > recetox > recetox_aplcms_align_features
comparison utils.R @ 2:4d1583e15960 draft
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/recetox_aplcms commit 506df2aef355b3791567283e1a175914f06b405a
| author | recetox |
|---|---|
| date | Mon, 13 Feb 2023 10:24:40 +0000 |
| parents | e867e0c90d29 |
| children | d0b77b35819f |
comparison
equal
deleted
inserted
replaced
| 1:148327f1e1be | 2:4d1583e15960 |
|---|---|
| 1 library(recetox.aplcms) | 1 library(recetox.aplcms) |
| 2 | 2 |
| 3 align_features <- function(sample_names, ...) { | 3 get_env_sample_name <- function() { |
| 4 aligned <- feature.align(...) | 4 sample_name <- Sys.getenv("SAMPLE_NAME", unset = NA) |
| 5 feature_names <- seq_len(nrow(aligned$pk.times)) | 5 if (nchar(sample_name) == 0) { |
| 6 | 6 sample_name <- NA |
| 7 list( | 7 } |
| 8 mz_tolerance = as.numeric(aligned$mz.tol), | 8 if (is.na(sample_name)) { |
| 9 rt_tolerance = as.numeric(aligned$chr.tol), | 9 message("The mzML file does not contain run ID.") |
| 10 rt_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$pk.times), | 10 } |
| 11 int_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$aligned.ftrs) | 11 return(sample_name) |
| 12 ) | |
| 13 } | 12 } |
| 14 | 13 |
| 15 get_sample_name <- function(filename) { | 14 save_sample_name <- function(df, sample_name) { |
| 16 tools::file_path_sans_ext(basename(filename)) | 15 attr(df, "sample_name") <- sample_name |
| 16 return(df) | |
| 17 } | 17 } |
| 18 | 18 |
| 19 as_feature_crosstab <- function(feature_names, sample_names, data) { | 19 load_sample_name <- function(df) { |
| 20 colnames(data) <- c("mz", "rt", "mz_min", "mz_max", sample_names) | 20 sample_name <- attr(df, "sample_name") |
| 21 rownames(data) <- feature_names | 21 if (is.null(sample_name)) { |
| 22 as.data.frame(data) | 22 return(NA) |
| 23 } else { | |
| 24 return(sample_name) | |
| 25 } | |
| 23 } | 26 } |
| 24 | 27 |
| 25 as_feature_sample_table <- function(rt_crosstab, int_crosstab) { | 28 save_data_as_parquet_file <- function(data, filename) { |
| 26 feature_names <- rownames(rt_crosstab) | 29 arrow::write_parquet(data, filename) |
| 27 sample_names <- colnames(rt_crosstab)[- (1:4)] | |
| 28 | |
| 29 feature_table <- data.frame( | |
| 30 feature = feature_names, | |
| 31 mz = rt_crosstab[, 1], | |
| 32 rt = rt_crosstab[, 2] | |
| 33 ) | |
| 34 | |
| 35 # series of conversions to produce a table type from data.frame | |
| 36 rt_crosstab <- as.table(as.matrix(rt_crosstab[, - (1:4)])) | |
| 37 int_crosstab <- as.table(as.matrix(int_crosstab[, - (1:4)])) | |
| 38 | |
| 39 crosstab_axes <- list(feature = feature_names, sample = sample_names) | |
| 40 dimnames(rt_crosstab) <- dimnames(int_crosstab) <- crosstab_axes | |
| 41 | |
| 42 x <- as.data.frame(rt_crosstab, responseName = "sample_rt") | |
| 43 y <- as.data.frame(int_crosstab, responseName = "sample_intensity") | |
| 44 | |
| 45 data <- merge(x, y, by = c("feature", "sample")) | |
| 46 data <- merge(feature_table, data, by = "feature") | |
| 47 data | |
| 48 } | 30 } |
| 49 | 31 |
| 50 load_features <- function(files) { | 32 load_data_from_parquet_file <- function(filename) { |
| 51 files_list <- sort_samples_by_acquisition_number(files) | 33 return(arrow::read_parquet(filename)) |
| 52 features <- lapply(files_list, arrow::read_parquet) | 34 } |
| 53 features <- lapply(features, as.matrix) | 35 |
| 36 load_parquet_collection <- function(files) { | |
| 37 features <- lapply(files, arrow::read_parquet) | |
| 38 features <- lapply(features, tibble::as_tibble) | |
| 54 return(features) | 39 return(features) |
| 55 } | 40 } |
| 56 | 41 |
| 57 save_data_as_parquet_files <- function(data, subdir) { | 42 save_parquet_collection <- function(table, sample_names, subdir) { |
| 58 dir.create(subdir) | 43 dir.create(subdir) |
| 59 for (i in 0:(length(data) - 1)) { | 44 for (i in seq_len(length(table$feature_tables))) { |
| 60 filename <- file.path(subdir, paste0(subdir, "_features_", i, ".parquet")) | 45 filename <- file.path(subdir, paste0(subdir, "_", sample_names[i], ".parquet")) |
| 61 arrow::write_parquet(as.data.frame(data[i + 1]), filename) | 46 feature_table <- as.data.frame(table$feature_tables[[i]]) |
| 62 } | 47 feature_table <- save_sample_name(feature_table, sample_names[i]) |
| 48 arrow::write_parquet(feature_table, filename) | |
| 49 } | |
| 63 } | 50 } |
| 64 | 51 |
| 65 save_aligned_features <- function(aligned, rt_file, int_file, tol_file) { | 52 sort_by_sample_name <- function(tables, sample_names) { |
| 66 arrow::write_parquet(as.data.frame(aligned$rt_crosstab), rt_file) | 53 return(tables[order(sample_names)]) |
| 67 arrow::write_parquet(as.data.frame(aligned$int_crosstab), int_file) | |
| 68 | |
| 69 mz_tolerance <- c(aligned$mz_tolerance) | |
| 70 rt_tolerance <- c(aligned$rt_tolerance) | |
| 71 arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file) | |
| 72 } | 54 } |
| 73 | 55 |
| 74 load_aligned_features <- function(rt_file, int_file, tol_file) { | 56 save_tolerances <- function(table, tol_file) { |
| 75 rt_cross_table <- arrow::read_parquet(rt_file) | 57 mz_tolerance <- c(table$mz_tol_relative) |
| 76 int_cross_table <- arrow::read_parquet(int_file) | 58 rt_tolerance <- c(table$rt_tol_relative) |
| 77 tolerances_table <- arrow::read_parquet(tol_file) | 59 arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file) |
| 78 | |
| 79 result <- list() | |
| 80 result$mz_tolerance <- tolerances_table$mz_tolerance | |
| 81 result$rt_tolerance <- tolerances_table$rt_tolerance | |
| 82 result$rt_crosstab <- rt_cross_table | |
| 83 result$int_crosstab <- int_cross_table | |
| 84 return(result) | |
| 85 } | 60 } |
| 86 | 61 |
| 87 recover_signals <- function(cluster, | 62 get_mz_tol <- function(tolerances) { |
| 88 filenames, | 63 return(tolerances$mz_tolerance) |
| 89 extracted, | |
| 90 corrected, | |
| 91 aligned, | |
| 92 mz_tol = 1e-05, | |
| 93 mz_range = NA, | |
| 94 rt_range = NA, | |
| 95 use_observed_range = TRUE, | |
| 96 min_bandwidth = NA, | |
| 97 max_bandwidth = NA, | |
| 98 recover_min_count = 3) { | |
| 99 if (!is(cluster, "cluster")) { | |
| 100 cluster <- parallel::makeCluster(cluster) | |
| 101 on.exit(parallel::stopCluster(cluster)) | |
| 102 } | |
| 103 | |
| 104 clusterExport(cluster, c("extracted", "corrected", "aligned", "recover.weaker")) | |
| 105 clusterEvalQ(cluster, library("splines")) | |
| 106 | |
| 107 recovered <- parLapply(cluster, seq_along(filenames), function(i) { | |
| 108 recover.weaker( | |
| 109 loc = i, | |
| 110 filename = filenames[[i]], | |
| 111 this.f1 = extracted[[i]], | |
| 112 this.f2 = corrected[[i]], | |
| 113 pk.times = aligned$rt_crosstab, | |
| 114 aligned.ftrs = aligned$int_crosstab, | |
| 115 orig.tol = mz_tol, | |
| 116 align.mz.tol = aligned$mz_tolerance, | |
| 117 align.chr.tol = aligned$rt_tolerance, | |
| 118 mz.range = mz_range, | |
| 119 chr.range = rt_range, | |
| 120 use.observed.range = use_observed_range, | |
| 121 bandwidth = 0.5, | |
| 122 min.bw = min_bandwidth, | |
| 123 max.bw = max_bandwidth, | |
| 124 recover.min.count = recover_min_count | |
| 125 ) | |
| 126 }) | |
| 127 | |
| 128 feature_table <- aligned$rt_crosstab[, 1:4] | |
| 129 rt_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.times)) | |
| 130 int_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.ftrs)) | |
| 131 | |
| 132 feature_names <- rownames(feature_table) | |
| 133 sample_names <- colnames(aligned$rt_crosstab[, - (1:4)]) | |
| 134 | |
| 135 list( | |
| 136 extracted_features = lapply(recovered, function(x) x$this.f1), | |
| 137 corrected_features = lapply(recovered, function(x) x$this.f2), | |
| 138 rt_crosstab = as_feature_crosstab(feature_names, sample_names, rt_crosstab), | |
| 139 int_crosstab = as_feature_crosstab(feature_names, sample_names, int_crosstab) | |
| 140 ) | |
| 141 } | 64 } |
| 142 | 65 |
| 143 create_feature_sample_table <- function(features) { | 66 get_rt_tol <- function(tolerances) { |
| 144 table <- as_feature_sample_table( | 67 return(tolerances$rt_tolerance) |
| 145 rt_crosstab = features$rt_crosstab, | |
| 146 int_crosstab = features$int_crosstab | |
| 147 ) | |
| 148 return(table) | |
| 149 } | 68 } |
| 69 | |
| 70 save_aligned_features <- function(aligned_features, metadata_file, rt_file, intensity_file) { | |
| 71 save_data_as_parquet_file(aligned_features$metadata, metadata_file) | |
| 72 save_data_as_parquet_file(aligned_features$rt, rt_file) | |
| 73 save_data_as_parquet_file(aligned_features$intensity, intensity_file) | |
| 74 } | |
| 75 | |
| 76 select_table_with_sample_name <- function(tables, sample_name) { | |
| 77 sample_names <- lapply(tables, load_sample_name) | |
| 78 index <- which(sample_names == sample_name) | |
| 79 if (length(index) > 0) { | |
| 80 return(tables[[index]]) | |
| 81 } else { | |
| 82 stop(sprintf("Mismatch - sample name '%s' not present in %s", | |
| 83 sample_name, paste(sample_names, collapse = ", "))) | |
| 84 } | |
| 85 } | |
| 86 | |
| 87 select_adjusted <- function(recovered_features) { | |
| 88 return(recovered_features$adjusted_features) | |
| 89 } | |
| 90 | |
| 91 known_table_columns <- function() { | |
| 92 c("chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type", | |
| 93 "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max", | |
| 94 "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)", | |
| 95 "int_min(log)", "int_max(log)") | |
| 96 } | |
| 97 | |
| 98 save_known_table <- function(table, filename) { | |
| 99 columns <- known_table_columns() | |
| 100 arrow::write_parquet(table$known_table[columns], filename) | |
| 101 } | |
| 102 | |
| 103 read_known_table <- function(filename) { | |
| 104 arrow::read_parquet(filename, col_select = known_table_columns()) | |
| 105 } | |
| 106 | |
| 107 save_pairing <- function(table, filename) { | |
| 108 df <- table$pairing %>% as_tibble() %>% setNames(c("new", "old")) | |
| 109 arrow::write_parquet(df, filename) | |
| 110 } | |
| 111 | |
| 112 join_tables_to_list <- function(metadata, rt_table, intensity_table) { | |
| 113 features <- new("list") | |
| 114 features$metadata <- metadata | |
| 115 features$intensity <- intensity_table | |
| 116 features$rt <- rt_table | |
| 117 return(features) | |
| 118 } | |
| 119 | |
| 120 validate_sample_names <- function(sample_names) { | |
| 121 if ((any(is.na(sample_names))) || (length(unique(sample_names)) != length(sample_names))) { | |
| 122 stop(sprintf("Sample names absent or not unique - provided sample names: %s", | |
| 123 paste(sample_names, collapse = ", "))) | |
| 124 } | |
| 125 } |
