Mercurial > repos > eschen42 > w4mcorcov
comparison w4mcorcov_salience.R @ 7:ca9938f2eb6a draft default tip
"planemo upload for repository https://github.com/HegemanLab/w4mcorcov_galaxy_wrapper/tree/master commit 5fd9687d543a48a715b1180caf93abebebd58b0e"
| author | eschen42 |
|---|---|
| date | Tue, 17 Nov 2020 23:29:59 +0000 |
| parents | 0b49916c5c52 |
| children |
comparison
equal
deleted
inserted
replaced
| 6:0b49916c5c52 | 7:ca9938f2eb6a |
|---|---|
| 17 t_data_matrix <- t(data_matrix) | 17 t_data_matrix <- t(data_matrix) |
| 18 if ( !is.matrix(t_data_matrix) || !is.numeric(t_data_matrix) ) { | 18 if ( !is.matrix(t_data_matrix) || !is.numeric(t_data_matrix) ) { |
| 19 failure_action("w4msalience: Expected data_matrix to be a matrix (or data.frame) of numeric") | 19 failure_action("w4msalience: Expected data_matrix to be a matrix (or data.frame) of numeric") |
| 20 return (NULL) | 20 return (NULL) |
| 21 } | 21 } |
| 22 | |
| 23 feature_names <- colnames(t_data_matrix) | |
| 24 | |
| 22 n_features <- ncol(t_data_matrix) | 25 n_features <- ncol(t_data_matrix) |
| 23 n_features_plus_1 <- 1 + n_features | |
| 24 features <- colnames(t_data_matrix) | |
| 25 n_samples <- nrow(t_data_matrix) | 26 n_samples <- nrow(t_data_matrix) |
| 26 if ( length(sample_class) != n_samples ) { | 27 if ( length(sample_class) != n_samples ) { |
| 27 strF(data_matrix) | 28 strF(data_matrix) |
| 28 strF(sample_class) | 29 strF(sample_class) |
| 29 failure_action(sprintf("w4msalience: The data_matrix has %d samples but sample_class has %d", n_samples, length(sample_class))) | 30 failure_action( |
| 31 sprintf( | |
| 32 "w4msalience: The data_matrix has %d samples but sample_class has %d" | |
| 33 , n_samples | |
| 34 , length(sample_class) | |
| 35 ) | |
| 36 ) | |
| 30 return (NULL) | 37 return (NULL) |
| 31 } | 38 } |
| 32 # end sanity checks | 39 # end sanity checks |
| 33 | 40 |
| 34 # "For each feature, 'select sample_class, median(intensity) from feature group by sample_class'." | 41 # "For each feature, 'select sample_class, median(intensity) from feature group by sample_class'." |
| 37 x = as.data.frame(t_data_matrix) | 44 x = as.data.frame(t_data_matrix) |
| 38 , by = list(sample_class) | 45 , by = list(sample_class) |
| 39 , FUN = "median" | 46 , FUN = "median" |
| 40 ) | 47 ) |
| 41 | 48 |
| 42 # "For each feature, 'select sample_class, max(intensity) from feature group by sample_class'." | |
| 43 maxOfFeatureBySampleClassLevel <- aggregate( | |
| 44 x = as.data.frame(t_data_matrix) | |
| 45 , by = list(sample_class) | |
| 46 , FUN = "max" | |
| 47 ) | |
| 48 | |
| 49 # "For each feature, 'select sample_class, rcv(intensity) from feature group by sample_class'." | 49 # "For each feature, 'select sample_class, rcv(intensity) from feature group by sample_class'." |
| 50 # cv is less robust; deviation from normality degrades performance | 50 # cv is less robust; deviation from normality degrades performance |
| 51 # cv(x) == sd(x) / mean(x) | 51 # cv(x) == sd(x) / mean(x) |
| 52 # rcv is a "robust" coefficient of variation, expressed as a proportion | 52 # rcv is a "robust" coefficient of variation, expressed as a proportion |
| 53 # rcv(x) == mad(x) / median(x) | 53 # rcv(x) == mad(x) / median(x) |
| 54 madOfFeatureBySampleClassLevel <- aggregate( | 54 madOfFeatureBySampleClassLevel <- aggregate( |
| 55 x = as.data.frame(t_data_matrix) | 55 x = as.data.frame(t_data_matrix) |
| 56 , by = list(sample_class) | 56 , by = list(sample_class) |
| 57 , FUN = "mad" | 57 , FUN = "mad" |
| 58 ) | 58 ) |
| 59 rcvOfFeatureBySampleClassLevel <- as.matrix( | 59 |
| 60 madOfFeatureBySampleClassLevel[,2:n_features_plus_1] / medianOfFeatureBySampleClassLevel[,2:n_features_plus_1] | 60 # Note that `apply(X=array(1:10), MARGIN = 1, FUN = function(x) return(c(x,x^2)))` |
| 61 # produces a matrix with two rows and ten columns | |
| 62 | |
| 63 my_list <- apply( | |
| 64 X = array(1:n_features) | |
| 65 , MARGIN = 1 | |
| 66 , FUN = function(x) { | |
| 67 my_df <- data.frame( | |
| 68 median = medianOfFeatureBySampleClassLevel[ , 1 + x] | |
| 69 , mad = madOfFeatureBySampleClassLevel[ , 1 + x] | |
| 70 ) | |
| 71 my_df$salient_level <- medianOfFeatureBySampleClassLevel[ , 1] | |
| 72 my_df <- my_df[ order(my_df$median, decreasing = TRUE), ] | |
| 73 my_dist_df <- my_df[ 1:2, ] | |
| 74 # "robust coefficient of variation", i.e., | |
| 75 # mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level) | |
| 76 rcv_result <- my_dist_df$mad[1] / my_dist_df$median[1] | |
| 77 dist_result <- | |
| 78 ( my_dist_df$median[1] - my_dist_df$median[2] ) / | |
| 79 sqrt( my_dist_df$mad[1] * my_dist_df$mad[2] ) | |
| 80 if (is.infinite(dist_result) || is.nan(dist_result)) | |
| 81 dist_result <- 0 | |
| 82 mean_median <- mean(my_df$median) | |
| 83 salience_result <- if (mean_median > 0) my_df$median[1] / mean_median else 0 | |
| 84 return ( | |
| 85 data.frame( | |
| 86 dist_result = dist_result | |
| 87 , max_median = my_df$median[1] | |
| 88 , mean_median = mean_median | |
| 89 , salience_result = salience_result | |
| 90 , salient_level = my_df$salient_level[1] | |
| 91 , rcv_result = rcv_result | |
| 92 ) | |
| 93 ) | |
| 94 } | |
| 61 ) | 95 ) |
| 62 rcvOfFeatureBySampleClassLevel[is.nan(rcvOfFeatureBySampleClassLevel)] <- max(9999,max(rcvOfFeatureBySampleClassLevel, na.rm = TRUE)) | 96 results_matrix <- sapply(X = 1:n_features, FUN = function(i) my_list[[i]]) |
| 97 results_df <- as.data.frame(t(results_matrix)) | |
| 63 | 98 |
| 64 # "For each feature, 'select max(max_feature_intensity) from feature'." | 99 relative_salient_distance <- unlist(results_df$dist_result) |
| 65 maxApplyMaxOfFeatureBySampleClassLevel <- sapply( | 100 salience <- unlist(results_df$salience_result) |
| 66 X = 1:n_features | 101 salient_level <- unlist(results_df$salient_level) |
| 67 , FUN = function(i) { | 102 max_median <- unlist(results_df$max_median) |
| 68 match( | 103 mean_median <- unlist(results_df$mean_median) |
| 69 max(maxOfFeatureBySampleClassLevel[, i + 1]) | 104 rcv_result <- unlist(results_df$rcv_result) |
| 70 , maxOfFeatureBySampleClassLevel[, i + 1] | 105 |
| 71 ) | 106 salience_df <- |
| 72 } | 107 data.frame( |
| 108 # the feature name | |
| 109 feature = feature_names | |
| 110 # the name (or factor-level) of the class-level with the highest median intensity for the feature | |
| 111 , max_level = salient_level | |
| 112 # the median intensity for the feature and the level max_level | |
| 113 , max_median = max_median | |
| 114 # the distance between the maximum intensities for the feature at the two highest levels | |
| 115 , relative_salient_distance = relative_salient_distance | |
| 116 # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level | |
| 117 , salience_rcv = rcv_result | |
| 118 # the mean of the medians of intensity for all class-levels for the feature | |
| 119 , mean_median = mean_median | |
| 120 # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature | |
| 121 , salience = salience | |
| 122 # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame) | |
| 123 , stringsAsFactors = FALSE | |
| 73 ) | 124 ) |
| 74 | |
| 75 # "For each feature, 'select mean(median_feature_intensity) from feature'." | |
| 76 meanApplyMedianOfFeatureBySampleClassLevel <- sapply( | |
| 77 X = 1:n_features | |
| 78 , FUN = function(i) mean(medianOfFeatureBySampleClassLevel[, i + 1]) | |
| 79 ) | |
| 80 | |
| 81 # Compute the 'salience' for each feature, i.e., how salient the intensity of a feature | |
| 82 # is for one particular class-level relative to the intensity across all class-levels. | |
| 83 salience_df <- data.frame( | |
| 84 # the feature name | |
| 85 feature = features | |
| 86 # the name (or factor-level) of the class-level with the highest median intensity for the feature | |
| 87 , max_level = medianOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel,1] | |
| 88 # the median intensity for the feature and the level max_level | |
| 89 , max_median = sapply( | |
| 90 X = 1:n_features | |
| 91 , FUN = function(i) { | |
| 92 maxOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel[i], 1 + i] | |
| 93 } | |
| 94 ) | |
| 95 # the coefficient of variation (expressed as a proportion) for the intensity for the feature and the level max_level | |
| 96 , max_rcv = sapply( | |
| 97 X = 1:n_features | |
| 98 , FUN = function(i) { | |
| 99 rcvOfFeatureBySampleClassLevel[maxApplyMaxOfFeatureBySampleClassLevel[i], i] | |
| 100 } | |
| 101 ) | |
| 102 # the mean of the medians of intensity for all class-levels for the feature | |
| 103 , mean_median = meanApplyMedianOfFeatureBySampleClassLevel | |
| 104 # don't coerce strings to factors (this is a parameter for the data.frame constructor, not a column of the data.frame) | |
| 105 , stringsAsFactors = FALSE | |
| 106 ) | |
| 107 # raw salience is the ratio of the most-prominent level to the mean of all levels for the feature | |
| 108 salience_df$salience <- sapply( | |
| 109 X = 1:nrow(salience_df) | |
| 110 , FUN = function(i) with(salience_df[i,], if (mean_median > 0) { max_median / mean_median } else { 0 } ) | |
| 111 ) | |
| 112 # "robust coefficient of variation, i.e., mad(feature-intensity for class-level max_level) / median(feature-intensity for class-level max_level) | |
| 113 salience_df$salient_rcv <- salience_df$max_rcv | |
| 114 | 125 |
| 115 return (salience_df) | 126 return (salience_df) |
| 116 } | 127 } |
| 117 | 128 |
