Mercurial > repos > bgruening > music_deconvolution
comparison scripts/estimateprops.R @ 3:064056f47ff2 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/music/ commit 20f8561478535013e111d982b99639f48f1bea79"
| author | bgruening |
|---|---|
| date | Sat, 29 Jan 2022 12:49:59 +0000 |
| parents | f7ade72ba78c |
| children | 46fb28053b12 |
comparison
equal
deleted
inserted
replaced
| 2:1eb4f2f83579 | 3:064056f47ff2 |
|---|---|
| 15 samples = samples_label, select.ct = celltypes, verbose = T) | 15 samples = samples_label, select.ct = celltypes, verbose = T) |
| 16 | 16 |
| 17 estimated_music_props <- est_prop$Est.prop.weighted | 17 estimated_music_props <- est_prop$Est.prop.weighted |
| 18 estimated_nnls_props <- est_prop$Est.prop.allgene | 18 estimated_nnls_props <- est_prop$Est.prop.allgene |
| 19 | 19 |
| 20 scale_yaxes <- function(gplot, value) { | |
| 21 if (is.na(value)) { | |
| 22 gplot | |
| 23 } else { | |
| 24 gplot + scale_y_continuous(lim = c(0, value)) | |
| 25 } | |
| 26 } | |
| 27 | |
| 20 ## Show different in estimation methods | 28 ## Show different in estimation methods |
| 21 ## Jitter plot of estimated cell type proportions | 29 ## Jitter plot of estimated cell type proportions |
| 22 jitter_fig <- Jitter_Est( | 30 jitter_fig <- scale_yaxes(Jitter_Est( |
| 23 list(data.matrix(estimated_music_props), | 31 list(data.matrix(estimated_music_props), |
| 24 data.matrix(estimated_nnls_props)), | 32 data.matrix(estimated_nnls_props)), |
| 25 method.name = methods, title = "Jitter plot of Est Proportions", | 33 method.name = methods, title = "Jitter plot of Est Proportions", |
| 26 size = 2, alpha = 0.7) + theme_minimal() | 34 size = 2, alpha = 0.7) + theme_minimal(), maxyscale) |
| 27 | 35 |
| 28 | 36 |
| 29 ## Make a Plot | 37 ## Make a Plot |
| 30 ## A more sophisticated jitter plot is provided as below. We separated | 38 ## A more sophisticated jitter plot is provided as below. We separated |
| 31 ## the T2D subjects and normal subjects by their disease factor levels. | 39 ## the T2D subjects and normal subjects by their disease factor levels. |
| 40 celltypes <- levels(m_prop$CellType) | 48 celltypes <- levels(m_prop$CellType) |
| 41 message("No celltypes declared, using:") | 49 message("No celltypes declared, using:") |
| 42 message(celltypes) | 50 message(celltypes) |
| 43 } | 51 } |
| 44 | 52 |
| 45 if (phenotype_target_threshold == -99) { | |
| 46 phenotype_target_threshold <- -Inf | |
| 47 message("phenotype target threshold set to -Inf") | |
| 48 } | |
| 49 | |
| 50 if (is.null(phenotype_factors)) { | 53 if (is.null(phenotype_factors)) { |
| 51 phenotype_factors <- colnames(pData(bulk_eset)) | 54 phenotype_factors <- colnames(pData(bulk_eset)) |
| 52 } | 55 } |
| 53 ## filter out unwanted factors like "sampleID" and "subjectName" | 56 ## filter out unwanted factors like "sampleID" and "subjectName" |
| 54 phenotype_factors <- phenotype_factors[ | 57 phenotype_factors <- phenotype_factors[ |
| 55 !(phenotype_factors %in% phenotype_factors_always_exclude)] | 58 !(phenotype_factors %in% phenotype_factors_always_exclude)] |
| 56 message("Phenotype Factors to use:") | 59 message("Phenotype Factors to use:") |
| 57 message(phenotype_factors) | 60 message(paste0(phenotype_factors, collapse = ", ")) |
| 58 | |
| 59 | 61 |
| 60 m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint | 62 m_prop$CellType <- factor(m_prop$CellType, levels = celltypes) # nolint |
| 61 m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint | 63 m_prop$Method <- factor(rep(methods, each = nrow(estimated_music_props_flat)), # nolint |
| 62 levels = methods) | 64 levels = methods) |
| 63 m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint | 65 |
| 64 m_prop <- m_prop[!is.na(m_prop$Disease_factor), ] | 66 if (use_disease_factor) { |
| 65 ## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2 | 67 |
| 66 sample_groups <- c("Normal", sample_disease_group) | 68 if (phenotype_target_threshold == -99) { |
| 67 m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint | 69 phenotype_target_threshold <- -Inf |
| 68 levels = sample_groups) | 70 message("phenotype target threshold set to -Inf") |
| 69 | 71 } |
| 70 ## Binary to scale: e.g. TRUE / 5 = 0.2 | 72 |
| 71 m_prop$D <- (m_prop$Disease == # nolint | 73 m_prop$Disease_factor <- rep(bulk_eset[[phenotype_target]], 2 * length(celltypes)) # nolint |
| 72 sample_disease_group) / sample_disease_group_scale | 74 m_prop <- m_prop[!is.na(m_prop$Disease_factor), ] |
| 73 ## NA's are not included in the comparison below | 75 ## Generate a TRUE/FALSE table of Normal == 1 and Disease == 2 |
| 74 m_prop <- rbind(subset(m_prop, Disease != sample_disease_group), | 76 sample_groups <- c("Normal", sample_disease_group) |
| 75 subset(m_prop, Disease == sample_disease_group)) | 77 m_prop$Disease <- factor(sample_groups[(m_prop$Disease_factor > phenotype_target_threshold) + 1], # nolint |
| 76 | 78 levels = sample_groups) |
| 77 jitter_new <- ggplot(m_prop, aes(Method, Prop)) + | 79 |
| 78 geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), | 80 ## Binary to scale: e.g. TRUE / 5 = 0.2 |
| 79 size = 2, alpha = 0.7, | 81 m_prop$D <- (m_prop$Disease == # nolint |
| 80 position = position_jitter(width = 0.25, height = 0)) + | |
| 81 facet_wrap(~ CellType, scales = "free") + | |
| 82 scale_colour_manual(values = c("white", "gray20")) + | |
| 83 scale_shape_manual(values = c(21, 24)) + theme_minimal() | |
| 84 | |
| 85 ## Plot to compare method effectiveness | |
| 86 ## Create dataframe for beta cell proportions and Disease_factor levels | |
| 87 m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:nrow(estimated_music_props), 2), #nolint | |
| 88 phenotype_factors], | |
| 89 ct.prop = c(estimated_music_props[, 2], | |
| 90 estimated_nnls_props[, 2]), | |
| 91 Method = factor(rep(methods, | |
| 92 each = nrow(estimated_music_props)), | |
| 93 levels = methods)) | |
| 94 colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint | |
| 95 m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target])) | |
| 96 m_prop_ana$Disease <- factor(sample_groups[( # nolint | |
| 97 m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1], | |
| 98 sample_groups) | |
| 99 m_prop_ana$D <- (m_prop_ana$Disease == # nolint | |
| 100 sample_disease_group) / sample_disease_group_scale | 82 sample_disease_group) / sample_disease_group_scale |
| 101 | 83 ## NA's are not included in the comparison below |
| 102 jitt_compare <- ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) + | 84 m_prop <- rbind(subset(m_prop, Disease != sample_disease_group), |
| 103 geom_smooth(method = "lm", se = FALSE, col = "black", lwd = 0.25) + | 85 subset(m_prop, Disease == sample_disease_group)) |
| 104 geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), | 86 |
| 105 size = 2, alpha = 0.7) + facet_wrap(~ Method) + | 87 jitter_new <- scale_yaxes(ggplot(m_prop, aes(Method, Prop)) + |
| 106 ggtitle(compare_title) + theme_minimal() + | 88 geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), |
| 107 scale_colour_manual(values = c("white", "gray20")) + | 89 size = 2, alpha = 0.7, |
| 108 scale_shape_manual(values = c(21, 24)) | 90 position = position_jitter(width = 0.25, height = 0)) + |
| 91 facet_wrap(~ CellType, scales = "free") + | |
| 92 scale_colour_manual(values = c("white", "gray20")) + | |
| 93 scale_shape_manual(values = c(21, 24)) + theme_minimal(), maxyscale) | |
| 94 | |
| 95 } | |
| 96 | |
| 97 if (use_disease_factor) { | |
| 98 | |
| 99 ## Plot to compare method effectiveness | |
| 100 ## Create dataframe for beta cell proportions and Disease_factor levels | |
| 101 ## - Ugly code. Essentially, doubles the cell type proportions for each | |
| 102 ## set of MuSiC and NNLS methods | |
| 103 m_prop_ana <- data.frame(pData(bulk_eset)[rep(1:nrow(estimated_music_props), 2), #nolint | |
| 104 phenotype_factors], | |
| 105 ## get proportions of target cell type | |
| 106 ct.prop = c(estimated_music_props[, phenotype_scrna_target], | |
| 107 estimated_nnls_props[, phenotype_scrna_target]), | |
| 108 ## | |
| 109 Method = factor(rep(methods, | |
| 110 each = nrow(estimated_music_props)), | |
| 111 levels = methods)) | |
| 112 ## - fix headers | |
| 113 colnames(m_prop_ana)[1:length(phenotype_factors)] <- phenotype_factors #nolint | |
| 114 ## - drop NA for target phenotype (e.g. hba1c) | |
| 115 m_prop_ana <- subset(m_prop_ana, !is.na(m_prop_ana[phenotype_target])) | |
| 116 m_prop_ana$Disease <- factor( # nolint | |
| 117 ## - Here we set Normal/Disease assignments across the two MuSiC and NNLS methods | |
| 118 sample_groups[( | |
| 119 m_prop_ana[phenotype_target] > phenotype_target_threshold) + 1 | |
| 120 ], | |
| 121 sample_groups) | |
| 122 ## - Then we scale this binary assignment to a plotable factor | |
| 123 m_prop_ana$D <- (m_prop_ana$Disease == # nolint | |
| 124 sample_disease_group) / sample_disease_group_scale | |
| 125 | |
| 126 jitt_compare <- scale_yaxes(ggplot(m_prop_ana, aes_string(phenotype_target, "ct.prop")) + | |
| 127 geom_smooth(method = "lm", se = FALSE, col = "black", lwd = 0.25) + | |
| 128 geom_point(aes(fill = Method, color = Disease, stroke = D, shape = Disease), | |
| 129 size = 2, alpha = 0.7) + facet_wrap(~ Method) + | |
| 130 ggtitle(paste0(toupper(phenotype_target), " vs. ", | |
| 131 toupper(phenotype_scrna_target), " Cell Type Proportion")) + | |
| 132 theme_minimal() + | |
| 133 ylab(paste0("Proportion of ", | |
| 134 phenotype_scrna_target, " cells")) + | |
| 135 xlab(paste0("Level of bulk factor (", phenotype_target, ")")) + | |
| 136 scale_colour_manual(values = c("white", "gray20")) + | |
| 137 scale_shape_manual(values = c(21, 24)), maxyscale) | |
| 138 } | |
| 109 | 139 |
| 110 ## BoxPlot | 140 ## BoxPlot |
| 111 plot_box <- Boxplot_Est(list( | 141 plot_box <- scale_yaxes(Boxplot_Est(list( |
| 112 data.matrix(estimated_music_props), | 142 data.matrix(estimated_music_props), |
| 113 data.matrix(estimated_nnls_props)), | 143 data.matrix(estimated_nnls_props)), |
| 114 method.name = c("MuSiC", "NNLS")) + | 144 method.name = c("MuSiC", "NNLS")) + |
| 115 theme(axis.text.x = element_text(angle = -90), | 145 theme(axis.text.x = element_text(angle = -90), |
| 116 axis.text.y = element_text(size = 8)) + | 146 axis.text.y = element_text(size = 8)) + |
| 117 ggtitle(element_blank()) + theme_minimal() | 147 ggtitle(element_blank()) + theme_minimal(), maxyscale) |
| 118 | 148 |
| 119 ## Heatmap | 149 ## Heatmap |
| 120 plot_hmap <- Prop_heat_Est(list( | 150 plot_hmap <- Prop_heat_Est(list( |
| 121 data.matrix(estimated_music_props), | 151 data.matrix(estimated_music_props), |
| 122 data.matrix(estimated_nnls_props)), | 152 data.matrix(estimated_nnls_props)), |
| 123 method.name = c("MuSiC", "NNLS")) + | 153 method.name = c("MuSiC", "NNLS")) + |
| 124 theme(axis.text.x = element_text(angle = -90), | 154 theme(axis.text.x = element_text(angle = -90), |
| 125 axis.text.y = element_text(size = 6)) | 155 axis.text.y = element_text(size = 6)) |
| 126 | 156 |
| 127 pdf(file = outfile_pdf, width = 8, height = 8) | 157 pdf(file = outfile_pdf, width = 8, height = 8) |
| 128 plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2) | 158 if (length(celltypes) <= 8) { |
| 129 plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2) | 159 plot_grid(jitter_fig, plot_box, labels = "auto", ncol = 1, nrow = 2) |
| 160 } else { | |
| 161 print(jitter_fig) | |
| 162 plot_box | |
| 163 } | |
| 164 if (use_disease_factor) { | |
| 165 plot_grid(jitter_new, jitt_compare, labels = "auto", ncol = 1, nrow = 2) | |
| 166 } | |
| 130 plot_hmap | 167 plot_hmap |
| 131 message(dev.off()) | 168 message(dev.off()) |
| 132 | 169 |
| 133 ## Output Proportions | 170 ## Output Proportions |
| 134 | 171 |
| 157 "Matrix of Variance of MuSiC Estimates", | 194 "Matrix of Variance of MuSiC Estimates", |
| 158 ".tabular"), | 195 ".tabular"), |
| 159 quote = F, sep = "\t", col.names = NA) | 196 quote = F, sep = "\t", col.names = NA) |
| 160 | 197 |
| 161 | 198 |
| 162 ## Summary table | 199 if (use_disease_factor) { |
| 163 for (meth in methods) { | 200 ## Summary table of linear regressions of disease factors |
| 164 ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data = | 201 for (meth in methods) { |
| 165 sub_data <- subset(m_prop_ana, Method == meth) | 202 ##lm_beta_meth = lm(ct.prop ~ age + bmi + hba1c + gender, data = |
| 166 ## We can only do regression where there are more than 1 factors | 203 sub_data <- subset(m_prop_ana, Method == meth) |
| 167 ## so we must find and exclude the ones which are not | 204 |
| 168 gt1_facts <- sapply(phenotype_factors, function(facname) { | 205 ## We can only do regression where there are more than 1 factors |
| 169 return(length(unique(sort(sub_data[[facname]]))) == 1) | 206 ## so we must find and exclude the ones which are not |
| 170 }) | 207 gt1_facts <- sapply(phenotype_factors, function(facname) { |
| 171 form_factors <- phenotype_factors | 208 return(length(unique(sort(sub_data[[facname]]))) == 1) |
| 172 exclude_facts <- names(gt1_facts)[gt1_facts] | 209 }) |
| 173 if (length(exclude_facts) > 0) { | 210 form_factors <- phenotype_factors |
| 174 message("Factors with only one level will be excluded:") | 211 exclude_facts <- names(gt1_facts)[gt1_facts] |
| 175 message(exclude_facts) | 212 if (length(exclude_facts) > 0) { |
| 176 form_factors <- phenotype_factors[ | 213 message("Factors with only one level will be excluded:") |
| 177 !(phenotype_factors %in% exclude_facts)] | 214 message(exclude_facts) |
| 215 form_factors <- phenotype_factors[ | |
| 216 !(phenotype_factors %in% exclude_facts)] | |
| 217 } | |
| 218 lm_beta_meth <- lm(as.formula( | |
| 219 paste("ct.prop", paste(form_factors, collapse = " + "), | |
| 220 sep = " ~ ")), data = sub_data) | |
| 221 message(paste0("Summary: ", meth)) | |
| 222 capture.output(summary(lm_beta_meth), | |
| 223 file = paste0("report_data/summ_Log of ", | |
| 224 meth, | |
| 225 " fitting.txt")) | |
| 178 } | 226 } |
| 179 lm_beta_meth <- lm(as.formula( | 227 } |
| 180 paste("ct.prop", paste(form_factors, collapse = " + "), | |
| 181 sep = " ~ ")), data = sub_data) | |
| 182 message(paste0("Summary: ", meth)) | |
| 183 capture.output(summary(lm_beta_meth), | |
| 184 file = paste0("report_data/summ_Log of ", | |
| 185 meth, | |
| 186 " fitting.txt")) | |
| 187 } |
